aboutsummaryrefslogtreecommitdiff
path: root/contrib/masmx64
diff options
context:
space:
mode:
authorMark Adler <zlib@madler.net>2017-10-12 20:08:53 -0700
committerMark Adler <zlib@madler.net>2017-10-12 20:27:14 -0700
commit288f1080317b954b6bdca33708631c011549c008 (patch)
tree9629f01104722ba8e490f04a0790c56513ba989a /contrib/masmx64
parenta5773513942b1c57d0eff51fcb2ebac72796ed95 (diff)
downloadzlib-288f1080317b954b6bdca33708631c011549c008.tar.gz
zlib-288f1080317b954b6bdca33708631c011549c008.tar.bz2
zlib-288f1080317b954b6bdca33708631c011549c008.zip
Remove old assembler code in which bugs have manifested.
In addition, there is not sufficient gain from the inflate assembler code to warrant its inclusion.
Diffstat (limited to 'contrib/masmx64')
-rw-r--r--contrib/masmx64/bld_ml64.bat2
-rw-r--r--contrib/masmx64/gvmat64.asm553
-rw-r--r--contrib/masmx64/inffas8664.c186
-rw-r--r--contrib/masmx64/inffasx64.asm396
-rw-r--r--contrib/masmx64/readme.txt31
5 files changed, 0 insertions, 1168 deletions
diff --git a/contrib/masmx64/bld_ml64.bat b/contrib/masmx64/bld_ml64.bat
deleted file mode 100644
index 8f9343d..0000000
--- a/contrib/masmx64/bld_ml64.bat
+++ /dev/null
@@ -1,2 +0,0 @@
1ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
2ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
diff --git a/contrib/masmx64/gvmat64.asm b/contrib/masmx64/gvmat64.asm
deleted file mode 100644
index 9879c28..0000000
--- a/contrib/masmx64/gvmat64.asm
+++ /dev/null
@@ -1,553 +0,0 @@
1;uInt longest_match_x64(
2; deflate_state *s,
3; IPos cur_match); /* current match */
4
5; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
6; (AMD64 on Athlon 64, Opteron, Phenom
7; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
8; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
9;
10; File written by Gilles Vollant, by converting to assembly the longest_match
11; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
12;
13; and by taking inspiration on asm686 with masm, optimised assembly code
14; from Brian Raiter, written 1998
15;
16; This software is provided 'as-is', without any express or implied
17; warranty. In no event will the authors be held liable for any damages
18; arising from the use of this software.
19;
20; Permission is granted to anyone to use this software for any purpose,
21; including commercial applications, and to alter it and redistribute it
22; freely, subject to the following restrictions:
23;
24; 1. The origin of this software must not be misrepresented; you must not
25; claim that you wrote the original software. If you use this software
26; in a product, an acknowledgment in the product documentation would be
27; appreciated but is not required.
28; 2. Altered source versions must be plainly marked as such, and must not be
29; misrepresented as being the original software
30; 3. This notice may not be removed or altered from any source distribution.
31;
32;
33;
34; http://www.zlib.net
35; http://www.winimage.com/zLibDll
36; http://www.muppetlabs.com/~breadbox/software/assembly.html
37;
38; to compile this file for infozip Zip, I use option:
39; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
40;
41; to compile this file for zLib, I use option:
42; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
43; Be carrefull to adapt zlib1222add below to your version of zLib
44; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
45; value of zlib1222add later)
46;
47; This file compile with Microsoft Macro Assembler (x64) for AMD64
48;
49; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
50;
51; (you can get Windows WDK with ml64 for AMD64 from
52; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
53;
54
55
56;uInt longest_match(s, cur_match)
57; deflate_state *s;
58; IPos cur_match; /* current match */
59.code
60longest_match PROC
61
62
63;LocalVarsSize equ 88
64 LocalVarsSize equ 72
65
66; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
67; free register : r14,r15
68; register can be saved : rsp
69
70 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
71 ; low word: s->wmask
72;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
73;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
74;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
75;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
76;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
77;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
78;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
79IFDEF INFOZIP
80ELSE
81 nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size
82ENDIF
83
84save_rdi equ rsp + 24 - LocalVarsSize
85save_rsi equ rsp + 32 - LocalVarsSize
86save_rbx equ rsp + 40 - LocalVarsSize
87save_rbp equ rsp + 48 - LocalVarsSize
88save_r12 equ rsp + 56 - LocalVarsSize
89save_r13 equ rsp + 64 - LocalVarsSize
90;save_r14 equ rsp + 72 - LocalVarsSize
91;save_r15 equ rsp + 80 - LocalVarsSize
92
93
94; summary of register usage
95; scanend ebx
96; scanendw bx
97; chainlenwmask edx
98; curmatch rsi
99; curmatchd esi
100; windowbestlen r8
101; scanalign r9
102; scanalignd r9d
103; window r10
104; bestlen r11
105; bestlend r11d
106; scanstart r12d
107; scanstartw r12w
108; scan r13
109; nicematch r14d
110; limit r15
111; limitd r15d
112; prev rcx
113
114; all the +4 offsets are due to the addition of pending_buf_size (in zlib
115; in the deflate_state structure since the asm code was first written
116; (if you compile with zlib 1.0.4 or older, remove the +4).
117; Note : these value are good with a 8 bytes boundary pack structure
118
119
120 MAX_MATCH equ 258
121 MIN_MATCH equ 3
122 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
123
124
125;;; Offsets for fields in the deflate_state structure. These numbers
126;;; are calculated from the definition of deflate_state, with the
127;;; assumption that the compiler will dword-align the fields. (Thus,
128;;; changing the definition of deflate_state could easily cause this
129;;; program to crash horribly, without so much as a warning at
130;;; compile time. Sigh.)
131
132; all the +zlib1222add offsets are due to the addition of fields
133; in zlib in the deflate_state structure since the asm code was first written
134; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
135; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
136; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
137
138
139IFDEF INFOZIP
140
141_DATA SEGMENT
142COMM window_size:DWORD
143; WMask ; 7fff
144COMM window:BYTE:010040H
145COMM prev:WORD:08000H
146; MatchLen : unused
147; PrevMatch : unused
148COMM strstart:DWORD
149COMM match_start:DWORD
150; Lookahead : ignore
151COMM prev_length:DWORD ; PrevLen
152COMM max_chain_length:DWORD
153COMM good_match:DWORD
154COMM nice_match:DWORD
155prev_ad equ OFFSET prev
156window_ad equ OFFSET window
157nicematch equ nice_match
158_DATA ENDS
159WMask equ 07fffh
160
161ELSE
162
163 IFNDEF zlib1222add
164 zlib1222add equ 8
165 ENDIF
166dsWSize equ 56+zlib1222add+(zlib1222add/2)
167dsWMask equ 64+zlib1222add+(zlib1222add/2)
168dsWindow equ 72+zlib1222add
169dsPrev equ 88+zlib1222add
170dsMatchLen equ 128+zlib1222add
171dsPrevMatch equ 132+zlib1222add
172dsStrStart equ 140+zlib1222add
173dsMatchStart equ 144+zlib1222add
174dsLookahead equ 148+zlib1222add
175dsPrevLen equ 152+zlib1222add
176dsMaxChainLen equ 156+zlib1222add
177dsGoodMatch equ 172+zlib1222add
178dsNiceMatch equ 176+zlib1222add
179
180window_size equ [ rcx + dsWSize]
181WMask equ [ rcx + dsWMask]
182window_ad equ [ rcx + dsWindow]
183prev_ad equ [ rcx + dsPrev]
184strstart equ [ rcx + dsStrStart]
185match_start equ [ rcx + dsMatchStart]
186Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
187prev_length equ [ rcx + dsPrevLen]
188max_chain_length equ [ rcx + dsMaxChainLen]
189good_match equ [ rcx + dsGoodMatch]
190nice_match equ [ rcx + dsNiceMatch]
191ENDIF
192
193; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
194
195; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
196; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
197;
198; All registers must be preserved across the call, except for
199; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
200
201
202
203;;; Save registers that the compiler may be using, and adjust esp to
204;;; make room for our stack frame.
205
206
207;;; Retrieve the function arguments. r8d will hold cur_match
208;;; throughout the entire function. edx will hold the pointer to the
209;;; deflate_state structure during the function's setup (before
210;;; entering the main loop.
211
212; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
213
214; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
215
216 mov [save_rdi],rdi
217 mov [save_rsi],rsi
218 mov [save_rbx],rbx
219 mov [save_rbp],rbp
220IFDEF INFOZIP
221 mov r8d,ecx
222ELSE
223 mov r8d,edx
224ENDIF
225 mov [save_r12],r12
226 mov [save_r13],r13
227; mov [save_r14],r14
228; mov [save_r15],r15
229
230
231;;; uInt wmask = s->w_mask;
232;;; unsigned chain_length = s->max_chain_length;
233;;; if (s->prev_length >= s->good_match) {
234;;; chain_length >>= 2;
235;;; }
236
237 mov edi, prev_length
238 mov esi, good_match
239 mov eax, WMask
240 mov ebx, max_chain_length
241 cmp edi, esi
242 jl LastMatchGood
243 shr ebx, 2
244LastMatchGood:
245
246;;; chainlen is decremented once beforehand so that the function can
247;;; use the sign flag instead of the zero flag for the exit test.
248;;; It is then shifted into the high word, to make room for the wmask
249;;; value, which it will always accompany.
250
251 dec ebx
252 shl ebx, 16
253 or ebx, eax
254
255;;; on zlib only
256;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
257
258IFDEF INFOZIP
259 mov [chainlenwmask], ebx
260; on infozip nice_match = [nice_match]
261ELSE
262 mov eax, nice_match
263 mov [chainlenwmask], ebx
264 mov r10d, Lookahead
265 cmp r10d, eax
266 cmovnl r10d, eax
267 mov [nicematch],r10d
268ENDIF
269
270;;; register Bytef *scan = s->window + s->strstart;
271 mov r10, window_ad
272 mov ebp, strstart
273 lea r13, [r10 + rbp]
274
275;;; Determine how many bytes the scan ptr is off from being
276;;; dword-aligned.
277
278 mov r9,r13
279 neg r13
280 and r13,3
281
282;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
283;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
284IFDEF INFOZIP
285 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
286ELSE
287 mov eax, window_size
288 sub eax, MIN_LOOKAHEAD
289ENDIF
290 xor edi,edi
291 sub ebp, eax
292
293 mov r11d, prev_length
294
295 cmovng ebp,edi
296
297;;; int best_len = s->prev_length;
298
299
300;;; Store the sum of s->window + best_len in esi locally, and in esi.
301
302 lea rsi,[r10+r11]
303
304;;; register ush scan_start = *(ushf*)scan;
305;;; register ush scan_end = *(ushf*)(scan+best_len-1);
306;;; Posf *prev = s->prev;
307
308 movzx r12d,word ptr [r9]
309 movzx ebx, word ptr [r9 + r11 - 1]
310
311 mov rdi, prev_ad
312
313;;; Jump into the main loop.
314
315 mov edx, [chainlenwmask]
316
317 cmp bx,word ptr [rsi + r8 - 1]
318 jz LookupLoopIsZero
319
320LookupLoop1:
321 and r8d, edx
322
323 movzx r8d, word ptr [rdi + r8*2]
324 cmp r8d, ebp
325 jbe LeaveNow
326 sub edx, 00010000h
327 js LeaveNow
328
329LoopEntry1:
330 cmp bx,word ptr [rsi + r8 - 1]
331 jz LookupLoopIsZero
332
333LookupLoop2:
334 and r8d, edx
335
336 movzx r8d, word ptr [rdi + r8*2]
337 cmp r8d, ebp
338 jbe LeaveNow
339 sub edx, 00010000h
340 js LeaveNow
341
342LoopEntry2:
343 cmp bx,word ptr [rsi + r8 - 1]
344 jz LookupLoopIsZero
345
346LookupLoop4:
347 and r8d, edx
348
349 movzx r8d, word ptr [rdi + r8*2]
350 cmp r8d, ebp
351 jbe LeaveNow
352 sub edx, 00010000h
353 js LeaveNow
354
355LoopEntry4:
356
357 cmp bx,word ptr [rsi + r8 - 1]
358 jnz LookupLoop1
359 jmp LookupLoopIsZero
360
361
362;;; do {
363;;; match = s->window + cur_match;
364;;; if (*(ushf*)(match+best_len-1) != scan_end ||
365;;; *(ushf*)match != scan_start) continue;
366;;; [...]
367;;; } while ((cur_match = prev[cur_match & wmask]) > limit
368;;; && --chain_length != 0);
369;;;
370;;; Here is the inner loop of the function. The function will spend the
371;;; majority of its time in this loop, and majority of that time will
372;;; be spent in the first ten instructions.
373;;;
374;;; Within this loop:
375;;; ebx = scanend
376;;; r8d = curmatch
377;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
378;;; esi = windowbestlen - i.e., (window + bestlen)
379;;; edi = prev
380;;; ebp = limit
381
382LookupLoop:
383 and r8d, edx
384
385 movzx r8d, word ptr [rdi + r8*2]
386 cmp r8d, ebp
387 jbe LeaveNow
388 sub edx, 00010000h
389 js LeaveNow
390
391LoopEntry:
392
393 cmp bx,word ptr [rsi + r8 - 1]
394 jnz LookupLoop1
395LookupLoopIsZero:
396 cmp r12w, word ptr [r10 + r8]
397 jnz LookupLoop1
398
399
400;;; Store the current value of chainlen.
401 mov [chainlenwmask], edx
402
403;;; Point edi to the string under scrutiny, and esi to the string we
404;;; are hoping to match it up with. In actuality, esi and edi are
405;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
406;;; initialized to -(MAX_MATCH_8 - scanalign).
407
408 lea rsi,[r8+r10]
409 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
410 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
411 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
412
413 prefetcht1 [rsi+rdx]
414 prefetcht1 [rdi+rdx]
415
416
417;;; Test the strings for equality, 8 bytes at a time. At the end,
418;;; adjust rdx so that it is offset to the exact byte that mismatched.
419;;;
420;;; We already know at this point that the first three bytes of the
421;;; strings match each other, and they can be safely passed over before
422;;; starting the compare loop. So what this code does is skip over 0-3
423;;; bytes, as much as necessary in order to dword-align the edi
424;;; pointer. (rsi will still be misaligned three times out of four.)
425;;;
426;;; It should be confessed that this loop usually does not represent
427;;; much of the total running time. Replacing it with a more
428;;; straightforward "rep cmpsb" would not drastically degrade
429;;; performance.
430
431
432LoopCmps:
433 mov rax, [rsi + rdx]
434 xor rax, [rdi + rdx]
435 jnz LeaveLoopCmps
436
437 mov rax, [rsi + rdx + 8]
438 xor rax, [rdi + rdx + 8]
439 jnz LeaveLoopCmps8
440
441
442 mov rax, [rsi + rdx + 8+8]
443 xor rax, [rdi + rdx + 8+8]
444 jnz LeaveLoopCmps16
445
446 add rdx,8+8+8
447
448 jnz short LoopCmps
449 jmp short LenMaximum
450LeaveLoopCmps16: add rdx,8
451LeaveLoopCmps8: add rdx,8
452LeaveLoopCmps:
453
454 test eax, 0000FFFFh
455 jnz LenLower
456
457 test eax,0ffffffffh
458
459 jnz LenLower32
460
461 add rdx,4
462 shr rax,32
463 or ax,ax
464 jnz LenLower
465
466LenLower32:
467 shr eax,16
468 add rdx,2
469LenLower: sub al, 1
470 adc rdx, 0
471;;; Calculate the length of the match. If it is longer than MAX_MATCH,
472;;; then automatically accept it as the best possible match and leave.
473
474 lea rax, [rdi + rdx]
475 sub rax, r9
476 cmp eax, MAX_MATCH
477 jge LenMaximum
478
479;;; If the length of the match is not longer than the best match we
480;;; have so far, then forget it and return to the lookup loop.
481;///////////////////////////////////
482
483 cmp eax, r11d
484 jg LongerMatch
485
486 lea rsi,[r10+r11]
487
488 mov rdi, prev_ad
489 mov edx, [chainlenwmask]
490 jmp LookupLoop
491
492;;; s->match_start = cur_match;
493;;; best_len = len;
494;;; if (len >= nice_match) break;
495;;; scan_end = *(ushf*)(scan+best_len-1);
496
497LongerMatch:
498 mov r11d, eax
499 mov match_start, r8d
500 cmp eax, [nicematch]
501 jge LeaveNow
502
503 lea rsi,[r10+rax]
504
505 movzx ebx, word ptr [r9 + rax - 1]
506 mov rdi, prev_ad
507 mov edx, [chainlenwmask]
508 jmp LookupLoop
509
510;;; Accept the current string, with the maximum possible length.
511
512LenMaximum:
513 mov r11d,MAX_MATCH
514 mov match_start, r8d
515
516;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
517;;; return s->lookahead;
518
519LeaveNow:
520IFDEF INFOZIP
521 mov eax,r11d
522ELSE
523 mov eax, Lookahead
524 cmp r11d, eax
525 cmovng eax, r11d
526ENDIF
527
528;;; Restore the stack and return from whence we came.
529
530
531 mov rsi,[save_rsi]
532 mov rdi,[save_rdi]
533 mov rbx,[save_rbx]
534 mov rbp,[save_rbp]
535 mov r12,[save_r12]
536 mov r13,[save_r13]
537; mov r14,[save_r14]
538; mov r15,[save_r15]
539
540
541 ret 0
542; please don't remove this string !
543; Your can freely use gvmat64 in any free or commercial app
544; but it is far better don't remove the string in the binary!
545 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
546longest_match ENDP
547
548match_init PROC
549 ret 0
550match_init ENDP
551
552
553END
diff --git a/contrib/masmx64/inffas8664.c b/contrib/masmx64/inffas8664.c
deleted file mode 100644
index e8af06f..0000000
--- a/contrib/masmx64/inffas8664.c
+++ /dev/null
@@ -1,186 +0,0 @@
1/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
2 * version for AMD64 on Windows using Microsoft C compiler
3 *
4 * Copyright (C) 1995-2003 Mark Adler
5 * For conditions of distribution and use, see copyright notice in zlib.h
6 *
7 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
8 * Please use the copyright conditions above.
9 *
10 * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
11 *
12 * inffas8664.c call function inffas8664fnc in inffasx64.asm
13 * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
14 *
15 * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
16 * slightly quicker on x86 systems because, instead of using rep movsb to copy
17 * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
18 * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
19 * from http://fedora.linux.duke.edu/fc1_x86_64
20 * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
21 * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
22 * when decompressing mozilla-source-1.3.tar.gz.
23 *
24 * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
25 * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
26 * the moment. I have successfully compiled and tested this code with gcc2.96,
27 * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
28 * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
29 * enabled. I will attempt to merge the MMX code into this version. Newer
30 * versions of this and inffast.S can be found at
31 * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
32 *
33 */
34
35#include <stdio.h>
36#include "zutil.h"
37#include "inftrees.h"
38#include "inflate.h"
39#include "inffast.h"
40
41/* Mark Adler's comments from inffast.c: */
42
43/*
44 Decode literal, length, and distance codes and write out the resulting
45 literal and match bytes until either not enough input or output is
46 available, an end-of-block is encountered, or a data error is encountered.
47 When large enough input and output buffers are supplied to inflate(), for
48 example, a 16K input buffer and a 64K output buffer, more than 95% of the
49 inflate execution time is spent in this routine.
50
51 Entry assumptions:
52
53 state->mode == LEN
54 strm->avail_in >= 6
55 strm->avail_out >= 258
56 start >= strm->avail_out
57 state->bits < 8
58
59 On return, state->mode is one of:
60
61 LEN -- ran out of enough output space or enough available input
62 TYPE -- reached end of block code, inflate() to interpret next block
63 BAD -- error in block data
64
65 Notes:
66
67 - The maximum input bits used by a length/distance pair is 15 bits for the
68 length code, 5 bits for the length extra, 15 bits for the distance code,
69 and 13 bits for the distance extra. This totals 48 bits, or six bytes.
70 Therefore if strm->avail_in >= 6, then there is enough input to avoid
71 checking for available input while decoding.
72
73 - The maximum bytes that a single length/distance pair can output is 258
74 bytes, which is the maximum length that can be coded. inflate_fast()
75 requires strm->avail_out >= 258 for each loop to avoid checking for
76 output space.
77 */
78
79
80
81 typedef struct inffast_ar {
82/* 64 32 x86 x86_64 */
83/* ar offset register */
84/* 0 0 */ void *esp; /* esp save */
85/* 8 4 */ void *ebp; /* ebp save */
86/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
87/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
88/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
89/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
90/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
91/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
92/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
93/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
94/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
95/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
96/* 92 48 */ unsigned wsize; /* window size */
97/* 96 52 */ unsigned write; /* window write index */
98/*100 56 */ unsigned lmask; /* r12 mask for lcode */
99/*104 60 */ unsigned dmask; /* r13 mask for dcode */
100/*108 64 */ unsigned len; /* r14 match length */
101/*112 68 */ unsigned dist; /* r15 match distance */
102/*116 72 */ unsigned status; /* set when state chng*/
103 } type_ar;
104#ifdef ASMINF
105
106void inflate_fast(strm, start)
107z_streamp strm;
108unsigned start; /* inflate()'s starting value for strm->avail_out */
109{
110 struct inflate_state FAR *state;
111 type_ar ar;
112 void inffas8664fnc(struct inffast_ar * par);
113
114
115
116#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
117#define PAD_AVAIL_IN 6
118#define PAD_AVAIL_OUT 258
119#else
120#define PAD_AVAIL_IN 5
121#define PAD_AVAIL_OUT 257
122#endif
123
124 /* copy state to local variables */
125 state = (struct inflate_state FAR *)strm->state;
126
127 ar.in = strm->next_in;
128 ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
129 ar.out = strm->next_out;
130 ar.beg = ar.out - (start - strm->avail_out);
131 ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
132 ar.wsize = state->wsize;
133 ar.write = state->wnext;
134 ar.window = state->window;
135 ar.hold = state->hold;
136 ar.bits = state->bits;
137 ar.lcode = state->lencode;
138 ar.dcode = state->distcode;
139 ar.lmask = (1U << state->lenbits) - 1;
140 ar.dmask = (1U << state->distbits) - 1;
141
142 /* decode literals and length/distances until end-of-block or not enough
143 input data or output space */
144
145 /* align in on 1/2 hold size boundary */
146 while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
147 ar.hold += (unsigned long)*ar.in++ << ar.bits;
148 ar.bits += 8;
149 }
150
151 inffas8664fnc(&ar);
152
153 if (ar.status > 1) {
154 if (ar.status == 2)
155 strm->msg = "invalid literal/length code";
156 else if (ar.status == 3)
157 strm->msg = "invalid distance code";
158 else
159 strm->msg = "invalid distance too far back";
160 state->mode = BAD;
161 }
162 else if ( ar.status == 1 ) {
163 state->mode = TYPE;
164 }
165
166 /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
167 ar.len = ar.bits >> 3;
168 ar.in -= ar.len;
169 ar.bits -= ar.len << 3;
170 ar.hold &= (1U << ar.bits) - 1;
171
172 /* update state and return */
173 strm->next_in = ar.in;
174 strm->next_out = ar.out;
175 strm->avail_in = (unsigned)(ar.in < ar.last ?
176 PAD_AVAIL_IN + (ar.last - ar.in) :
177 PAD_AVAIL_IN - (ar.in - ar.last));
178 strm->avail_out = (unsigned)(ar.out < ar.end ?
179 PAD_AVAIL_OUT + (ar.end - ar.out) :
180 PAD_AVAIL_OUT - (ar.out - ar.end));
181 state->hold = (unsigned long)ar.hold;
182 state->bits = ar.bits;
183 return;
184}
185
186#endif
diff --git a/contrib/masmx64/inffasx64.asm b/contrib/masmx64/inffasx64.asm
deleted file mode 100644
index 60a8d89..0000000
--- a/contrib/masmx64/inffasx64.asm
+++ /dev/null
@@ -1,396 +0,0 @@
1; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
2; version for AMD64 on Windows using Microsoft C compiler
3;
4; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
5; inffasx64.asm is called by inffas8664.c, which contain more info.
6
7
8; to compile this file, I use option
9; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
10; with Microsoft Macro Assembler (x64) for AMD64
11;
12
13; This file compile with Microsoft Macro Assembler (x64) for AMD64
14;
15; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
16;
17; (you can get Windows WDK with ml64 for AMD64 from
18; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
19;
20
21
22.code
23inffas8664fnc PROC
24
25; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
26; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
27;
28; All registers must be preserved across the call, except for
29; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
30
31
32 mov [rsp-8],rsi
33 mov [rsp-16],rdi
34 mov [rsp-24],r12
35 mov [rsp-32],r13
36 mov [rsp-40],r14
37 mov [rsp-48],r15
38 mov [rsp-56],rbx
39
40 mov rax,rcx
41
42 mov [rax+8], rbp ; /* save regs rbp and rsp */
43 mov [rax], rsp
44
45 mov rsp, rax ; /* make rsp point to &ar */
46
47 mov rsi, [rsp+16] ; /* rsi = in */
48 mov rdi, [rsp+32] ; /* rdi = out */
49 mov r9, [rsp+24] ; /* r9 = last */
50 mov r10, [rsp+48] ; /* r10 = end */
51 mov rbp, [rsp+64] ; /* rbp = lcode */
52 mov r11, [rsp+72] ; /* r11 = dcode */
53 mov rdx, [rsp+80] ; /* rdx = hold */
54 mov ebx, [rsp+88] ; /* ebx = bits */
55 mov r12d, [rsp+100] ; /* r12d = lmask */
56 mov r13d, [rsp+104] ; /* r13d = dmask */
57 ; /* r14d = len */
58 ; /* r15d = dist */
59
60
61 cld
62 cmp r10, rdi
63 je L_one_time ; /* if only one decode left */
64 cmp r9, rsi
65
66 jne L_do_loop
67
68
69L_one_time:
70 mov r8, r12 ; /* r8 = lmask */
71 cmp bl, 32
72 ja L_get_length_code_one_time
73
74 lodsd ; /* eax = *(uint *)in++ */
75 mov cl, bl ; /* cl = bits, needs it for shifting */
76 add bl, 32 ; /* bits += 32 */
77 shl rax, cl
78 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
79 jmp L_get_length_code_one_time
80
81ALIGN 4
82L_while_test:
83 cmp r10, rdi
84 jbe L_break_loop
85 cmp r9, rsi
86 jbe L_break_loop
87
88L_do_loop:
89 mov r8, r12 ; /* r8 = lmask */
90 cmp bl, 32
91 ja L_get_length_code ; /* if (32 < bits) */
92
93 lodsd ; /* eax = *(uint *)in++ */
94 mov cl, bl ; /* cl = bits, needs it for shifting */
95 add bl, 32 ; /* bits += 32 */
96 shl rax, cl
97 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
98
99L_get_length_code:
100 and r8, rdx ; /* r8 &= hold */
101 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
102
103 mov cl, ah ; /* cl = this.bits */
104 sub bl, ah ; /* bits -= this.bits */
105 shr rdx, cl ; /* hold >>= this.bits */
106
107 test al, al
108 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
109
110 mov r8, r12 ; /* r8 = lmask */
111 shr eax, 16 ; /* output this.val char */
112 stosb
113
114L_get_length_code_one_time:
115 and r8, rdx ; /* r8 &= hold */
116 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
117
118L_dolen:
119 mov cl, ah ; /* cl = this.bits */
120 sub bl, ah ; /* bits -= this.bits */
121 shr rdx, cl ; /* hold >>= this.bits */
122
123 test al, al
124 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
125
126 shr eax, 16 ; /* output this.val char */
127 stosb
128 jmp L_while_test
129
130ALIGN 4
131L_test_for_length_base:
132 mov r14d, eax ; /* len = this */
133 shr r14d, 16 ; /* len = this.val */
134 mov cl, al
135
136 test al, 16
137 jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
138 and cl, 15 ; /* op &= 15 */
139 jz L_decode_distance ; /* if (!op) */
140
141L_add_bits_to_len:
142 sub bl, cl
143 xor eax, eax
144 inc eax
145 shl eax, cl
146 dec eax
147 and eax, edx ; /* eax &= hold */
148 shr rdx, cl
149 add r14d, eax ; /* len += hold & mask[op] */
150
151L_decode_distance:
152 mov r8, r13 ; /* r8 = dmask */
153 cmp bl, 32
154 ja L_get_distance_code ; /* if (32 < bits) */
155
156 lodsd ; /* eax = *(uint *)in++ */
157 mov cl, bl ; /* cl = bits, needs it for shifting */
158 add bl, 32 ; /* bits += 32 */
159 shl rax, cl
160 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
161
162L_get_distance_code:
163 and r8, rdx ; /* r8 &= hold */
164 mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
165
166L_dodist:
167 mov r15d, eax ; /* dist = this */
168 shr r15d, 16 ; /* dist = this.val */
169 mov cl, ah
170 sub bl, ah ; /* bits -= this.bits */
171 shr rdx, cl ; /* hold >>= this.bits */
172 mov cl, al ; /* cl = this.op */
173
174 test al, 16 ; /* if ((op & 16) == 0) */
175 jz L_test_for_second_level_dist
176 and cl, 15 ; /* op &= 15 */
177 jz L_check_dist_one
178
179L_add_bits_to_dist:
180 sub bl, cl
181 xor eax, eax
182 inc eax
183 shl eax, cl
184 dec eax ; /* (1 << op) - 1 */
185 and eax, edx ; /* eax &= hold */
186 shr rdx, cl
187 add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
188
189L_check_window:
190 mov r8, rsi ; /* save in so from can use it's reg */
191 mov rax, rdi
192 sub rax, [rsp+40] ; /* nbytes = out - beg */
193
194 cmp eax, r15d
195 jb L_clip_window ; /* if (dist > nbytes) 4.2% */
196
197 mov ecx, r14d ; /* ecx = len */
198 mov rsi, rdi
199 sub rsi, r15 ; /* from = out - dist */
200
201 sar ecx, 1
202 jnc L_copy_two ; /* if len % 2 == 0 */
203
204 rep movsw
205 mov al, [rsi]
206 mov [rdi], al
207 inc rdi
208
209 mov rsi, r8 ; /* move in back to %rsi, toss from */
210 jmp L_while_test
211
212L_copy_two:
213 rep movsw
214 mov rsi, r8 ; /* move in back to %rsi, toss from */
215 jmp L_while_test
216
217ALIGN 4
218L_check_dist_one:
219 cmp r15d, 1 ; /* if dist 1, is a memset */
220 jne L_check_window
221 cmp [rsp+40], rdi ; /* if out == beg, outside window */
222 je L_check_window
223
224 mov ecx, r14d ; /* ecx = len */
225 mov al, [rdi-1]
226 mov ah, al
227
228 sar ecx, 1
229 jnc L_set_two
230 mov [rdi], al
231 inc rdi
232
233L_set_two:
234 rep stosw
235 jmp L_while_test
236
237ALIGN 4
238L_test_for_second_level_length:
239 test al, 64
240 jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
241
242 xor eax, eax
243 inc eax
244 shl eax, cl
245 dec eax
246 and eax, edx ; /* eax &= hold */
247 add eax, r14d ; /* eax += len */
248 mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
249 jmp L_dolen
250
251ALIGN 4
252L_test_for_second_level_dist:
253 test al, 64
254 jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
255
256 xor eax, eax
257 inc eax
258 shl eax, cl
259 dec eax
260 and eax, edx ; /* eax &= hold */
261 add eax, r15d ; /* eax += dist */
262 mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
263 jmp L_dodist
264
265ALIGN 4
266L_clip_window:
267 mov ecx, eax ; /* ecx = nbytes */
268 mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
269 neg ecx ; /* nbytes = -nbytes */
270
271 cmp eax, r15d
272 jb L_invalid_distance_too_far ; /* if (dist > wsize) */
273
274 add ecx, r15d ; /* nbytes = dist - nbytes */
275 cmp dword ptr [rsp+96], 0
276 jne L_wrap_around_window ; /* if (write != 0) */
277
278 mov rsi, [rsp+56] ; /* from = window */
279 sub eax, ecx ; /* eax -= nbytes */
280 add rsi, rax ; /* from += wsize - nbytes */
281
282 mov eax, r14d ; /* eax = len */
283 cmp r14d, ecx
284 jbe L_do_copy ; /* if (nbytes >= len) */
285
286 sub eax, ecx ; /* eax -= nbytes */
287 rep movsb
288 mov rsi, rdi
289 sub rsi, r15 ; /* from = &out[ -dist ] */
290 jmp L_do_copy
291
292ALIGN 4
293L_wrap_around_window:
294 mov eax, [rsp+96] ; /* eax = write */
295 cmp ecx, eax
296 jbe L_contiguous_in_window ; /* if (write >= nbytes) */
297
298 mov esi, [rsp+92] ; /* from = wsize */
299 add rsi, [rsp+56] ; /* from += window */
300 add rsi, rax ; /* from += write */
301 sub rsi, rcx ; /* from -= nbytes */
302 sub ecx, eax ; /* nbytes -= write */
303
304 mov eax, r14d ; /* eax = len */
305 cmp eax, ecx
306 jbe L_do_copy ; /* if (nbytes >= len) */
307
308 sub eax, ecx ; /* len -= nbytes */
309 rep movsb
310 mov rsi, [rsp+56] ; /* from = window */
311 mov ecx, [rsp+96] ; /* nbytes = write */
312 cmp eax, ecx
313 jbe L_do_copy ; /* if (nbytes >= len) */
314
315 sub eax, ecx ; /* len -= nbytes */
316 rep movsb
317 mov rsi, rdi
318 sub rsi, r15 ; /* from = out - dist */
319 jmp L_do_copy
320
321ALIGN 4
322L_contiguous_in_window:
323 mov rsi, [rsp+56] ; /* rsi = window */
324 add rsi, rax
325 sub rsi, rcx ; /* from += write - nbytes */
326
327 mov eax, r14d ; /* eax = len */
328 cmp eax, ecx
329 jbe L_do_copy ; /* if (nbytes >= len) */
330
331 sub eax, ecx ; /* len -= nbytes */
332 rep movsb
333 mov rsi, rdi
334 sub rsi, r15 ; /* from = out - dist */
335 jmp L_do_copy ; /* if (nbytes >= len) */
336
337ALIGN 4
338L_do_copy:
339 mov ecx, eax ; /* ecx = len */
340 rep movsb
341
342 mov rsi, r8 ; /* move in back to %esi, toss from */
343 jmp L_while_test
344
345L_test_for_end_of_block:
346 test al, 32
347 jz L_invalid_literal_length_code
348 mov dword ptr [rsp+116], 1
349 jmp L_break_loop_with_status
350
351L_invalid_literal_length_code:
352 mov dword ptr [rsp+116], 2
353 jmp L_break_loop_with_status
354
355L_invalid_distance_code:
356 mov dword ptr [rsp+116], 3
357 jmp L_break_loop_with_status
358
359L_invalid_distance_too_far:
360 mov dword ptr [rsp+116], 4
361 jmp L_break_loop_with_status
362
363L_break_loop:
364 mov dword ptr [rsp+116], 0
365
366L_break_loop_with_status:
367; /* put in, out, bits, and hold back into ar and pop esp */
368 mov [rsp+16], rsi ; /* in */
369 mov [rsp+32], rdi ; /* out */
370 mov [rsp+88], ebx ; /* bits */
371 mov [rsp+80], rdx ; /* hold */
372
373 mov rax, [rsp] ; /* restore rbp and rsp */
374 mov rbp, [rsp+8]
375 mov rsp, rax
376
377
378
379 mov rsi,[rsp-8]
380 mov rdi,[rsp-16]
381 mov r12,[rsp-24]
382 mov r13,[rsp-32]
383 mov r14,[rsp-40]
384 mov r15,[rsp-48]
385 mov rbx,[rsp-56]
386
387 ret 0
388; :
389; : "m" (ar)
390; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
391; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
392; );
393
394inffas8664fnc ENDP
395;_TEXT ENDS
396END
diff --git a/contrib/masmx64/readme.txt b/contrib/masmx64/readme.txt
deleted file mode 100644
index 2da6733..0000000
--- a/contrib/masmx64/readme.txt
+++ /dev/null
@@ -1,31 +0,0 @@
1Summary
2-------
3This directory contains ASM implementations of the functions
4longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
5for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
6
7gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
8 assembly optimized version from Jean-loup Gailly original longest_match function
9
10inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
11 original function from Mark Adler
12
13Use instructions
14----------------
15Assemble the .asm files using MASM and put the object files into the zlib source
16directory. You can also get object files here:
17
18 http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
19
20define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
21and inffasx64.obj and gvmat64.obj as object to link.
22
23
24Build instructions
25------------------
26run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
27
28ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
29
30You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
31 http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)