summaryrefslogtreecommitdiff
path: root/contrib/masmx64/gvmat64.asm
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/masmx64/gvmat64.asm')
-rw-r--r--contrib/masmx64/gvmat64.asm464
1 files changed, 464 insertions, 0 deletions
diff --git a/contrib/masmx64/gvmat64.asm b/contrib/masmx64/gvmat64.asm
new file mode 100644
index 0000000..cee2145
--- /dev/null
+++ b/contrib/masmx64/gvmat64.asm
@@ -0,0 +1,464 @@
1;uInt longest_match_x64(
2; deflate_state *s,
3; IPos cur_match); /* current match */
4
5; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86
6; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
7; File written by Gilles Vollant, by modifiying the longest_match
8; from Jean-loup Gailly in deflate.c
9; and modifying asm686 with masm, optimised assembly code from Brian Raiter,
10; written 1998
11; http://www.zlib.net
12; http://www.winimage.com/zLibDll
13; http://www.muppetlabs.com/~breadbox/software/assembly.html
14;
15; to compile this file, I use option
16; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
17; with Microsoft Macro Assembler (x64) for AMD64
18;
19; ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
20;
21; (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from
22; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
23;
24; Be carrefull to adapt zlib1222add below to your version of zLib
25
26
27;uInt longest_match(s, cur_match)
28; deflate_state *s;
29; IPos cur_match; /* current match */
30.code
31longest_match PROC
32
33
34;LocalVarsSize equ 88
35 LocalVarsSize equ 72
36
37; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
38; free register : r14,r15
39; register can be saved : rsp
40
41 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
42 ; low word: s->wmask
43;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
44;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
45;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
46;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
47;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
48;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
49;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
50 nicematch equ rsp + 16 - LocalVarsSize ; a good enough match size -> r14
51
52save_rdi equ rsp + 24 - LocalVarsSize
53save_rsi equ rsp + 32 - LocalVarsSize
54save_rbx equ rsp + 40 - LocalVarsSize
55save_rbp equ rsp + 48 - LocalVarsSize
56save_r12 equ rsp + 56 - LocalVarsSize
57save_r13 equ rsp + 64 - LocalVarsSize
58;save_r14 equ rsp + 72 - LocalVarsSize
59;save_r15 equ rsp + 80 - LocalVarsSize
60
61
62
63; all the +4 offsets are due to the addition of pending_buf_size (in zlib
64; in the deflate_state structure since the asm code was first written
65; (if you compile with zlib 1.0.4 or older, remove the +4).
66; Note : these value are good with a 8 bytes boundary pack structure
67
68
69 MAX_MATCH equ 258
70 MIN_MATCH equ 3
71 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
72
73
74;;; Offsets for fields in the deflate_state structure. These numbers
75;;; are calculated from the definition of deflate_state, with the
76;;; assumption that the compiler will dword-align the fields. (Thus,
77;;; changing the definition of deflate_state could easily cause this
78;;; program to crash horribly, without so much as a warning at
79;;; compile time. Sigh.)
80
81; all the +zlib1222add offsets are due to the addition of fields
82; in zlib in the deflate_state structure since the asm code was first written
83; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
84; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
85; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
86
87zlib1222add equ 8
88
89dsWSize equ 56+zlib1222add+(zlib1222add/2)
90dsWMask equ 64+zlib1222add+(zlib1222add/2)
91dsWindow equ 72+zlib1222add
92dsPrev equ 88+zlib1222add
93dsMatchLen equ 128+zlib1222add
94dsPrevMatch equ 132+zlib1222add
95dsStrStart equ 140+zlib1222add
96dsMatchStart equ 144+zlib1222add
97dsLookahead equ 148+zlib1222add
98dsPrevLen equ 152+zlib1222add
99dsMaxChainLen equ 156+zlib1222add
100dsGoodMatch equ 172+zlib1222add
101dsNiceMatch equ 176+zlib1222add
102
103
104; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
105
106; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
107; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
108;
109; All registers must be preserved across the call, except for
110; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
111
112
113
114;;; Save registers that the compiler may be using, and adjust esp to
115;;; make room for our stack frame.
116
117
118;;; Retrieve the function arguments. r8d will hold cur_match
119;;; throughout the entire function. edx will hold the pointer to the
120;;; deflate_state structure during the function's setup (before
121;;; entering the main loop.
122
123; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
124
125; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
126
127
128 mov [save_rdi],rdi
129 mov [save_rsi],rsi
130 mov [save_rbx],rbx
131 mov [save_rbp],rbp
132 mov r8d,edx
133 mov [save_r12],r12
134 mov [save_r13],r13
135; mov [save_r14],r14
136; mov [save_r15],r15
137
138
139;;; uInt wmask = s->w_mask;
140;;; unsigned chain_length = s->max_chain_length;
141;;; if (s->prev_length >= s->good_match) {
142;;; chain_length >>= 2;
143;;; }
144
145 mov edi, [rcx + dsPrevLen]
146 mov esi, [rcx + dsGoodMatch]
147 mov eax, [rcx + dsWMask]
148 mov ebx, [rcx + dsMaxChainLen]
149 cmp edi, esi
150 jl LastMatchGood
151 shr ebx, 2
152LastMatchGood:
153
154;;; chainlen is decremented once beforehand so that the function can
155;;; use the sign flag instead of the zero flag for the exit test.
156;;; It is then shifted into the high word, to make room for the wmask
157;;; value, which it will always accompany.
158
159 dec ebx
160 shl ebx, 16
161 or ebx, eax
162 mov [chainlenwmask], ebx
163
164;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
165
166 mov eax, [rcx + dsNiceMatch]
167 mov r10d, [rcx + dsLookahead]
168 cmp r10d, eax
169 cmovnl r10d, eax
170 mov [nicematch],r10d
171LookaheadLess:
172
173;;; register Bytef *scan = s->window + s->strstart;
174
175 mov r10, [rcx + dsWindow]
176 mov ebp, [rcx + dsStrStart]
177 lea r13, [r10 + rbp]
178
179;;; Determine how many bytes the scan ptr is off from being
180;;; dword-aligned.
181
182 mov r9,r13
183 neg r13
184 and r13,3
185
186;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
187;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
188
189 mov eax, [rcx + dsWSize]
190 sub eax, MIN_LOOKAHEAD
191 xor edi,edi
192 sub ebp, eax
193
194 mov r11d, [rcx + dsPrevLen]
195
196 cmovng ebp,edi
197
198;;; int best_len = s->prev_length;
199
200
201;;; Store the sum of s->window + best_len in esi locally, and in esi.
202
203 lea rsi,[r10+r11]
204
205;;; register ush scan_start = *(ushf*)scan;
206;;; register ush scan_end = *(ushf*)(scan+best_len-1);
207;;; Posf *prev = s->prev;
208
209 movzx r12d,word ptr [r9]
210 movzx ebx, word ptr [r9 + r11 - 1]
211 mov rdi, [rcx + dsPrev]
212
213;;; Jump into the main loop.
214
215 mov edx, [chainlenwmask]
216
217 cmp bx,word ptr [rsi + r8 - 1]
218 jz LookupLoopIsZero
219
220LookupLoop1:
221 and r8d, edx
222
223 movzx r8d, word ptr [rdi + r8*2]
224 cmp r8d, ebp
225 jbe LeaveNow
226 sub edx, 00010000h
227 js LeaveNow
228
229LoopEntry1:
230 cmp bx,word ptr [rsi + r8 - 1]
231 jz LookupLoopIsZero
232
233LookupLoop2:
234 and r8d, edx
235
236 movzx r8d, word ptr [rdi + r8*2]
237 cmp r8d, ebp
238 jbe LeaveNow
239 sub edx, 00010000h
240 js LeaveNow
241
242LoopEntry2:
243 cmp bx,word ptr [rsi + r8 - 1]
244 jz LookupLoopIsZero
245
246LookupLoop4:
247 and r8d, edx
248
249 movzx r8d, word ptr [rdi + r8*2]
250 cmp r8d, ebp
251 jbe LeaveNow
252 sub edx, 00010000h
253 js LeaveNow
254
255LoopEntry4:
256
257 cmp bx,word ptr [rsi + r8 - 1]
258 jnz LookupLoop1
259 jmp LookupLoopIsZero
260
261
262;;; do {
263;;; match = s->window + cur_match;
264;;; if (*(ushf*)(match+best_len-1) != scan_end ||
265;;; *(ushf*)match != scan_start) continue;
266;;; [...]
267;;; } while ((cur_match = prev[cur_match & wmask]) > limit
268;;; && --chain_length != 0);
269;;;
270;;; Here is the inner loop of the function. The function will spend the
271;;; majority of its time in this loop, and majority of that time will
272;;; be spent in the first ten instructions.
273;;;
274;;; Within this loop:
275;;; ebx = scanend
276;;; r8d = curmatch
277;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
278;;; esi = windowbestlen - i.e., (window + bestlen)
279;;; edi = prev
280;;; ebp = limit
281
282LookupLoop:
283 and r8d, edx
284
285 movzx r8d, word ptr [rdi + r8*2]
286 cmp r8d, ebp
287 jbe LeaveNow
288 sub edx, 00010000h
289 js LeaveNow
290
291LoopEntry:
292
293 cmp bx,word ptr [rsi + r8 - 1]
294 jnz LookupLoop1
295LookupLoopIsZero:
296 cmp r12w, word ptr [r10 + r8]
297 jnz LookupLoop1
298
299
300;;; Store the current value of chainlen.
301 mov [chainlenwmask], edx
302
303;;; Point edi to the string under scrutiny, and esi to the string we
304;;; are hoping to match it up with. In actuality, esi and edi are
305;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
306;;; initialized to -(MAX_MATCH_8 - scanalign).
307
308 lea rsi,[r8+r10]
309 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
310 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
311 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
312
313 prefetcht1 [rsi+rdx]
314 prefetcht1 [rdi+rdx]
315;;; Test the strings for equality, 8 bytes at a time. At the end,
316;;; adjust edx so that it is offset to the exact byte that mismatched.
317;;;
318;;; We already know at this point that the first three bytes of the
319;;; strings match each other, and they can be safely passed over before
320;;; starting the compare loop. So what this code does is skip over 0-3
321;;; bytes, as much as necessary in order to dword-align the edi
322;;; pointer. (esi will still be misaligned three times out of four.)
323;;;
324;;; It should be confessed that this loop usually does not represent
325;;; much of the total running time. Replacing it with a more
326;;; straightforward "rep cmpsb" would not drastically degrade
327;;; performance.
328
329;LoopCmps:
330; mov eax, [rsi + rdx]
331; xor eax, [rdi + rdx]
332; jnz LeaveLoopCmps
333; mov eax, [rsi + rdx + 4]
334; xor eax, [rdi + rdx + 4]
335; jnz LeaveLoopCmps4
336; add rdx, 8
337; jnz LoopCmps
338; jmp LenMaximum
339;LeaveLoopCmps4: add rdx, 4
340;LeaveLoopCmps: test eax, 0000FFFFh
341; jnz LenLower
342; add rdx, 2
343; shr eax, 16
344;LenLower: sub al, 1
345; adc rdx, 0
346
347
348LoopCmps:
349 mov rax, [rsi + rdx]
350 xor rax, [rdi + rdx]
351 jnz LeaveLoopCmps
352
353 mov rax, [rsi + rdx + 8]
354 xor rax, [rdi + rdx + 8]
355 jnz LeaveLoopCmps8
356
357
358 mov rax, [rsi + rdx + 8+8]
359 xor rax, [rdi + rdx + 8+8]
360 jnz LeaveLoopCmps16
361
362 add rdx,8+8+8
363
364 jmp short LoopCmps
365LeaveLoopCmps16: add rdx,8
366LeaveLoopCmps8: add rdx,8
367LeaveLoopCmps:
368
369 test eax, 0000FFFFh
370 jnz LenLower
371
372 test eax,0ffffffffh
373
374 jnz LenLower32
375
376 add rdx,4
377 shr rax,32
378 or ax,ax
379 jnz LenLower
380
381LenLower32:
382 shr eax,16
383 add rdx,2
384LenLower: sub al, 1
385 adc rdx, 0
386;;; Calculate the length of the match. If it is longer than MAX_MATCH,
387;;; then automatically accept it as the best possible match and leave.
388
389 lea rax, [rdi + rdx]
390 sub rax, r9
391 cmp eax, MAX_MATCH
392 jge LenMaximum
393
394;;; If the length of the match is not longer than the best match we
395;;; have so far, then forget it and return to the lookup loop.
396;///////////////////////////////////
397
398 cmp eax, r11d
399 jg LongerMatch
400
401 lea rsi,[r10+r11]
402
403 mov rdi, [rcx + dsPrev]
404 mov edx, [chainlenwmask]
405 jmp LookupLoop
406
407;;; s->match_start = cur_match;
408;;; best_len = len;
409;;; if (len >= nice_match) break;
410;;; scan_end = *(ushf*)(scan+best_len-1);
411
412LongerMatch:
413 mov r11d, eax
414 mov [rcx + dsMatchStart], r8d
415 cmp eax, [nicematch]
416 jge LeaveNow
417
418 lea rsi,[r10+rax]
419
420 movzx ebx, word ptr [r9 + rax - 1]
421 mov rdi, [rcx + dsPrev]
422 mov edx, [chainlenwmask]
423 jmp LookupLoop
424
425;;; Accept the current string, with the maximum possible length.
426
427LenMaximum:
428 mov r11d,MAX_MATCH
429 mov [rcx + dsMatchStart], r8d
430
431;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
432;;; return s->lookahead;
433
434LeaveNow:
435 mov eax, [rcx + dsLookahead]
436 cmp r11d, eax
437 cmovng eax, r11d
438
439
440;;; Restore the stack and return from whence we came.
441
442
443 mov rsi,[save_rsi]
444 mov rdi,[save_rdi]
445 mov rbx,[save_rbx]
446 mov rbp,[save_rbp]
447 mov r12,[save_r12]
448 mov r13,[save_r13]
449; mov r14,[save_r14]
450; mov r15,[save_r15]
451
452
453 ret 0
454; please don't remove this string !
455; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
456 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
457longest_match ENDP
458
459match_init PROC
460 ret 0
461match_init ENDP
462
463
464END