diff options
author | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:25:17 -0700 |
---|---|---|
committer | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:25:17 -0700 |
commit | abf180a067223611620dd97dd5681df7c7fa7c9b (patch) | |
tree | 48ce6022aa1670380c098bd0abed2ac4aa1d9ca0 /contrib/masmx64 | |
parent | 9c3a5830218c4e7fff23b8fc4386269db77a03a9 (diff) | |
download | zlib-1.2.3.tar.gz zlib-1.2.3.tar.bz2 zlib-1.2.3.zip |
zlib 1.2.3v1.2.3
Diffstat (limited to 'contrib/masmx64')
-rw-r--r-- | contrib/masmx64/gvmat64.asm | 159 | ||||
-rw-r--r-- | contrib/masmx64/gvmat64.obj | bin | 4155 -> 4119 bytes |
2 files changed, 104 insertions, 55 deletions
diff --git a/contrib/masmx64/gvmat64.asm b/contrib/masmx64/gvmat64.asm index cee2145..790d655 100644 --- a/contrib/masmx64/gvmat64.asm +++ b/contrib/masmx64/gvmat64.asm | |||
@@ -4,24 +4,33 @@ | |||
4 | 4 | ||
5 | ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86 | 5 | ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86 |
6 | ; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant. | 6 | ; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant. |
7 | ; File written by Gilles Vollant, by modifiying the longest_match | 7 | ; |
8 | ; from Jean-loup Gailly in deflate.c | 8 | ; File written by Gilles Vollant, by converting to assembly the longest_match |
9 | ; and modifying asm686 with masm, optimised assembly code from Brian Raiter, | 9 | ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. |
10 | ; written 1998 | 10 | ; |
11 | ; and by taking inspiration on asm686 with masm, optimised assembly code | ||
12 | ; from Brian Raiter, written 1998 | ||
13 | ; | ||
11 | ; http://www.zlib.net | 14 | ; http://www.zlib.net |
12 | ; http://www.winimage.com/zLibDll | 15 | ; http://www.winimage.com/zLibDll |
13 | ; http://www.muppetlabs.com/~breadbox/software/assembly.html | 16 | ; http://www.muppetlabs.com/~breadbox/software/assembly.html |
14 | ; | 17 | ; |
15 | ; to compile this file, I use option | 18 | ; to compile this file for infozip Zip, I use option: |
19 | ; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm | ||
20 | ; | ||
21 | ; to compile this file for zLib, I use option: | ||
16 | ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm | 22 | ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm |
17 | ; with Microsoft Macro Assembler (x64) for AMD64 | 23 | ; Be carrefull to adapt zlib1222add below to your version of zLib |
24 | ; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change | ||
25 | ; value of zlib1222add later) | ||
18 | ; | 26 | ; |
19 | ; ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK | 27 | ; This file compile with Microsoft Macro Assembler (x64) for AMD64 |
28 | ; | ||
29 | ; ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK | ||
20 | ; | 30 | ; |
21 | ; (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from | 31 | ; (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from |
22 | ; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price) | 32 | ; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price) |
23 | ; | 33 | ; |
24 | ; Be carrefull to adapt zlib1222add below to your version of zLib | ||
25 | 34 | ||
26 | 35 | ||
27 | ;uInt longest_match(s, cur_match) | 36 | ;uInt longest_match(s, cur_match) |
@@ -47,7 +56,10 @@ longest_match PROC | |||
47 | ;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13 | 56 | ;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13 |
48 | ;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d | 57 | ;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d |
49 | ;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 | 58 | ;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 |
50 | nicematch equ rsp + 16 - LocalVarsSize ; a good enough match size -> r14 | 59 | IFDEF INFOZIP |
60 | ELSE | ||
61 | nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size | ||
62 | ENDIF | ||
51 | 63 | ||
52 | save_rdi equ rsp + 24 - LocalVarsSize | 64 | save_rdi equ rsp + 24 - LocalVarsSize |
53 | save_rsi equ rsp + 32 - LocalVarsSize | 65 | save_rsi equ rsp + 32 - LocalVarsSize |
@@ -84,8 +96,34 @@ save_r13 equ rsp + 64 - LocalVarsSize | |||
84 | ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). | 96 | ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). |
85 | ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). | 97 | ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). |
86 | 98 | ||
87 | zlib1222add equ 8 | ||
88 | 99 | ||
100 | IFDEF INFOZIP | ||
101 | |||
102 | _DATA SEGMENT | ||
103 | COMM window_size:DWORD | ||
104 | ; WMask ; 7fff | ||
105 | COMM window:BYTE:010040H | ||
106 | COMM prev:WORD:08000H | ||
107 | ; MatchLen : unused | ||
108 | ; PrevMatch : unused | ||
109 | COMM strstart:DWORD | ||
110 | COMM match_start:DWORD | ||
111 | ; Lookahead : ignore | ||
112 | COMM prev_length:DWORD ; PrevLen | ||
113 | COMM max_chain_length:DWORD | ||
114 | COMM good_match:DWORD | ||
115 | COMM nice_match:DWORD | ||
116 | prev_ad equ OFFSET prev | ||
117 | window_ad equ OFFSET window | ||
118 | nicematch equ nice_match | ||
119 | _DATA ENDS | ||
120 | WMask equ 07fffh | ||
121 | |||
122 | ELSE | ||
123 | |||
124 | IFNDEF zlib1222add | ||
125 | zlib1222add equ 8 | ||
126 | ENDIF | ||
89 | dsWSize equ 56+zlib1222add+(zlib1222add/2) | 127 | dsWSize equ 56+zlib1222add+(zlib1222add/2) |
90 | dsWMask equ 64+zlib1222add+(zlib1222add/2) | 128 | dsWMask equ 64+zlib1222add+(zlib1222add/2) |
91 | dsWindow equ 72+zlib1222add | 129 | dsWindow equ 72+zlib1222add |
@@ -100,6 +138,18 @@ dsMaxChainLen equ 156+zlib1222add | |||
100 | dsGoodMatch equ 172+zlib1222add | 138 | dsGoodMatch equ 172+zlib1222add |
101 | dsNiceMatch equ 176+zlib1222add | 139 | dsNiceMatch equ 176+zlib1222add |
102 | 140 | ||
141 | window_size equ [ rcx + dsWSize] | ||
142 | WMask equ [ rcx + dsWMask] | ||
143 | window_ad equ [ rcx + dsWindow] | ||
144 | prev_ad equ [ rcx + dsPrev] | ||
145 | strstart equ [ rcx + dsStrStart] | ||
146 | match_start equ [ rcx + dsMatchStart] | ||
147 | Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip | ||
148 | prev_length equ [ rcx + dsPrevLen] | ||
149 | max_chain_length equ [ rcx + dsMaxChainLen] | ||
150 | good_match equ [ rcx + dsGoodMatch] | ||
151 | nice_match equ [ rcx + dsNiceMatch] | ||
152 | ENDIF | ||
103 | 153 | ||
104 | ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) | 154 | ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) |
105 | 155 | ||
@@ -107,7 +157,7 @@ dsNiceMatch equ 176+zlib1222add | |||
107 | ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp | 157 | ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp |
108 | ; | 158 | ; |
109 | ; All registers must be preserved across the call, except for | 159 | ; All registers must be preserved across the call, except for |
110 | ; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch. | 160 | ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. |
111 | 161 | ||
112 | 162 | ||
113 | 163 | ||
@@ -124,12 +174,15 @@ dsNiceMatch equ 176+zlib1222add | |||
124 | 174 | ||
125 | ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx | 175 | ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx |
126 | 176 | ||
127 | |||
128 | mov [save_rdi],rdi | 177 | mov [save_rdi],rdi |
129 | mov [save_rsi],rsi | 178 | mov [save_rsi],rsi |
130 | mov [save_rbx],rbx | 179 | mov [save_rbx],rbx |
131 | mov [save_rbp],rbp | 180 | mov [save_rbp],rbp |
181 | IFDEF INFOZIP | ||
182 | mov r8d,ecx | ||
183 | ELSE | ||
132 | mov r8d,edx | 184 | mov r8d,edx |
185 | ENDIF | ||
133 | mov [save_r12],r12 | 186 | mov [save_r12],r12 |
134 | mov [save_r13],r13 | 187 | mov [save_r13],r13 |
135 | ; mov [save_r14],r14 | 188 | ; mov [save_r14],r14 |
@@ -142,10 +195,10 @@ dsNiceMatch equ 176+zlib1222add | |||
142 | ;;; chain_length >>= 2; | 195 | ;;; chain_length >>= 2; |
143 | ;;; } | 196 | ;;; } |
144 | 197 | ||
145 | mov edi, [rcx + dsPrevLen] | 198 | mov edi, prev_length |
146 | mov esi, [rcx + dsGoodMatch] | 199 | mov esi, good_match |
147 | mov eax, [rcx + dsWMask] | 200 | mov eax, WMask |
148 | mov ebx, [rcx + dsMaxChainLen] | 201 | mov ebx, max_chain_length |
149 | cmp edi, esi | 202 | cmp edi, esi |
150 | jl LastMatchGood | 203 | jl LastMatchGood |
151 | shr ebx, 2 | 204 | shr ebx, 2 |
@@ -159,21 +212,25 @@ LastMatchGood: | |||
159 | dec ebx | 212 | dec ebx |
160 | shl ebx, 16 | 213 | shl ebx, 16 |
161 | or ebx, eax | 214 | or ebx, eax |
162 | mov [chainlenwmask], ebx | ||
163 | 215 | ||
216 | ;;; on zlib only | ||
164 | ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; | 217 | ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; |
165 | 218 | ||
166 | mov eax, [rcx + dsNiceMatch] | 219 | IFDEF INFOZIP |
167 | mov r10d, [rcx + dsLookahead] | 220 | mov [chainlenwmask], ebx |
221 | ; on infozip nice_match = [nice_match] | ||
222 | ELSE | ||
223 | mov eax, nice_match | ||
224 | mov [chainlenwmask], ebx | ||
225 | mov r10d, Lookahead | ||
168 | cmp r10d, eax | 226 | cmp r10d, eax |
169 | cmovnl r10d, eax | 227 | cmovnl r10d, eax |
170 | mov [nicematch],r10d | 228 | mov [nicematch],r10d |
171 | LookaheadLess: | 229 | ENDIF |
172 | 230 | ||
173 | ;;; register Bytef *scan = s->window + s->strstart; | 231 | ;;; register Bytef *scan = s->window + s->strstart; |
174 | 232 | mov r10, window_ad | |
175 | mov r10, [rcx + dsWindow] | 233 | mov ebp, strstart |
176 | mov ebp, [rcx + dsStrStart] | ||
177 | lea r13, [r10 + rbp] | 234 | lea r13, [r10 + rbp] |
178 | 235 | ||
179 | ;;; Determine how many bytes the scan ptr is off from being | 236 | ;;; Determine how many bytes the scan ptr is off from being |
@@ -185,13 +242,16 @@ LookaheadLess: | |||
185 | 242 | ||
186 | ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? | 243 | ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? |
187 | ;;; s->strstart - (IPos)MAX_DIST(s) : NIL; | 244 | ;;; s->strstart - (IPos)MAX_DIST(s) : NIL; |
188 | 245 | IFDEF INFOZIP | |
189 | mov eax, [rcx + dsWSize] | 246 | mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1)) |
247 | ELSE | ||
248 | mov eax, window_size | ||
190 | sub eax, MIN_LOOKAHEAD | 249 | sub eax, MIN_LOOKAHEAD |
250 | ENDIF | ||
191 | xor edi,edi | 251 | xor edi,edi |
192 | sub ebp, eax | 252 | sub ebp, eax |
193 | 253 | ||
194 | mov r11d, [rcx + dsPrevLen] | 254 | mov r11d, prev_length |
195 | 255 | ||
196 | cmovng ebp,edi | 256 | cmovng ebp,edi |
197 | 257 | ||
@@ -207,8 +267,9 @@ LookaheadLess: | |||
207 | ;;; Posf *prev = s->prev; | 267 | ;;; Posf *prev = s->prev; |
208 | 268 | ||
209 | movzx r12d,word ptr [r9] | 269 | movzx r12d,word ptr [r9] |
210 | movzx ebx, word ptr [r9 + r11 - 1] | 270 | movzx ebx, word ptr [r9 + r11 - 1] |
211 | mov rdi, [rcx + dsPrev] | 271 | |
272 | mov rdi, prev_ad | ||
212 | 273 | ||
213 | ;;; Jump into the main loop. | 274 | ;;; Jump into the main loop. |
214 | 275 | ||
@@ -312,38 +373,22 @@ LookupLoopIsZero: | |||
312 | 373 | ||
313 | prefetcht1 [rsi+rdx] | 374 | prefetcht1 [rsi+rdx] |
314 | prefetcht1 [rdi+rdx] | 375 | prefetcht1 [rdi+rdx] |
376 | |||
377 | |||
315 | ;;; Test the strings for equality, 8 bytes at a time. At the end, | 378 | ;;; Test the strings for equality, 8 bytes at a time. At the end, |
316 | ;;; adjust edx so that it is offset to the exact byte that mismatched. | 379 | ;;; adjust rdx so that it is offset to the exact byte that mismatched. |
317 | ;;; | 380 | ;;; |
318 | ;;; We already know at this point that the first three bytes of the | 381 | ;;; We already know at this point that the first three bytes of the |
319 | ;;; strings match each other, and they can be safely passed over before | 382 | ;;; strings match each other, and they can be safely passed over before |
320 | ;;; starting the compare loop. So what this code does is skip over 0-3 | 383 | ;;; starting the compare loop. So what this code does is skip over 0-3 |
321 | ;;; bytes, as much as necessary in order to dword-align the edi | 384 | ;;; bytes, as much as necessary in order to dword-align the edi |
322 | ;;; pointer. (esi will still be misaligned three times out of four.) | 385 | ;;; pointer. (rsi will still be misaligned three times out of four.) |
323 | ;;; | 386 | ;;; |
324 | ;;; It should be confessed that this loop usually does not represent | 387 | ;;; It should be confessed that this loop usually does not represent |
325 | ;;; much of the total running time. Replacing it with a more | 388 | ;;; much of the total running time. Replacing it with a more |
326 | ;;; straightforward "rep cmpsb" would not drastically degrade | 389 | ;;; straightforward "rep cmpsb" would not drastically degrade |
327 | ;;; performance. | 390 | ;;; performance. |
328 | 391 | ||
329 | ;LoopCmps: | ||
330 | ; mov eax, [rsi + rdx] | ||
331 | ; xor eax, [rdi + rdx] | ||
332 | ; jnz LeaveLoopCmps | ||
333 | ; mov eax, [rsi + rdx + 4] | ||
334 | ; xor eax, [rdi + rdx + 4] | ||
335 | ; jnz LeaveLoopCmps4 | ||
336 | ; add rdx, 8 | ||
337 | ; jnz LoopCmps | ||
338 | ; jmp LenMaximum | ||
339 | ;LeaveLoopCmps4: add rdx, 4 | ||
340 | ;LeaveLoopCmps: test eax, 0000FFFFh | ||
341 | ; jnz LenLower | ||
342 | ; add rdx, 2 | ||
343 | ; shr eax, 16 | ||
344 | ;LenLower: sub al, 1 | ||
345 | ; adc rdx, 0 | ||
346 | |||
347 | 392 | ||
348 | LoopCmps: | 393 | LoopCmps: |
349 | mov rax, [rsi + rdx] | 394 | mov rax, [rsi + rdx] |
@@ -400,7 +445,7 @@ LenLower: sub al, 1 | |||
400 | 445 | ||
401 | lea rsi,[r10+r11] | 446 | lea rsi,[r10+r11] |
402 | 447 | ||
403 | mov rdi, [rcx + dsPrev] | 448 | mov rdi, prev_ad |
404 | mov edx, [chainlenwmask] | 449 | mov edx, [chainlenwmask] |
405 | jmp LookupLoop | 450 | jmp LookupLoop |
406 | 451 | ||
@@ -411,14 +456,14 @@ LenLower: sub al, 1 | |||
411 | 456 | ||
412 | LongerMatch: | 457 | LongerMatch: |
413 | mov r11d, eax | 458 | mov r11d, eax |
414 | mov [rcx + dsMatchStart], r8d | 459 | mov match_start, r8d |
415 | cmp eax, [nicematch] | 460 | cmp eax, [nicematch] |
416 | jge LeaveNow | 461 | jge LeaveNow |
417 | 462 | ||
418 | lea rsi,[r10+rax] | 463 | lea rsi,[r10+rax] |
419 | 464 | ||
420 | movzx ebx, word ptr [r9 + rax - 1] | 465 | movzx ebx, word ptr [r9 + rax - 1] |
421 | mov rdi, [rcx + dsPrev] | 466 | mov rdi, prev_ad |
422 | mov edx, [chainlenwmask] | 467 | mov edx, [chainlenwmask] |
423 | jmp LookupLoop | 468 | jmp LookupLoop |
424 | 469 | ||
@@ -426,16 +471,19 @@ LongerMatch: | |||
426 | 471 | ||
427 | LenMaximum: | 472 | LenMaximum: |
428 | mov r11d,MAX_MATCH | 473 | mov r11d,MAX_MATCH |
429 | mov [rcx + dsMatchStart], r8d | 474 | mov match_start, r8d |
430 | 475 | ||
431 | ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; | 476 | ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; |
432 | ;;; return s->lookahead; | 477 | ;;; return s->lookahead; |
433 | 478 | ||
434 | LeaveNow: | 479 | LeaveNow: |
435 | mov eax, [rcx + dsLookahead] | 480 | IFDEF INFOZIP |
481 | mov eax,r11d | ||
482 | ELSE | ||
483 | mov eax, Lookahead | ||
436 | cmp r11d, eax | 484 | cmp r11d, eax |
437 | cmovng eax, r11d | 485 | cmovng eax, r11d |
438 | 486 | ENDIF | |
439 | 487 | ||
440 | ;;; Restore the stack and return from whence we came. | 488 | ;;; Restore the stack and return from whence we came. |
441 | 489 | ||
@@ -452,7 +500,8 @@ LeaveNow: | |||
452 | 500 | ||
453 | ret 0 | 501 | ret 0 |
454 | ; please don't remove this string ! | 502 | ; please don't remove this string ! |
455 | ; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary! | 503 | ; Your can freely use gvmat64 in any free or commercial app |
504 | ; but it is far better don't remove the string in the binary! | ||
456 | db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 | 505 | db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 |
457 | longest_match ENDP | 506 | longest_match ENDP |
458 | 507 | ||
diff --git a/contrib/masmx64/gvmat64.obj b/contrib/masmx64/gvmat64.obj index 10a73fe..a49ca02 100644 --- a/contrib/masmx64/gvmat64.obj +++ b/contrib/masmx64/gvmat64.obj | |||
Binary files differ | |||