summaryrefslogtreecommitdiff
path: root/contrib/masmx86
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:24:43 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:24:43 -0700
commit6b8233bfe00e79134cb1b84fc49d4f750a797f79 (patch)
treeca2b03b0169568681dc3d9c823e9f0bc4417d6b5 /contrib/masmx86
parent0484693e1723bbab791c56f95597bd7dbe867d03 (diff)
downloadzlib-1.2.2.3.tar.gz
zlib-1.2.2.3.tar.bz2
zlib-1.2.2.3.zip
zlib 1.2.2.3v1.2.2.3
Diffstat (limited to 'contrib/masmx86')
-rw-r--r--contrib/masmx86/bld_ml32.bat2
-rw-r--r--contrib/masmx86/gvmat32.asm1881
-rw-r--r--contrib/masmx86/gvmat32c.c268
-rw-r--r--contrib/masmx86/inffas32.asm2119
4 files changed, 2119 insertions, 2151 deletions
diff --git a/contrib/masmx86/bld_ml32.bat b/contrib/masmx86/bld_ml32.bat
new file mode 100644
index 0000000..99144d0
--- /dev/null
+++ b/contrib/masmx86/bld_ml32.bat
@@ -0,0 +1,2 @@
1ml /coff /Zi /c /Flgvmat32.lst gvmat32.asm
2ml /coff /Zi /c /Flinffas32.lst inffas32.asm
diff --git a/contrib/masmx86/gvmat32.asm b/contrib/masmx86/gvmat32.asm
index e841a7f..874bb2d 100644
--- a/contrib/masmx86/gvmat32.asm
+++ b/contrib/masmx86/gvmat32.asm
@@ -1,909 +1,972 @@
1; 1; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86
2; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86 2; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant.
3; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. 3; File written by Gilles Vollant, by modifiying the longest_match
4; File written by Gilles Vollant, by modifiying the longest_match 4; from Jean-loup Gailly in deflate.c
5; from Jean-loup Gailly in deflate.c 5;
6; It need wmask == 0x7fff 6; http://www.zlib.net
7; (assembly code is faster with a fixed wmask) 7; http://www.winimage.com/zLibDll
8; 8; http://www.muppetlabs.com/~breadbox/software/assembly.html
9; For Visual C++ 4.2 and ML 6.11c (version in directory \MASM611C of Win95 DDK) 9;
10; I compile with : "ml /coff /Zi /c gvmat32.asm" 10; For Visual C++ 4.x and higher and ML 6.x and higher
11; 11; ml.exe is in directory \MASM611C of Win95 DDK
12 12; ml.exe is also distributed in http://www.masm32.com/masmdl.htm
13;uInt longest_match_7fff(s, cur_match) 13; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
14; deflate_state *s; 14;
15; IPos cur_match; /* current match */ 15; this file contain two implementation of longest_match
16 16;
17 NbStack equ 76 17; longest_match_7fff : written 1996 by Gilles Vollant optimized for
18 cur_match equ dword ptr[esp+NbStack-0] 18; first Pentium. Assume s->w_mask == 0x7fff
19 str_s equ dword ptr[esp+NbStack-4] 19; longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro
20; 5 dword on top (ret,ebp,esi,edi,ebx) 20;
21 adrret equ dword ptr[esp+NbStack-8] 21; for using an seembly version of longest_match, you need define ASMV in project
22 pushebp equ dword ptr[esp+NbStack-12] 22; There is two way in using gvmat32.asm
23 pushedi equ dword ptr[esp+NbStack-16] 23;
24 pushesi equ dword ptr[esp+NbStack-20] 24; A) Suggested method
25 pushebx equ dword ptr[esp+NbStack-24] 25; if you want include both longest_match_7fff and longest_match_686
26 26; compile the asm file running
27 chain_length equ dword ptr [esp+NbStack-28] 27; ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm
28 limit equ dword ptr [esp+NbStack-32] 28; and include gvmat32c.c in your project
29 best_len equ dword ptr [esp+NbStack-36] 29; if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff,
30 window equ dword ptr [esp+NbStack-40] 30; longest_match_7fff will be used
31 prev equ dword ptr [esp+NbStack-44] 31; if you have a more modern CPU (Pentium Pro, II and higher)
32 scan_start equ word ptr [esp+NbStack-48] 32; longest_match_686 will be used
33 wmask equ dword ptr [esp+NbStack-52] 33; on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used,
34 match_start_ptr equ dword ptr [esp+NbStack-56] 34; but this is not a sitation you'll find often
35 nice_match equ dword ptr [esp+NbStack-60] 35;
36 scan equ dword ptr [esp+NbStack-64] 36; B) Alternative
37 37; if you are not interresed in old cpu performance and want the smaller
38 windowlen equ dword ptr [esp+NbStack-68] 38; binaries possible
39 match_start equ dword ptr [esp+NbStack-72] 39;
40 strend equ dword ptr [esp+NbStack-76] 40; compile the asm file running
41 NbStackAdd equ (NbStack-24) 41; ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm
42 42; and do not include gvmat32c.c in your project (ou define also
43 .386p 43; NOOLDPENTIUMCODE)
44 44;
45 name gvmatch 45; note : as I known, longest_match_686 is very faster than longest_match_7fff
46 .MODEL FLAT 46; on pentium Pro/II/III, faster (but less) in P4, but it seem
47 47; longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8
48 48;
49 49; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
50; all the +addstr offsets are due to the addition of pending_buf_size in zlib 1.04 50
51; and adding gzhead and gzindex in zlib 1.2.2.1 51;uInt longest_match_7fff(s, cur_match)
52; in the deflate_state structure since the asm code was first written 52; deflate_state *s;
53; (if you compile with zlib 1.0.4 or older, set addstr to 0). 53; IPos cur_match; /* current match */
54; (if you compiler with zlib between 1.04 and 1.2.1, set addstr to 4) 54
55; Note : these value are good with a 8 bytes boundary pack structure 55 NbStack equ 76
56 56 cur_match equ dword ptr[esp+NbStack-0]
57 addstr equ 4+8 57 str_s equ dword ptr[esp+NbStack-4]
58 dep_chain_length equ 70h+addstr 58; 5 dword on top (ret,ebp,esi,edi,ebx)
59 dep_window equ 2ch+addstr 59 adrret equ dword ptr[esp+NbStack-8]
60 dep_strstart equ 60h+addstr 60 pushebp equ dword ptr[esp+NbStack-12]
61 dep_prev_length equ 6ch+addstr 61 pushedi equ dword ptr[esp+NbStack-16]
62 dep_nice_match equ 84h+addstr 62 pushesi equ dword ptr[esp+NbStack-20]
63 dep_w_size equ 20h+addstr 63 pushebx equ dword ptr[esp+NbStack-24]
64 dep_prev equ 34h+addstr 64
65 dep_w_mask equ 28h+addstr 65 chain_length equ dword ptr [esp+NbStack-28]
66 dep_good_match equ 80h+addstr 66 limit equ dword ptr [esp+NbStack-32]
67 dep_match_start equ 64h+addstr 67 best_len equ dword ptr [esp+NbStack-36]
68 dep_lookahead equ 68h+addstr 68 window equ dword ptr [esp+NbStack-40]
69 69 prev equ dword ptr [esp+NbStack-44]
70 70 scan_start equ word ptr [esp+NbStack-48]
71_TEXT segment 71 wmask equ dword ptr [esp+NbStack-52]
72 72 match_start_ptr equ dword ptr [esp+NbStack-56]
73IFDEF NOUNDERLINE 73 nice_match equ dword ptr [esp+NbStack-60]
74 public longest_match_7fff 74 scan equ dword ptr [esp+NbStack-64]
75 public longest_match_686 75
76; public match_init 76 windowlen equ dword ptr [esp+NbStack-68]
77ELSE 77 match_start equ dword ptr [esp+NbStack-72]
78 public _longest_match_7fff 78 strend equ dword ptr [esp+NbStack-76]
79 public _longest_match_686 79 NbStackAdd equ (NbStack-24)
80; public _match_init 80
81ENDIF 81 .386p
82 82
83 MAX_MATCH equ 258 83 name gvmatch
84 MIN_MATCH equ 3 84 .MODEL FLAT
85 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) 85
86 86
87 87
88 88; all the +zlib1222add offsets are due to the addition of fields
89IFDEF NOUNDERLINE 89; in zlib in the deflate_state structure since the asm code was first written
90;match_init proc near 90; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
91; ret 91; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
92;match_init endp 92; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
93ELSE 93
94;_match_init proc near 94 zlib1222add equ 8
95; ret 95
96;_match_init endp 96; Note : these value are good with a 8 bytes boundary pack structure
97ENDIF 97 dep_chain_length equ 74h+zlib1222add
98 98 dep_window equ 30h+zlib1222add
99 99 dep_strstart equ 64h+zlib1222add
100IFDEF NOUNDERLINE 100 dep_prev_length equ 70h+zlib1222add
101longest_match_7fff proc near 101 dep_nice_match equ 88h+zlib1222add
102ELSE 102 dep_w_size equ 24h+zlib1222add
103_longest_match_7fff proc near 103 dep_prev equ 38h+zlib1222add
104ENDIF 104 dep_w_mask equ 2ch+zlib1222add
105 105 dep_good_match equ 84h+zlib1222add
106 mov edx,[esp+4] 106 dep_match_start equ 68h+zlib1222add
107 107 dep_lookahead equ 6ch+zlib1222add
108 108
109 109
110 push ebp 110_TEXT segment
111 push edi 111
112 push esi 112IFDEF NOUNDERLINE
113 push ebx 113 IFDEF NOOLDPENTIUMCODE
114 114 public longest_match
115 sub esp,NbStackAdd 115 public match_init
116 116 ELSE
117; initialize or check the variables used in match.asm. 117 public longest_match_7fff
118 mov ebp,edx 118 public cpudetect32
119 119 public longest_match_686
120; chain_length = s->max_chain_length 120 ENDIF
121; if (prev_length>=good_match) chain_length >>= 2 121ELSE
122 mov edx,[ebp+dep_chain_length] 122 IFDEF NOOLDPENTIUMCODE
123 mov ebx,[ebp+dep_prev_length] 123 public _longest_match
124 cmp [ebp+dep_good_match],ebx 124 public _match_init
125 ja noshr 125 ELSE
126 shr edx,2 126 public _longest_match_7fff
127noshr: 127 public _cpudetect32
128; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop 128 public _longest_match_686
129 inc edx 129 ENDIF
130 mov edi,[ebp+dep_nice_match] 130ENDIF
131 mov chain_length,edx 131
132 mov eax,[ebp+dep_lookahead] 132 MAX_MATCH equ 258
133 cmp eax,edi 133 MIN_MATCH equ 3
134; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 134 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
135 jae nolookaheadnicematch 135
136 mov edi,eax 136
137nolookaheadnicematch: 137
138; best_len = s->prev_length 138IFNDEF NOOLDPENTIUMCODE
139 mov best_len,ebx 139IFDEF NOUNDERLINE
140 140longest_match_7fff proc near
141; window = s->window 141ELSE
142 mov esi,[ebp+dep_window] 142_longest_match_7fff proc near
143 mov ecx,[ebp+dep_strstart] 143ENDIF
144 mov window,esi 144
145 145 mov edx,[esp+4]
146 mov nice_match,edi 146
147; scan = window + strstart 147
148 add esi,ecx 148
149 mov scan,esi 149 push ebp
150; dx = *window 150 push edi
151 mov dx,word ptr [esi] 151 push esi
152; bx = *(window+best_len-1) 152 push ebx
153 mov bx,word ptr [esi+ebx-1] 153
154 add esi,MAX_MATCH-1 154 sub esp,NbStackAdd
155; scan_start = *scan 155
156 mov scan_start,dx 156; initialize or check the variables used in match.asm.
157; strend = scan + MAX_MATCH-1 157 mov ebp,edx
158 mov strend,esi 158
159; bx = scan_end = *(window+best_len-1) 159; chain_length = s->max_chain_length
160 160; if (prev_length>=good_match) chain_length >>= 2
161; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 161 mov edx,[ebp+dep_chain_length]
162; s->strstart - (IPos)MAX_DIST(s) : NIL; 162 mov ebx,[ebp+dep_prev_length]
163 163 cmp [ebp+dep_good_match],ebx
164 mov esi,[ebp+dep_w_size] 164 ja noshr
165 sub esi,MIN_LOOKAHEAD 165 shr edx,2
166; here esi = MAX_DIST(s) 166noshr:
167 sub ecx,esi 167; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop
168 ja nodist 168 inc edx
169 xor ecx,ecx 169 mov edi,[ebp+dep_nice_match]
170nodist: 170 mov chain_length,edx
171 mov limit,ecx 171 mov eax,[ebp+dep_lookahead]
172 172 cmp eax,edi
173; prev = s->prev 173; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
174 mov edx,[ebp+dep_prev] 174 jae nolookaheadnicematch
175 mov prev,edx 175 mov edi,eax
176 176nolookaheadnicematch:
177; 177; best_len = s->prev_length
178 mov edx,dword ptr [ebp+dep_match_start] 178 mov best_len,ebx
179 mov bp,scan_start 179
180 mov eax,cur_match 180; window = s->window
181 mov match_start,edx 181 mov esi,[ebp+dep_window]
182 182 mov ecx,[ebp+dep_strstart]
183 mov edx,window 183 mov window,esi
184 mov edi,edx 184
185 add edi,best_len 185 mov nice_match,edi
186 mov esi,prev 186; scan = window + strstart
187 dec edi 187 add esi,ecx
188; windowlen = window + best_len -1 188 mov scan,esi
189 mov windowlen,edi 189; dx = *window
190 190 mov dx,word ptr [esi]
191 jmp beginloop2 191; bx = *(window+best_len-1)
192 align 4 192 mov bx,word ptr [esi+ebx-1]
193 193 add esi,MAX_MATCH-1
194; here, in the loop 194; scan_start = *scan
195; eax = ax = cur_match 195 mov scan_start,dx
196; ecx = limit 196; strend = scan + MAX_MATCH-1
197; bx = scan_end 197 mov strend,esi
198; bp = scan_start 198; bx = scan_end = *(window+best_len-1)
199; edi = windowlen (window + best_len -1) 199
200; esi = prev 200; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
201 201; s->strstart - (IPos)MAX_DIST(s) : NIL;
202 202
203;// here; chain_length <=16 203 mov esi,[ebp+dep_w_size]
204normalbeg0add16: 204 sub esi,MIN_LOOKAHEAD
205 add chain_length,16 205; here esi = MAX_DIST(s)
206 jz exitloop 206 sub ecx,esi
207normalbeg0: 207 ja nodist
208 cmp word ptr[edi+eax],bx 208 xor ecx,ecx
209 je normalbeg2noroll 209nodist:
210rcontlabnoroll: 210 mov limit,ecx
211; cur_match = prev[cur_match & wmask] 211
212 and eax,7fffh 212; prev = s->prev
213 mov ax,word ptr[esi+eax*2] 213 mov edx,[ebp+dep_prev]
214; if cur_match > limit, go to exitloop 214 mov prev,edx
215 cmp ecx,eax 215
216 jnb exitloop 216;
217; if --chain_length != 0, go to exitloop 217 mov edx,dword ptr [ebp+dep_match_start]
218 dec chain_length 218 mov bp,scan_start
219 jnz normalbeg0 219 mov eax,cur_match
220 jmp exitloop 220 mov match_start,edx
221 221
222normalbeg2noroll: 222 mov edx,window
223; if (scan_start==*(cur_match+window)) goto normalbeg2 223 mov edi,edx
224 cmp bp,word ptr[edx+eax] 224 add edi,best_len
225 jne rcontlabnoroll 225 mov esi,prev
226 jmp normalbeg2 226 dec edi
227 227; windowlen = window + best_len -1
228contloop3: 228 mov windowlen,edi
229 mov edi,windowlen 229
230 230 jmp beginloop2
231; cur_match = prev[cur_match & wmask] 231 align 4
232 and eax,7fffh 232
233 mov ax,word ptr[esi+eax*2] 233; here, in the loop
234; if cur_match > limit, go to exitloop 234; eax = ax = cur_match
235 cmp ecx,eax 235; ecx = limit
236jnbexitloopshort1: 236; bx = scan_end
237 jnb exitloop 237; bp = scan_start
238; if --chain_length != 0, go to exitloop 238; edi = windowlen (window + best_len -1)
239 239; esi = prev
240 240
241; begin the main loop 241
242beginloop2: 242;// here; chain_length <=16
243 sub chain_length,16+1 243normalbeg0add16:
244; if chain_length <=16, don't use the unrolled loop 244 add chain_length,16
245 jna normalbeg0add16 245 jz exitloop
246 246normalbeg0:
247do16: 247 cmp word ptr[edi+eax],bx
248 cmp word ptr[edi+eax],bx 248 je normalbeg2noroll
249 je normalbeg2dc0 249rcontlabnoroll:
250 250; cur_match = prev[cur_match & wmask]
251maccn MACRO lab 251 and eax,7fffh
252 and eax,7fffh 252 mov ax,word ptr[esi+eax*2]
253 mov ax,word ptr[esi+eax*2] 253; if cur_match > limit, go to exitloop
254 cmp ecx,eax 254 cmp ecx,eax
255 jnb exitloop 255 jnb exitloop
256 cmp word ptr[edi+eax],bx 256; if --chain_length != 0, go to exitloop
257 je lab 257 dec chain_length
258 ENDM 258 jnz normalbeg0
259 259 jmp exitloop
260rcontloop0: 260
261 maccn normalbeg2dc1 261normalbeg2noroll:
262 262; if (scan_start==*(cur_match+window)) goto normalbeg2
263rcontloop1: 263 cmp bp,word ptr[edx+eax]
264 maccn normalbeg2dc2 264 jne rcontlabnoroll
265 265 jmp normalbeg2
266rcontloop2: 266
267 maccn normalbeg2dc3 267contloop3:
268 268 mov edi,windowlen
269rcontloop3: 269
270 maccn normalbeg2dc4 270; cur_match = prev[cur_match & wmask]
271 271 and eax,7fffh
272rcontloop4: 272 mov ax,word ptr[esi+eax*2]
273 maccn normalbeg2dc5 273; if cur_match > limit, go to exitloop
274 274 cmp ecx,eax
275rcontloop5: 275jnbexitloopshort1:
276 maccn normalbeg2dc6 276 jnb exitloop
277 277; if --chain_length != 0, go to exitloop
278rcontloop6: 278
279 maccn normalbeg2dc7 279
280 280; begin the main loop
281rcontloop7: 281beginloop2:
282 maccn normalbeg2dc8 282 sub chain_length,16+1
283 283; if chain_length <=16, don't use the unrolled loop
284rcontloop8: 284 jna normalbeg0add16
285 maccn normalbeg2dc9 285
286 286do16:
287rcontloop9: 287 cmp word ptr[edi+eax],bx
288 maccn normalbeg2dc10 288 je normalbeg2dc0
289 289
290rcontloop10: 290maccn MACRO lab
291 maccn short normalbeg2dc11 291 and eax,7fffh
292 292 mov ax,word ptr[esi+eax*2]
293rcontloop11: 293 cmp ecx,eax
294 maccn short normalbeg2dc12 294 jnb exitloop
295 295 cmp word ptr[edi+eax],bx
296rcontloop12: 296 je lab
297 maccn short normalbeg2dc13 297 ENDM
298 298
299rcontloop13: 299rcontloop0:
300 maccn short normalbeg2dc14 300 maccn normalbeg2dc1
301 301
302rcontloop14: 302rcontloop1:
303 maccn short normalbeg2dc15 303 maccn normalbeg2dc2
304 304
305rcontloop15: 305rcontloop2:
306 and eax,7fffh 306 maccn normalbeg2dc3
307 mov ax,word ptr[esi+eax*2] 307
308 cmp ecx,eax 308rcontloop3:
309 jnb exitloop 309 maccn normalbeg2dc4
310 310
311 sub chain_length,16 311rcontloop4:
312 ja do16 312 maccn normalbeg2dc5
313 jmp normalbeg0add16 313
314 314rcontloop5:
315;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 315 maccn normalbeg2dc6
316 316
317normbeg MACRO rcontlab,valsub 317rcontloop6:
318; if we are here, we know that *(match+best_len-1) == scan_end 318 maccn normalbeg2dc7
319 cmp bp,word ptr[edx+eax] 319
320; if (match != scan_start) goto rcontlab 320rcontloop7:
321 jne rcontlab 321 maccn normalbeg2dc8
322; calculate the good chain_length, and we'll compare scan and match string 322
323 add chain_length,16-valsub 323rcontloop8:
324 jmp iseq 324 maccn normalbeg2dc9
325 ENDM 325
326 326rcontloop9:
327 327 maccn normalbeg2dc10
328normalbeg2dc11: 328
329 normbeg rcontloop11,11 329rcontloop10:
330 330 maccn short normalbeg2dc11
331normalbeg2dc12: 331
332 normbeg short rcontloop12,12 332rcontloop11:
333 333 maccn short normalbeg2dc12
334normalbeg2dc13: 334
335 normbeg short rcontloop13,13 335rcontloop12:
336 336 maccn short normalbeg2dc13
337normalbeg2dc14: 337
338 normbeg short rcontloop14,14 338rcontloop13:
339 339 maccn short normalbeg2dc14
340normalbeg2dc15: 340
341 normbeg short rcontloop15,15 341rcontloop14:
342 342 maccn short normalbeg2dc15
343normalbeg2dc10: 343
344 normbeg rcontloop10,10 344rcontloop15:
345 345 and eax,7fffh
346normalbeg2dc9: 346 mov ax,word ptr[esi+eax*2]
347 normbeg rcontloop9,9 347 cmp ecx,eax
348 348 jnb exitloop
349normalbeg2dc8: 349
350 normbeg rcontloop8,8 350 sub chain_length,16
351 351 ja do16
352normalbeg2dc7: 352 jmp normalbeg0add16
353 normbeg rcontloop7,7 353
354 354;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
355normalbeg2dc6: 355
356 normbeg rcontloop6,6 356normbeg MACRO rcontlab,valsub
357 357; if we are here, we know that *(match+best_len-1) == scan_end
358normalbeg2dc5: 358 cmp bp,word ptr[edx+eax]
359 normbeg rcontloop5,5 359; if (match != scan_start) goto rcontlab
360 360 jne rcontlab
361normalbeg2dc4: 361; calculate the good chain_length, and we'll compare scan and match string
362 normbeg rcontloop4,4 362 add chain_length,16-valsub
363 363 jmp iseq
364normalbeg2dc3: 364 ENDM
365 normbeg rcontloop3,3 365
366 366
367normalbeg2dc2: 367normalbeg2dc11:
368 normbeg rcontloop2,2 368 normbeg rcontloop11,11
369 369
370normalbeg2dc1: 370normalbeg2dc12:
371 normbeg rcontloop1,1 371 normbeg short rcontloop12,12
372 372
373normalbeg2dc0: 373normalbeg2dc13:
374 normbeg rcontloop0,0 374 normbeg short rcontloop13,13
375 375
376 376normalbeg2dc14:
377; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end 377 normbeg short rcontloop14,14
378 378
379normalbeg2: 379normalbeg2dc15:
380 mov edi,window 380 normbeg short rcontloop15,15
381 381
382 cmp bp,word ptr[edi+eax] 382normalbeg2dc10:
383 jne contloop3 ; if *(ushf*)match != scan_start, continue 383 normbeg rcontloop10,10
384 384
385iseq: 385normalbeg2dc9:
386; if we are here, we know that *(match+best_len-1) == scan_end 386 normbeg rcontloop9,9
387; and (match == scan_start) 387
388 388normalbeg2dc8:
389 mov edi,edx 389 normbeg rcontloop8,8
390 mov esi,scan ; esi = scan 390
391 add edi,eax ; edi = window + cur_match = match 391normalbeg2dc7:
392 392 normbeg rcontloop7,7
393 mov edx,[esi+3] ; compare manually dword at match+3 393
394 xor edx,[edi+3] ; and scan +3 394normalbeg2dc6:
395 395 normbeg rcontloop6,6
396 jz begincompare ; if equal, go to long compare 396
397 397normalbeg2dc5:
398; we will determine the unmatch byte and calculate len (in esi) 398 normbeg rcontloop5,5
399 or dl,dl 399
400 je eq1rr 400normalbeg2dc4:
401 mov esi,3 401 normbeg rcontloop4,4
402 jmp trfinval 402
403eq1rr: 403normalbeg2dc3:
404 or dx,dx 404 normbeg rcontloop3,3
405 je eq1 405
406 406normalbeg2dc2:
407 mov esi,4 407 normbeg rcontloop2,2
408 jmp trfinval 408
409eq1: 409normalbeg2dc1:
410 and edx,0ffffffh 410 normbeg rcontloop1,1
411 jz eq11 411
412 mov esi,5 412normalbeg2dc0:
413 jmp trfinval 413 normbeg rcontloop0,0
414eq11: 414
415 mov esi,6 415
416 jmp trfinval 416; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end
417 417
418begincompare: 418normalbeg2:
419 ; here we now scan and match begin same 419 mov edi,window
420 add edi,6 420
421 add esi,6 421 cmp bp,word ptr[edi+eax]
422 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes 422 jne contloop3 ; if *(ushf*)match != scan_start, continue
423 repe cmpsd ; loop until mismatch 423
424 424iseq:
425 je trfin ; go to trfin if not unmatch 425; if we are here, we know that *(match+best_len-1) == scan_end
426; we determine the unmatch byte 426; and (match == scan_start)
427 sub esi,4 427
428 mov edx,[edi-4] 428 mov edi,edx
429 xor edx,[esi] 429 mov esi,scan ; esi = scan
430 430 add edi,eax ; edi = window + cur_match = match
431 or dl,dl 431
432 jnz trfin 432 mov edx,[esi+3] ; compare manually dword at match+3
433 inc esi 433 xor edx,[edi+3] ; and scan +3
434 434
435 or dx,dx 435 jz begincompare ; if equal, go to long compare
436 jnz trfin 436
437 inc esi 437; we will determine the unmatch byte and calculate len (in esi)
438 438 or dl,dl
439 and edx,0ffffffh 439 je eq1rr
440 jnz trfin 440 mov esi,3
441 inc esi 441 jmp trfinval
442 442eq1rr:
443trfin: 443 or dx,dx
444 sub esi,scan ; esi = len 444 je eq1
445trfinval: 445
446; here we have finised compare, and esi contain len of equal string 446 mov esi,4
447 cmp esi,best_len ; if len > best_len, go newbestlen 447 jmp trfinval
448 ja short newbestlen 448eq1:
449; now we restore edx, ecx and esi, for the big loop 449 and edx,0ffffffh
450 mov esi,prev 450 jz eq11
451 mov ecx,limit 451 mov esi,5
452 mov edx,window 452 jmp trfinval
453 jmp contloop3 453eq11:
454 454 mov esi,6
455newbestlen: 455 jmp trfinval
456 mov best_len,esi ; len become best_len 456
457 457begincompare:
458 mov match_start,eax ; save new position as match_start 458 ; here we now scan and match begin same
459 cmp esi,nice_match ; if best_len >= nice_match, exit 459 add edi,6
460 jae exitloop 460 add esi,6
461 mov ecx,scan 461 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes
462 mov edx,window ; restore edx=window 462 repe cmpsd ; loop until mismatch
463 add ecx,esi 463
464 add esi,edx 464 je trfin ; go to trfin if not unmatch
465 465; we determine the unmatch byte
466 dec esi 466 sub esi,4
467 mov windowlen,esi ; windowlen = window + best_len-1 467 mov edx,[edi-4]
468 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end 468 xor edx,[esi]
469 469
470; now we restore ecx and esi, for the big loop : 470 or dl,dl
471 mov esi,prev 471 jnz trfin
472 mov ecx,limit 472 inc esi
473 jmp contloop3 473
474 474 or dx,dx
475exitloop: 475 jnz trfin
476; exit : s->match_start=match_start 476 inc esi
477 mov ebx,match_start 477
478 mov ebp,str_s 478 and edx,0ffffffh
479 mov ecx,best_len 479 jnz trfin
480 mov dword ptr [ebp+dep_match_start],ebx 480 inc esi
481 mov eax,dword ptr [ebp+dep_lookahead] 481
482 cmp ecx,eax 482trfin:
483 ja minexlo 483 sub esi,scan ; esi = len
484 mov eax,ecx 484trfinval:
485minexlo: 485; here we have finised compare, and esi contain len of equal string
486; return min(best_len,s->lookahead) 486 cmp esi,best_len ; if len > best_len, go newbestlen
487 487 ja short newbestlen
488; restore stack and register ebx,esi,edi,ebp 488; now we restore edx, ecx and esi, for the big loop
489 add esp,NbStackAdd 489 mov esi,prev
490 490 mov ecx,limit
491 pop ebx 491 mov edx,window
492 pop esi 492 jmp contloop3
493 pop edi 493
494 pop ebp 494newbestlen:
495 ret 495 mov best_len,esi ; len become best_len
496InfoAuthor: 496
497; please don't remove this string ! 497 mov match_start,eax ; save new position as match_start
498; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary! 498 cmp esi,nice_match ; if best_len >= nice_match, exit
499 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah 499 jae exitloop
500 500 mov ecx,scan
501 501 mov edx,window ; restore edx=window
502 502 add ecx,esi
503IFDEF NOUNDERLINE 503 add esi,edx
504longest_match_7fff endp 504
505ELSE 505 dec esi
506_longest_match_7fff endp 506 mov windowlen,esi ; windowlen = window + best_len-1
507ENDIF 507 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end
508 508
509 509; now we restore ecx and esi, for the big loop :
510IFDEF NOUNDERLINE 510 mov esi,prev
511cpudetect32 proc near 511 mov ecx,limit
512ELSE 512 jmp contloop3
513_cpudetect32 proc near 513
514ENDIF 514exitloop:
515 515; exit : s->match_start=match_start
516 push ebx 516 mov ebx,match_start
517 517 mov ebp,str_s
518 pushfd ; push original EFLAGS 518 mov ecx,best_len
519 pop eax ; get original EFLAGS 519 mov dword ptr [ebp+dep_match_start],ebx
520 mov ecx, eax ; save original EFLAGS 520 mov eax,dword ptr [ebp+dep_lookahead]
521 xor eax, 40000h ; flip AC bit in EFLAGS 521 cmp ecx,eax
522 push eax ; save new EFLAGS value on stack 522 ja minexlo
523 popfd ; replace current EFLAGS value 523 mov eax,ecx
524 pushfd ; get new EFLAGS 524minexlo:
525 pop eax ; store new EFLAGS in EAX 525; return min(best_len,s->lookahead)
526 xor eax, ecx ; can’t toggle AC bit, processor=80386 526
527 jz end_cpu_is_386 ; jump if 80386 processor 527; restore stack and register ebx,esi,edi,ebp
528 push ecx 528 add esp,NbStackAdd
529 popfd ; restore AC bit in EFLAGS first 529
530 530 pop ebx
531 pushfd 531 pop esi
532 pushfd 532 pop edi
533 pop ecx 533 pop ebp
534 534 ret
535 mov eax, ecx ; get original EFLAGS 535InfoAuthor:
536 xor eax, 200000h ; flip ID bit in EFLAGS 536; please don't remove this string !
537 push eax ; save new EFLAGS value on stack 537; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary!
538 popfd ; replace current EFLAGS value 538 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah
539 pushfd ; get new EFLAGS 539
540 pop eax ; store new EFLAGS in EAX 540
541 popfd ; restore original EFLAGS 541
542 xor eax, ecx ; can’t toggle ID bit, 542IFDEF NOUNDERLINE
543 je is_old_486 ; processor=old 543longest_match_7fff endp
544 544ELSE
545 mov eax,1 545_longest_match_7fff endp
546 db 0fh,0a2h ;CPUID 546ENDIF
547 547
548exitcpudetect: 548
549 pop ebx 549IFDEF NOUNDERLINE
550 ret 550cpudetect32 proc near
551 551ELSE
552end_cpu_is_386: 552_cpudetect32 proc near
553 mov eax,0300h 553ENDIF
554 jmp exitcpudetect 554
555 555 push ebx
556is_old_486: 556
557 mov eax,0400h 557 pushfd ; push original EFLAGS
558 jmp exitcpudetect 558 pop eax ; get original EFLAGS
559 559 mov ecx, eax ; save original EFLAGS
560IFDEF NOUNDERLINE 560 xor eax, 40000h ; flip AC bit in EFLAGS
561cpudetect32 endp 561 push eax ; save new EFLAGS value on stack
562ELSE 562 popfd ; replace current EFLAGS value
563_cpudetect32 endp 563 pushfd ; get new EFLAGS
564ENDIF 564 pop eax ; store new EFLAGS in EAX
565 565 xor eax, ecx ; can’t toggle AC bit, processor=80386
566 566 jz end_cpu_is_386 ; jump if 80386 processor
567 567 push ecx
568 568 popfd ; restore AC bit in EFLAGS first
569MAX_MATCH equ 258 569
570MIN_MATCH equ 3 570 pushfd
571MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) 571 pushfd
572MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) 572 pop ecx
573 573
574 574 mov eax, ecx ; get original EFLAGS
575;;; stack frame offsets 575 xor eax, 200000h ; flip ID bit in EFLAGS
576 576 push eax ; save new EFLAGS value on stack
577chainlenwmask equ esp + 0 ; high word: current chain len 577 popfd ; replace current EFLAGS value
578 ; low word: s->wmask 578 pushfd ; get new EFLAGS
579window equ esp + 4 ; local copy of s->window 579 pop eax ; store new EFLAGS in EAX
580windowbestlen equ esp + 8 ; s->window + bestlen 580 popfd ; restore original EFLAGS
581scanstart equ esp + 16 ; first two bytes of string 581 xor eax, ecx ; can’t toggle ID bit,
582scanend equ esp + 12 ; last two bytes of string 582 je is_old_486 ; processor=old
583scanalign equ esp + 20 ; dword-misalignment of string 583
584nicematch equ esp + 24 ; a good enough match size 584 mov eax,1
585bestlen equ esp + 28 ; size of best match so far 585 db 0fh,0a2h ;CPUID
586scan equ esp + 32 ; ptr to string wanting match 586
587 587exitcpudetect:
588LocalVarsSize equ 36 588 pop ebx
589; saved ebx byte esp + 36 589 ret
590; saved edi byte esp + 40 590
591; saved esi byte esp + 44 591end_cpu_is_386:
592; saved ebp byte esp + 48 592 mov eax,0300h
593; return address byte esp + 52 593 jmp exitcpudetect
594deflatestate equ esp + 56 ; the function arguments 594
595curmatch equ esp + 60 595is_old_486:
596 596 mov eax,0400h
597;;; Offsets for fields in the deflate_state structure. These numbers 597 jmp exitcpudetect
598;;; are calculated from the definition of deflate_state, with the 598
599;;; assumption that the compiler will dword-align the fields. (Thus, 599IFDEF NOUNDERLINE
600;;; changing the definition of deflate_state could easily cause this 600cpudetect32 endp
601;;; program to crash horribly, without so much as a warning at 601ELSE
602;;; compile time. Sigh.) 602_cpudetect32 endp
603 603ENDIF
604dsWSize equ 36+addstr-4 604ENDIF
605dsWMask equ 44+addstr-4 605
606dsWindow equ 48+addstr-4 606MAX_MATCH equ 258
607dsPrev equ 56+addstr-4 607MIN_MATCH equ 3
608dsMatchLen equ 88+addstr-4 608MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
609dsPrevMatch equ 92+addstr-4 609MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
610dsStrStart equ 100+addstr-4 610
611dsMatchStart equ 104+addstr-4 611
612dsLookahead equ 108+addstr-4 612;;; stack frame offsets
613dsPrevLen equ 112+addstr-4 613
614dsMaxChainLen equ 116+addstr-4 614chainlenwmask equ esp + 0 ; high word: current chain len
615dsGoodMatch equ 132+addstr-4 615 ; low word: s->wmask
616dsNiceMatch equ 136+addstr-4 616window equ esp + 4 ; local copy of s->window
617 617windowbestlen equ esp + 8 ; s->window + bestlen
618 618scanstart equ esp + 16 ; first two bytes of string
619;;; match.asm -- Pentium-Pro-optimized version of longest_match() 619scanend equ esp + 12 ; last two bytes of string
620;;; Written for zlib 1.1.2 620scanalign equ esp + 20 ; dword-misalignment of string
621;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 621nicematch equ esp + 24 ; a good enough match size
622;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html 622bestlen equ esp + 28 ; size of best match so far
623;;; 623scan equ esp + 32 ; ptr to string wanting match
624;;; This is free software; you can redistribute it and/or modify it 624
625;;; under the terms of the GNU General Public License. 625LocalVarsSize equ 36
626 626; saved ebx byte esp + 36
627;GLOBAL _longest_match, _match_init 627; saved edi byte esp + 40
628 628; saved esi byte esp + 44
629 629; saved ebp byte esp + 48
630;SECTION .text 630; return address byte esp + 52
631 631deflatestate equ esp + 56 ; the function arguments
632;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) 632curmatch equ esp + 60
633 633
634;_longest_match: 634;;; Offsets for fields in the deflate_state structure. These numbers
635IFDEF NOUNDERLINE 635;;; are calculated from the definition of deflate_state, with the
636longest_match_686 proc near 636;;; assumption that the compiler will dword-align the fields. (Thus,
637ELSE 637;;; changing the definition of deflate_state could easily cause this
638_longest_match_686 proc near 638;;; program to crash horribly, without so much as a warning at
639ENDIF 639;;; compile time. Sigh.)
640 640
641 641dsWSize equ 36+zlib1222add
642;;; Save registers that the compiler may be using, and adjust esp to 642dsWMask equ 44+zlib1222add
643;;; make room for our stack frame. 643dsWindow equ 48+zlib1222add
644 644dsPrev equ 56+zlib1222add
645 push ebp 645dsMatchLen equ 88+zlib1222add
646 push edi 646dsPrevMatch equ 92+zlib1222add
647 push esi 647dsStrStart equ 100+zlib1222add
648 push ebx 648dsMatchStart equ 104+zlib1222add
649 sub esp, LocalVarsSize 649dsLookahead equ 108+zlib1222add
650 650dsPrevLen equ 112+zlib1222add
651;;; Retrieve the function arguments. ecx will hold cur_match 651dsMaxChainLen equ 116+zlib1222add
652;;; throughout the entire function. edx will hold the pointer to the 652dsGoodMatch equ 132+zlib1222add
653;;; deflate_state structure during the function's setup (before 653dsNiceMatch equ 136+zlib1222add
654;;; entering the main loop. 654
655 655
656 mov edx, [deflatestate] 656;;; match.asm -- Pentium-Pro-optimized version of longest_match()
657 mov ecx, [curmatch] 657;;; Written for zlib 1.1.2
658 658;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
659;;; uInt wmask = s->w_mask; 659;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
660;;; unsigned chain_length = s->max_chain_length; 660;;;
661;;; if (s->prev_length >= s->good_match) { 661;;; This is free software; you can redistribute it and/or modify it
662;;; chain_length >>= 2; 662;;; under the terms of the GNU General Public License.
663;;; } 663
664 664;GLOBAL _longest_match, _match_init
665 mov eax, [edx + dsPrevLen] 665
666 mov ebx, [edx + dsGoodMatch] 666
667 cmp eax, ebx 667;SECTION .text
668 mov eax, [edx + dsWMask] 668
669 mov ebx, [edx + dsMaxChainLen] 669;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
670 jl LastMatchGood 670
671 shr ebx, 2 671;_longest_match:
672LastMatchGood: 672IFDEF NOOLDPENTIUMCODE
673 673 IFDEF NOUNDERLINE
674;;; chainlen is decremented once beforehand so that the function can 674 longest_match proc near
675;;; use the sign flag instead of the zero flag for the exit test. 675 ELSE
676;;; It is then shifted into the high word, to make room for the wmask 676 _longest_match proc near
677;;; value, which it will always accompany. 677 ENDIF
678 678ELSE
679 dec ebx 679 IFDEF NOUNDERLINE
680 shl ebx, 16 680 longest_match_686 proc near
681 or ebx, eax 681 ELSE
682 mov [chainlenwmask], ebx 682 _longest_match_686 proc near
683 683 ENDIF
684;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 684ENDIF
685 685
686 mov eax, [edx + dsNiceMatch] 686;;; Save registers that the compiler may be using, and adjust esp to
687 mov ebx, [edx + dsLookahead] 687;;; make room for our stack frame.
688 cmp ebx, eax 688
689 jl LookaheadLess 689 push ebp
690 mov ebx, eax 690 push edi
691LookaheadLess: mov [nicematch], ebx 691 push esi
692 692 push ebx
693;;; register Bytef *scan = s->window + s->strstart; 693 sub esp, LocalVarsSize
694 694
695 mov esi, [edx + dsWindow] 695;;; Retrieve the function arguments. ecx will hold cur_match
696 mov [window], esi 696;;; throughout the entire function. edx will hold the pointer to the
697 mov ebp, [edx + dsStrStart] 697;;; deflate_state structure during the function's setup (before
698 lea edi, [esi + ebp] 698;;; entering the main loop.
699 mov [scan], edi 699
700 700 mov edx, [deflatestate]
701;;; Determine how many bytes the scan ptr is off from being 701 mov ecx, [curmatch]
702;;; dword-aligned. 702
703 703;;; uInt wmask = s->w_mask;
704 mov eax, edi 704;;; unsigned chain_length = s->max_chain_length;
705 neg eax 705;;; if (s->prev_length >= s->good_match) {
706 and eax, 3 706;;; chain_length >>= 2;
707 mov [scanalign], eax 707;;; }
708 708
709;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 709 mov eax, [edx + dsPrevLen]
710;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 710 mov ebx, [edx + dsGoodMatch]
711 711 cmp eax, ebx
712 mov eax, [edx + dsWSize] 712 mov eax, [edx + dsWMask]
713 sub eax, MIN_LOOKAHEAD 713 mov ebx, [edx + dsMaxChainLen]
714 sub ebp, eax 714 jl LastMatchGood
715 jg LimitPositive 715 shr ebx, 2
716 xor ebp, ebp 716LastMatchGood:
717LimitPositive: 717
718 718;;; chainlen is decremented once beforehand so that the function can
719;;; int best_len = s->prev_length; 719;;; use the sign flag instead of the zero flag for the exit test.
720 720;;; It is then shifted into the high word, to make room for the wmask
721 mov eax, [edx + dsPrevLen] 721;;; value, which it will always accompany.
722 mov [bestlen], eax 722
723 723 dec ebx
724;;; Store the sum of s->window + best_len in esi locally, and in esi. 724 shl ebx, 16
725 725 or ebx, eax
726 add esi, eax 726 mov [chainlenwmask], ebx
727 mov [windowbestlen], esi 727
728 728;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
729;;; register ush scan_start = *(ushf*)scan; 729
730;;; register ush scan_end = *(ushf*)(scan+best_len-1); 730 mov eax, [edx + dsNiceMatch]
731;;; Posf *prev = s->prev; 731 mov ebx, [edx + dsLookahead]
732 732 cmp ebx, eax
733 movzx ebx, word ptr [edi] 733 jl LookaheadLess
734 mov [scanstart], ebx 734 mov ebx, eax
735 movzx ebx, word ptr [edi + eax - 1] 735LookaheadLess: mov [nicematch], ebx
736 mov [scanend], ebx 736
737 mov edi, [edx + dsPrev] 737;;; register Bytef *scan = s->window + s->strstart;
738 738
739;;; Jump into the main loop. 739 mov esi, [edx + dsWindow]
740 740 mov [window], esi
741 mov edx, [chainlenwmask] 741 mov ebp, [edx + dsStrStart]
742 jmp short LoopEntry 742 lea edi, [esi + ebp]
743 743 mov [scan], edi
744align 4 744
745 745;;; Determine how many bytes the scan ptr is off from being
746;;; do { 746;;; dword-aligned.
747;;; match = s->window + cur_match; 747
748;;; if (*(ushf*)(match+best_len-1) != scan_end || 748 mov eax, edi
749;;; *(ushf*)match != scan_start) continue; 749 neg eax
750;;; [...] 750 and eax, 3
751;;; } while ((cur_match = prev[cur_match & wmask]) > limit 751 mov [scanalign], eax
752;;; && --chain_length != 0); 752
753;;; 753;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
754;;; Here is the inner loop of the function. The function will spend the 754;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
755;;; majority of its time in this loop, and majority of that time will 755
756;;; be spent in the first ten instructions. 756 mov eax, [edx + dsWSize]
757;;; 757 sub eax, MIN_LOOKAHEAD
758;;; Within this loop: 758 sub ebp, eax
759;;; ebx = scanend 759 jg LimitPositive
760;;; ecx = curmatch 760 xor ebp, ebp
761;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 761LimitPositive:
762;;; esi = windowbestlen - i.e., (window + bestlen) 762
763;;; edi = prev 763;;; int best_len = s->prev_length;
764;;; ebp = limit 764
765 765 mov eax, [edx + dsPrevLen]
766LookupLoop: 766 mov [bestlen], eax
767 and ecx, edx 767
768 movzx ecx, word ptr [edi + ecx*2] 768;;; Store the sum of s->window + best_len in esi locally, and in esi.
769 cmp ecx, ebp 769
770 jbe LeaveNow 770 add esi, eax
771 sub edx, 00010000h 771 mov [windowbestlen], esi
772 js LeaveNow 772
773LoopEntry: movzx eax, word ptr [esi + ecx - 1] 773;;; register ush scan_start = *(ushf*)scan;
774 cmp eax, ebx 774;;; register ush scan_end = *(ushf*)(scan+best_len-1);
775 jnz LookupLoop 775;;; Posf *prev = s->prev;
776 mov eax, [window] 776
777 movzx eax, word ptr [eax + ecx] 777 movzx ebx, word ptr [edi]
778 cmp eax, [scanstart] 778 mov [scanstart], ebx
779 jnz LookupLoop 779 movzx ebx, word ptr [edi + eax - 1]
780 780 mov [scanend], ebx
781;;; Store the current value of chainlen. 781 mov edi, [edx + dsPrev]
782 782
783 mov [chainlenwmask], edx 783;;; Jump into the main loop.
784 784
785;;; Point edi to the string under scrutiny, and esi to the string we 785 mov edx, [chainlenwmask]
786;;; are hoping to match it up with. In actuality, esi and edi are 786 jmp short LoopEntry
787;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 787
788;;; initialized to -(MAX_MATCH_8 - scanalign). 788align 4
789 789
790 mov esi, [window] 790;;; do {
791 mov edi, [scan] 791;;; match = s->window + cur_match;
792 add esi, ecx 792;;; if (*(ushf*)(match+best_len-1) != scan_end ||
793 mov eax, [scanalign] 793;;; *(ushf*)match != scan_start) continue;
794 mov edx, 0fffffef8h; -(MAX_MATCH_8) 794;;; [...]
795 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] 795;;; } while ((cur_match = prev[cur_match & wmask]) > limit
796 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] 796;;; && --chain_length != 0);
797 797;;;
798;;; Test the strings for equality, 8 bytes at a time. At the end, 798;;; Here is the inner loop of the function. The function will spend the
799;;; adjust edx so that it is offset to the exact byte that mismatched. 799;;; majority of its time in this loop, and majority of that time will
800;;; 800;;; be spent in the first ten instructions.
801;;; We already know at this point that the first three bytes of the 801;;;
802;;; strings match each other, and they can be safely passed over before 802;;; Within this loop:
803;;; starting the compare loop. So what this code does is skip over 0-3 803;;; ebx = scanend
804;;; bytes, as much as necessary in order to dword-align the edi 804;;; ecx = curmatch
805;;; pointer. (esi will still be misaligned three times out of four.) 805;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
806;;; 806;;; esi = windowbestlen - i.e., (window + bestlen)
807;;; It should be confessed that this loop usually does not represent 807;;; edi = prev
808;;; much of the total running time. Replacing it with a more 808;;; ebp = limit
809;;; straightforward "rep cmpsb" would not drastically degrade 809
810;;; performance. 810LookupLoop:
811 811 and ecx, edx
812LoopCmps: 812 movzx ecx, word ptr [edi + ecx*2]
813 mov eax, [esi + edx] 813 cmp ecx, ebp
814 xor eax, [edi + edx] 814 jbe LeaveNow
815 jnz LeaveLoopCmps 815 sub edx, 00010000h
816 mov eax, [esi + edx + 4] 816 js LeaveNow
817 xor eax, [edi + edx + 4] 817LoopEntry: movzx eax, word ptr [esi + ecx - 1]
818 jnz LeaveLoopCmps4 818 cmp eax, ebx
819 add edx, 8 819 jnz LookupLoop
820 jnz LoopCmps 820 mov eax, [window]
821 jmp short LenMaximum 821 movzx eax, word ptr [eax + ecx]
822LeaveLoopCmps4: add edx, 4 822 cmp eax, [scanstart]
823LeaveLoopCmps: test eax, 0000FFFFh 823 jnz LookupLoop
824 jnz LenLower 824
825 add edx, 2 825;;; Store the current value of chainlen.
826 shr eax, 16 826
827LenLower: sub al, 1 827 mov [chainlenwmask], edx
828 adc edx, 0 828
829 829;;; Point edi to the string under scrutiny, and esi to the string we
830;;; Calculate the length of the match. If it is longer than MAX_MATCH, 830;;; are hoping to match it up with. In actuality, esi and edi are
831;;; then automatically accept it as the best possible match and leave. 831;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
832 832;;; initialized to -(MAX_MATCH_8 - scanalign).
833 lea eax, [edi + edx] 833
834 mov edi, [scan] 834 mov esi, [window]
835 sub eax, edi 835 mov edi, [scan]
836 cmp eax, MAX_MATCH 836 add esi, ecx
837 jge LenMaximum 837 mov eax, [scanalign]
838 838 mov edx, 0fffffef8h; -(MAX_MATCH_8)
839;;; If the length of the match is not longer than the best match we 839 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
840;;; have so far, then forget it and return to the lookup loop. 840 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
841 841
842 mov edx, [deflatestate] 842;;; Test the strings for equality, 8 bytes at a time. At the end,
843 mov ebx, [bestlen] 843;;; adjust edx so that it is offset to the exact byte that mismatched.
844 cmp eax, ebx 844;;;
845 jg LongerMatch 845;;; We already know at this point that the first three bytes of the
846 mov esi, [windowbestlen] 846;;; strings match each other, and they can be safely passed over before
847 mov edi, [edx + dsPrev] 847;;; starting the compare loop. So what this code does is skip over 0-3
848 mov ebx, [scanend] 848;;; bytes, as much as necessary in order to dword-align the edi
849 mov edx, [chainlenwmask] 849;;; pointer. (esi will still be misaligned three times out of four.)
850 jmp LookupLoop 850;;;
851 851;;; It should be confessed that this loop usually does not represent
852;;; s->match_start = cur_match; 852;;; much of the total running time. Replacing it with a more
853;;; best_len = len; 853;;; straightforward "rep cmpsb" would not drastically degrade
854;;; if (len >= nice_match) break; 854;;; performance.
855;;; scan_end = *(ushf*)(scan+best_len-1); 855
856 856LoopCmps:
857LongerMatch: mov ebx, [nicematch] 857 mov eax, [esi + edx]
858 mov [bestlen], eax 858 xor eax, [edi + edx]
859 mov [edx + dsMatchStart], ecx 859 jnz LeaveLoopCmps
860 cmp eax, ebx 860 mov eax, [esi + edx + 4]
861 jge LeaveNow 861 xor eax, [edi + edx + 4]
862 mov esi, [window] 862 jnz LeaveLoopCmps4
863 add esi, eax 863 add edx, 8
864 mov [windowbestlen], esi 864 jnz LoopCmps
865 movzx ebx, word ptr [edi + eax - 1] 865 jmp short LenMaximum
866 mov edi, [edx + dsPrev] 866LeaveLoopCmps4: add edx, 4
867 mov [scanend], ebx 867LeaveLoopCmps: test eax, 0000FFFFh
868 mov edx, [chainlenwmask] 868 jnz LenLower
869 jmp LookupLoop 869 add edx, 2
870 870 shr eax, 16
871;;; Accept the current string, with the maximum possible length. 871LenLower: sub al, 1
872 872 adc edx, 0
873LenMaximum: mov edx, [deflatestate] 873
874 mov dword ptr [bestlen], MAX_MATCH 874;;; Calculate the length of the match. If it is longer than MAX_MATCH,
875 mov [edx + dsMatchStart], ecx 875;;; then automatically accept it as the best possible match and leave.
876 876
877;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 877 lea eax, [edi + edx]
878;;; return s->lookahead; 878 mov edi, [scan]
879 879 sub eax, edi
880LeaveNow: 880 cmp eax, MAX_MATCH
881 mov edx, [deflatestate] 881 jge LenMaximum
882 mov ebx, [bestlen] 882
883 mov eax, [edx + dsLookahead] 883;;; If the length of the match is not longer than the best match we
884 cmp ebx, eax 884;;; have so far, then forget it and return to the lookup loop.
885 jg LookaheadRet 885
886 mov eax, ebx 886 mov edx, [deflatestate]
887LookaheadRet: 887 mov ebx, [bestlen]
888 888 cmp eax, ebx
889;;; Restore the stack and return from whence we came. 889 jg LongerMatch
890 890 mov esi, [windowbestlen]
891 add esp, LocalVarsSize 891 mov edi, [edx + dsPrev]
892 pop ebx 892 mov ebx, [scanend]
893 pop esi 893 mov edx, [chainlenwmask]
894 pop edi 894 jmp LookupLoop
895 pop ebp 895
896 896;;; s->match_start = cur_match;
897 ret 897;;; best_len = len;
898; please don't remove this string ! 898;;; if (len >= nice_match) break;
899; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary! 899;;; scan_end = *(ushf*)(scan+best_len-1);
900 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah 900
901 901LongerMatch: mov ebx, [nicematch]
902IFDEF NOUNDERLINE 902 mov [bestlen], eax
903longest_match_686 endp 903 mov [edx + dsMatchStart], ecx
904ELSE 904 cmp eax, ebx
905_longest_match_686 endp 905 jge LeaveNow
906ENDIF 906 mov esi, [window]
907 907 add esi, eax
908_TEXT ends 908 mov [windowbestlen], esi
909end 909 movzx ebx, word ptr [edi + eax - 1]
910 mov edi, [edx + dsPrev]
911 mov [scanend], ebx
912 mov edx, [chainlenwmask]
913 jmp LookupLoop
914
915;;; Accept the current string, with the maximum possible length.
916
917LenMaximum: mov edx, [deflatestate]
918 mov dword ptr [bestlen], MAX_MATCH
919 mov [edx + dsMatchStart], ecx
920
921;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
922;;; return s->lookahead;
923
924LeaveNow:
925 mov edx, [deflatestate]
926 mov ebx, [bestlen]
927 mov eax, [edx + dsLookahead]
928 cmp ebx, eax
929 jg LookaheadRet
930 mov eax, ebx
931LookaheadRet:
932
933;;; Restore the stack and return from whence we came.
934
935 add esp, LocalVarsSize
936 pop ebx
937 pop esi
938 pop edi
939 pop ebp
940
941 ret
942; please don't remove this string !
943; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
944 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
945
946
947IFDEF NOOLDPENTIUMCODE
948 IFDEF NOUNDERLINE
949 longest_match endp
950 ELSE
951 _longest_match endp
952 ENDIF
953
954 IFDEF NOUNDERLINE
955 match_init proc near
956 ret
957 match_init endp
958 ELSE
959 _match_init proc near
960 ret
961 _match_init endp
962 ENDIF
963ELSE
964 IFDEF NOUNDERLINE
965 longest_match_686 endp
966 ELSE
967 _longest_match_686 endp
968 ENDIF
969ENDIF
970
971_TEXT ends
972end
diff --git a/contrib/masmx86/gvmat32c.c b/contrib/masmx86/gvmat32c.c
index 9ed25f3..7ad2b27 100644
--- a/contrib/masmx86/gvmat32c.c
+++ b/contrib/masmx86/gvmat32c.c
@@ -1,206 +1,62 @@
1/* gvmat32.c -- C portion of the optimized longest_match for 32 bits x86 1/* gvmat32.c -- C portion of the optimized longest_match for 32 bits x86
2 * Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. 2 * Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant.
3 * File written by Gilles Vollant, by modifiying the longest_match 3 * File written by Gilles Vollant, by modifiying the longest_match
4 * from Jean-loup Gailly in deflate.c 4 * from Jean-loup Gailly in deflate.c
5 * it prepare all parameters and call the assembly longest_match_gvasm 5 * it prepare all parameters and call the assembly longest_match_gvasm
6 * longest_match execute standard C code is wmask != 0x7fff 6 * longest_match execute standard C code is wmask != 0x7fff
7 * (assembly code is faster with a fixed wmask) 7 * (assembly code is faster with a fixed wmask)
8 * 8 *
9 */ 9 * Read comment at beginning of gvmat32.asm for more information
10 10 */
11#include "deflate.h" 11
12 12#if defined(ASMV) && (!defined(NOOLDPENTIUMCODE))
13#ifdef ASMV 13#include "deflate.h"
14#define NIL 0 14
15 15/* if your C compiler don't add underline before function name,
16#define UNALIGNED_OK 16 define ADD_UNDERLINE_ASMFUNC */
17 17#ifdef ADD_UNDERLINE_ASMFUNC
18 18#define longest_match_7fff _longest_match_7fff
19/* if your C compiler don't add underline before function name, 19#define longest_match_686 _longest_match_686
20 define ADD_UNDERLINE_ASMFUNC */ 20#define cpudetect32 _cpudetect32
21#ifdef ADD_UNDERLINE_ASMFUNC 21#endif
22#define longest_match_7fff _longest_match_7fff 22
23#define longest_match_686 _longest_match_686 23
24#define cpudetect32 _cpudetect32 24unsigned long cpudetect32();
25#endif 25
26 26uInt longest_match_c(
27 27 deflate_state *s,
28 28 IPos cur_match); /* current match */
29void match_init() 29
30{ 30
31} 31uInt longest_match_7fff(
32 32 deflate_state *s,
33unsigned long cpudetect32(); 33 IPos cur_match); /* current match */
34 34
35uInt longest_match_c( 35uInt longest_match_686(
36 deflate_state *s, 36 deflate_state *s,
37 IPos cur_match); /* current match */ 37 IPos cur_match); /* current match */
38 38
39 39
40uInt longest_match_7fff( 40static uInt iIsPPro=2;
41 deflate_state *s, 41
42 IPos cur_match); /* current match */ 42void match_init ()
43 43{
44uInt longest_match_686( 44 iIsPPro = (((cpudetect32()/0x100)&0xf)>=6) ? 1 : 0;
45 deflate_state *s, 45}
46 IPos cur_match); /* current match */ 46
47 47uInt longest_match(
48uInt longest_match( 48 deflate_state *s,
49 deflate_state *s, 49 IPos cur_match) /* current match */
50 IPos cur_match) /* current match */ 50{
51{ 51 if (iIsPPro!=0)
52 static uInt iIsPPro=2; 52 return longest_match_686(s,cur_match);
53 53
54 if ((s->w_mask == 0x7fff) && (iIsPPro==0)) 54 if (s->w_mask != 0x7fff)
55 return longest_match_7fff(s,cur_match); 55 return longest_match_686(s,cur_match);
56 56
57 if (iIsPPro==1) 57 /* now ((s->w_mask == 0x7fff) && (iIsPPro==0)) */
58 return longest_match_686(s,cur_match); 58 return longest_match_7fff(s,cur_match);
59 59}
60 if (iIsPPro==2) 60
61 iIsPPro = (((cpudetect32()/0x100)&0xf)>=6) ? 1 : 0; 61
62 62#endif /* defined(ASMV) && (!defined(NOOLDPENTIUMCODE)) */
63 return longest_match_c(s,cur_match);
64}
65
66
67
68uInt longest_match_c(s, cur_match)
69 deflate_state *s;
70 IPos cur_match; /* current match */
71{
72 unsigned chain_length = s->max_chain_length;/* max hash chain length */
73 register Bytef *scan = s->window + s->strstart; /* current string */
74 register Bytef *match; /* matched string */
75 register int len; /* length of current match */
76 int best_len = s->prev_length; /* best match length so far */
77 int nice_match = s->nice_match; /* stop if match long enough */
78 IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
79 s->strstart - (IPos)MAX_DIST(s) : NIL;
80 /* Stop when cur_match becomes <= limit. To simplify the code,
81 * we prevent matches with the string of window index 0.
82 */
83 Posf *prev = s->prev;
84 uInt wmask = s->w_mask;
85
86#ifdef UNALIGNED_OK
87 /* Compare two bytes at a time. Note: this is not always beneficial.
88 * Try with and without -DUNALIGNED_OK to check.
89 */
90 register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
91 register ush scan_start = *(ushf*)scan;
92 register ush scan_end = *(ushf*)(scan+best_len-1);
93#else
94 register Bytef *strend = s->window + s->strstart + MAX_MATCH;
95 register Byte scan_end1 = scan[best_len-1];
96 register Byte scan_end = scan[best_len];
97#endif
98
99 /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
100 * It is easy to get rid of this optimization if necessary.
101 */
102 Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
103
104 /* Do not waste too much time if we already have a good match: */
105 if (s->prev_length >= s->good_match) {
106 chain_length >>= 2;
107 }
108 /* Do not look for matches beyond the end of the input. This is necessary
109 * to make deflate deterministic.
110 */
111 if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
112
113 Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
114
115 do {
116 Assert(cur_match < s->strstart, "no future");
117 match = s->window + cur_match;
118
119 /* Skip to next match if the match length cannot increase
120 * or if the match length is less than 2:
121 */
122#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
123 /* This code assumes sizeof(unsigned short) == 2. Do not use
124 * UNALIGNED_OK if your compiler uses a different size.
125 */
126 if (*(ushf*)(match+best_len-1) != scan_end ||
127 *(ushf*)match != scan_start) continue;
128
129 /* It is not necessary to compare scan[2] and match[2] since they are
130 * always equal when the other bytes match, given that the hash keys
131 * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
132 * strstart+3, +5, ... up to strstart+257. We check for insufficient
133 * lookahead only every 4th comparison; the 128th check will be made
134 * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
135 * necessary to put more guard bytes at the end of the window, or
136 * to check more often for insufficient lookahead.
137 */
138 Assert(scan[2] == match[2], "scan[2]?");
139 scan++, match++;
140 do {
141 } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
142 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
143 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
144 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
145 scan < strend);
146 /* The funny "do {}" generates better code on most compilers */
147
148 /* Here, scan <= window+strstart+257 */
149 Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
150 if (*scan == *match) scan++;
151
152 len = (MAX_MATCH - 1) - (int)(strend-scan);
153 scan = strend - (MAX_MATCH-1);
154
155#else /* UNALIGNED_OK */
156
157 if (match[best_len] != scan_end ||
158 match[best_len-1] != scan_end1 ||
159 *match != *scan ||
160 *++match != scan[1]) continue;
161
162 /* The check at best_len-1 can be removed because it will be made
163 * again later. (This heuristic is not always a win.)
164 * It is not necessary to compare scan[2] and match[2] since they
165 * are always equal when the other bytes match, given that
166 * the hash keys are equal and that HASH_BITS >= 8.
167 */
168 scan += 2, match++;
169 Assert(*scan == *match, "match[2]?");
170
171 /* We check for insufficient lookahead only every 8th comparison;
172 * the 256th check will be made at strstart+258.
173 */
174 do {
175 } while (*++scan == *++match && *++scan == *++match &&
176 *++scan == *++match && *++scan == *++match &&
177 *++scan == *++match && *++scan == *++match &&
178 *++scan == *++match && *++scan == *++match &&
179 scan < strend);
180
181 Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
182
183 len = MAX_MATCH - (int)(strend - scan);
184 scan = strend - MAX_MATCH;
185
186#endif /* UNALIGNED_OK */
187
188 if (len > best_len) {
189 s->match_start = cur_match;
190 best_len = len;
191 if (len >= nice_match) break;
192#ifdef UNALIGNED_OK
193 scan_end = *(ushf*)(scan+best_len-1);
194#else
195 scan_end1 = scan[best_len-1];
196 scan_end = scan[best_len];
197#endif
198 }
199 } while ((cur_match = prev[cur_match & wmask]) > limit
200 && --chain_length != 0);
201
202 if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
203 return s->lookahead;
204}
205
206#endif /* ASMV */
diff --git a/contrib/masmx86/inffas32.asm b/contrib/masmx86/inffas32.asm
index 531bcef..4a20512 100644
--- a/contrib/masmx86/inffas32.asm
+++ b/contrib/masmx86/inffas32.asm
@@ -1,1036 +1,1083 @@
1; 75 "inffast.S" 1;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding
2;FILE "inffast.S" 2; *
3 3; * inffas32.asm is derivated from inffas86.c, with translation of assembly code
4;;;GLOBAL _inflate_fast 4; *
5 5; * Copyright (C) 1995-2003 Mark Adler
6;;;SECTION .text 6; * For conditions of distribution and use, see copyright notice in zlib.h
7 7; *
8 8; * Copyright (C) 2003 Chris Anderson <christop@charm.net>
9 9; * Please use the copyright conditions above.
10 .586p 10; *
11 .mmx 11; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
12 12; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
13 name inflate_fast_x86 13; * the moment. I have successfully compiled and tested this code with gcc2.96,
14 .MODEL FLAT 14; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
15 15; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
16_DATA segment 16; * enabled. I will attempt to merge the MMX code into this version. Newer
17inflate_fast_use_mmx: 17; * versions of this and inffast.S can be found at
18 dd 1 18; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
19 19; *
20 20; * 2005 : modification by Gilles Vollant
21_TEXT segment 21; */
22PUBLIC _inflate_fast 22; For Visual C++ 4.x and higher and ML 6.x and higher
23 23; ml.exe is in directory \MASM611C of Win95 DDK
24ALIGN 4 24; ml.exe is also distributed in http://www.masm32.com/masmdl.htm
25_inflate_fast: 25; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
26 jmp inflate_fast_entry 26;
27 27;
28 28; compile with command line option
29 29; ml /coff /Zi /c /Flinffas32.lst inffas32.asm
30ALIGN 4 30
31 db 'Fast decoding Code from Chris Anderson' 31; if you define NO_GZIP (see inflate.h), compile with
32 db 0 32; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm
33 33
34ALIGN 4 34
35invalid_literal_length_code_msg: 35; zlib122sup is 0 fort zlib 1.2.2.1 and lower
36 db 'invalid literal/length code' 36; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head
37 db 0 37; in inflate_state in inflate.h)
38 38zlib1222sup equ 8
39ALIGN 4 39
40invalid_distance_code_msg: 40
41 db 'invalid distance code' 41IFDEF GUNZIP
42 db 0 42 INFLATE_MODE_TYPE equ 11
43 43 INFLATE_MODE_BAD equ 26
44ALIGN 4 44ELSE
45invalid_distance_too_far_msg: 45 IFNDEF NO_GUNZIP
46 db 'invalid distance too far back' 46 INFLATE_MODE_TYPE equ 11
47 db 0 47 INFLATE_MODE_BAD equ 26
48 48 ELSE
49 49 INFLATE_MODE_TYPE equ 3
50ALIGN 4 50 INFLATE_MODE_BAD equ 17
51inflate_fast_mask: 51 ENDIF
52dd 0 52ENDIF
53dd 1 53
54dd 3 54
55dd 7 55; 75 "inffast.S"
56dd 15 56;FILE "inffast.S"
57dd 31 57
58dd 63 58;;;GLOBAL _inflate_fast
59dd 127 59
60dd 255 60;;;SECTION .text
61dd 511 61
62dd 1023 62
63dd 2047 63
64dd 4095 64 .586p
65dd 8191 65 .mmx
66dd 16383 66
67dd 32767 67 name inflate_fast_x86
68dd 65535 68 .MODEL FLAT
69dd 131071 69
70dd 262143 70_DATA segment
71dd 524287 71inflate_fast_use_mmx:
72dd 1048575 72 dd 1
73dd 2097151 73
74dd 4194303 74
75dd 8388607 75_TEXT segment
76dd 16777215 76PUBLIC _inflate_fast
77dd 33554431 77
78dd 67108863 78ALIGN 4
79dd 134217727 79_inflate_fast:
80dd 268435455 80 jmp inflate_fast_entry
81dd 536870911 81
82dd 1073741823 82
83dd 2147483647 83
84dd 4294967295 84ALIGN 4
85 85 db 'Fast decoding Code from Chris Anderson'
86 86 db 0
87; head was added in zlib 1.2.2.1, so we add addstr 87
88; set addstr to 0 with zlib 1.2.1 of below 88ALIGN 4
89addstr equ 4 89invalid_literal_length_code_msg:
90 90 db 'invalid literal/length code'
91mode_state equ 0 ;/* state->mode */ 91 db 0
92wsize_state equ 32+addstr ;/* state->wsize */ 92
93write_state equ (36+4+addstr) ;/* state->write */ 93ALIGN 4
94window_state equ (40+4+addstr) ;/* state->window */ 94invalid_distance_code_msg:
95hold_state equ (44+4+addstr) ;/* state->hold */ 95 db 'invalid distance code'
96bits_state equ (48+4+addstr) ;/* state->bits */ 96 db 0
97lencode_state equ (64+4+addstr) ;/* state->lencode */ 97
98distcode_state equ (68+4+addstr) ;/* state->distcode */ 98ALIGN 4
99lenbits_state equ (72+4+addstr) ;/* state->lenbits */ 99invalid_distance_too_far_msg:
100distbits_state equ (76+4+addstr) ;/* state->distbits */ 100 db 'invalid distance too far back'
101 101 db 0
102 102
103;;SECTION .text 103
104; 205 "inffast.S" 104ALIGN 4
105;GLOBAL inflate_fast_use_mmx 105inflate_fast_mask:
106 106dd 0
107;SECTION .data 107dd 1
108 108dd 3
109 109dd 7
110; GLOBAL inflate_fast_use_mmx:object 110dd 15
111;.size inflate_fast_use_mmx, 4 111dd 31
112; 226 "inffast.S" 112dd 63
113;SECTION .text 113dd 127
114 114dd 255
115ALIGN 4 115dd 511
116inflate_fast_entry: 116dd 1023
117 push edi 117dd 2047
118 push esi 118dd 4095
119 push ebp 119dd 8191
120 push ebx 120dd 16383
121 pushfd 121dd 32767
122 sub esp,64 122dd 65535
123 cld 123dd 131071
124 124dd 262143
125 125dd 524287
126 126dd 1048575
127 127dd 2097151
128 mov esi, [esp+88] 128dd 4194303
129 mov edi, [esi+28] 129dd 8388607
130 130dd 16777215
131 131dd 33554431
132 132dd 67108863
133 133dd 134217727
134 134dd 268435455
135 135dd 536870911
136 136dd 1073741823
137 mov edx, [esi+4] 137dd 2147483647
138 mov eax, [esi+0] 138dd 4294967295
139 139
140 add edx,eax 140
141 sub edx,11 141mode_state equ 0 ;/* state->mode */
142 142wsize_state equ (32+zlib1222sup) ;/* state->wsize */
143 mov [esp+44],eax 143write_state equ (36+4+zlib1222sup) ;/* state->write */
144 mov [esp+20],edx 144window_state equ (40+4+zlib1222sup) ;/* state->window */
145 145hold_state equ (44+4+zlib1222sup) ;/* state->hold */
146 mov ebp, [esp+92] 146bits_state equ (48+4+zlib1222sup) ;/* state->bits */
147 mov ecx, [esi+16] 147lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */
148 mov ebx, [esi+12] 148distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */
149 149lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */
150 sub ebp,ecx 150distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */
151 neg ebp 151
152 add ebp,ebx 152
153 153;;SECTION .text
154 sub ecx,257 154; 205 "inffast.S"
155 add ecx,ebx 155;GLOBAL inflate_fast_use_mmx
156 156
157 mov [esp+60],ebx 157;SECTION .data
158 mov [esp+40],ebp 158
159 mov [esp+16],ecx 159
160; 285 "inffast.S" 160; GLOBAL inflate_fast_use_mmx:object
161 mov eax, [edi+lencode_state] 161;.size inflate_fast_use_mmx, 4
162 mov ecx, [edi+distcode_state] 162; 226 "inffast.S"
163 163;SECTION .text
164 mov [esp+8],eax 164
165 mov [esp+12],ecx 165ALIGN 4
166 166inflate_fast_entry:
167 mov eax,1 167 push edi
168 mov ecx, [edi+lenbits_state] 168 push esi
169 shl eax,cl 169 push ebp
170 dec eax 170 push ebx
171 mov [esp+0],eax 171 pushfd
172 172 sub esp,64
173 mov eax,1 173 cld
174 mov ecx, [edi+distbits_state] 174
175 shl eax,cl 175
176 dec eax 176
177 mov [esp+4],eax 177
178 178 mov esi, [esp+88]
179 mov eax, [edi+wsize_state] 179 mov edi, [esi+28]
180 mov ecx, [edi+write_state] 180
181 mov edx, [edi+window_state] 181
182 182
183 mov [esp+52],eax 183
184 mov [esp+48],ecx 184
185 mov [esp+56],edx 185
186 186
187 mov ebp, [edi+hold_state] 187 mov edx, [esi+4]
188 mov ebx, [edi+bits_state] 188 mov eax, [esi+0]
189; 321 "inffast.S" 189
190 mov esi, [esp+44] 190 add edx,eax
191 mov ecx, [esp+20] 191 sub edx,11
192 cmp ecx,esi 192
193 ja L_align_long 193 mov [esp+44],eax
194 194 mov [esp+20],edx
195 add ecx,11 195
196 sub ecx,esi 196 mov ebp, [esp+92]
197 mov eax,12 197 mov ecx, [esi+16]
198 sub eax,ecx 198 mov ebx, [esi+12]
199 lea edi, [esp+28] 199
200 rep movsb 200 sub ebp,ecx
201 mov ecx,eax 201 neg ebp
202 xor eax,eax 202 add ebp,ebx
203 rep stosb 203
204 lea esi, [esp+28] 204 sub ecx,257
205 mov [esp+20],esi 205 add ecx,ebx
206 jmp L_is_aligned 206
207 207 mov [esp+60],ebx
208 208 mov [esp+40],ebp
209L_align_long: 209 mov [esp+16],ecx
210 test esi,3 210; 285 "inffast.S"
211 jz L_is_aligned 211 mov eax, [edi+lencode_state]
212 xor eax,eax 212 mov ecx, [edi+distcode_state]
213 mov al, [esi] 213
214 inc esi 214 mov [esp+8],eax
215 mov ecx,ebx 215 mov [esp+12],ecx
216 add ebx,8 216
217 shl eax,cl 217 mov eax,1
218 or ebp,eax 218 mov ecx, [edi+lenbits_state]
219 jmp L_align_long 219 shl eax,cl
220 220 dec eax
221L_is_aligned: 221 mov [esp+0],eax
222 mov edi, [esp+60] 222
223; 366 "inffast.S" 223 mov eax,1
224L_check_mmx: 224 mov ecx, [edi+distbits_state]
225 cmp dword ptr [inflate_fast_use_mmx],2 225 shl eax,cl
226 je L_init_mmx 226 dec eax
227 ja L_do_loop 227 mov [esp+4],eax
228 228
229 push eax 229 mov eax, [edi+wsize_state]
230 push ebx 230 mov ecx, [edi+write_state]
231 push ecx 231 mov edx, [edi+window_state]
232 push edx 232
233 pushfd 233 mov [esp+52],eax
234 mov eax, [esp] 234 mov [esp+48],ecx
235 xor dword ptr [esp],0200000h 235 mov [esp+56],edx
236 236
237 237 mov ebp, [edi+hold_state]
238 238 mov ebx, [edi+bits_state]
239 239; 321 "inffast.S"
240 popfd 240 mov esi, [esp+44]
241 pushfd 241 mov ecx, [esp+20]
242 pop edx 242 cmp ecx,esi
243 xor edx,eax 243 ja L_align_long
244 jz L_dont_use_mmx 244
245 xor eax,eax 245 add ecx,11
246 cpuid 246 sub ecx,esi
247 cmp ebx,0756e6547h 247 mov eax,12
248 jne L_dont_use_mmx 248 sub eax,ecx
249 cmp ecx,06c65746eh 249 lea edi, [esp+28]
250 jne L_dont_use_mmx 250 rep movsb
251 cmp edx,049656e69h 251 mov ecx,eax
252 jne L_dont_use_mmx 252 xor eax,eax
253 mov eax,1 253 rep stosb
254 cpuid 254 lea esi, [esp+28]
255 shr eax,8 255 mov [esp+20],esi
256 and eax,15 256 jmp L_is_aligned
257 cmp eax,6 257
258 jne L_dont_use_mmx 258
259 test edx,0800000h 259L_align_long:
260 jnz L_use_mmx 260 test esi,3
261 jmp L_dont_use_mmx 261 jz L_is_aligned
262L_use_mmx: 262 xor eax,eax
263 mov dword ptr [inflate_fast_use_mmx],2 263 mov al, [esi]
264 jmp L_check_mmx_pop 264 inc esi
265L_dont_use_mmx: 265 mov ecx,ebx
266 mov dword ptr [inflate_fast_use_mmx],3 266 add ebx,8
267L_check_mmx_pop: 267 shl eax,cl
268 pop edx 268 or ebp,eax
269 pop ecx 269 jmp L_align_long
270 pop ebx 270
271 pop eax 271L_is_aligned:
272 jmp L_check_mmx 272 mov edi, [esp+60]
273; 426 "inffast.S" 273; 366 "inffast.S"
274ALIGN 4 274L_check_mmx:
275L_do_loop: 275 cmp dword ptr [inflate_fast_use_mmx],2
276; 437 "inffast.S" 276 je L_init_mmx
277 cmp bl,15 277 ja L_do_loop
278 ja L_get_length_code 278
279 279 push eax
280 xor eax,eax 280 push ebx
281 lodsw 281 push ecx
282 mov cl,bl 282 push edx
283 add bl,16 283 pushfd
284 shl eax,cl 284 mov eax, [esp]
285 or ebp,eax 285 xor dword ptr [esp],0200000h
286 286
287L_get_length_code: 287
288 mov edx, [esp+0] 288
289 mov ecx, [esp+8] 289
290 and edx,ebp 290 popfd
291 mov eax, [ecx+edx*4] 291 pushfd
292 292 pop edx
293L_dolen: 293 xor edx,eax
294 294 jz L_dont_use_mmx
295 295 xor eax,eax
296 296 cpuid
297 297 cmp ebx,0756e6547h
298 298 jne L_dont_use_mmx
299 299 cmp ecx,06c65746eh
300 mov cl,ah 300 jne L_dont_use_mmx
301 sub bl,ah 301 cmp edx,049656e69h
302 shr ebp,cl 302 jne L_dont_use_mmx
303 303 mov eax,1
304 304 cpuid
305 305 shr eax,8
306 306 and eax,15
307 307 cmp eax,6
308 308 jne L_dont_use_mmx
309 test al,al 309 test edx,0800000h
310 jnz L_test_for_length_base 310 jnz L_use_mmx
311 311 jmp L_dont_use_mmx
312 shr eax,16 312L_use_mmx:
313 stosb 313 mov dword ptr [inflate_fast_use_mmx],2
314 314 jmp L_check_mmx_pop
315L_while_test: 315L_dont_use_mmx:
316 316 mov dword ptr [inflate_fast_use_mmx],3
317 317L_check_mmx_pop:
318 cmp [esp+16],edi 318 pop edx
319 jbe L_break_loop 319 pop ecx
320 320 pop ebx
321 cmp [esp+20],esi 321 pop eax
322 ja L_do_loop 322 jmp L_check_mmx
323 jmp L_break_loop 323; 426 "inffast.S"
324 324ALIGN 4
325L_test_for_length_base: 325L_do_loop:
326; 502 "inffast.S" 326; 437 "inffast.S"
327 mov edx,eax 327 cmp bl,15
328 shr edx,16 328 ja L_get_length_code
329 mov cl,al 329
330 330 xor eax,eax
331 test al,16 331 lodsw
332 jz L_test_for_second_level_length 332 mov cl,bl
333 and cl,15 333 add bl,16
334 jz L_save_len 334 shl eax,cl
335 cmp bl,cl 335 or ebp,eax
336 jae L_add_bits_to_len 336
337 337L_get_length_code:
338 mov ch,cl 338 mov edx, [esp+0]
339 xor eax,eax 339 mov ecx, [esp+8]
340 lodsw 340 and edx,ebp
341 mov cl,bl 341 mov eax, [ecx+edx*4]
342 add bl,16 342
343 shl eax,cl 343L_dolen:
344 or ebp,eax 344
345 mov cl,ch 345
346 346
347L_add_bits_to_len: 347
348 mov eax,1 348
349 shl eax,cl 349
350 dec eax 350 mov cl,ah
351 sub bl,cl 351 sub bl,ah
352 and eax,ebp 352 shr ebp,cl
353 shr ebp,cl 353
354 add edx,eax 354
355 355
356L_save_len: 356
357 mov [esp+24],edx 357
358 358
359 359 test al,al
360L_decode_distance: 360 jnz L_test_for_length_base
361; 549 "inffast.S" 361
362 cmp bl,15 362 shr eax,16
363 ja L_get_distance_code 363 stosb
364 364
365 xor eax,eax 365L_while_test:
366 lodsw 366
367 mov cl,bl 367
368 add bl,16 368 cmp [esp+16],edi
369 shl eax,cl 369 jbe L_break_loop
370 or ebp,eax 370
371 371 cmp [esp+20],esi
372L_get_distance_code: 372 ja L_do_loop
373 mov edx, [esp+4] 373 jmp L_break_loop
374 mov ecx, [esp+12] 374
375 and edx,ebp 375L_test_for_length_base:
376 mov eax, [ecx+edx*4] 376; 502 "inffast.S"
377 377 mov edx,eax
378 378 shr edx,16
379L_dodist: 379 mov cl,al
380 mov edx,eax 380
381 shr edx,16 381 test al,16
382 mov cl,ah 382 jz L_test_for_second_level_length
383 sub bl,ah 383 and cl,15
384 shr ebp,cl 384 jz L_save_len
385; 584 "inffast.S" 385 cmp bl,cl
386 mov cl,al 386 jae L_add_bits_to_len
387 387
388 test al,16 388 mov ch,cl
389 jz L_test_for_second_level_dist 389 xor eax,eax
390 and cl,15 390 lodsw
391 jz L_check_dist_one 391 mov cl,bl
392 cmp bl,cl 392 add bl,16
393 jae L_add_bits_to_dist 393 shl eax,cl
394 394 or ebp,eax
395 mov ch,cl 395 mov cl,ch
396 xor eax,eax 396
397 lodsw 397L_add_bits_to_len:
398 mov cl,bl 398 mov eax,1
399 add bl,16 399 shl eax,cl
400 shl eax,cl 400 dec eax
401 or ebp,eax 401 sub bl,cl
402 mov cl,ch 402 and eax,ebp
403 403 shr ebp,cl
404L_add_bits_to_dist: 404 add edx,eax
405 mov eax,1 405
406 shl eax,cl 406L_save_len:
407 dec eax 407 mov [esp+24],edx
408 sub bl,cl 408
409 and eax,ebp 409
410 shr ebp,cl 410L_decode_distance:
411 add edx,eax 411; 549 "inffast.S"
412 jmp L_check_window 412 cmp bl,15
413 413 ja L_get_distance_code
414L_check_window: 414
415; 625 "inffast.S" 415 xor eax,eax
416 mov [esp+44],esi 416 lodsw
417 mov eax,edi 417 mov cl,bl
418 sub eax, [esp+40] 418 add bl,16
419 419 shl eax,cl
420 cmp eax,edx 420 or ebp,eax
421 jb L_clip_window 421
422 422L_get_distance_code:
423 mov ecx, [esp+24] 423 mov edx, [esp+4]
424 mov esi,edi 424 mov ecx, [esp+12]
425 sub esi,edx 425 and edx,ebp
426 426 mov eax, [ecx+edx*4]
427 sub ecx,3 427
428 mov al, [esi] 428
429 mov [edi],al 429L_dodist:
430 mov al, [esi+1] 430 mov edx,eax
431 mov dl, [esi+2] 431 shr edx,16
432 add esi,3 432 mov cl,ah
433 mov [edi+1],al 433 sub bl,ah
434 mov [edi+2],dl 434 shr ebp,cl
435 add edi,3 435; 584 "inffast.S"
436 rep movsb 436 mov cl,al
437 437
438 mov esi, [esp+44] 438 test al,16
439 jmp L_while_test 439 jz L_test_for_second_level_dist
440 440 and cl,15
441ALIGN 4 441 jz L_check_dist_one
442L_check_dist_one: 442 cmp bl,cl
443 cmp edx,1 443 jae L_add_bits_to_dist
444 jne L_check_window 444
445 cmp [esp+40],edi 445 mov ch,cl
446 je L_check_window 446 xor eax,eax
447 447 lodsw
448 dec edi 448 mov cl,bl
449 mov ecx, [esp+24] 449 add bl,16
450 mov al, [edi] 450 shl eax,cl
451 sub ecx,3 451 or ebp,eax
452 452 mov cl,ch
453 mov [edi+1],al 453
454 mov [edi+2],al 454L_add_bits_to_dist:
455 mov [edi+3],al 455 mov eax,1
456 add edi,4 456 shl eax,cl
457 rep stosb 457 dec eax
458 458 sub bl,cl
459 jmp L_while_test 459 and eax,ebp
460 460 shr ebp,cl
461ALIGN 4 461 add edx,eax
462L_test_for_second_level_length: 462 jmp L_check_window
463 463
464 464L_check_window:
465 465; 625 "inffast.S"
466 466 mov [esp+44],esi
467 test al,64 467 mov eax,edi
468 jnz L_test_for_end_of_block 468 sub eax, [esp+40]
469 469
470 mov eax,1 470 cmp eax,edx
471 shl eax,cl 471 jb L_clip_window
472 dec eax 472
473 and eax,ebp 473 mov ecx, [esp+24]
474 add eax,edx 474 mov esi,edi
475 mov edx, [esp+8] 475 sub esi,edx
476 mov eax, [edx+eax*4] 476
477 jmp L_dolen 477 sub ecx,3
478 478 mov al, [esi]
479ALIGN 4 479 mov [edi],al
480L_test_for_second_level_dist: 480 mov al, [esi+1]
481 481 mov dl, [esi+2]
482 482 add esi,3
483 483 mov [edi+1],al
484 484 mov [edi+2],dl
485 test al,64 485 add edi,3
486 jnz L_invalid_distance_code 486 rep movsb
487 487
488 mov eax,1 488 mov esi, [esp+44]
489 shl eax,cl 489 jmp L_while_test
490 dec eax 490
491 and eax,ebp 491ALIGN 4
492 add eax,edx 492L_check_dist_one:
493 mov edx, [esp+12] 493 cmp edx,1
494 mov eax, [edx+eax*4] 494 jne L_check_window
495 jmp L_dodist 495 cmp [esp+40],edi
496 496 je L_check_window
497ALIGN 4 497
498L_clip_window: 498 dec edi
499; 721 "inffast.S" 499 mov ecx, [esp+24]
500 mov ecx,eax 500 mov al, [edi]
501 mov eax, [esp+52] 501 sub ecx,3
502 neg ecx 502
503 mov esi, [esp+56] 503 mov [edi+1],al
504 504 mov [edi+2],al
505 cmp eax,edx 505 mov [edi+3],al
506 jb L_invalid_distance_too_far 506 add edi,4
507 507 rep stosb
508 add ecx,edx 508
509 cmp dword ptr [esp+48],0 509 jmp L_while_test
510 jne L_wrap_around_window 510
511 511ALIGN 4
512 sub eax,ecx 512L_test_for_second_level_length:
513 add esi,eax 513
514; 749 "inffast.S" 514
515 mov eax, [esp+24] 515
516 cmp eax,ecx 516
517 jbe L_do_copy1 517 test al,64
518 518 jnz L_test_for_end_of_block
519 sub eax,ecx 519
520 rep movsb 520 mov eax,1
521 mov esi,edi 521 shl eax,cl
522 sub esi,edx 522 dec eax
523 jmp L_do_copy1 523 and eax,ebp
524 524 add eax,edx
525 cmp eax,ecx 525 mov edx, [esp+8]
526 jbe L_do_copy1 526 mov eax, [edx+eax*4]
527 527 jmp L_dolen
528 sub eax,ecx 528
529 rep movsb 529ALIGN 4
530 mov esi,edi 530L_test_for_second_level_dist:
531 sub esi,edx 531
532 jmp L_do_copy1 532
533 533
534L_wrap_around_window: 534
535; 793 "inffast.S" 535 test al,64
536 mov eax, [esp+48] 536 jnz L_invalid_distance_code
537 cmp ecx,eax 537
538 jbe L_contiguous_in_window 538 mov eax,1
539 539 shl eax,cl
540 add esi, [esp+52] 540 dec eax
541 add esi,eax 541 and eax,ebp
542 sub esi,ecx 542 add eax,edx
543 sub ecx,eax 543 mov edx, [esp+12]
544 544 mov eax, [edx+eax*4]
545 545 jmp L_dodist
546 mov eax, [esp+24] 546
547 cmp eax,ecx 547ALIGN 4
548 jbe L_do_copy1 548L_clip_window:
549 549; 721 "inffast.S"
550 sub eax,ecx 550 mov ecx,eax
551 rep movsb 551 mov eax, [esp+52]
552 mov esi, [esp+56] 552 neg ecx
553 mov ecx, [esp+48] 553 mov esi, [esp+56]
554 cmp eax,ecx 554
555 jbe L_do_copy1 555 cmp eax,edx
556 556 jb L_invalid_distance_too_far
557 sub eax,ecx 557
558 rep movsb 558 add ecx,edx
559 mov esi,edi 559 cmp dword ptr [esp+48],0
560 sub esi,edx 560 jne L_wrap_around_window
561 jmp L_do_copy1 561
562 562 sub eax,ecx
563L_contiguous_in_window: 563 add esi,eax
564; 836 "inffast.S" 564; 749 "inffast.S"
565 add esi,eax 565 mov eax, [esp+24]
566 sub esi,ecx 566 cmp eax,ecx
567 567 jbe L_do_copy1
568 568
569 mov eax, [esp+24] 569 sub eax,ecx
570 cmp eax,ecx 570 rep movsb
571 jbe L_do_copy1 571 mov esi,edi
572 572 sub esi,edx
573 sub eax,ecx 573 jmp L_do_copy1
574 rep movsb 574
575 mov esi,edi 575 cmp eax,ecx
576 sub esi,edx 576 jbe L_do_copy1
577 577
578L_do_copy1: 578 sub eax,ecx
579; 862 "inffast.S" 579 rep movsb
580 mov ecx,eax 580 mov esi,edi
581 rep movsb 581 sub esi,edx
582 582 jmp L_do_copy1
583 mov esi, [esp+44] 583
584 jmp L_while_test 584L_wrap_around_window:
585; 878 "inffast.S" 585; 793 "inffast.S"
586ALIGN 4 586 mov eax, [esp+48]
587L_init_mmx: 587 cmp ecx,eax
588 emms 588 jbe L_contiguous_in_window
589 589
590 590 add esi, [esp+52]
591 591 add esi,eax
592 592 sub esi,ecx
593 593 sub ecx,eax
594 movd mm0,ebp 594
595 mov ebp,ebx 595
596; 896 "inffast.S" 596 mov eax, [esp+24]
597 movd mm4,[esp+0] 597 cmp eax,ecx
598 movq mm3,mm4 598 jbe L_do_copy1
599 movd mm5,[esp+4] 599
600 movq mm2,mm5 600 sub eax,ecx
601 pxor mm1,mm1 601 rep movsb
602 mov ebx, [esp+8] 602 mov esi, [esp+56]
603 jmp L_do_loop_mmx 603 mov ecx, [esp+48]
604 604 cmp eax,ecx
605ALIGN 4 605 jbe L_do_copy1
606L_do_loop_mmx: 606
607 psrlq mm0,mm1 607 sub eax,ecx
608 608 rep movsb
609 cmp ebp,32 609 mov esi,edi
610 ja L_get_length_code_mmx 610 sub esi,edx
611 611 jmp L_do_copy1
612 movd mm6,ebp 612
613 movd mm7,[esi] 613L_contiguous_in_window:
614 add esi,4 614; 836 "inffast.S"
615 psllq mm7,mm6 615 add esi,eax
616 add ebp,32 616 sub esi,ecx
617 por mm0,mm7 617
618 618
619L_get_length_code_mmx: 619 mov eax, [esp+24]
620 pand mm4,mm0 620 cmp eax,ecx
621 movd eax,mm4 621 jbe L_do_copy1
622 movq mm4,mm3 622
623 mov eax, [ebx+eax*4] 623 sub eax,ecx
624 624 rep movsb
625L_dolen_mmx: 625 mov esi,edi
626 movzx ecx,ah 626 sub esi,edx
627 movd mm1,ecx 627
628 sub ebp,ecx 628L_do_copy1:
629 629; 862 "inffast.S"
630 test al,al 630 mov ecx,eax
631 jnz L_test_for_length_base_mmx 631 rep movsb
632 632
633 shr eax,16 633 mov esi, [esp+44]
634 stosb 634 jmp L_while_test
635 635; 878 "inffast.S"
636L_while_test_mmx: 636ALIGN 4
637 637L_init_mmx:
638 638 emms
639 cmp [esp+16],edi 639
640 jbe L_break_loop 640
641 641
642 cmp [esp+20],esi 642
643 ja L_do_loop_mmx 643
644 jmp L_break_loop 644 movd mm0,ebp
645 645 mov ebp,ebx
646L_test_for_length_base_mmx: 646; 896 "inffast.S"
647 647 movd mm4,[esp+0]
648 mov edx,eax 648 movq mm3,mm4
649 shr edx,16 649 movd mm5,[esp+4]
650 650 movq mm2,mm5
651 test al,16 651 pxor mm1,mm1
652 jz L_test_for_second_level_length_mmx 652 mov ebx, [esp+8]
653 and eax,15 653 jmp L_do_loop_mmx
654 jz L_decode_distance_mmx 654
655 655ALIGN 4
656 psrlq mm0,mm1 656L_do_loop_mmx:
657 movd mm1,eax 657 psrlq mm0,mm1
658 movd ecx,mm0 658
659 sub ebp,eax 659 cmp ebp,32
660 and ecx, [inflate_fast_mask+eax*4] 660 ja L_get_length_code_mmx
661 add edx,ecx 661
662 662 movd mm6,ebp
663L_decode_distance_mmx: 663 movd mm7,[esi]
664 psrlq mm0,mm1 664 add esi,4
665 665 psllq mm7,mm6
666 cmp ebp,32 666 add ebp,32
667 ja L_get_dist_code_mmx 667 por mm0,mm7
668 668
669 movd mm6,ebp 669L_get_length_code_mmx:
670 movd mm7,[esi] 670 pand mm4,mm0
671 add esi,4 671 movd eax,mm4
672 psllq mm7,mm6 672 movq mm4,mm3
673 add ebp,32 673 mov eax, [ebx+eax*4]
674 por mm0,mm7 674
675 675L_dolen_mmx:
676L_get_dist_code_mmx: 676 movzx ecx,ah
677 mov ebx, [esp+12] 677 movd mm1,ecx
678 pand mm5,mm0 678 sub ebp,ecx
679 movd eax,mm5 679
680 movq mm5,mm2 680 test al,al
681 mov eax, [ebx+eax*4] 681 jnz L_test_for_length_base_mmx
682 682
683L_dodist_mmx: 683 shr eax,16
684 684 stosb
685 movzx ecx,ah 685
686 mov ebx,eax 686L_while_test_mmx:
687 shr ebx,16 687
688 sub ebp,ecx 688
689 movd mm1,ecx 689 cmp [esp+16],edi
690 690 jbe L_break_loop
691 test al,16 691
692 jz L_test_for_second_level_dist_mmx 692 cmp [esp+20],esi
693 and eax,15 693 ja L_do_loop_mmx
694 jz L_check_dist_one_mmx 694 jmp L_break_loop
695 695
696L_add_bits_to_dist_mmx: 696L_test_for_length_base_mmx:
697 psrlq mm0,mm1 697
698 movd mm1,eax 698 mov edx,eax
699 movd ecx,mm0 699 shr edx,16
700 sub ebp,eax 700
701 and ecx, [inflate_fast_mask+eax*4] 701 test al,16
702 add ebx,ecx 702 jz L_test_for_second_level_length_mmx
703 703 and eax,15
704L_check_window_mmx: 704 jz L_decode_distance_mmx
705 mov [esp+44],esi 705
706 mov eax,edi 706 psrlq mm0,mm1
707 sub eax, [esp+40] 707 movd mm1,eax
708 708 movd ecx,mm0
709 cmp eax,ebx 709 sub ebp,eax
710 jb L_clip_window_mmx 710 and ecx, [inflate_fast_mask+eax*4]
711 711 add edx,ecx
712 mov ecx,edx 712
713 mov esi,edi 713L_decode_distance_mmx:
714 sub esi,ebx 714 psrlq mm0,mm1
715 715
716 sub ecx,3 716 cmp ebp,32
717 mov al, [esi] 717 ja L_get_dist_code_mmx
718 mov [edi],al 718
719 mov al, [esi+1] 719 movd mm6,ebp
720 mov dl, [esi+2] 720 movd mm7,[esi]
721 add esi,3 721 add esi,4
722 mov [edi+1],al 722 psllq mm7,mm6
723 mov [edi+2],dl 723 add ebp,32
724 add edi,3 724 por mm0,mm7
725 rep movsb 725
726 726L_get_dist_code_mmx:
727 mov esi, [esp+44] 727 mov ebx, [esp+12]
728 mov ebx, [esp+8] 728 pand mm5,mm0
729 jmp L_while_test_mmx 729 movd eax,mm5
730 730 movq mm5,mm2
731ALIGN 4 731 mov eax, [ebx+eax*4]
732L_check_dist_one_mmx: 732
733 cmp ebx,1 733L_dodist_mmx:
734 jne L_check_window_mmx 734
735 cmp [esp+40],edi 735 movzx ecx,ah
736 je L_check_window_mmx 736 mov ebx,eax
737 737 shr ebx,16
738 dec edi 738 sub ebp,ecx
739 mov ecx,edx 739 movd mm1,ecx
740 mov al, [edi] 740
741 sub ecx,3 741 test al,16
742 742 jz L_test_for_second_level_dist_mmx
743 mov [edi+1],al 743 and eax,15
744 mov [edi+2],al 744 jz L_check_dist_one_mmx
745 mov [edi+3],al 745
746 add edi,4 746L_add_bits_to_dist_mmx:
747 rep stosb 747 psrlq mm0,mm1
748 748 movd mm1,eax
749 mov ebx, [esp+8] 749 movd ecx,mm0
750 jmp L_while_test_mmx 750 sub ebp,eax
751 751 and ecx, [inflate_fast_mask+eax*4]
752ALIGN 4 752 add ebx,ecx
753L_test_for_second_level_length_mmx: 753
754 test al,64 754L_check_window_mmx:
755 jnz L_test_for_end_of_block 755 mov [esp+44],esi
756 756 mov eax,edi
757 and eax,15 757 sub eax, [esp+40]
758 psrlq mm0,mm1 758
759 movd ecx,mm0 759 cmp eax,ebx
760 and ecx, [inflate_fast_mask+eax*4] 760 jb L_clip_window_mmx
761 add ecx,edx 761
762 mov eax, [ebx+ecx*4] 762 mov ecx,edx
763 jmp L_dolen_mmx 763 mov esi,edi
764 764 sub esi,ebx
765ALIGN 4 765
766L_test_for_second_level_dist_mmx: 766 sub ecx,3
767 test al,64 767 mov al, [esi]
768 jnz L_invalid_distance_code 768 mov [edi],al
769 769 mov al, [esi+1]
770 and eax,15 770 mov dl, [esi+2]
771 psrlq mm0,mm1 771 add esi,3
772 movd ecx,mm0 772 mov [edi+1],al
773 and ecx, [inflate_fast_mask+eax*4] 773 mov [edi+2],dl
774 mov eax, [esp+12] 774 add edi,3
775 add ecx,ebx 775 rep movsb
776 mov eax, [eax+ecx*4] 776
777 jmp L_dodist_mmx 777 mov esi, [esp+44]
778 778 mov ebx, [esp+8]
779ALIGN 4 779 jmp L_while_test_mmx
780L_clip_window_mmx: 780
781 781ALIGN 4
782 mov ecx,eax 782L_check_dist_one_mmx:
783 mov eax, [esp+52] 783 cmp ebx,1
784 neg ecx 784 jne L_check_window_mmx
785 mov esi, [esp+56] 785 cmp [esp+40],edi
786 786 je L_check_window_mmx
787 cmp eax,ebx 787
788 jb L_invalid_distance_too_far 788 dec edi
789 789 mov ecx,edx
790 add ecx,ebx 790 mov al, [edi]
791 cmp dword ptr [esp+48],0 791 sub ecx,3
792 jne L_wrap_around_window_mmx 792
793 793 mov [edi+1],al
794 sub eax,ecx 794 mov [edi+2],al
795 add esi,eax 795 mov [edi+3],al
796 796 add edi,4
797 cmp edx,ecx 797 rep stosb
798 jbe L_do_copy1_mmx 798
799 799 mov ebx, [esp+8]
800 sub edx,ecx 800 jmp L_while_test_mmx
801 rep movsb 801
802 mov esi,edi 802ALIGN 4
803 sub esi,ebx 803L_test_for_second_level_length_mmx:
804 jmp L_do_copy1_mmx 804 test al,64
805 805 jnz L_test_for_end_of_block
806 cmp edx,ecx 806
807 jbe L_do_copy1_mmx 807 and eax,15
808 808 psrlq mm0,mm1
809 sub edx,ecx 809 movd ecx,mm0
810 rep movsb 810 and ecx, [inflate_fast_mask+eax*4]
811 mov esi,edi 811 add ecx,edx
812 sub esi,ebx 812 mov eax, [ebx+ecx*4]
813 jmp L_do_copy1_mmx 813 jmp L_dolen_mmx
814 814
815L_wrap_around_window_mmx: 815ALIGN 4
816 816L_test_for_second_level_dist_mmx:
817 mov eax, [esp+48] 817 test al,64
818 cmp ecx,eax 818 jnz L_invalid_distance_code
819 jbe L_contiguous_in_window_mmx 819
820 820 and eax,15
821 add esi, [esp+52] 821 psrlq mm0,mm1
822 add esi,eax 822 movd ecx,mm0
823 sub esi,ecx 823 and ecx, [inflate_fast_mask+eax*4]
824 sub ecx,eax 824 mov eax, [esp+12]
825 825 add ecx,ebx
826 826 mov eax, [eax+ecx*4]
827 cmp edx,ecx 827 jmp L_dodist_mmx
828 jbe L_do_copy1_mmx 828
829 829ALIGN 4
830 sub edx,ecx 830L_clip_window_mmx:
831 rep movsb 831
832 mov esi, [esp+56] 832 mov ecx,eax
833 mov ecx, [esp+48] 833 mov eax, [esp+52]
834 cmp edx,ecx 834 neg ecx
835 jbe L_do_copy1_mmx 835 mov esi, [esp+56]
836 836
837 sub edx,ecx 837 cmp eax,ebx
838 rep movsb 838 jb L_invalid_distance_too_far
839 mov esi,edi 839
840 sub esi,ebx 840 add ecx,ebx
841 jmp L_do_copy1_mmx 841 cmp dword ptr [esp+48],0
842 842 jne L_wrap_around_window_mmx
843L_contiguous_in_window_mmx: 843
844 844 sub eax,ecx
845 add esi,eax 845 add esi,eax
846 sub esi,ecx 846
847 847 cmp edx,ecx
848 848 jbe L_do_copy1_mmx
849 cmp edx,ecx 849
850 jbe L_do_copy1_mmx 850 sub edx,ecx
851 851 rep movsb
852 sub edx,ecx 852 mov esi,edi
853 rep movsb 853 sub esi,ebx
854 mov esi,edi 854 jmp L_do_copy1_mmx
855 sub esi,ebx 855
856 856 cmp edx,ecx
857L_do_copy1_mmx: 857 jbe L_do_copy1_mmx
858 858
859 859 sub edx,ecx
860 mov ecx,edx 860 rep movsb
861 rep movsb 861 mov esi,edi
862 862 sub esi,ebx
863 mov esi, [esp+44] 863 jmp L_do_copy1_mmx
864 mov ebx, [esp+8] 864
865 jmp L_while_test_mmx 865L_wrap_around_window_mmx:
866; 1174 "inffast.S" 866
867L_invalid_distance_code: 867 mov eax, [esp+48]
868 868 cmp ecx,eax
869 869 jbe L_contiguous_in_window_mmx
870 870
871 871 add esi, [esp+52]
872 872 add esi,eax
873 mov ecx, invalid_distance_code_msg 873 sub esi,ecx
874 mov edx,26 874 sub ecx,eax
875 jmp L_update_stream_state 875
876 876
877L_test_for_end_of_block: 877 cmp edx,ecx
878 878 jbe L_do_copy1_mmx
879 879
880 880 sub edx,ecx
881 881 rep movsb
882 882 mov esi, [esp+56]
883 test al,32 883 mov ecx, [esp+48]
884 jz L_invalid_literal_length_code 884 cmp edx,ecx
885 885 jbe L_do_copy1_mmx
886 mov ecx,0 886
887 mov edx,11 887 sub edx,ecx
888 jmp L_update_stream_state 888 rep movsb
889 889 mov esi,edi
890L_invalid_literal_length_code: 890 sub esi,ebx
891 891 jmp L_do_copy1_mmx
892 892
893 893L_contiguous_in_window_mmx:
894 894
895 895 add esi,eax
896 mov ecx, invalid_literal_length_code_msg 896 sub esi,ecx
897 mov edx,26 897
898 jmp L_update_stream_state 898
899 899 cmp edx,ecx
900L_invalid_distance_too_far: 900 jbe L_do_copy1_mmx
901 901
902 902 sub edx,ecx
903 903 rep movsb
904 mov esi, [esp+44] 904 mov esi,edi
905 mov ecx, invalid_distance_too_far_msg 905 sub esi,ebx
906 mov edx,26 906
907 jmp L_update_stream_state 907L_do_copy1_mmx:
908 908
909L_update_stream_state: 909
910 910 mov ecx,edx
911 mov eax, [esp+88] 911 rep movsb
912 test ecx,ecx 912
913 jz L_skip_msg 913 mov esi, [esp+44]
914 mov [eax+24],ecx 914 mov ebx, [esp+8]
915L_skip_msg: 915 jmp L_while_test_mmx
916 mov eax, [eax+28] 916; 1174 "inffast.S"
917 mov [eax+mode_state],edx 917L_invalid_distance_code:
918 jmp L_break_loop 918
919 919
920ALIGN 4 920
921L_break_loop: 921
922; 1243 "inffast.S" 922
923 cmp dword ptr [inflate_fast_use_mmx],2 923 mov ecx, invalid_distance_code_msg
924 jne L_update_next_in 924 mov edx,INFLATE_MODE_BAD
925 925 jmp L_update_stream_state
926 926
927 927L_test_for_end_of_block:
928 mov ebx,ebp 928
929 929
930L_update_next_in: 930
931; 1266 "inffast.S" 931
932 mov eax, [esp+88] 932
933 mov ecx,ebx 933 test al,32
934 mov edx, [eax+28] 934 jz L_invalid_literal_length_code
935 shr ecx,3 935
936 sub esi,ecx 936 mov ecx,0
937 shl ecx,3 937 mov edx,INFLATE_MODE_TYPE
938 sub ebx,ecx 938 jmp L_update_stream_state
939 mov [eax+12],edi 939
940 mov [edx+bits_state],ebx 940L_invalid_literal_length_code:
941 mov ecx,ebx 941
942 942
943 lea ebx, [esp+28] 943
944 cmp [esp+20],ebx 944
945 jne L_buf_not_used 945
946 946 mov ecx, invalid_literal_length_code_msg
947 sub esi,ebx 947 mov edx,INFLATE_MODE_BAD
948 mov ebx, [eax+0] 948 jmp L_update_stream_state
949 mov [esp+20],ebx 949
950 add esi,ebx 950L_invalid_distance_too_far:
951 mov ebx, [eax+4] 951
952 sub ebx,11 952
953 add [esp+20],ebx 953
954 954 mov esi, [esp+44]
955L_buf_not_used: 955 mov ecx, invalid_distance_too_far_msg
956 mov [eax+0],esi 956 mov edx,INFLATE_MODE_BAD
957 957 jmp L_update_stream_state
958 mov ebx,1 958
959 shl ebx,cl 959L_update_stream_state:
960 dec ebx 960
961 961 mov eax, [esp+88]
962 962 test ecx,ecx
963 963 jz L_skip_msg
964 964 mov [eax+24],ecx
965 965L_skip_msg:
966 cmp dword ptr [inflate_fast_use_mmx],2 966 mov eax, [eax+28]
967 jne L_update_hold 967 mov [eax+mode_state],edx
968 968 jmp L_break_loop
969 969
970 970ALIGN 4
971 psrlq mm0,mm1 971L_break_loop:
972 movd ebp,mm0 972; 1243 "inffast.S"
973 973 cmp dword ptr [inflate_fast_use_mmx],2
974 emms 974 jne L_update_next_in
975 975
976L_update_hold: 976
977 977
978 978 mov ebx,ebp
979 979
980 and ebp,ebx 980L_update_next_in:
981 mov [edx+hold_state],ebp 981; 1266 "inffast.S"
982 982 mov eax, [esp+88]
983 983 mov ecx,ebx
984 984 mov edx, [eax+28]
985 985 shr ecx,3
986 mov ebx, [esp+20] 986 sub esi,ecx
987 cmp ebx,esi 987 shl ecx,3
988 jbe L_last_is_smaller 988 sub ebx,ecx
989 989 mov [eax+12],edi
990 sub ebx,esi 990 mov [edx+bits_state],ebx
991 add ebx,11 991 mov ecx,ebx
992 mov [eax+4],ebx 992
993 jmp L_fixup_out 993 lea ebx, [esp+28]
994L_last_is_smaller: 994 cmp [esp+20],ebx
995 sub esi,ebx 995 jne L_buf_not_used
996 neg esi 996
997 add esi,11 997 sub esi,ebx
998 mov [eax+4],esi 998 mov ebx, [eax+0]
999 999 mov [esp+20],ebx
1000 1000 add esi,ebx
1001 1001 mov ebx, [eax+4]
1002 1002 sub ebx,11
1003L_fixup_out: 1003 add [esp+20],ebx
1004 1004
1005 mov ebx, [esp+16] 1005L_buf_not_used:
1006 cmp ebx,edi 1006 mov [eax+0],esi
1007 jbe L_end_is_smaller 1007
1008 1008 mov ebx,1
1009 sub ebx,edi 1009 shl ebx,cl
1010 add ebx,257 1010 dec ebx
1011 mov [eax+16],ebx 1011
1012 jmp L_done 1012
1013L_end_is_smaller: 1013
1014 sub edi,ebx 1014
1015 neg edi 1015
1016 add edi,257 1016 cmp dword ptr [inflate_fast_use_mmx],2
1017 mov [eax+16],edi 1017 jne L_update_hold
1018 1018
1019 1019
1020 1020
1021 1021 psrlq mm0,mm1
1022 1022 movd ebp,mm0
1023L_done: 1023
1024 add esp,64 1024 emms
1025 popfd 1025
1026 pop ebx 1026L_update_hold:
1027 pop ebp 1027
1028 pop esi 1028
1029 pop edi 1029
1030 ret 1030 and ebp,ebx
1031 1031 mov [edx+hold_state],ebp
1032 1032
1033 1033
1034 1034
1035_TEXT ends 1035
1036end 1036 mov ebx, [esp+20]
1037 cmp ebx,esi
1038 jbe L_last_is_smaller
1039
1040 sub ebx,esi
1041 add ebx,11
1042 mov [eax+4],ebx
1043 jmp L_fixup_out
1044L_last_is_smaller:
1045 sub esi,ebx
1046 neg esi
1047 add esi,11
1048 mov [eax+4],esi
1049
1050
1051
1052
1053L_fixup_out:
1054
1055 mov ebx, [esp+16]
1056 cmp ebx,edi
1057 jbe L_end_is_smaller
1058
1059 sub ebx,edi
1060 add ebx,257
1061 mov [eax+16],ebx
1062 jmp L_done
1063L_end_is_smaller:
1064 sub edi,ebx
1065 neg edi
1066 add edi,257
1067 mov [eax+16],edi
1068
1069
1070
1071
1072
1073L_done:
1074 add esp,64
1075 popfd
1076 pop ebx
1077 pop ebp
1078 pop esi
1079 pop edi
1080 ret
1081
1082_TEXT ends
1083end