summaryrefslogtreecommitdiff
path: root/contrib/masmx86/gvmat32.asm
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/masmx86/gvmat32.asm')
-rw-r--r--contrib/masmx86/gvmat32.asm1881
1 files changed, 972 insertions, 909 deletions
diff --git a/contrib/masmx86/gvmat32.asm b/contrib/masmx86/gvmat32.asm
index e841a7f..874bb2d 100644
--- a/contrib/masmx86/gvmat32.asm
+++ b/contrib/masmx86/gvmat32.asm
@@ -1,909 +1,972 @@
1; 1; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86
2; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86 2; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant.
3; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. 3; File written by Gilles Vollant, by modifiying the longest_match
4; File written by Gilles Vollant, by modifiying the longest_match 4; from Jean-loup Gailly in deflate.c
5; from Jean-loup Gailly in deflate.c 5;
6; It need wmask == 0x7fff 6; http://www.zlib.net
7; (assembly code is faster with a fixed wmask) 7; http://www.winimage.com/zLibDll
8; 8; http://www.muppetlabs.com/~breadbox/software/assembly.html
9; For Visual C++ 4.2 and ML 6.11c (version in directory \MASM611C of Win95 DDK) 9;
10; I compile with : "ml /coff /Zi /c gvmat32.asm" 10; For Visual C++ 4.x and higher and ML 6.x and higher
11; 11; ml.exe is in directory \MASM611C of Win95 DDK
12 12; ml.exe is also distributed in http://www.masm32.com/masmdl.htm
13;uInt longest_match_7fff(s, cur_match) 13; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
14; deflate_state *s; 14;
15; IPos cur_match; /* current match */ 15; this file contain two implementation of longest_match
16 16;
17 NbStack equ 76 17; longest_match_7fff : written 1996 by Gilles Vollant optimized for
18 cur_match equ dword ptr[esp+NbStack-0] 18; first Pentium. Assume s->w_mask == 0x7fff
19 str_s equ dword ptr[esp+NbStack-4] 19; longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro
20; 5 dword on top (ret,ebp,esi,edi,ebx) 20;
21 adrret equ dword ptr[esp+NbStack-8] 21; for using an seembly version of longest_match, you need define ASMV in project
22 pushebp equ dword ptr[esp+NbStack-12] 22; There is two way in using gvmat32.asm
23 pushedi equ dword ptr[esp+NbStack-16] 23;
24 pushesi equ dword ptr[esp+NbStack-20] 24; A) Suggested method
25 pushebx equ dword ptr[esp+NbStack-24] 25; if you want include both longest_match_7fff and longest_match_686
26 26; compile the asm file running
27 chain_length equ dword ptr [esp+NbStack-28] 27; ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm
28 limit equ dword ptr [esp+NbStack-32] 28; and include gvmat32c.c in your project
29 best_len equ dword ptr [esp+NbStack-36] 29; if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff,
30 window equ dword ptr [esp+NbStack-40] 30; longest_match_7fff will be used
31 prev equ dword ptr [esp+NbStack-44] 31; if you have a more modern CPU (Pentium Pro, II and higher)
32 scan_start equ word ptr [esp+NbStack-48] 32; longest_match_686 will be used
33 wmask equ dword ptr [esp+NbStack-52] 33; on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used,
34 match_start_ptr equ dword ptr [esp+NbStack-56] 34; but this is not a sitation you'll find often
35 nice_match equ dword ptr [esp+NbStack-60] 35;
36 scan equ dword ptr [esp+NbStack-64] 36; B) Alternative
37 37; if you are not interresed in old cpu performance and want the smaller
38 windowlen equ dword ptr [esp+NbStack-68] 38; binaries possible
39 match_start equ dword ptr [esp+NbStack-72] 39;
40 strend equ dword ptr [esp+NbStack-76] 40; compile the asm file running
41 NbStackAdd equ (NbStack-24) 41; ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm
42 42; and do not include gvmat32c.c in your project (ou define also
43 .386p 43; NOOLDPENTIUMCODE)
44 44;
45 name gvmatch 45; note : as I known, longest_match_686 is very faster than longest_match_7fff
46 .MODEL FLAT 46; on pentium Pro/II/III, faster (but less) in P4, but it seem
47 47; longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8
48 48;
49 49; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
50; all the +addstr offsets are due to the addition of pending_buf_size in zlib 1.04 50
51; and adding gzhead and gzindex in zlib 1.2.2.1 51;uInt longest_match_7fff(s, cur_match)
52; in the deflate_state structure since the asm code was first written 52; deflate_state *s;
53; (if you compile with zlib 1.0.4 or older, set addstr to 0). 53; IPos cur_match; /* current match */
54; (if you compiler with zlib between 1.04 and 1.2.1, set addstr to 4) 54
55; Note : these value are good with a 8 bytes boundary pack structure 55 NbStack equ 76
56 56 cur_match equ dword ptr[esp+NbStack-0]
57 addstr equ 4+8 57 str_s equ dword ptr[esp+NbStack-4]
58 dep_chain_length equ 70h+addstr 58; 5 dword on top (ret,ebp,esi,edi,ebx)
59 dep_window equ 2ch+addstr 59 adrret equ dword ptr[esp+NbStack-8]
60 dep_strstart equ 60h+addstr 60 pushebp equ dword ptr[esp+NbStack-12]
61 dep_prev_length equ 6ch+addstr 61 pushedi equ dword ptr[esp+NbStack-16]
62 dep_nice_match equ 84h+addstr 62 pushesi equ dword ptr[esp+NbStack-20]
63 dep_w_size equ 20h+addstr 63 pushebx equ dword ptr[esp+NbStack-24]
64 dep_prev equ 34h+addstr 64
65 dep_w_mask equ 28h+addstr 65 chain_length equ dword ptr [esp+NbStack-28]
66 dep_good_match equ 80h+addstr 66 limit equ dword ptr [esp+NbStack-32]
67 dep_match_start equ 64h+addstr 67 best_len equ dword ptr [esp+NbStack-36]
68 dep_lookahead equ 68h+addstr 68 window equ dword ptr [esp+NbStack-40]
69 69 prev equ dword ptr [esp+NbStack-44]
70 70 scan_start equ word ptr [esp+NbStack-48]
71_TEXT segment 71 wmask equ dword ptr [esp+NbStack-52]
72 72 match_start_ptr equ dword ptr [esp+NbStack-56]
73IFDEF NOUNDERLINE 73 nice_match equ dword ptr [esp+NbStack-60]
74 public longest_match_7fff 74 scan equ dword ptr [esp+NbStack-64]
75 public longest_match_686 75
76; public match_init 76 windowlen equ dword ptr [esp+NbStack-68]
77ELSE 77 match_start equ dword ptr [esp+NbStack-72]
78 public _longest_match_7fff 78 strend equ dword ptr [esp+NbStack-76]
79 public _longest_match_686 79 NbStackAdd equ (NbStack-24)
80; public _match_init 80
81ENDIF 81 .386p
82 82
83 MAX_MATCH equ 258 83 name gvmatch
84 MIN_MATCH equ 3 84 .MODEL FLAT
85 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) 85
86 86
87 87
88 88; all the +zlib1222add offsets are due to the addition of fields
89IFDEF NOUNDERLINE 89; in zlib in the deflate_state structure since the asm code was first written
90;match_init proc near 90; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
91; ret 91; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
92;match_init endp 92; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
93ELSE 93
94;_match_init proc near 94 zlib1222add equ 8
95; ret 95
96;_match_init endp 96; Note : these value are good with a 8 bytes boundary pack structure
97ENDIF 97 dep_chain_length equ 74h+zlib1222add
98 98 dep_window equ 30h+zlib1222add
99 99 dep_strstart equ 64h+zlib1222add
100IFDEF NOUNDERLINE 100 dep_prev_length equ 70h+zlib1222add
101longest_match_7fff proc near 101 dep_nice_match equ 88h+zlib1222add
102ELSE 102 dep_w_size equ 24h+zlib1222add
103_longest_match_7fff proc near 103 dep_prev equ 38h+zlib1222add
104ENDIF 104 dep_w_mask equ 2ch+zlib1222add
105 105 dep_good_match equ 84h+zlib1222add
106 mov edx,[esp+4] 106 dep_match_start equ 68h+zlib1222add
107 107 dep_lookahead equ 6ch+zlib1222add
108 108
109 109
110 push ebp 110_TEXT segment
111 push edi 111
112 push esi 112IFDEF NOUNDERLINE
113 push ebx 113 IFDEF NOOLDPENTIUMCODE
114 114 public longest_match
115 sub esp,NbStackAdd 115 public match_init
116 116 ELSE
117; initialize or check the variables used in match.asm. 117 public longest_match_7fff
118 mov ebp,edx 118 public cpudetect32
119 119 public longest_match_686
120; chain_length = s->max_chain_length 120 ENDIF
121; if (prev_length>=good_match) chain_length >>= 2 121ELSE
122 mov edx,[ebp+dep_chain_length] 122 IFDEF NOOLDPENTIUMCODE
123 mov ebx,[ebp+dep_prev_length] 123 public _longest_match
124 cmp [ebp+dep_good_match],ebx 124 public _match_init
125 ja noshr 125 ELSE
126 shr edx,2 126 public _longest_match_7fff
127noshr: 127 public _cpudetect32
128; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop 128 public _longest_match_686
129 inc edx 129 ENDIF
130 mov edi,[ebp+dep_nice_match] 130ENDIF
131 mov chain_length,edx 131
132 mov eax,[ebp+dep_lookahead] 132 MAX_MATCH equ 258
133 cmp eax,edi 133 MIN_MATCH equ 3
134; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 134 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
135 jae nolookaheadnicematch 135
136 mov edi,eax 136
137nolookaheadnicematch: 137
138; best_len = s->prev_length 138IFNDEF NOOLDPENTIUMCODE
139 mov best_len,ebx 139IFDEF NOUNDERLINE
140 140longest_match_7fff proc near
141; window = s->window 141ELSE
142 mov esi,[ebp+dep_window] 142_longest_match_7fff proc near
143 mov ecx,[ebp+dep_strstart] 143ENDIF
144 mov window,esi 144
145 145 mov edx,[esp+4]
146 mov nice_match,edi 146
147; scan = window + strstart 147
148 add esi,ecx 148
149 mov scan,esi 149 push ebp
150; dx = *window 150 push edi
151 mov dx,word ptr [esi] 151 push esi
152; bx = *(window+best_len-1) 152 push ebx
153 mov bx,word ptr [esi+ebx-1] 153
154 add esi,MAX_MATCH-1 154 sub esp,NbStackAdd
155; scan_start = *scan 155
156 mov scan_start,dx 156; initialize or check the variables used in match.asm.
157; strend = scan + MAX_MATCH-1 157 mov ebp,edx
158 mov strend,esi 158
159; bx = scan_end = *(window+best_len-1) 159; chain_length = s->max_chain_length
160 160; if (prev_length>=good_match) chain_length >>= 2
161; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 161 mov edx,[ebp+dep_chain_length]
162; s->strstart - (IPos)MAX_DIST(s) : NIL; 162 mov ebx,[ebp+dep_prev_length]
163 163 cmp [ebp+dep_good_match],ebx
164 mov esi,[ebp+dep_w_size] 164 ja noshr
165 sub esi,MIN_LOOKAHEAD 165 shr edx,2
166; here esi = MAX_DIST(s) 166noshr:
167 sub ecx,esi 167; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop
168 ja nodist 168 inc edx
169 xor ecx,ecx 169 mov edi,[ebp+dep_nice_match]
170nodist: 170 mov chain_length,edx
171 mov limit,ecx 171 mov eax,[ebp+dep_lookahead]
172 172 cmp eax,edi
173; prev = s->prev 173; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
174 mov edx,[ebp+dep_prev] 174 jae nolookaheadnicematch
175 mov prev,edx 175 mov edi,eax
176 176nolookaheadnicematch:
177; 177; best_len = s->prev_length
178 mov edx,dword ptr [ebp+dep_match_start] 178 mov best_len,ebx
179 mov bp,scan_start 179
180 mov eax,cur_match 180; window = s->window
181 mov match_start,edx 181 mov esi,[ebp+dep_window]
182 182 mov ecx,[ebp+dep_strstart]
183 mov edx,window 183 mov window,esi
184 mov edi,edx 184
185 add edi,best_len 185 mov nice_match,edi
186 mov esi,prev 186; scan = window + strstart
187 dec edi 187 add esi,ecx
188; windowlen = window + best_len -1 188 mov scan,esi
189 mov windowlen,edi 189; dx = *window
190 190 mov dx,word ptr [esi]
191 jmp beginloop2 191; bx = *(window+best_len-1)
192 align 4 192 mov bx,word ptr [esi+ebx-1]
193 193 add esi,MAX_MATCH-1
194; here, in the loop 194; scan_start = *scan
195; eax = ax = cur_match 195 mov scan_start,dx
196; ecx = limit 196; strend = scan + MAX_MATCH-1
197; bx = scan_end 197 mov strend,esi
198; bp = scan_start 198; bx = scan_end = *(window+best_len-1)
199; edi = windowlen (window + best_len -1) 199
200; esi = prev 200; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
201 201; s->strstart - (IPos)MAX_DIST(s) : NIL;
202 202
203;// here; chain_length <=16 203 mov esi,[ebp+dep_w_size]
204normalbeg0add16: 204 sub esi,MIN_LOOKAHEAD
205 add chain_length,16 205; here esi = MAX_DIST(s)
206 jz exitloop 206 sub ecx,esi
207normalbeg0: 207 ja nodist
208 cmp word ptr[edi+eax],bx 208 xor ecx,ecx
209 je normalbeg2noroll 209nodist:
210rcontlabnoroll: 210 mov limit,ecx
211; cur_match = prev[cur_match & wmask] 211
212 and eax,7fffh 212; prev = s->prev
213 mov ax,word ptr[esi+eax*2] 213 mov edx,[ebp+dep_prev]
214; if cur_match > limit, go to exitloop 214 mov prev,edx
215 cmp ecx,eax 215
216 jnb exitloop 216;
217; if --chain_length != 0, go to exitloop 217 mov edx,dword ptr [ebp+dep_match_start]
218 dec chain_length 218 mov bp,scan_start
219 jnz normalbeg0 219 mov eax,cur_match
220 jmp exitloop 220 mov match_start,edx
221 221
222normalbeg2noroll: 222 mov edx,window
223; if (scan_start==*(cur_match+window)) goto normalbeg2 223 mov edi,edx
224 cmp bp,word ptr[edx+eax] 224 add edi,best_len
225 jne rcontlabnoroll 225 mov esi,prev
226 jmp normalbeg2 226 dec edi
227 227; windowlen = window + best_len -1
228contloop3: 228 mov windowlen,edi
229 mov edi,windowlen 229
230 230 jmp beginloop2
231; cur_match = prev[cur_match & wmask] 231 align 4
232 and eax,7fffh 232
233 mov ax,word ptr[esi+eax*2] 233; here, in the loop
234; if cur_match > limit, go to exitloop 234; eax = ax = cur_match
235 cmp ecx,eax 235; ecx = limit
236jnbexitloopshort1: 236; bx = scan_end
237 jnb exitloop 237; bp = scan_start
238; if --chain_length != 0, go to exitloop 238; edi = windowlen (window + best_len -1)
239 239; esi = prev
240 240
241; begin the main loop 241
242beginloop2: 242;// here; chain_length <=16
243 sub chain_length,16+1 243normalbeg0add16:
244; if chain_length <=16, don't use the unrolled loop 244 add chain_length,16
245 jna normalbeg0add16 245 jz exitloop
246 246normalbeg0:
247do16: 247 cmp word ptr[edi+eax],bx
248 cmp word ptr[edi+eax],bx 248 je normalbeg2noroll
249 je normalbeg2dc0 249rcontlabnoroll:
250 250; cur_match = prev[cur_match & wmask]
251maccn MACRO lab 251 and eax,7fffh
252 and eax,7fffh 252 mov ax,word ptr[esi+eax*2]
253 mov ax,word ptr[esi+eax*2] 253; if cur_match > limit, go to exitloop
254 cmp ecx,eax 254 cmp ecx,eax
255 jnb exitloop 255 jnb exitloop
256 cmp word ptr[edi+eax],bx 256; if --chain_length != 0, go to exitloop
257 je lab 257 dec chain_length
258 ENDM 258 jnz normalbeg0
259 259 jmp exitloop
260rcontloop0: 260
261 maccn normalbeg2dc1 261normalbeg2noroll:
262 262; if (scan_start==*(cur_match+window)) goto normalbeg2
263rcontloop1: 263 cmp bp,word ptr[edx+eax]
264 maccn normalbeg2dc2 264 jne rcontlabnoroll
265 265 jmp normalbeg2
266rcontloop2: 266
267 maccn normalbeg2dc3 267contloop3:
268 268 mov edi,windowlen
269rcontloop3: 269
270 maccn normalbeg2dc4 270; cur_match = prev[cur_match & wmask]
271 271 and eax,7fffh
272rcontloop4: 272 mov ax,word ptr[esi+eax*2]
273 maccn normalbeg2dc5 273; if cur_match > limit, go to exitloop
274 274 cmp ecx,eax
275rcontloop5: 275jnbexitloopshort1:
276 maccn normalbeg2dc6 276 jnb exitloop
277 277; if --chain_length != 0, go to exitloop
278rcontloop6: 278
279 maccn normalbeg2dc7 279
280 280; begin the main loop
281rcontloop7: 281beginloop2:
282 maccn normalbeg2dc8 282 sub chain_length,16+1
283 283; if chain_length <=16, don't use the unrolled loop
284rcontloop8: 284 jna normalbeg0add16
285 maccn normalbeg2dc9 285
286 286do16:
287rcontloop9: 287 cmp word ptr[edi+eax],bx
288 maccn normalbeg2dc10 288 je normalbeg2dc0
289 289
290rcontloop10: 290maccn MACRO lab
291 maccn short normalbeg2dc11 291 and eax,7fffh
292 292 mov ax,word ptr[esi+eax*2]
293rcontloop11: 293 cmp ecx,eax
294 maccn short normalbeg2dc12 294 jnb exitloop
295 295 cmp word ptr[edi+eax],bx
296rcontloop12: 296 je lab
297 maccn short normalbeg2dc13 297 ENDM
298 298
299rcontloop13: 299rcontloop0:
300 maccn short normalbeg2dc14 300 maccn normalbeg2dc1
301 301
302rcontloop14: 302rcontloop1:
303 maccn short normalbeg2dc15 303 maccn normalbeg2dc2
304 304
305rcontloop15: 305rcontloop2:
306 and eax,7fffh 306 maccn normalbeg2dc3
307 mov ax,word ptr[esi+eax*2] 307
308 cmp ecx,eax 308rcontloop3:
309 jnb exitloop 309 maccn normalbeg2dc4
310 310
311 sub chain_length,16 311rcontloop4:
312 ja do16 312 maccn normalbeg2dc5
313 jmp normalbeg0add16 313
314 314rcontloop5:
315;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 315 maccn normalbeg2dc6
316 316
317normbeg MACRO rcontlab,valsub 317rcontloop6:
318; if we are here, we know that *(match+best_len-1) == scan_end 318 maccn normalbeg2dc7
319 cmp bp,word ptr[edx+eax] 319
320; if (match != scan_start) goto rcontlab 320rcontloop7:
321 jne rcontlab 321 maccn normalbeg2dc8
322; calculate the good chain_length, and we'll compare scan and match string 322
323 add chain_length,16-valsub 323rcontloop8:
324 jmp iseq 324 maccn normalbeg2dc9
325 ENDM 325
326 326rcontloop9:
327 327 maccn normalbeg2dc10
328normalbeg2dc11: 328
329 normbeg rcontloop11,11 329rcontloop10:
330 330 maccn short normalbeg2dc11
331normalbeg2dc12: 331
332 normbeg short rcontloop12,12 332rcontloop11:
333 333 maccn short normalbeg2dc12
334normalbeg2dc13: 334
335 normbeg short rcontloop13,13 335rcontloop12:
336 336 maccn short normalbeg2dc13
337normalbeg2dc14: 337
338 normbeg short rcontloop14,14 338rcontloop13:
339 339 maccn short normalbeg2dc14
340normalbeg2dc15: 340
341 normbeg short rcontloop15,15 341rcontloop14:
342 342 maccn short normalbeg2dc15
343normalbeg2dc10: 343
344 normbeg rcontloop10,10 344rcontloop15:
345 345 and eax,7fffh
346normalbeg2dc9: 346 mov ax,word ptr[esi+eax*2]
347 normbeg rcontloop9,9 347 cmp ecx,eax
348 348 jnb exitloop
349normalbeg2dc8: 349
350 normbeg rcontloop8,8 350 sub chain_length,16
351 351 ja do16
352normalbeg2dc7: 352 jmp normalbeg0add16
353 normbeg rcontloop7,7 353
354 354;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
355normalbeg2dc6: 355
356 normbeg rcontloop6,6 356normbeg MACRO rcontlab,valsub
357 357; if we are here, we know that *(match+best_len-1) == scan_end
358normalbeg2dc5: 358 cmp bp,word ptr[edx+eax]
359 normbeg rcontloop5,5 359; if (match != scan_start) goto rcontlab
360 360 jne rcontlab
361normalbeg2dc4: 361; calculate the good chain_length, and we'll compare scan and match string
362 normbeg rcontloop4,4 362 add chain_length,16-valsub
363 363 jmp iseq
364normalbeg2dc3: 364 ENDM
365 normbeg rcontloop3,3 365
366 366
367normalbeg2dc2: 367normalbeg2dc11:
368 normbeg rcontloop2,2 368 normbeg rcontloop11,11
369 369
370normalbeg2dc1: 370normalbeg2dc12:
371 normbeg rcontloop1,1 371 normbeg short rcontloop12,12
372 372
373normalbeg2dc0: 373normalbeg2dc13:
374 normbeg rcontloop0,0 374 normbeg short rcontloop13,13
375 375
376 376normalbeg2dc14:
377; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end 377 normbeg short rcontloop14,14
378 378
379normalbeg2: 379normalbeg2dc15:
380 mov edi,window 380 normbeg short rcontloop15,15
381 381
382 cmp bp,word ptr[edi+eax] 382normalbeg2dc10:
383 jne contloop3 ; if *(ushf*)match != scan_start, continue 383 normbeg rcontloop10,10
384 384
385iseq: 385normalbeg2dc9:
386; if we are here, we know that *(match+best_len-1) == scan_end 386 normbeg rcontloop9,9
387; and (match == scan_start) 387
388 388normalbeg2dc8:
389 mov edi,edx 389 normbeg rcontloop8,8
390 mov esi,scan ; esi = scan 390
391 add edi,eax ; edi = window + cur_match = match 391normalbeg2dc7:
392 392 normbeg rcontloop7,7
393 mov edx,[esi+3] ; compare manually dword at match+3 393
394 xor edx,[edi+3] ; and scan +3 394normalbeg2dc6:
395 395 normbeg rcontloop6,6
396 jz begincompare ; if equal, go to long compare 396
397 397normalbeg2dc5:
398; we will determine the unmatch byte and calculate len (in esi) 398 normbeg rcontloop5,5
399 or dl,dl 399
400 je eq1rr 400normalbeg2dc4:
401 mov esi,3 401 normbeg rcontloop4,4
402 jmp trfinval 402
403eq1rr: 403normalbeg2dc3:
404 or dx,dx 404 normbeg rcontloop3,3
405 je eq1 405
406 406normalbeg2dc2:
407 mov esi,4 407 normbeg rcontloop2,2
408 jmp trfinval 408
409eq1: 409normalbeg2dc1:
410 and edx,0ffffffh 410 normbeg rcontloop1,1
411 jz eq11 411
412 mov esi,5 412normalbeg2dc0:
413 jmp trfinval 413 normbeg rcontloop0,0
414eq11: 414
415 mov esi,6 415
416 jmp trfinval 416; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end
417 417
418begincompare: 418normalbeg2:
419 ; here we now scan and match begin same 419 mov edi,window
420 add edi,6 420
421 add esi,6 421 cmp bp,word ptr[edi+eax]
422 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes 422 jne contloop3 ; if *(ushf*)match != scan_start, continue
423 repe cmpsd ; loop until mismatch 423
424 424iseq:
425 je trfin ; go to trfin if not unmatch 425; if we are here, we know that *(match+best_len-1) == scan_end
426; we determine the unmatch byte 426; and (match == scan_start)
427 sub esi,4 427
428 mov edx,[edi-4] 428 mov edi,edx
429 xor edx,[esi] 429 mov esi,scan ; esi = scan
430 430 add edi,eax ; edi = window + cur_match = match
431 or dl,dl 431
432 jnz trfin 432 mov edx,[esi+3] ; compare manually dword at match+3
433 inc esi 433 xor edx,[edi+3] ; and scan +3
434 434
435 or dx,dx 435 jz begincompare ; if equal, go to long compare
436 jnz trfin 436
437 inc esi 437; we will determine the unmatch byte and calculate len (in esi)
438 438 or dl,dl
439 and edx,0ffffffh 439 je eq1rr
440 jnz trfin 440 mov esi,3
441 inc esi 441 jmp trfinval
442 442eq1rr:
443trfin: 443 or dx,dx
444 sub esi,scan ; esi = len 444 je eq1
445trfinval: 445
446; here we have finised compare, and esi contain len of equal string 446 mov esi,4
447 cmp esi,best_len ; if len > best_len, go newbestlen 447 jmp trfinval
448 ja short newbestlen 448eq1:
449; now we restore edx, ecx and esi, for the big loop 449 and edx,0ffffffh
450 mov esi,prev 450 jz eq11
451 mov ecx,limit 451 mov esi,5
452 mov edx,window 452 jmp trfinval
453 jmp contloop3 453eq11:
454 454 mov esi,6
455newbestlen: 455 jmp trfinval
456 mov best_len,esi ; len become best_len 456
457 457begincompare:
458 mov match_start,eax ; save new position as match_start 458 ; here we now scan and match begin same
459 cmp esi,nice_match ; if best_len >= nice_match, exit 459 add edi,6
460 jae exitloop 460 add esi,6
461 mov ecx,scan 461 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes
462 mov edx,window ; restore edx=window 462 repe cmpsd ; loop until mismatch
463 add ecx,esi 463
464 add esi,edx 464 je trfin ; go to trfin if not unmatch
465 465; we determine the unmatch byte
466 dec esi 466 sub esi,4
467 mov windowlen,esi ; windowlen = window + best_len-1 467 mov edx,[edi-4]
468 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end 468 xor edx,[esi]
469 469
470; now we restore ecx and esi, for the big loop : 470 or dl,dl
471 mov esi,prev 471 jnz trfin
472 mov ecx,limit 472 inc esi
473 jmp contloop3 473
474 474 or dx,dx
475exitloop: 475 jnz trfin
476; exit : s->match_start=match_start 476 inc esi
477 mov ebx,match_start 477
478 mov ebp,str_s 478 and edx,0ffffffh
479 mov ecx,best_len 479 jnz trfin
480 mov dword ptr [ebp+dep_match_start],ebx 480 inc esi
481 mov eax,dword ptr [ebp+dep_lookahead] 481
482 cmp ecx,eax 482trfin:
483 ja minexlo 483 sub esi,scan ; esi = len
484 mov eax,ecx 484trfinval:
485minexlo: 485; here we have finised compare, and esi contain len of equal string
486; return min(best_len,s->lookahead) 486 cmp esi,best_len ; if len > best_len, go newbestlen
487 487 ja short newbestlen
488; restore stack and register ebx,esi,edi,ebp 488; now we restore edx, ecx and esi, for the big loop
489 add esp,NbStackAdd 489 mov esi,prev
490 490 mov ecx,limit
491 pop ebx 491 mov edx,window
492 pop esi 492 jmp contloop3
493 pop edi 493
494 pop ebp 494newbestlen:
495 ret 495 mov best_len,esi ; len become best_len
496InfoAuthor: 496
497; please don't remove this string ! 497 mov match_start,eax ; save new position as match_start
498; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary! 498 cmp esi,nice_match ; if best_len >= nice_match, exit
499 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah 499 jae exitloop
500 500 mov ecx,scan
501 501 mov edx,window ; restore edx=window
502 502 add ecx,esi
503IFDEF NOUNDERLINE 503 add esi,edx
504longest_match_7fff endp 504
505ELSE 505 dec esi
506_longest_match_7fff endp 506 mov windowlen,esi ; windowlen = window + best_len-1
507ENDIF 507 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end
508 508
509 509; now we restore ecx and esi, for the big loop :
510IFDEF NOUNDERLINE 510 mov esi,prev
511cpudetect32 proc near 511 mov ecx,limit
512ELSE 512 jmp contloop3
513_cpudetect32 proc near 513
514ENDIF 514exitloop:
515 515; exit : s->match_start=match_start
516 push ebx 516 mov ebx,match_start
517 517 mov ebp,str_s
518 pushfd ; push original EFLAGS 518 mov ecx,best_len
519 pop eax ; get original EFLAGS 519 mov dword ptr [ebp+dep_match_start],ebx
520 mov ecx, eax ; save original EFLAGS 520 mov eax,dword ptr [ebp+dep_lookahead]
521 xor eax, 40000h ; flip AC bit in EFLAGS 521 cmp ecx,eax
522 push eax ; save new EFLAGS value on stack 522 ja minexlo
523 popfd ; replace current EFLAGS value 523 mov eax,ecx
524 pushfd ; get new EFLAGS 524minexlo:
525 pop eax ; store new EFLAGS in EAX 525; return min(best_len,s->lookahead)
526 xor eax, ecx ; can’t toggle AC bit, processor=80386 526
527 jz end_cpu_is_386 ; jump if 80386 processor 527; restore stack and register ebx,esi,edi,ebp
528 push ecx 528 add esp,NbStackAdd
529 popfd ; restore AC bit in EFLAGS first 529
530 530 pop ebx
531 pushfd 531 pop esi
532 pushfd 532 pop edi
533 pop ecx 533 pop ebp
534 534 ret
535 mov eax, ecx ; get original EFLAGS 535InfoAuthor:
536 xor eax, 200000h ; flip ID bit in EFLAGS 536; please don't remove this string !
537 push eax ; save new EFLAGS value on stack 537; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary!
538 popfd ; replace current EFLAGS value 538 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah
539 pushfd ; get new EFLAGS 539
540 pop eax ; store new EFLAGS in EAX 540
541 popfd ; restore original EFLAGS 541
542 xor eax, ecx ; can’t toggle ID bit, 542IFDEF NOUNDERLINE
543 je is_old_486 ; processor=old 543longest_match_7fff endp
544 544ELSE
545 mov eax,1 545_longest_match_7fff endp
546 db 0fh,0a2h ;CPUID 546ENDIF
547 547
548exitcpudetect: 548
549 pop ebx 549IFDEF NOUNDERLINE
550 ret 550cpudetect32 proc near
551 551ELSE
552end_cpu_is_386: 552_cpudetect32 proc near
553 mov eax,0300h 553ENDIF
554 jmp exitcpudetect 554
555 555 push ebx
556is_old_486: 556
557 mov eax,0400h 557 pushfd ; push original EFLAGS
558 jmp exitcpudetect 558 pop eax ; get original EFLAGS
559 559 mov ecx, eax ; save original EFLAGS
560IFDEF NOUNDERLINE 560 xor eax, 40000h ; flip AC bit in EFLAGS
561cpudetect32 endp 561 push eax ; save new EFLAGS value on stack
562ELSE 562 popfd ; replace current EFLAGS value
563_cpudetect32 endp 563 pushfd ; get new EFLAGS
564ENDIF 564 pop eax ; store new EFLAGS in EAX
565 565 xor eax, ecx ; can’t toggle AC bit, processor=80386
566 566 jz end_cpu_is_386 ; jump if 80386 processor
567 567 push ecx
568 568 popfd ; restore AC bit in EFLAGS first
569MAX_MATCH equ 258 569
570MIN_MATCH equ 3 570 pushfd
571MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) 571 pushfd
572MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) 572 pop ecx
573 573
574 574 mov eax, ecx ; get original EFLAGS
575;;; stack frame offsets 575 xor eax, 200000h ; flip ID bit in EFLAGS
576 576 push eax ; save new EFLAGS value on stack
577chainlenwmask equ esp + 0 ; high word: current chain len 577 popfd ; replace current EFLAGS value
578 ; low word: s->wmask 578 pushfd ; get new EFLAGS
579window equ esp + 4 ; local copy of s->window 579 pop eax ; store new EFLAGS in EAX
580windowbestlen equ esp + 8 ; s->window + bestlen 580 popfd ; restore original EFLAGS
581scanstart equ esp + 16 ; first two bytes of string 581 xor eax, ecx ; can’t toggle ID bit,
582scanend equ esp + 12 ; last two bytes of string 582 je is_old_486 ; processor=old
583scanalign equ esp + 20 ; dword-misalignment of string 583
584nicematch equ esp + 24 ; a good enough match size 584 mov eax,1
585bestlen equ esp + 28 ; size of best match so far 585 db 0fh,0a2h ;CPUID
586scan equ esp + 32 ; ptr to string wanting match 586
587 587exitcpudetect:
588LocalVarsSize equ 36 588 pop ebx
589; saved ebx byte esp + 36 589 ret
590; saved edi byte esp + 40 590
591; saved esi byte esp + 44 591end_cpu_is_386:
592; saved ebp byte esp + 48 592 mov eax,0300h
593; return address byte esp + 52 593 jmp exitcpudetect
594deflatestate equ esp + 56 ; the function arguments 594
595curmatch equ esp + 60 595is_old_486:
596 596 mov eax,0400h
597;;; Offsets for fields in the deflate_state structure. These numbers 597 jmp exitcpudetect
598;;; are calculated from the definition of deflate_state, with the 598
599;;; assumption that the compiler will dword-align the fields. (Thus, 599IFDEF NOUNDERLINE
600;;; changing the definition of deflate_state could easily cause this 600cpudetect32 endp
601;;; program to crash horribly, without so much as a warning at 601ELSE
602;;; compile time. Sigh.) 602_cpudetect32 endp
603 603ENDIF
604dsWSize equ 36+addstr-4 604ENDIF
605dsWMask equ 44+addstr-4 605
606dsWindow equ 48+addstr-4 606MAX_MATCH equ 258
607dsPrev equ 56+addstr-4 607MIN_MATCH equ 3
608dsMatchLen equ 88+addstr-4 608MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
609dsPrevMatch equ 92+addstr-4 609MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
610dsStrStart equ 100+addstr-4 610
611dsMatchStart equ 104+addstr-4 611
612dsLookahead equ 108+addstr-4 612;;; stack frame offsets
613dsPrevLen equ 112+addstr-4 613
614dsMaxChainLen equ 116+addstr-4 614chainlenwmask equ esp + 0 ; high word: current chain len
615dsGoodMatch equ 132+addstr-4 615 ; low word: s->wmask
616dsNiceMatch equ 136+addstr-4 616window equ esp + 4 ; local copy of s->window
617 617windowbestlen equ esp + 8 ; s->window + bestlen
618 618scanstart equ esp + 16 ; first two bytes of string
619;;; match.asm -- Pentium-Pro-optimized version of longest_match() 619scanend equ esp + 12 ; last two bytes of string
620;;; Written for zlib 1.1.2 620scanalign equ esp + 20 ; dword-misalignment of string
621;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 621nicematch equ esp + 24 ; a good enough match size
622;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html 622bestlen equ esp + 28 ; size of best match so far
623;;; 623scan equ esp + 32 ; ptr to string wanting match
624;;; This is free software; you can redistribute it and/or modify it 624
625;;; under the terms of the GNU General Public License. 625LocalVarsSize equ 36
626 626; saved ebx byte esp + 36
627;GLOBAL _longest_match, _match_init 627; saved edi byte esp + 40
628 628; saved esi byte esp + 44
629 629; saved ebp byte esp + 48
630;SECTION .text 630; return address byte esp + 52
631 631deflatestate equ esp + 56 ; the function arguments
632;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) 632curmatch equ esp + 60
633 633
634;_longest_match: 634;;; Offsets for fields in the deflate_state structure. These numbers
635IFDEF NOUNDERLINE 635;;; are calculated from the definition of deflate_state, with the
636longest_match_686 proc near 636;;; assumption that the compiler will dword-align the fields. (Thus,
637ELSE 637;;; changing the definition of deflate_state could easily cause this
638_longest_match_686 proc near 638;;; program to crash horribly, without so much as a warning at
639ENDIF 639;;; compile time. Sigh.)
640 640
641 641dsWSize equ 36+zlib1222add
642;;; Save registers that the compiler may be using, and adjust esp to 642dsWMask equ 44+zlib1222add
643;;; make room for our stack frame. 643dsWindow equ 48+zlib1222add
644 644dsPrev equ 56+zlib1222add
645 push ebp 645dsMatchLen equ 88+zlib1222add
646 push edi 646dsPrevMatch equ 92+zlib1222add
647 push esi 647dsStrStart equ 100+zlib1222add
648 push ebx 648dsMatchStart equ 104+zlib1222add
649 sub esp, LocalVarsSize 649dsLookahead equ 108+zlib1222add
650 650dsPrevLen equ 112+zlib1222add
651;;; Retrieve the function arguments. ecx will hold cur_match 651dsMaxChainLen equ 116+zlib1222add
652;;; throughout the entire function. edx will hold the pointer to the 652dsGoodMatch equ 132+zlib1222add
653;;; deflate_state structure during the function's setup (before 653dsNiceMatch equ 136+zlib1222add
654;;; entering the main loop. 654
655 655
656 mov edx, [deflatestate] 656;;; match.asm -- Pentium-Pro-optimized version of longest_match()
657 mov ecx, [curmatch] 657;;; Written for zlib 1.1.2
658 658;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
659;;; uInt wmask = s->w_mask; 659;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
660;;; unsigned chain_length = s->max_chain_length; 660;;;
661;;; if (s->prev_length >= s->good_match) { 661;;; This is free software; you can redistribute it and/or modify it
662;;; chain_length >>= 2; 662;;; under the terms of the GNU General Public License.
663;;; } 663
664 664;GLOBAL _longest_match, _match_init
665 mov eax, [edx + dsPrevLen] 665
666 mov ebx, [edx + dsGoodMatch] 666
667 cmp eax, ebx 667;SECTION .text
668 mov eax, [edx + dsWMask] 668
669 mov ebx, [edx + dsMaxChainLen] 669;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
670 jl LastMatchGood 670
671 shr ebx, 2 671;_longest_match:
672LastMatchGood: 672IFDEF NOOLDPENTIUMCODE
673 673 IFDEF NOUNDERLINE
674;;; chainlen is decremented once beforehand so that the function can 674 longest_match proc near
675;;; use the sign flag instead of the zero flag for the exit test. 675 ELSE
676;;; It is then shifted into the high word, to make room for the wmask 676 _longest_match proc near
677;;; value, which it will always accompany. 677 ENDIF
678 678ELSE
679 dec ebx 679 IFDEF NOUNDERLINE
680 shl ebx, 16 680 longest_match_686 proc near
681 or ebx, eax 681 ELSE
682 mov [chainlenwmask], ebx 682 _longest_match_686 proc near
683 683 ENDIF
684;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 684ENDIF
685 685
686 mov eax, [edx + dsNiceMatch] 686;;; Save registers that the compiler may be using, and adjust esp to
687 mov ebx, [edx + dsLookahead] 687;;; make room for our stack frame.
688 cmp ebx, eax 688
689 jl LookaheadLess 689 push ebp
690 mov ebx, eax 690 push edi
691LookaheadLess: mov [nicematch], ebx 691 push esi
692 692 push ebx
693;;; register Bytef *scan = s->window + s->strstart; 693 sub esp, LocalVarsSize
694 694
695 mov esi, [edx + dsWindow] 695;;; Retrieve the function arguments. ecx will hold cur_match
696 mov [window], esi 696;;; throughout the entire function. edx will hold the pointer to the
697 mov ebp, [edx + dsStrStart] 697;;; deflate_state structure during the function's setup (before
698 lea edi, [esi + ebp] 698;;; entering the main loop.
699 mov [scan], edi 699
700 700 mov edx, [deflatestate]
701;;; Determine how many bytes the scan ptr is off from being 701 mov ecx, [curmatch]
702;;; dword-aligned. 702
703 703;;; uInt wmask = s->w_mask;
704 mov eax, edi 704;;; unsigned chain_length = s->max_chain_length;
705 neg eax 705;;; if (s->prev_length >= s->good_match) {
706 and eax, 3 706;;; chain_length >>= 2;
707 mov [scanalign], eax 707;;; }
708 708
709;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 709 mov eax, [edx + dsPrevLen]
710;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 710 mov ebx, [edx + dsGoodMatch]
711 711 cmp eax, ebx
712 mov eax, [edx + dsWSize] 712 mov eax, [edx + dsWMask]
713 sub eax, MIN_LOOKAHEAD 713 mov ebx, [edx + dsMaxChainLen]
714 sub ebp, eax 714 jl LastMatchGood
715 jg LimitPositive 715 shr ebx, 2
716 xor ebp, ebp 716LastMatchGood:
717LimitPositive: 717
718 718;;; chainlen is decremented once beforehand so that the function can
719;;; int best_len = s->prev_length; 719;;; use the sign flag instead of the zero flag for the exit test.
720 720;;; It is then shifted into the high word, to make room for the wmask
721 mov eax, [edx + dsPrevLen] 721;;; value, which it will always accompany.
722 mov [bestlen], eax 722
723 723 dec ebx
724;;; Store the sum of s->window + best_len in esi locally, and in esi. 724 shl ebx, 16
725 725 or ebx, eax
726 add esi, eax 726 mov [chainlenwmask], ebx
727 mov [windowbestlen], esi 727
728 728;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
729;;; register ush scan_start = *(ushf*)scan; 729
730;;; register ush scan_end = *(ushf*)(scan+best_len-1); 730 mov eax, [edx + dsNiceMatch]
731;;; Posf *prev = s->prev; 731 mov ebx, [edx + dsLookahead]
732 732 cmp ebx, eax
733 movzx ebx, word ptr [edi] 733 jl LookaheadLess
734 mov [scanstart], ebx 734 mov ebx, eax
735 movzx ebx, word ptr [edi + eax - 1] 735LookaheadLess: mov [nicematch], ebx
736 mov [scanend], ebx 736
737 mov edi, [edx + dsPrev] 737;;; register Bytef *scan = s->window + s->strstart;
738 738
739;;; Jump into the main loop. 739 mov esi, [edx + dsWindow]
740 740 mov [window], esi
741 mov edx, [chainlenwmask] 741 mov ebp, [edx + dsStrStart]
742 jmp short LoopEntry 742 lea edi, [esi + ebp]
743 743 mov [scan], edi
744align 4 744
745 745;;; Determine how many bytes the scan ptr is off from being
746;;; do { 746;;; dword-aligned.
747;;; match = s->window + cur_match; 747
748;;; if (*(ushf*)(match+best_len-1) != scan_end || 748 mov eax, edi
749;;; *(ushf*)match != scan_start) continue; 749 neg eax
750;;; [...] 750 and eax, 3
751;;; } while ((cur_match = prev[cur_match & wmask]) > limit 751 mov [scanalign], eax
752;;; && --chain_length != 0); 752
753;;; 753;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
754;;; Here is the inner loop of the function. The function will spend the 754;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
755;;; majority of its time in this loop, and majority of that time will 755
756;;; be spent in the first ten instructions. 756 mov eax, [edx + dsWSize]
757;;; 757 sub eax, MIN_LOOKAHEAD
758;;; Within this loop: 758 sub ebp, eax
759;;; ebx = scanend 759 jg LimitPositive
760;;; ecx = curmatch 760 xor ebp, ebp
761;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 761LimitPositive:
762;;; esi = windowbestlen - i.e., (window + bestlen) 762
763;;; edi = prev 763;;; int best_len = s->prev_length;
764;;; ebp = limit 764
765 765 mov eax, [edx + dsPrevLen]
766LookupLoop: 766 mov [bestlen], eax
767 and ecx, edx 767
768 movzx ecx, word ptr [edi + ecx*2] 768;;; Store the sum of s->window + best_len in esi locally, and in esi.
769 cmp ecx, ebp 769
770 jbe LeaveNow 770 add esi, eax
771 sub edx, 00010000h 771 mov [windowbestlen], esi
772 js LeaveNow 772
773LoopEntry: movzx eax, word ptr [esi + ecx - 1] 773;;; register ush scan_start = *(ushf*)scan;
774 cmp eax, ebx 774;;; register ush scan_end = *(ushf*)(scan+best_len-1);
775 jnz LookupLoop 775;;; Posf *prev = s->prev;
776 mov eax, [window] 776
777 movzx eax, word ptr [eax + ecx] 777 movzx ebx, word ptr [edi]
778 cmp eax, [scanstart] 778 mov [scanstart], ebx
779 jnz LookupLoop 779 movzx ebx, word ptr [edi + eax - 1]
780 780 mov [scanend], ebx
781;;; Store the current value of chainlen. 781 mov edi, [edx + dsPrev]
782 782
783 mov [chainlenwmask], edx 783;;; Jump into the main loop.
784 784
785;;; Point edi to the string under scrutiny, and esi to the string we 785 mov edx, [chainlenwmask]
786;;; are hoping to match it up with. In actuality, esi and edi are 786 jmp short LoopEntry
787;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 787
788;;; initialized to -(MAX_MATCH_8 - scanalign). 788align 4
789 789
790 mov esi, [window] 790;;; do {
791 mov edi, [scan] 791;;; match = s->window + cur_match;
792 add esi, ecx 792;;; if (*(ushf*)(match+best_len-1) != scan_end ||
793 mov eax, [scanalign] 793;;; *(ushf*)match != scan_start) continue;
794 mov edx, 0fffffef8h; -(MAX_MATCH_8) 794;;; [...]
795 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] 795;;; } while ((cur_match = prev[cur_match & wmask]) > limit
796 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] 796;;; && --chain_length != 0);
797 797;;;
798;;; Test the strings for equality, 8 bytes at a time. At the end, 798;;; Here is the inner loop of the function. The function will spend the
799;;; adjust edx so that it is offset to the exact byte that mismatched. 799;;; majority of its time in this loop, and majority of that time will
800;;; 800;;; be spent in the first ten instructions.
801;;; We already know at this point that the first three bytes of the 801;;;
802;;; strings match each other, and they can be safely passed over before 802;;; Within this loop:
803;;; starting the compare loop. So what this code does is skip over 0-3 803;;; ebx = scanend
804;;; bytes, as much as necessary in order to dword-align the edi 804;;; ecx = curmatch
805;;; pointer. (esi will still be misaligned three times out of four.) 805;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
806;;; 806;;; esi = windowbestlen - i.e., (window + bestlen)
807;;; It should be confessed that this loop usually does not represent 807;;; edi = prev
808;;; much of the total running time. Replacing it with a more 808;;; ebp = limit
809;;; straightforward "rep cmpsb" would not drastically degrade 809
810;;; performance. 810LookupLoop:
811 811 and ecx, edx
812LoopCmps: 812 movzx ecx, word ptr [edi + ecx*2]
813 mov eax, [esi + edx] 813 cmp ecx, ebp
814 xor eax, [edi + edx] 814 jbe LeaveNow
815 jnz LeaveLoopCmps 815 sub edx, 00010000h
816 mov eax, [esi + edx + 4] 816 js LeaveNow
817 xor eax, [edi + edx + 4] 817LoopEntry: movzx eax, word ptr [esi + ecx - 1]
818 jnz LeaveLoopCmps4 818 cmp eax, ebx
819 add edx, 8 819 jnz LookupLoop
820 jnz LoopCmps 820 mov eax, [window]
821 jmp short LenMaximum 821 movzx eax, word ptr [eax + ecx]
822LeaveLoopCmps4: add edx, 4 822 cmp eax, [scanstart]
823LeaveLoopCmps: test eax, 0000FFFFh 823 jnz LookupLoop
824 jnz LenLower 824
825 add edx, 2 825;;; Store the current value of chainlen.
826 shr eax, 16 826
827LenLower: sub al, 1 827 mov [chainlenwmask], edx
828 adc edx, 0 828
829 829;;; Point edi to the string under scrutiny, and esi to the string we
830;;; Calculate the length of the match. If it is longer than MAX_MATCH, 830;;; are hoping to match it up with. In actuality, esi and edi are
831;;; then automatically accept it as the best possible match and leave. 831;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
832 832;;; initialized to -(MAX_MATCH_8 - scanalign).
833 lea eax, [edi + edx] 833
834 mov edi, [scan] 834 mov esi, [window]
835 sub eax, edi 835 mov edi, [scan]
836 cmp eax, MAX_MATCH 836 add esi, ecx
837 jge LenMaximum 837 mov eax, [scanalign]
838 838 mov edx, 0fffffef8h; -(MAX_MATCH_8)
839;;; If the length of the match is not longer than the best match we 839 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
840;;; have so far, then forget it and return to the lookup loop. 840 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
841 841
842 mov edx, [deflatestate] 842;;; Test the strings for equality, 8 bytes at a time. At the end,
843 mov ebx, [bestlen] 843;;; adjust edx so that it is offset to the exact byte that mismatched.
844 cmp eax, ebx 844;;;
845 jg LongerMatch 845;;; We already know at this point that the first three bytes of the
846 mov esi, [windowbestlen] 846;;; strings match each other, and they can be safely passed over before
847 mov edi, [edx + dsPrev] 847;;; starting the compare loop. So what this code does is skip over 0-3
848 mov ebx, [scanend] 848;;; bytes, as much as necessary in order to dword-align the edi
849 mov edx, [chainlenwmask] 849;;; pointer. (esi will still be misaligned three times out of four.)
850 jmp LookupLoop 850;;;
851 851;;; It should be confessed that this loop usually does not represent
852;;; s->match_start = cur_match; 852;;; much of the total running time. Replacing it with a more
853;;; best_len = len; 853;;; straightforward "rep cmpsb" would not drastically degrade
854;;; if (len >= nice_match) break; 854;;; performance.
855;;; scan_end = *(ushf*)(scan+best_len-1); 855
856 856LoopCmps:
857LongerMatch: mov ebx, [nicematch] 857 mov eax, [esi + edx]
858 mov [bestlen], eax 858 xor eax, [edi + edx]
859 mov [edx + dsMatchStart], ecx 859 jnz LeaveLoopCmps
860 cmp eax, ebx 860 mov eax, [esi + edx + 4]
861 jge LeaveNow 861 xor eax, [edi + edx + 4]
862 mov esi, [window] 862 jnz LeaveLoopCmps4
863 add esi, eax 863 add edx, 8
864 mov [windowbestlen], esi 864 jnz LoopCmps
865 movzx ebx, word ptr [edi + eax - 1] 865 jmp short LenMaximum
866 mov edi, [edx + dsPrev] 866LeaveLoopCmps4: add edx, 4
867 mov [scanend], ebx 867LeaveLoopCmps: test eax, 0000FFFFh
868 mov edx, [chainlenwmask] 868 jnz LenLower
869 jmp LookupLoop 869 add edx, 2
870 870 shr eax, 16
871;;; Accept the current string, with the maximum possible length. 871LenLower: sub al, 1
872 872 adc edx, 0
873LenMaximum: mov edx, [deflatestate] 873
874 mov dword ptr [bestlen], MAX_MATCH 874;;; Calculate the length of the match. If it is longer than MAX_MATCH,
875 mov [edx + dsMatchStart], ecx 875;;; then automatically accept it as the best possible match and leave.
876 876
877;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 877 lea eax, [edi + edx]
878;;; return s->lookahead; 878 mov edi, [scan]
879 879 sub eax, edi
880LeaveNow: 880 cmp eax, MAX_MATCH
881 mov edx, [deflatestate] 881 jge LenMaximum
882 mov ebx, [bestlen] 882
883 mov eax, [edx + dsLookahead] 883;;; If the length of the match is not longer than the best match we
884 cmp ebx, eax 884;;; have so far, then forget it and return to the lookup loop.
885 jg LookaheadRet 885
886 mov eax, ebx 886 mov edx, [deflatestate]
887LookaheadRet: 887 mov ebx, [bestlen]
888 888 cmp eax, ebx
889;;; Restore the stack and return from whence we came. 889 jg LongerMatch
890 890 mov esi, [windowbestlen]
891 add esp, LocalVarsSize 891 mov edi, [edx + dsPrev]
892 pop ebx 892 mov ebx, [scanend]
893 pop esi 893 mov edx, [chainlenwmask]
894 pop edi 894 jmp LookupLoop
895 pop ebp 895
896 896;;; s->match_start = cur_match;
897 ret 897;;; best_len = len;
898; please don't remove this string ! 898;;; if (len >= nice_match) break;
899; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary! 899;;; scan_end = *(ushf*)(scan+best_len-1);
900 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah 900
901 901LongerMatch: mov ebx, [nicematch]
902IFDEF NOUNDERLINE 902 mov [bestlen], eax
903longest_match_686 endp 903 mov [edx + dsMatchStart], ecx
904ELSE 904 cmp eax, ebx
905_longest_match_686 endp 905 jge LeaveNow
906ENDIF 906 mov esi, [window]
907 907 add esi, eax
908_TEXT ends 908 mov [windowbestlen], esi
909end 909 movzx ebx, word ptr [edi + eax - 1]
910 mov edi, [edx + dsPrev]
911 mov [scanend], ebx
912 mov edx, [chainlenwmask]
913 jmp LookupLoop
914
915;;; Accept the current string, with the maximum possible length.
916
917LenMaximum: mov edx, [deflatestate]
918 mov dword ptr [bestlen], MAX_MATCH
919 mov [edx + dsMatchStart], ecx
920
921;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
922;;; return s->lookahead;
923
924LeaveNow:
925 mov edx, [deflatestate]
926 mov ebx, [bestlen]
927 mov eax, [edx + dsLookahead]
928 cmp ebx, eax
929 jg LookaheadRet
930 mov eax, ebx
931LookaheadRet:
932
933;;; Restore the stack and return from whence we came.
934
935 add esp, LocalVarsSize
936 pop ebx
937 pop esi
938 pop edi
939 pop ebp
940
941 ret
942; please don't remove this string !
943; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
944 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
945
946
947IFDEF NOOLDPENTIUMCODE
948 IFDEF NOUNDERLINE
949 longest_match endp
950 ELSE
951 _longest_match endp
952 ENDIF
953
954 IFDEF NOUNDERLINE
955 match_init proc near
956 ret
957 match_init endp
958 ELSE
959 _match_init proc near
960 ret
961 _match_init endp
962 ENDIF
963ELSE
964 IFDEF NOUNDERLINE
965 longest_match_686 endp
966 ELSE
967 _longest_match_686 endp
968 ENDIF
969ENDIF
970
971_TEXT ends
972end