summaryrefslogtreecommitdiff
path: root/contrib/masmx86/gvmat32.asm
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:22:37 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:22:37 -0700
commit4b5a43a219d51066c01ff2ab86af18b967f2d0dd (patch)
tree4dcaf0cd18751d04cf638a9a6ec521990d4f2e90 /contrib/masmx86/gvmat32.asm
parent086e982175da84b3db958191031380794315f95f (diff)
downloadzlib-1.2.0.5.tar.gz
zlib-1.2.0.5.tar.bz2
zlib-1.2.0.5.zip
zlib 1.2.0.5v1.2.0.5
Diffstat (limited to '')
-rw-r--r--contrib/masmx86/gvmat32.asm (renamed from contrib/vstudio/vc70_32/gvmat32.asm)1810
1 files changed, 905 insertions, 905 deletions
diff --git a/contrib/vstudio/vc70_32/gvmat32.asm b/contrib/masmx86/gvmat32.asm
index 320348f..ec360e6 100644
--- a/contrib/vstudio/vc70_32/gvmat32.asm
+++ b/contrib/masmx86/gvmat32.asm
@@ -1,905 +1,905 @@
1; 1;
2; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86 2; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86
3; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. 3; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant.
4; File written by Gilles Vollant, by modifiying the longest_match 4; File written by Gilles Vollant, by modifiying the longest_match
5; from Jean-loup Gailly in deflate.c 5; from Jean-loup Gailly in deflate.c
6; It need wmask == 0x7fff 6; It need wmask == 0x7fff
7; (assembly code is faster with a fixed wmask) 7; (assembly code is faster with a fixed wmask)
8; 8;
9; For Visual C++ 4.2 and ML 6.11c (version in directory \MASM611C of Win95 DDK) 9; For Visual C++ 4.2 and ML 6.11c (version in directory \MASM611C of Win95 DDK)
10; I compile with : "ml /coff /Zi /c gvmat32.asm" 10; I compile with : "ml /coff /Zi /c gvmat32.asm"
11; 11;
12 12
13;uInt longest_match_7fff(s, cur_match) 13;uInt longest_match_7fff(s, cur_match)
14; deflate_state *s; 14; deflate_state *s;
15; IPos cur_match; /* current match */ 15; IPos cur_match; /* current match */
16 16
17 NbStack equ 76 17 NbStack equ 76
18 cur_match equ dword ptr[esp+NbStack-0] 18 cur_match equ dword ptr[esp+NbStack-0]
19 str_s equ dword ptr[esp+NbStack-4] 19 str_s equ dword ptr[esp+NbStack-4]
20; 5 dword on top (ret,ebp,esi,edi,ebx) 20; 5 dword on top (ret,ebp,esi,edi,ebx)
21 adrret equ dword ptr[esp+NbStack-8] 21 adrret equ dword ptr[esp+NbStack-8]
22 pushebp equ dword ptr[esp+NbStack-12] 22 pushebp equ dword ptr[esp+NbStack-12]
23 pushedi equ dword ptr[esp+NbStack-16] 23 pushedi equ dword ptr[esp+NbStack-16]
24 pushesi equ dword ptr[esp+NbStack-20] 24 pushesi equ dword ptr[esp+NbStack-20]
25 pushebx equ dword ptr[esp+NbStack-24] 25 pushebx equ dword ptr[esp+NbStack-24]
26 26
27 chain_length equ dword ptr [esp+NbStack-28] 27 chain_length equ dword ptr [esp+NbStack-28]
28 limit equ dword ptr [esp+NbStack-32] 28 limit equ dword ptr [esp+NbStack-32]
29 best_len equ dword ptr [esp+NbStack-36] 29 best_len equ dword ptr [esp+NbStack-36]
30 window equ dword ptr [esp+NbStack-40] 30 window equ dword ptr [esp+NbStack-40]
31 prev equ dword ptr [esp+NbStack-44] 31 prev equ dword ptr [esp+NbStack-44]
32 scan_start equ word ptr [esp+NbStack-48] 32 scan_start equ word ptr [esp+NbStack-48]
33 wmask equ dword ptr [esp+NbStack-52] 33 wmask equ dword ptr [esp+NbStack-52]
34 match_start_ptr equ dword ptr [esp+NbStack-56] 34 match_start_ptr equ dword ptr [esp+NbStack-56]
35 nice_match equ dword ptr [esp+NbStack-60] 35 nice_match equ dword ptr [esp+NbStack-60]
36 scan equ dword ptr [esp+NbStack-64] 36 scan equ dword ptr [esp+NbStack-64]
37 37
38 windowlen equ dword ptr [esp+NbStack-68] 38 windowlen equ dword ptr [esp+NbStack-68]
39 match_start equ dword ptr [esp+NbStack-72] 39 match_start equ dword ptr [esp+NbStack-72]
40 strend equ dword ptr [esp+NbStack-76] 40 strend equ dword ptr [esp+NbStack-76]
41 NbStackAdd equ (NbStack-24) 41 NbStackAdd equ (NbStack-24)
42 42
43 .386p 43 .386p
44 44
45 name gvmatch 45 name gvmatch
46 .MODEL FLAT 46 .MODEL FLAT
47 47
48 48
49 49
50; all the +4 offsets are due to the addition of pending_buf_size (in zlib 50; all the +4 offsets are due to the addition of pending_buf_size (in zlib
51; in the deflate_state structure since the asm code was first written 51; in the deflate_state structure since the asm code was first written
52; (if you compile with zlib 1.0.4 or older, remove the +4). 52; (if you compile with zlib 1.0.4 or older, remove the +4).
53; Note : these value are good with a 8 bytes boundary pack structure 53; Note : these value are good with a 8 bytes boundary pack structure
54 dep_chain_length equ 70h+4 54 dep_chain_length equ 70h+4
55 dep_window equ 2ch+4 55 dep_window equ 2ch+4
56 dep_strstart equ 60h+4 56 dep_strstart equ 60h+4
57 dep_prev_length equ 6ch+4 57 dep_prev_length equ 6ch+4
58 dep_nice_match equ 84h+4 58 dep_nice_match equ 84h+4
59 dep_w_size equ 20h+4 59 dep_w_size equ 20h+4
60 dep_prev equ 34h+4 60 dep_prev equ 34h+4
61 dep_w_mask equ 28h+4 61 dep_w_mask equ 28h+4
62 dep_good_match equ 80h+4 62 dep_good_match equ 80h+4
63 dep_match_start equ 64h+4 63 dep_match_start equ 64h+4
64 dep_lookahead equ 68h+4 64 dep_lookahead equ 68h+4
65 65
66 66
67_TEXT segment 67_TEXT segment
68 68
69IFDEF NOUNDERLINE 69IFDEF NOUNDERLINE
70 public longest_match_7fff 70 public longest_match_7fff
71 public longest_match_686 71 public longest_match_686
72; public match_init 72; public match_init
73ELSE 73ELSE
74 public _longest_match_7fff 74 public _longest_match_7fff
75 public _longest_match_686 75 public _longest_match_686
76; public _match_init 76; public _match_init
77ENDIF 77ENDIF
78 78
79 MAX_MATCH equ 258 79 MAX_MATCH equ 258
80 MIN_MATCH equ 3 80 MIN_MATCH equ 3
81 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) 81 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
82 82
83 83
84 84
85IFDEF NOUNDERLINE 85IFDEF NOUNDERLINE
86;match_init proc near 86;match_init proc near
87; ret 87; ret
88;match_init endp 88;match_init endp
89ELSE 89ELSE
90;_match_init proc near 90;_match_init proc near
91; ret 91; ret
92;_match_init endp 92;_match_init endp
93ENDIF 93ENDIF
94 94
95 95
96IFDEF NOUNDERLINE 96IFDEF NOUNDERLINE
97longest_match_7fff proc near 97longest_match_7fff proc near
98ELSE 98ELSE
99_longest_match_7fff proc near 99_longest_match_7fff proc near
100ENDIF 100ENDIF
101 101
102 mov edx,[esp+4] 102 mov edx,[esp+4]
103 103
104 104
105 105
106 push ebp 106 push ebp
107 push edi 107 push edi
108 push esi 108 push esi
109 push ebx 109 push ebx
110 110
111 sub esp,NbStackAdd 111 sub esp,NbStackAdd
112 112
113; initialize or check the variables used in match.asm. 113; initialize or check the variables used in match.asm.
114 mov ebp,edx 114 mov ebp,edx
115 115
116; chain_length = s->max_chain_length 116; chain_length = s->max_chain_length
117; if (prev_length>=good_match) chain_length >>= 2 117; if (prev_length>=good_match) chain_length >>= 2
118 mov edx,[ebp+dep_chain_length] 118 mov edx,[ebp+dep_chain_length]
119 mov ebx,[ebp+dep_prev_length] 119 mov ebx,[ebp+dep_prev_length]
120 cmp [ebp+dep_good_match],ebx 120 cmp [ebp+dep_good_match],ebx
121 ja noshr 121 ja noshr
122 shr edx,2 122 shr edx,2
123noshr: 123noshr:
124; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop 124; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop
125 inc edx 125 inc edx
126 mov edi,[ebp+dep_nice_match] 126 mov edi,[ebp+dep_nice_match]
127 mov chain_length,edx 127 mov chain_length,edx
128 mov eax,[ebp+dep_lookahead] 128 mov eax,[ebp+dep_lookahead]
129 cmp eax,edi 129 cmp eax,edi
130; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 130; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
131 jae nolookaheadnicematch 131 jae nolookaheadnicematch
132 mov edi,eax 132 mov edi,eax
133nolookaheadnicematch: 133nolookaheadnicematch:
134; best_len = s->prev_length 134; best_len = s->prev_length
135 mov best_len,ebx 135 mov best_len,ebx
136 136
137; window = s->window 137; window = s->window
138 mov esi,[ebp+dep_window] 138 mov esi,[ebp+dep_window]
139 mov ecx,[ebp+dep_strstart] 139 mov ecx,[ebp+dep_strstart]
140 mov window,esi 140 mov window,esi
141 141
142 mov nice_match,edi 142 mov nice_match,edi
143; scan = window + strstart 143; scan = window + strstart
144 add esi,ecx 144 add esi,ecx
145 mov scan,esi 145 mov scan,esi
146; dx = *window 146; dx = *window
147 mov dx,word ptr [esi] 147 mov dx,word ptr [esi]
148; bx = *(window+best_len-1) 148; bx = *(window+best_len-1)
149 mov bx,word ptr [esi+ebx-1] 149 mov bx,word ptr [esi+ebx-1]
150 add esi,MAX_MATCH-1 150 add esi,MAX_MATCH-1
151; scan_start = *scan 151; scan_start = *scan
152 mov scan_start,dx 152 mov scan_start,dx
153; strend = scan + MAX_MATCH-1 153; strend = scan + MAX_MATCH-1
154 mov strend,esi 154 mov strend,esi
155; bx = scan_end = *(window+best_len-1) 155; bx = scan_end = *(window+best_len-1)
156 156
157; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 157; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
158; s->strstart - (IPos)MAX_DIST(s) : NIL; 158; s->strstart - (IPos)MAX_DIST(s) : NIL;
159 159
160 mov esi,[ebp+dep_w_size] 160 mov esi,[ebp+dep_w_size]
161 sub esi,MIN_LOOKAHEAD 161 sub esi,MIN_LOOKAHEAD
162; here esi = MAX_DIST(s) 162; here esi = MAX_DIST(s)
163 sub ecx,esi 163 sub ecx,esi
164 ja nodist 164 ja nodist
165 xor ecx,ecx 165 xor ecx,ecx
166nodist: 166nodist:
167 mov limit,ecx 167 mov limit,ecx
168 168
169; prev = s->prev 169; prev = s->prev
170 mov edx,[ebp+dep_prev] 170 mov edx,[ebp+dep_prev]
171 mov prev,edx 171 mov prev,edx
172 172
173; 173;
174 mov edx,dword ptr [ebp+dep_match_start] 174 mov edx,dword ptr [ebp+dep_match_start]
175 mov bp,scan_start 175 mov bp,scan_start
176 mov eax,cur_match 176 mov eax,cur_match
177 mov match_start,edx 177 mov match_start,edx
178 178
179 mov edx,window 179 mov edx,window
180 mov edi,edx 180 mov edi,edx
181 add edi,best_len 181 add edi,best_len
182 mov esi,prev 182 mov esi,prev
183 dec edi 183 dec edi
184; windowlen = window + best_len -1 184; windowlen = window + best_len -1
185 mov windowlen,edi 185 mov windowlen,edi
186 186
187 jmp beginloop2 187 jmp beginloop2
188 align 4 188 align 4
189 189
190; here, in the loop 190; here, in the loop
191; eax = ax = cur_match 191; eax = ax = cur_match
192; ecx = limit 192; ecx = limit
193; bx = scan_end 193; bx = scan_end
194; bp = scan_start 194; bp = scan_start
195; edi = windowlen (window + best_len -1) 195; edi = windowlen (window + best_len -1)
196; esi = prev 196; esi = prev
197 197
198 198
199;// here; chain_length <=16 199;// here; chain_length <=16
200normalbeg0add16: 200normalbeg0add16:
201 add chain_length,16 201 add chain_length,16
202 jz exitloop 202 jz exitloop
203normalbeg0: 203normalbeg0:
204 cmp word ptr[edi+eax],bx 204 cmp word ptr[edi+eax],bx
205 je normalbeg2noroll 205 je normalbeg2noroll
206rcontlabnoroll: 206rcontlabnoroll:
207; cur_match = prev[cur_match & wmask] 207; cur_match = prev[cur_match & wmask]
208 and eax,7fffh 208 and eax,7fffh
209 mov ax,word ptr[esi+eax*2] 209 mov ax,word ptr[esi+eax*2]
210; if cur_match > limit, go to exitloop 210; if cur_match > limit, go to exitloop
211 cmp ecx,eax 211 cmp ecx,eax
212 jnb exitloop 212 jnb exitloop
213; if --chain_length != 0, go to exitloop 213; if --chain_length != 0, go to exitloop
214 dec chain_length 214 dec chain_length
215 jnz normalbeg0 215 jnz normalbeg0
216 jmp exitloop 216 jmp exitloop
217 217
218normalbeg2noroll: 218normalbeg2noroll:
219; if (scan_start==*(cur_match+window)) goto normalbeg2 219; if (scan_start==*(cur_match+window)) goto normalbeg2
220 cmp bp,word ptr[edx+eax] 220 cmp bp,word ptr[edx+eax]
221 jne rcontlabnoroll 221 jne rcontlabnoroll
222 jmp normalbeg2 222 jmp normalbeg2
223 223
224contloop3: 224contloop3:
225 mov edi,windowlen 225 mov edi,windowlen
226 226
227; cur_match = prev[cur_match & wmask] 227; cur_match = prev[cur_match & wmask]
228 and eax,7fffh 228 and eax,7fffh
229 mov ax,word ptr[esi+eax*2] 229 mov ax,word ptr[esi+eax*2]
230; if cur_match > limit, go to exitloop 230; if cur_match > limit, go to exitloop
231 cmp ecx,eax 231 cmp ecx,eax
232jnbexitloopshort1: 232jnbexitloopshort1:
233 jnb exitloop 233 jnb exitloop
234; if --chain_length != 0, go to exitloop 234; if --chain_length != 0, go to exitloop
235 235
236 236
237; begin the main loop 237; begin the main loop
238beginloop2: 238beginloop2:
239 sub chain_length,16+1 239 sub chain_length,16+1
240; if chain_length <=16, don't use the unrolled loop 240; if chain_length <=16, don't use the unrolled loop
241 jna normalbeg0add16 241 jna normalbeg0add16
242 242
243do16: 243do16:
244 cmp word ptr[edi+eax],bx 244 cmp word ptr[edi+eax],bx
245 je normalbeg2dc0 245 je normalbeg2dc0
246 246
247maccn MACRO lab 247maccn MACRO lab
248 and eax,7fffh 248 and eax,7fffh
249 mov ax,word ptr[esi+eax*2] 249 mov ax,word ptr[esi+eax*2]
250 cmp ecx,eax 250 cmp ecx,eax
251 jnb exitloop 251 jnb exitloop
252 cmp word ptr[edi+eax],bx 252 cmp word ptr[edi+eax],bx
253 je lab 253 je lab
254 ENDM 254 ENDM
255 255
256rcontloop0: 256rcontloop0:
257 maccn normalbeg2dc1 257 maccn normalbeg2dc1
258 258
259rcontloop1: 259rcontloop1:
260 maccn normalbeg2dc2 260 maccn normalbeg2dc2
261 261
262rcontloop2: 262rcontloop2:
263 maccn normalbeg2dc3 263 maccn normalbeg2dc3
264 264
265rcontloop3: 265rcontloop3:
266 maccn normalbeg2dc4 266 maccn normalbeg2dc4
267 267
268rcontloop4: 268rcontloop4:
269 maccn normalbeg2dc5 269 maccn normalbeg2dc5
270 270
271rcontloop5: 271rcontloop5:
272 maccn normalbeg2dc6 272 maccn normalbeg2dc6
273 273
274rcontloop6: 274rcontloop6:
275 maccn normalbeg2dc7 275 maccn normalbeg2dc7
276 276
277rcontloop7: 277rcontloop7:
278 maccn normalbeg2dc8 278 maccn normalbeg2dc8
279 279
280rcontloop8: 280rcontloop8:
281 maccn normalbeg2dc9 281 maccn normalbeg2dc9
282 282
283rcontloop9: 283rcontloop9:
284 maccn normalbeg2dc10 284 maccn normalbeg2dc10
285 285
286rcontloop10: 286rcontloop10:
287 maccn short normalbeg2dc11 287 maccn short normalbeg2dc11
288 288
289rcontloop11: 289rcontloop11:
290 maccn short normalbeg2dc12 290 maccn short normalbeg2dc12
291 291
292rcontloop12: 292rcontloop12:
293 maccn short normalbeg2dc13 293 maccn short normalbeg2dc13
294 294
295rcontloop13: 295rcontloop13:
296 maccn short normalbeg2dc14 296 maccn short normalbeg2dc14
297 297
298rcontloop14: 298rcontloop14:
299 maccn short normalbeg2dc15 299 maccn short normalbeg2dc15
300 300
301rcontloop15: 301rcontloop15:
302 and eax,7fffh 302 and eax,7fffh
303 mov ax,word ptr[esi+eax*2] 303 mov ax,word ptr[esi+eax*2]
304 cmp ecx,eax 304 cmp ecx,eax
305 jnb exitloop 305 jnb exitloop
306 306
307 sub chain_length,16 307 sub chain_length,16
308 ja do16 308 ja do16
309 jmp normalbeg0add16 309 jmp normalbeg0add16
310 310
311;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 311;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
312 312
313normbeg MACRO rcontlab,valsub 313normbeg MACRO rcontlab,valsub
314; if we are here, we know that *(match+best_len-1) == scan_end 314; if we are here, we know that *(match+best_len-1) == scan_end
315 cmp bp,word ptr[edx+eax] 315 cmp bp,word ptr[edx+eax]
316; if (match != scan_start) goto rcontlab 316; if (match != scan_start) goto rcontlab
317 jne rcontlab 317 jne rcontlab
318; calculate the good chain_length, and we'll compare scan and match string 318; calculate the good chain_length, and we'll compare scan and match string
319 add chain_length,16-valsub 319 add chain_length,16-valsub
320 jmp iseq 320 jmp iseq
321 ENDM 321 ENDM
322 322
323 323
324normalbeg2dc11: 324normalbeg2dc11:
325 normbeg rcontloop11,11 325 normbeg rcontloop11,11
326 326
327normalbeg2dc12: 327normalbeg2dc12:
328 normbeg short rcontloop12,12 328 normbeg short rcontloop12,12
329 329
330normalbeg2dc13: 330normalbeg2dc13:
331 normbeg short rcontloop13,13 331 normbeg short rcontloop13,13
332 332
333normalbeg2dc14: 333normalbeg2dc14:
334 normbeg short rcontloop14,14 334 normbeg short rcontloop14,14
335 335
336normalbeg2dc15: 336normalbeg2dc15:
337 normbeg short rcontloop15,15 337 normbeg short rcontloop15,15
338 338
339normalbeg2dc10: 339normalbeg2dc10:
340 normbeg rcontloop10,10 340 normbeg rcontloop10,10
341 341
342normalbeg2dc9: 342normalbeg2dc9:
343 normbeg rcontloop9,9 343 normbeg rcontloop9,9
344 344
345normalbeg2dc8: 345normalbeg2dc8:
346 normbeg rcontloop8,8 346 normbeg rcontloop8,8
347 347
348normalbeg2dc7: 348normalbeg2dc7:
349 normbeg rcontloop7,7 349 normbeg rcontloop7,7
350 350
351normalbeg2dc6: 351normalbeg2dc6:
352 normbeg rcontloop6,6 352 normbeg rcontloop6,6
353 353
354normalbeg2dc5: 354normalbeg2dc5:
355 normbeg rcontloop5,5 355 normbeg rcontloop5,5
356 356
357normalbeg2dc4: 357normalbeg2dc4:
358 normbeg rcontloop4,4 358 normbeg rcontloop4,4
359 359
360normalbeg2dc3: 360normalbeg2dc3:
361 normbeg rcontloop3,3 361 normbeg rcontloop3,3
362 362
363normalbeg2dc2: 363normalbeg2dc2:
364 normbeg rcontloop2,2 364 normbeg rcontloop2,2
365 365
366normalbeg2dc1: 366normalbeg2dc1:
367 normbeg rcontloop1,1 367 normbeg rcontloop1,1
368 368
369normalbeg2dc0: 369normalbeg2dc0:
370 normbeg rcontloop0,0 370 normbeg rcontloop0,0
371 371
372 372
373; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end 373; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end
374 374
375normalbeg2: 375normalbeg2:
376 mov edi,window 376 mov edi,window
377 377
378 cmp bp,word ptr[edi+eax] 378 cmp bp,word ptr[edi+eax]
379 jne contloop3 ; if *(ushf*)match != scan_start, continue 379 jne contloop3 ; if *(ushf*)match != scan_start, continue
380 380
381iseq: 381iseq:
382; if we are here, we know that *(match+best_len-1) == scan_end 382; if we are here, we know that *(match+best_len-1) == scan_end
383; and (match == scan_start) 383; and (match == scan_start)
384 384
385 mov edi,edx 385 mov edi,edx
386 mov esi,scan ; esi = scan 386 mov esi,scan ; esi = scan
387 add edi,eax ; edi = window + cur_match = match 387 add edi,eax ; edi = window + cur_match = match
388 388
389 mov edx,[esi+3] ; compare manually dword at match+3 389 mov edx,[esi+3] ; compare manually dword at match+3
390 xor edx,[edi+3] ; and scan +3 390 xor edx,[edi+3] ; and scan +3
391 391
392 jz begincompare ; if equal, go to long compare 392 jz begincompare ; if equal, go to long compare
393 393
394; we will determine the unmatch byte and calculate len (in esi) 394; we will determine the unmatch byte and calculate len (in esi)
395 or dl,dl 395 or dl,dl
396 je eq1rr 396 je eq1rr
397 mov esi,3 397 mov esi,3
398 jmp trfinval 398 jmp trfinval
399eq1rr: 399eq1rr:
400 or dx,dx 400 or dx,dx
401 je eq1 401 je eq1
402 402
403 mov esi,4 403 mov esi,4
404 jmp trfinval 404 jmp trfinval
405eq1: 405eq1:
406 and edx,0ffffffh 406 and edx,0ffffffh
407 jz eq11 407 jz eq11
408 mov esi,5 408 mov esi,5
409 jmp trfinval 409 jmp trfinval
410eq11: 410eq11:
411 mov esi,6 411 mov esi,6
412 jmp trfinval 412 jmp trfinval
413 413
414begincompare: 414begincompare:
415 ; here we now scan and match begin same 415 ; here we now scan and match begin same
416 add edi,6 416 add edi,6
417 add esi,6 417 add esi,6
418 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes 418 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes
419 repe cmpsd ; loop until mismatch 419 repe cmpsd ; loop until mismatch
420 420
421 je trfin ; go to trfin if not unmatch 421 je trfin ; go to trfin if not unmatch
422; we determine the unmatch byte 422; we determine the unmatch byte
423 sub esi,4 423 sub esi,4
424 mov edx,[edi-4] 424 mov edx,[edi-4]
425 xor edx,[esi] 425 xor edx,[esi]
426 426
427 or dl,dl 427 or dl,dl
428 jnz trfin 428 jnz trfin
429 inc esi 429 inc esi
430 430
431 or dx,dx 431 or dx,dx
432 jnz trfin 432 jnz trfin
433 inc esi 433 inc esi
434 434
435 and edx,0ffffffh 435 and edx,0ffffffh
436 jnz trfin 436 jnz trfin
437 inc esi 437 inc esi
438 438
439trfin: 439trfin:
440 sub esi,scan ; esi = len 440 sub esi,scan ; esi = len
441trfinval: 441trfinval:
442; here we have finised compare, and esi contain len of equal string 442; here we have finised compare, and esi contain len of equal string
443 cmp esi,best_len ; if len > best_len, go newbestlen 443 cmp esi,best_len ; if len > best_len, go newbestlen
444 ja short newbestlen 444 ja short newbestlen
445; now we restore edx, ecx and esi, for the big loop 445; now we restore edx, ecx and esi, for the big loop
446 mov esi,prev 446 mov esi,prev
447 mov ecx,limit 447 mov ecx,limit
448 mov edx,window 448 mov edx,window
449 jmp contloop3 449 jmp contloop3
450 450
451newbestlen: 451newbestlen:
452 mov best_len,esi ; len become best_len 452 mov best_len,esi ; len become best_len
453 453
454 mov match_start,eax ; save new position as match_start 454 mov match_start,eax ; save new position as match_start
455 cmp esi,nice_match ; if best_len >= nice_match, exit 455 cmp esi,nice_match ; if best_len >= nice_match, exit
456 jae exitloop 456 jae exitloop
457 mov ecx,scan 457 mov ecx,scan
458 mov edx,window ; restore edx=window 458 mov edx,window ; restore edx=window
459 add ecx,esi 459 add ecx,esi
460 add esi,edx 460 add esi,edx
461 461
462 dec esi 462 dec esi
463 mov windowlen,esi ; windowlen = window + best_len-1 463 mov windowlen,esi ; windowlen = window + best_len-1
464 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end 464 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end
465 465
466; now we restore ecx and esi, for the big loop : 466; now we restore ecx and esi, for the big loop :
467 mov esi,prev 467 mov esi,prev
468 mov ecx,limit 468 mov ecx,limit
469 jmp contloop3 469 jmp contloop3
470 470
471exitloop: 471exitloop:
472; exit : s->match_start=match_start 472; exit : s->match_start=match_start
473 mov ebx,match_start 473 mov ebx,match_start
474 mov ebp,str_s 474 mov ebp,str_s
475 mov ecx,best_len 475 mov ecx,best_len
476 mov dword ptr [ebp+dep_match_start],ebx 476 mov dword ptr [ebp+dep_match_start],ebx
477 mov eax,dword ptr [ebp+dep_lookahead] 477 mov eax,dword ptr [ebp+dep_lookahead]
478 cmp ecx,eax 478 cmp ecx,eax
479 ja minexlo 479 ja minexlo
480 mov eax,ecx 480 mov eax,ecx
481minexlo: 481minexlo:
482; return min(best_len,s->lookahead) 482; return min(best_len,s->lookahead)
483 483
484; restore stack and register ebx,esi,edi,ebp 484; restore stack and register ebx,esi,edi,ebp
485 add esp,NbStackAdd 485 add esp,NbStackAdd
486 486
487 pop ebx 487 pop ebx
488 pop esi 488 pop esi
489 pop edi 489 pop edi
490 pop ebp 490 pop ebp
491 ret 491 ret
492InfoAuthor: 492InfoAuthor:
493; please don't remove this string ! 493; please don't remove this string !
494; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary! 494; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary!
495 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah 495 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah
496 496
497 497
498 498
499IFDEF NOUNDERLINE 499IFDEF NOUNDERLINE
500longest_match_7fff endp 500longest_match_7fff endp
501ELSE 501ELSE
502_longest_match_7fff endp 502_longest_match_7fff endp
503ENDIF 503ENDIF
504 504
505 505
506IFDEF NOUNDERLINE 506IFDEF NOUNDERLINE
507cpudetect32 proc near 507cpudetect32 proc near
508ELSE 508ELSE
509_cpudetect32 proc near 509_cpudetect32 proc near
510ENDIF 510ENDIF
511 511
512 push ebx 512 push ebx
513 513
514 pushfd ; push original EFLAGS 514 pushfd ; push original EFLAGS
515 pop eax ; get original EFLAGS 515 pop eax ; get original EFLAGS
516 mov ecx, eax ; save original EFLAGS 516 mov ecx, eax ; save original EFLAGS
517 xor eax, 40000h ; flip AC bit in EFLAGS 517 xor eax, 40000h ; flip AC bit in EFLAGS
518 push eax ; save new EFLAGS value on stack 518 push eax ; save new EFLAGS value on stack
519 popfd ; replace current EFLAGS value 519 popfd ; replace current EFLAGS value
520 pushfd ; get new EFLAGS 520 pushfd ; get new EFLAGS
521 pop eax ; store new EFLAGS in EAX 521 pop eax ; store new EFLAGS in EAX
522 xor eax, ecx ; can’t toggle AC bit, processor=80386 522 xor eax, ecx ; can’t toggle AC bit, processor=80386
523 jz end_cpu_is_386 ; jump if 80386 processor 523 jz end_cpu_is_386 ; jump if 80386 processor
524 push ecx 524 push ecx
525 popfd ; restore AC bit in EFLAGS first 525 popfd ; restore AC bit in EFLAGS first
526 526
527 pushfd 527 pushfd
528 pushfd 528 pushfd
529 pop ecx 529 pop ecx
530 530
531 mov eax, ecx ; get original EFLAGS 531 mov eax, ecx ; get original EFLAGS
532 xor eax, 200000h ; flip ID bit in EFLAGS 532 xor eax, 200000h ; flip ID bit in EFLAGS
533 push eax ; save new EFLAGS value on stack 533 push eax ; save new EFLAGS value on stack
534 popfd ; replace current EFLAGS value 534 popfd ; replace current EFLAGS value
535 pushfd ; get new EFLAGS 535 pushfd ; get new EFLAGS
536 pop eax ; store new EFLAGS in EAX 536 pop eax ; store new EFLAGS in EAX
537 popfd ; restore original EFLAGS 537 popfd ; restore original EFLAGS
538 xor eax, ecx ; can’t toggle ID bit, 538 xor eax, ecx ; can’t toggle ID bit,
539 je is_old_486 ; processor=old 539 je is_old_486 ; processor=old
540 540
541 mov eax,1 541 mov eax,1
542 db 0fh,0a2h ;CPUID 542 db 0fh,0a2h ;CPUID
543 543
544exitcpudetect: 544exitcpudetect:
545 pop ebx 545 pop ebx
546 ret 546 ret
547 547
548end_cpu_is_386: 548end_cpu_is_386:
549 mov eax,0300h 549 mov eax,0300h
550 jmp exitcpudetect 550 jmp exitcpudetect
551 551
552is_old_486: 552is_old_486:
553 mov eax,0400h 553 mov eax,0400h
554 jmp exitcpudetect 554 jmp exitcpudetect
555 555
556IFDEF NOUNDERLINE 556IFDEF NOUNDERLINE
557cpudetect32 endp 557cpudetect32 endp
558ELSE 558ELSE
559_cpudetect32 endp 559_cpudetect32 endp
560ENDIF 560ENDIF
561 561
562 562
563 563
564 564
565MAX_MATCH equ 258 565MAX_MATCH equ 258
566MIN_MATCH equ 3 566MIN_MATCH equ 3
567MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) 567MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
568MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) 568MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
569 569
570 570
571;;; stack frame offsets 571;;; stack frame offsets
572 572
573chainlenwmask equ esp + 0 ; high word: current chain len 573chainlenwmask equ esp + 0 ; high word: current chain len
574 ; low word: s->wmask 574 ; low word: s->wmask
575window equ esp + 4 ; local copy of s->window 575window equ esp + 4 ; local copy of s->window
576windowbestlen equ esp + 8 ; s->window + bestlen 576windowbestlen equ esp + 8 ; s->window + bestlen
577scanstart equ esp + 16 ; first two bytes of string 577scanstart equ esp + 16 ; first two bytes of string
578scanend equ esp + 12 ; last two bytes of string 578scanend equ esp + 12 ; last two bytes of string
579scanalign equ esp + 20 ; dword-misalignment of string 579scanalign equ esp + 20 ; dword-misalignment of string
580nicematch equ esp + 24 ; a good enough match size 580nicematch equ esp + 24 ; a good enough match size
581bestlen equ esp + 28 ; size of best match so far 581bestlen equ esp + 28 ; size of best match so far
582scan equ esp + 32 ; ptr to string wanting match 582scan equ esp + 32 ; ptr to string wanting match
583 583
584LocalVarsSize equ 36 584LocalVarsSize equ 36
585; saved ebx byte esp + 36 585; saved ebx byte esp + 36
586; saved edi byte esp + 40 586; saved edi byte esp + 40
587; saved esi byte esp + 44 587; saved esi byte esp + 44
588; saved ebp byte esp + 48 588; saved ebp byte esp + 48
589; return address byte esp + 52 589; return address byte esp + 52
590deflatestate equ esp + 56 ; the function arguments 590deflatestate equ esp + 56 ; the function arguments
591curmatch equ esp + 60 591curmatch equ esp + 60
592 592
593;;; Offsets for fields in the deflate_state structure. These numbers 593;;; Offsets for fields in the deflate_state structure. These numbers
594;;; are calculated from the definition of deflate_state, with the 594;;; are calculated from the definition of deflate_state, with the
595;;; assumption that the compiler will dword-align the fields. (Thus, 595;;; assumption that the compiler will dword-align the fields. (Thus,
596;;; changing the definition of deflate_state could easily cause this 596;;; changing the definition of deflate_state could easily cause this
597;;; program to crash horribly, without so much as a warning at 597;;; program to crash horribly, without so much as a warning at
598;;; compile time. Sigh.) 598;;; compile time. Sigh.)
599 599
600dsWSize equ 36 600dsWSize equ 36
601dsWMask equ 44 601dsWMask equ 44
602dsWindow equ 48 602dsWindow equ 48
603dsPrev equ 56 603dsPrev equ 56
604dsMatchLen equ 88 604dsMatchLen equ 88
605dsPrevMatch equ 92 605dsPrevMatch equ 92
606dsStrStart equ 100 606dsStrStart equ 100
607dsMatchStart equ 104 607dsMatchStart equ 104
608dsLookahead equ 108 608dsLookahead equ 108
609dsPrevLen equ 112 609dsPrevLen equ 112
610dsMaxChainLen equ 116 610dsMaxChainLen equ 116
611dsGoodMatch equ 132 611dsGoodMatch equ 132
612dsNiceMatch equ 136 612dsNiceMatch equ 136
613 613
614 614
615;;; match.asm -- Pentium-Pro-optimized version of longest_match() 615;;; match.asm -- Pentium-Pro-optimized version of longest_match()
616;;; Written for zlib 1.1.2 616;;; Written for zlib 1.1.2
617;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 617;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
618;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html 618;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
619;;; 619;;;
620;;; This is free software; you can redistribute it and/or modify it 620;;; This is free software; you can redistribute it and/or modify it
621;;; under the terms of the GNU General Public License. 621;;; under the terms of the GNU General Public License.
622 622
623;GLOBAL _longest_match, _match_init 623;GLOBAL _longest_match, _match_init
624 624
625 625
626;SECTION .text 626;SECTION .text
627 627
628;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) 628;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
629 629
630;_longest_match: 630;_longest_match:
631IFDEF NOUNDERLINE 631IFDEF NOUNDERLINE
632longest_match_686 proc near 632longest_match_686 proc near
633ELSE 633ELSE
634_longest_match_686 proc near 634_longest_match_686 proc near
635ENDIF 635ENDIF
636 636
637 637
638;;; Save registers that the compiler may be using, and adjust esp to 638;;; Save registers that the compiler may be using, and adjust esp to
639;;; make room for our stack frame. 639;;; make room for our stack frame.
640 640
641 push ebp 641 push ebp
642 push edi 642 push edi
643 push esi 643 push esi
644 push ebx 644 push ebx
645 sub esp, LocalVarsSize 645 sub esp, LocalVarsSize
646 646
647;;; Retrieve the function arguments. ecx will hold cur_match 647;;; Retrieve the function arguments. ecx will hold cur_match
648;;; throughout the entire function. edx will hold the pointer to the 648;;; throughout the entire function. edx will hold the pointer to the
649;;; deflate_state structure during the function's setup (before 649;;; deflate_state structure during the function's setup (before
650;;; entering the main loop. 650;;; entering the main loop.
651 651
652 mov edx, [deflatestate] 652 mov edx, [deflatestate]
653 mov ecx, [curmatch] 653 mov ecx, [curmatch]
654 654
655;;; uInt wmask = s->w_mask; 655;;; uInt wmask = s->w_mask;
656;;; unsigned chain_length = s->max_chain_length; 656;;; unsigned chain_length = s->max_chain_length;
657;;; if (s->prev_length >= s->good_match) { 657;;; if (s->prev_length >= s->good_match) {
658;;; chain_length >>= 2; 658;;; chain_length >>= 2;
659;;; } 659;;; }
660 660
661 mov eax, [edx + dsPrevLen] 661 mov eax, [edx + dsPrevLen]
662 mov ebx, [edx + dsGoodMatch] 662 mov ebx, [edx + dsGoodMatch]
663 cmp eax, ebx 663 cmp eax, ebx
664 mov eax, [edx + dsWMask] 664 mov eax, [edx + dsWMask]
665 mov ebx, [edx + dsMaxChainLen] 665 mov ebx, [edx + dsMaxChainLen]
666 jl LastMatchGood 666 jl LastMatchGood
667 shr ebx, 2 667 shr ebx, 2
668LastMatchGood: 668LastMatchGood:
669 669
670;;; chainlen is decremented once beforehand so that the function can 670;;; chainlen is decremented once beforehand so that the function can
671;;; use the sign flag instead of the zero flag for the exit test. 671;;; use the sign flag instead of the zero flag for the exit test.
672;;; It is then shifted into the high word, to make room for the wmask 672;;; It is then shifted into the high word, to make room for the wmask
673;;; value, which it will always accompany. 673;;; value, which it will always accompany.
674 674
675 dec ebx 675 dec ebx
676 shl ebx, 16 676 shl ebx, 16
677 or ebx, eax 677 or ebx, eax
678 mov [chainlenwmask], ebx 678 mov [chainlenwmask], ebx
679 679
680;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 680;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
681 681
682 mov eax, [edx + dsNiceMatch] 682 mov eax, [edx + dsNiceMatch]
683 mov ebx, [edx + dsLookahead] 683 mov ebx, [edx + dsLookahead]
684 cmp ebx, eax 684 cmp ebx, eax
685 jl LookaheadLess 685 jl LookaheadLess
686 mov ebx, eax 686 mov ebx, eax
687LookaheadLess: mov [nicematch], ebx 687LookaheadLess: mov [nicematch], ebx
688 688
689;;; register Bytef *scan = s->window + s->strstart; 689;;; register Bytef *scan = s->window + s->strstart;
690 690
691 mov esi, [edx + dsWindow] 691 mov esi, [edx + dsWindow]
692 mov [window], esi 692 mov [window], esi
693 mov ebp, [edx + dsStrStart] 693 mov ebp, [edx + dsStrStart]
694 lea edi, [esi + ebp] 694 lea edi, [esi + ebp]
695 mov [scan], edi 695 mov [scan], edi
696 696
697;;; Determine how many bytes the scan ptr is off from being 697;;; Determine how many bytes the scan ptr is off from being
698;;; dword-aligned. 698;;; dword-aligned.
699 699
700 mov eax, edi 700 mov eax, edi
701 neg eax 701 neg eax
702 and eax, 3 702 and eax, 3
703 mov [scanalign], eax 703 mov [scanalign], eax
704 704
705;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 705;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
706;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 706;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
707 707
708 mov eax, [edx + dsWSize] 708 mov eax, [edx + dsWSize]
709 sub eax, MIN_LOOKAHEAD 709 sub eax, MIN_LOOKAHEAD
710 sub ebp, eax 710 sub ebp, eax
711 jg LimitPositive 711 jg LimitPositive
712 xor ebp, ebp 712 xor ebp, ebp
713LimitPositive: 713LimitPositive:
714 714
715;;; int best_len = s->prev_length; 715;;; int best_len = s->prev_length;
716 716
717 mov eax, [edx + dsPrevLen] 717 mov eax, [edx + dsPrevLen]
718 mov [bestlen], eax 718 mov [bestlen], eax
719 719
720;;; Store the sum of s->window + best_len in esi locally, and in esi. 720;;; Store the sum of s->window + best_len in esi locally, and in esi.
721 721
722 add esi, eax 722 add esi, eax
723 mov [windowbestlen], esi 723 mov [windowbestlen], esi
724 724
725;;; register ush scan_start = *(ushf*)scan; 725;;; register ush scan_start = *(ushf*)scan;
726;;; register ush scan_end = *(ushf*)(scan+best_len-1); 726;;; register ush scan_end = *(ushf*)(scan+best_len-1);
727;;; Posf *prev = s->prev; 727;;; Posf *prev = s->prev;
728 728
729 movzx ebx, word ptr [edi] 729 movzx ebx, word ptr [edi]
730 mov [scanstart], ebx 730 mov [scanstart], ebx
731 movzx ebx, word ptr [edi + eax - 1] 731 movzx ebx, word ptr [edi + eax - 1]
732 mov [scanend], ebx 732 mov [scanend], ebx
733 mov edi, [edx + dsPrev] 733 mov edi, [edx + dsPrev]
734 734
735;;; Jump into the main loop. 735;;; Jump into the main loop.
736 736
737 mov edx, [chainlenwmask] 737 mov edx, [chainlenwmask]
738 jmp short LoopEntry 738 jmp short LoopEntry
739 739
740align 4 740align 4
741 741
742;;; do { 742;;; do {
743;;; match = s->window + cur_match; 743;;; match = s->window + cur_match;
744;;; if (*(ushf*)(match+best_len-1) != scan_end || 744;;; if (*(ushf*)(match+best_len-1) != scan_end ||
745;;; *(ushf*)match != scan_start) continue; 745;;; *(ushf*)match != scan_start) continue;
746;;; [...] 746;;; [...]
747;;; } while ((cur_match = prev[cur_match & wmask]) > limit 747;;; } while ((cur_match = prev[cur_match & wmask]) > limit
748;;; && --chain_length != 0); 748;;; && --chain_length != 0);
749;;; 749;;;
750;;; Here is the inner loop of the function. The function will spend the 750;;; Here is the inner loop of the function. The function will spend the
751;;; majority of its time in this loop, and majority of that time will 751;;; majority of its time in this loop, and majority of that time will
752;;; be spent in the first ten instructions. 752;;; be spent in the first ten instructions.
753;;; 753;;;
754;;; Within this loop: 754;;; Within this loop:
755;;; ebx = scanend 755;;; ebx = scanend
756;;; ecx = curmatch 756;;; ecx = curmatch
757;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 757;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
758;;; esi = windowbestlen - i.e., (window + bestlen) 758;;; esi = windowbestlen - i.e., (window + bestlen)
759;;; edi = prev 759;;; edi = prev
760;;; ebp = limit 760;;; ebp = limit
761 761
762LookupLoop: 762LookupLoop:
763 and ecx, edx 763 and ecx, edx
764 movzx ecx, word ptr [edi + ecx*2] 764 movzx ecx, word ptr [edi + ecx*2]
765 cmp ecx, ebp 765 cmp ecx, ebp
766 jbe LeaveNow 766 jbe LeaveNow
767 sub edx, 00010000h 767 sub edx, 00010000h
768 js LeaveNow 768 js LeaveNow
769LoopEntry: movzx eax, word ptr [esi + ecx - 1] 769LoopEntry: movzx eax, word ptr [esi + ecx - 1]
770 cmp eax, ebx 770 cmp eax, ebx
771 jnz LookupLoop 771 jnz LookupLoop
772 mov eax, [window] 772 mov eax, [window]
773 movzx eax, word ptr [eax + ecx] 773 movzx eax, word ptr [eax + ecx]
774 cmp eax, [scanstart] 774 cmp eax, [scanstart]
775 jnz LookupLoop 775 jnz LookupLoop
776 776
777;;; Store the current value of chainlen. 777;;; Store the current value of chainlen.
778 778
779 mov [chainlenwmask], edx 779 mov [chainlenwmask], edx
780 780
781;;; Point edi to the string under scrutiny, and esi to the string we 781;;; Point edi to the string under scrutiny, and esi to the string we
782;;; are hoping to match it up with. In actuality, esi and edi are 782;;; are hoping to match it up with. In actuality, esi and edi are
783;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 783;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
784;;; initialized to -(MAX_MATCH_8 - scanalign). 784;;; initialized to -(MAX_MATCH_8 - scanalign).
785 785
786 mov esi, [window] 786 mov esi, [window]
787 mov edi, [scan] 787 mov edi, [scan]
788 add esi, ecx 788 add esi, ecx
789 mov eax, [scanalign] 789 mov eax, [scanalign]
790 mov edx, 0fffffef8h; -(MAX_MATCH_8) 790 mov edx, 0fffffef8h; -(MAX_MATCH_8)
791 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] 791 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
792 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] 792 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
793 793
794;;; Test the strings for equality, 8 bytes at a time. At the end, 794;;; Test the strings for equality, 8 bytes at a time. At the end,
795;;; adjust edx so that it is offset to the exact byte that mismatched. 795;;; adjust edx so that it is offset to the exact byte that mismatched.
796;;; 796;;;
797;;; We already know at this point that the first three bytes of the 797;;; We already know at this point that the first three bytes of the
798;;; strings match each other, and they can be safely passed over before 798;;; strings match each other, and they can be safely passed over before
799;;; starting the compare loop. So what this code does is skip over 0-3 799;;; starting the compare loop. So what this code does is skip over 0-3
800;;; bytes, as much as necessary in order to dword-align the edi 800;;; bytes, as much as necessary in order to dword-align the edi
801;;; pointer. (esi will still be misaligned three times out of four.) 801;;; pointer. (esi will still be misaligned three times out of four.)
802;;; 802;;;
803;;; It should be confessed that this loop usually does not represent 803;;; It should be confessed that this loop usually does not represent
804;;; much of the total running time. Replacing it with a more 804;;; much of the total running time. Replacing it with a more
805;;; straightforward "rep cmpsb" would not drastically degrade 805;;; straightforward "rep cmpsb" would not drastically degrade
806;;; performance. 806;;; performance.
807 807
808LoopCmps: 808LoopCmps:
809 mov eax, [esi + edx] 809 mov eax, [esi + edx]
810 xor eax, [edi + edx] 810 xor eax, [edi + edx]
811 jnz LeaveLoopCmps 811 jnz LeaveLoopCmps
812 mov eax, [esi + edx + 4] 812 mov eax, [esi + edx + 4]
813 xor eax, [edi + edx + 4] 813 xor eax, [edi + edx + 4]
814 jnz LeaveLoopCmps4 814 jnz LeaveLoopCmps4
815 add edx, 8 815 add edx, 8
816 jnz LoopCmps 816 jnz LoopCmps
817 jmp short LenMaximum 817 jmp short LenMaximum
818LeaveLoopCmps4: add edx, 4 818LeaveLoopCmps4: add edx, 4
819LeaveLoopCmps: test eax, 0000FFFFh 819LeaveLoopCmps: test eax, 0000FFFFh
820 jnz LenLower 820 jnz LenLower
821 add edx, 2 821 add edx, 2
822 shr eax, 16 822 shr eax, 16
823LenLower: sub al, 1 823LenLower: sub al, 1
824 adc edx, 0 824 adc edx, 0
825 825
826;;; Calculate the length of the match. If it is longer than MAX_MATCH, 826;;; Calculate the length of the match. If it is longer than MAX_MATCH,
827;;; then automatically accept it as the best possible match and leave. 827;;; then automatically accept it as the best possible match and leave.
828 828
829 lea eax, [edi + edx] 829 lea eax, [edi + edx]
830 mov edi, [scan] 830 mov edi, [scan]
831 sub eax, edi 831 sub eax, edi
832 cmp eax, MAX_MATCH 832 cmp eax, MAX_MATCH
833 jge LenMaximum 833 jge LenMaximum
834 834
835;;; If the length of the match is not longer than the best match we 835;;; If the length of the match is not longer than the best match we
836;;; have so far, then forget it and return to the lookup loop. 836;;; have so far, then forget it and return to the lookup loop.
837 837
838 mov edx, [deflatestate] 838 mov edx, [deflatestate]
839 mov ebx, [bestlen] 839 mov ebx, [bestlen]
840 cmp eax, ebx 840 cmp eax, ebx
841 jg LongerMatch 841 jg LongerMatch
842 mov esi, [windowbestlen] 842 mov esi, [windowbestlen]
843 mov edi, [edx + dsPrev] 843 mov edi, [edx + dsPrev]
844 mov ebx, [scanend] 844 mov ebx, [scanend]
845 mov edx, [chainlenwmask] 845 mov edx, [chainlenwmask]
846 jmp LookupLoop 846 jmp LookupLoop
847 847
848;;; s->match_start = cur_match; 848;;; s->match_start = cur_match;
849;;; best_len = len; 849;;; best_len = len;
850;;; if (len >= nice_match) break; 850;;; if (len >= nice_match) break;
851;;; scan_end = *(ushf*)(scan+best_len-1); 851;;; scan_end = *(ushf*)(scan+best_len-1);
852 852
853LongerMatch: mov ebx, [nicematch] 853LongerMatch: mov ebx, [nicematch]
854 mov [bestlen], eax 854 mov [bestlen], eax
855 mov [edx + dsMatchStart], ecx 855 mov [edx + dsMatchStart], ecx
856 cmp eax, ebx 856 cmp eax, ebx
857 jge LeaveNow 857 jge LeaveNow
858 mov esi, [window] 858 mov esi, [window]
859 add esi, eax 859 add esi, eax
860 mov [windowbestlen], esi 860 mov [windowbestlen], esi
861 movzx ebx, word ptr [edi + eax - 1] 861 movzx ebx, word ptr [edi + eax - 1]
862 mov edi, [edx + dsPrev] 862 mov edi, [edx + dsPrev]
863 mov [scanend], ebx 863 mov [scanend], ebx
864 mov edx, [chainlenwmask] 864 mov edx, [chainlenwmask]
865 jmp LookupLoop 865 jmp LookupLoop
866 866
867;;; Accept the current string, with the maximum possible length. 867;;; Accept the current string, with the maximum possible length.
868 868
869LenMaximum: mov edx, [deflatestate] 869LenMaximum: mov edx, [deflatestate]
870 mov dword ptr [bestlen], MAX_MATCH 870 mov dword ptr [bestlen], MAX_MATCH
871 mov [edx + dsMatchStart], ecx 871 mov [edx + dsMatchStart], ecx
872 872
873;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 873;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
874;;; return s->lookahead; 874;;; return s->lookahead;
875 875
876LeaveNow: 876LeaveNow:
877 mov edx, [deflatestate] 877 mov edx, [deflatestate]
878 mov ebx, [bestlen] 878 mov ebx, [bestlen]
879 mov eax, [edx + dsLookahead] 879 mov eax, [edx + dsLookahead]
880 cmp ebx, eax 880 cmp ebx, eax
881 jg LookaheadRet 881 jg LookaheadRet
882 mov eax, ebx 882 mov eax, ebx
883LookaheadRet: 883LookaheadRet:
884 884
885;;; Restore the stack and return from whence we came. 885;;; Restore the stack and return from whence we came.
886 886
887 add esp, LocalVarsSize 887 add esp, LocalVarsSize
888 pop ebx 888 pop ebx
889 pop esi 889 pop esi
890 pop edi 890 pop edi
891 pop ebp 891 pop ebp
892 892
893 ret 893 ret
894; please don't remove this string ! 894; please don't remove this string !
895; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary! 895; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
896 db 0dh,0ah,"asm686 with masm, code optimised assembly code from Brian Raiter, written 1998",0dh,0ah 896 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
897 897
898IFDEF NOUNDERLINE 898IFDEF NOUNDERLINE
899longest_match_686 endp 899longest_match_686 endp
900ELSE 900ELSE
901_longest_match_686 endp 901_longest_match_686 endp
902ENDIF 902ENDIF
903 903
904_TEXT ends 904_TEXT ends
905end 905end