diff options
| author | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:22:37 -0700 |
|---|---|---|
| committer | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:22:37 -0700 |
| commit | 4b5a43a219d51066c01ff2ab86af18b967f2d0dd (patch) | |
| tree | 4dcaf0cd18751d04cf638a9a6ec521990d4f2e90 /contrib/masm686 | |
| parent | 086e982175da84b3db958191031380794315f95f (diff) | |
| download | zlib-1.2.0.5.tar.gz zlib-1.2.0.5.tar.bz2 zlib-1.2.0.5.zip | |
zlib 1.2.0.5v1.2.0.5
Diffstat (limited to 'contrib/masm686')
| -rw-r--r-- | contrib/masm686/match.asm | 408 |
1 files changed, 408 insertions, 0 deletions
diff --git a/contrib/masm686/match.asm b/contrib/masm686/match.asm new file mode 100644 index 00000000..2287804d --- /dev/null +++ b/contrib/masm686/match.asm | |||
| @@ -0,0 +1,408 @@ | |||
| 1 | |||
| 2 | ; match.asm -- Pentium-Pro optimized version of longest_match() | ||
| 3 | ; | ||
| 4 | ; Updated for zlib 1.1.3 and converted to MASM 6.1x | ||
| 5 | ; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com> | ||
| 6 | ; and Chuck Walbourn <chuckw@kinesoft.com> | ||
| 7 | ; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro> | ||
| 8 | ; | ||
| 9 | ; This is free software; you can redistribute it and/or modify it | ||
| 10 | ; under the terms of the GNU General Public License. | ||
| 11 | |||
| 12 | ; Based on match.S | ||
| 13 | ; Written for zlib 1.1.2 | ||
| 14 | ; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> | ||
| 15 | |||
| 16 | .686P | ||
| 17 | .MODEL FLAT | ||
| 18 | |||
| 19 | ;=========================================================================== | ||
| 20 | ; EQUATES | ||
| 21 | ;=========================================================================== | ||
| 22 | |||
| 23 | MAX_MATCH EQU 258 | ||
| 24 | MIN_MATCH EQU 3 | ||
| 25 | MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1) | ||
| 26 | MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7)) | ||
| 27 | |||
| 28 | ;=========================================================================== | ||
| 29 | ; STRUCTURES | ||
| 30 | ;=========================================================================== | ||
| 31 | |||
| 32 | ; This STRUCT assumes a 4-byte alignment | ||
| 33 | |||
| 34 | DEFLATE_STATE STRUCT | ||
| 35 | ds_strm dd ? | ||
| 36 | ds_status dd ? | ||
| 37 | ds_pending_buf dd ? | ||
| 38 | ds_pending_buf_size dd ? | ||
| 39 | ds_pending_out dd ? | ||
| 40 | ds_pending dd ? | ||
| 41 | ds_wrap dd ? | ||
| 42 | ds_data_type db ? | ||
| 43 | ds_method db ? | ||
| 44 | db ? ; padding | ||
| 45 | db ? ; padding | ||
| 46 | ds_last_flush dd ? | ||
| 47 | ds_w_size dd ? ; used | ||
| 48 | ds_w_bits dd ? | ||
| 49 | ds_w_mask dd ? ; used | ||
| 50 | ds_window dd ? ; used | ||
| 51 | ds_window_size dd ? | ||
| 52 | ds_prev dd ? ; used | ||
| 53 | ds_head dd ? | ||
| 54 | ds_ins_h dd ? | ||
| 55 | ds_hash_size dd ? | ||
| 56 | ds_hash_bits dd ? | ||
| 57 | ds_hash_mask dd ? | ||
| 58 | ds_hash_shift dd ? | ||
| 59 | ds_block_start dd ? | ||
| 60 | ds_match_length dd ? ; used | ||
| 61 | ds_prev_match dd ? ; used | ||
| 62 | ds_match_available dd ? | ||
| 63 | ds_strstart dd ? ; used | ||
| 64 | ds_match_start dd ? ; used | ||
| 65 | ds_lookahead dd ? ; used | ||
| 66 | ds_prev_length dd ? ; used | ||
| 67 | ds_max_chain_length dd ? ; used | ||
| 68 | ds_max_laxy_match dd ? | ||
| 69 | ds_level dd ? | ||
| 70 | ds_strategy dd ? | ||
| 71 | ds_good_match dd ? ; used | ||
| 72 | ds_nice_match dd ? ; used | ||
| 73 | |||
| 74 | ; Don't need anymore of the struct for match | ||
| 75 | DEFLATE_STATE ENDS | ||
| 76 | |||
| 77 | ;=========================================================================== | ||
| 78 | ; CODE | ||
| 79 | ;=========================================================================== | ||
| 80 | _TEXT SEGMENT | ||
| 81 | |||
| 82 | ;--------------------------------------------------------------------------- | ||
| 83 | ; match_init | ||
| 84 | ;--------------------------------------------------------------------------- | ||
| 85 | ALIGN 4 | ||
| 86 | PUBLIC _match_init | ||
| 87 | _match_init PROC | ||
| 88 | ; no initialization needed | ||
| 89 | ret | ||
| 90 | _match_init ENDP | ||
| 91 | |||
| 92 | ;--------------------------------------------------------------------------- | ||
| 93 | ; uInt longest_match(deflate_state *deflatestate, IPos curmatch) | ||
| 94 | ;--------------------------------------------------------------------------- | ||
| 95 | ALIGN 4 | ||
| 96 | |||
| 97 | PUBLIC _longest_match | ||
| 98 | _longest_match PROC | ||
| 99 | |||
| 100 | ; Since this code uses EBP for a scratch register, the stack frame must | ||
| 101 | ; be manually constructed and referenced relative to the ESP register. | ||
| 102 | |||
| 103 | ; Stack image | ||
| 104 | ; Variables | ||
| 105 | chainlenwmask = 0 ; high word: current chain len | ||
| 106 | ; low word: s->wmask | ||
| 107 | window = 4 ; local copy of s->window | ||
| 108 | windowbestlen = 8 ; s->window + bestlen | ||
| 109 | scanend = 12 ; last two bytes of string | ||
| 110 | scanstart = 16 ; first two bytes of string | ||
| 111 | scanalign = 20 ; dword-misalignment of string | ||
| 112 | nicematch = 24 ; a good enough match size | ||
| 113 | bestlen = 28 ; size of best match so far | ||
| 114 | scan = 32 ; ptr to string wanting match | ||
| 115 | varsize = 36 ; number of bytes (also offset to last saved register) | ||
| 116 | |||
| 117 | ; Saved Registers (actually pushed into place) | ||
| 118 | ebx_save = 36 | ||
| 119 | edi_save = 40 | ||
| 120 | esi_save = 44 | ||
| 121 | ebp_save = 48 | ||
| 122 | |||
| 123 | ; Parameters | ||
| 124 | retaddr = 52 | ||
| 125 | deflatestate = 56 | ||
| 126 | curmatch = 60 | ||
| 127 | |||
| 128 | ; Save registers that the compiler may be using | ||
| 129 | push ebp | ||
| 130 | push edi | ||
| 131 | push esi | ||
| 132 | push ebx | ||
| 133 | |||
| 134 | ; Allocate local variable space | ||
| 135 | sub esp,varsize | ||
| 136 | |||
| 137 | ; Retrieve the function arguments. ecx will hold cur_match | ||
| 138 | ; throughout the entire function. edx will hold the pointer to the | ||
| 139 | ; deflate_state structure during the function's setup (before | ||
| 140 | ; entering the main loop). | ||
| 141 | |||
| 142 | mov edx, [esp+deflatestate] | ||
| 143 | ASSUME edx:PTR DEFLATE_STATE | ||
| 144 | |||
| 145 | mov ecx, [esp+curmatch] | ||
| 146 | |||
| 147 | ; uInt wmask = s->w_mask; | ||
| 148 | ; unsigned chain_length = s->max_chain_length; | ||
| 149 | ; if (s->prev_length >= s->good_match) { | ||
| 150 | ; chain_length >>= 2; | ||
| 151 | ; } | ||
| 152 | |||
| 153 | mov eax, [edx].ds_prev_length | ||
| 154 | mov ebx, [edx].ds_good_match | ||
| 155 | cmp eax, ebx | ||
| 156 | mov eax, [edx].ds_w_mask | ||
| 157 | mov ebx, [edx].ds_max_chain_length | ||
| 158 | jl SHORT LastMatchGood | ||
| 159 | shr ebx, 2 | ||
| 160 | LastMatchGood: | ||
| 161 | |||
| 162 | ; chainlen is decremented once beforehand so that the function can | ||
| 163 | ; use the sign flag instead of the zero flag for the exit test. | ||
| 164 | ; It is then shifted into the high word, to make room for the wmask | ||
| 165 | ; value, which it will always accompany. | ||
| 166 | |||
| 167 | dec ebx | ||
| 168 | shl ebx, 16 | ||
| 169 | or ebx, eax | ||
| 170 | mov [esp+chainlenwmask], ebx | ||
| 171 | |||
| 172 | ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; | ||
| 173 | |||
| 174 | mov eax, [edx].ds_nice_match | ||
| 175 | mov ebx, [edx].ds_lookahead | ||
| 176 | cmp ebx, eax | ||
| 177 | jl SHORT LookaheadLess | ||
| 178 | mov ebx, eax | ||
| 179 | LookaheadLess: | ||
| 180 | mov [esp+nicematch], ebx | ||
| 181 | |||
| 182 | ;/* register Bytef *scan = s->window + s->strstart; */ | ||
| 183 | |||
| 184 | mov esi, [edx].ds_window | ||
| 185 | mov [esp+window], esi | ||
| 186 | mov ebp, [edx].ds_strstart | ||
| 187 | lea edi, [esi+ebp] | ||
| 188 | mov [esp+scan],edi | ||
| 189 | |||
| 190 | ;/* Determine how many bytes the scan ptr is off from being */ | ||
| 191 | ;/* dword-aligned. */ | ||
| 192 | |||
| 193 | mov eax, edi | ||
| 194 | neg eax | ||
| 195 | and eax, 3 | ||
| 196 | mov [esp+scanalign], eax | ||
| 197 | |||
| 198 | ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ | ||
| 199 | ;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ | ||
| 200 | |||
| 201 | mov eax, [edx].ds_w_size | ||
| 202 | sub eax, MIN_LOOKAHEAD | ||
| 203 | sub ebp, eax | ||
| 204 | jg SHORT LimitPositive | ||
| 205 | xor ebp, ebp | ||
| 206 | LimitPositive: | ||
| 207 | |||
| 208 | ;/* int best_len = s->prev_length; */ | ||
| 209 | |||
| 210 | mov eax, [edx].ds_prev_length | ||
| 211 | mov [esp+bestlen], eax | ||
| 212 | |||
| 213 | ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ | ||
| 214 | |||
| 215 | add esi, eax | ||
| 216 | mov [esp+windowbestlen], esi | ||
| 217 | |||
| 218 | ;/* register ush scan_start = *(ushf*)scan; */ | ||
| 219 | ;/* register ush scan_end = *(ushf*)(scan+best_len-1); */ | ||
| 220 | ;/* Posf *prev = s->prev; */ | ||
| 221 | |||
| 222 | movzx ebx, WORD PTR[edi] | ||
| 223 | mov [esp+scanstart], ebx | ||
| 224 | movzx ebx, WORD PTR[eax+edi-1] | ||
| 225 | mov [esp+scanend], ebx | ||
| 226 | mov edi, [edx].ds_prev | ||
| 227 | |||
| 228 | ;/* Jump into the main loop. */ | ||
| 229 | |||
| 230 | mov edx, [esp+chainlenwmask] | ||
| 231 | jmp SHORT LoopEntry | ||
| 232 | |||
| 233 | ;/* do { | ||
| 234 | ; * match = s->window + cur_match; | ||
| 235 | ; * if (*(ushf*)(match+best_len-1) != scan_end || | ||
| 236 | ; * *(ushf*)match != scan_start) continue; | ||
| 237 | ; * [...] | ||
| 238 | ; * } while ((cur_match = prev[cur_match & wmask]) > limit | ||
| 239 | ; * && --chain_length != 0); | ||
| 240 | ; * | ||
| 241 | ; * Here is the inner loop of the function. The function will spend the | ||
| 242 | ; * majority of its time in this loop, and majority of that time will | ||
| 243 | ; * be spent in the first ten instructions. | ||
| 244 | ; * | ||
| 245 | ; * Within this loop: | ||
| 246 | ; * %ebx = scanend | ||
| 247 | ; * %ecx = curmatch | ||
| 248 | ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) | ||
| 249 | ; * %esi = windowbestlen - i.e., (window + bestlen) | ||
| 250 | ; * %edi = prev | ||
| 251 | ; * %ebp = limit | ||
| 252 | ; */ | ||
| 253 | |||
| 254 | ALIGN 4 | ||
| 255 | LookupLoop: | ||
| 256 | and ecx, edx | ||
| 257 | movzx ecx, WORD PTR[edi+ecx*2] | ||
| 258 | cmp ecx, ebp | ||
| 259 | jbe LeaveNow | ||
| 260 | sub edx, 000010000H | ||
| 261 | js LeaveNow | ||
| 262 | |||
| 263 | LoopEntry: | ||
| 264 | movzx eax, WORD PTR[esi+ecx-1] | ||
| 265 | cmp eax, ebx | ||
| 266 | jnz SHORT LookupLoop | ||
| 267 | |||
| 268 | mov eax, [esp+window] | ||
| 269 | movzx eax, WORD PTR[eax+ecx] | ||
| 270 | cmp eax, [esp+scanstart] | ||
| 271 | jnz SHORT LookupLoop | ||
| 272 | |||
| 273 | ;/* Store the current value of chainlen. */ | ||
| 274 | |||
| 275 | mov [esp+chainlenwmask], edx | ||
| 276 | |||
| 277 | ;/* Point %edi to the string under scrutiny, and %esi to the string we */ | ||
| 278 | ;/* are hoping to match it up with. In actuality, %esi and %edi are */ | ||
| 279 | ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ | ||
| 280 | ;/* initialized to -(MAX_MATCH_8 - scanalign). */ | ||
| 281 | |||
| 282 | mov esi, [esp+window] | ||
| 283 | mov edi, [esp+scan] | ||
| 284 | add esi, ecx | ||
| 285 | mov eax, [esp+scanalign] | ||
| 286 | mov edx, -MAX_MATCH_8 | ||
| 287 | lea edi, [edi+eax+MAX_MATCH_8] | ||
| 288 | lea esi, [esi+eax+MAX_MATCH_8] | ||
| 289 | |||
| 290 | ;/* Test the strings for equality, 8 bytes at a time. At the end, | ||
| 291 | ; * adjust %edx so that it is offset to the exact byte that mismatched. | ||
| 292 | ; * | ||
| 293 | ; * We already know at this point that the first three bytes of the | ||
| 294 | ; * strings match each other, and they can be safely passed over before | ||
| 295 | ; * starting the compare loop. So what this code does is skip over 0-3 | ||
| 296 | ; * bytes, as much as necessary in order to dword-align the %edi | ||
| 297 | ; * pointer. (%esi will still be misaligned three times out of four.) | ||
| 298 | ; * | ||
| 299 | ; * It should be confessed that this loop usually does not represent | ||
| 300 | ; * much of the total running time. Replacing it with a more | ||
| 301 | ; * straightforward "rep cmpsb" would not drastically degrade | ||
| 302 | ; * performance. | ||
| 303 | ; */ | ||
| 304 | |||
| 305 | LoopCmps: | ||
| 306 | mov eax, DWORD PTR[esi+edx] | ||
| 307 | xor eax, DWORD PTR[edi+edx] | ||
| 308 | jnz SHORT LeaveLoopCmps | ||
| 309 | |||
| 310 | mov eax, DWORD PTR[esi+edx+4] | ||
| 311 | xor eax, DWORD PTR[edi+edx+4] | ||
| 312 | jnz SHORT LeaveLoopCmps4 | ||
| 313 | |||
| 314 | add edx, 8 | ||
| 315 | jnz SHORT LoopCmps | ||
| 316 | jmp LenMaximum | ||
| 317 | ALIGN 4 | ||
| 318 | |||
| 319 | LeaveLoopCmps4: | ||
| 320 | add edx, 4 | ||
| 321 | |||
| 322 | LeaveLoopCmps: | ||
| 323 | test eax, 00000FFFFH | ||
| 324 | jnz SHORT LenLower | ||
| 325 | |||
| 326 | add edx, 2 | ||
| 327 | shr eax, 16 | ||
| 328 | |||
| 329 | LenLower: | ||
| 330 | sub al, 1 | ||
| 331 | adc edx, 0 | ||
| 332 | |||
| 333 | ;/* Calculate the length of the match. If it is longer than MAX_MATCH, */ | ||
| 334 | ;/* then automatically accept it as the best possible match and leave. */ | ||
| 335 | |||
| 336 | lea eax, [edi+edx] | ||
| 337 | mov edi, [esp+scan] | ||
| 338 | sub eax, edi | ||
| 339 | cmp eax, MAX_MATCH | ||
| 340 | jge SHORT LenMaximum | ||
| 341 | |||
| 342 | ;/* If the length of the match is not longer than the best match we */ | ||
| 343 | ;/* have so far, then forget it and return to the lookup loop. */ | ||
| 344 | |||
| 345 | mov edx, [esp+deflatestate] | ||
| 346 | mov ebx, [esp+bestlen] | ||
| 347 | cmp eax, ebx | ||
| 348 | jg SHORT LongerMatch | ||
| 349 | mov esi, [esp+windowbestlen] | ||
| 350 | mov edi, [edx].ds_prev | ||
| 351 | mov ebx, [esp+scanend] | ||
| 352 | mov edx, [esp+chainlenwmask] | ||
| 353 | jmp LookupLoop | ||
| 354 | ALIGN 4 | ||
| 355 | |||
| 356 | ;/* s->match_start = cur_match; */ | ||
| 357 | ;/* best_len = len; */ | ||
| 358 | ;/* if (len >= nice_match) break; */ | ||
| 359 | ;/* scan_end = *(ushf*)(scan+best_len-1); */ | ||
| 360 | |||
| 361 | LongerMatch: | ||
| 362 | mov ebx, [esp+nicematch] | ||
| 363 | mov [esp+bestlen], eax | ||
| 364 | mov [edx].ds_match_start, ecx | ||
| 365 | cmp eax, ebx | ||
| 366 | jge SHORT LeaveNow | ||
| 367 | mov esi, [esp+window] | ||
| 368 | add esi, eax | ||
| 369 | mov [esp+windowbestlen], esi | ||
| 370 | movzx ebx, WORD PTR[edi+eax-1] | ||
| 371 | mov edi, [edx].ds_prev | ||
| 372 | mov [esp+scanend], ebx | ||
| 373 | mov edx, [esp+chainlenwmask] | ||
| 374 | jmp LookupLoop | ||
| 375 | ALIGN 4 | ||
| 376 | |||
| 377 | ;/* Accept the current string, with the maximum possible length. */ | ||
| 378 | |||
| 379 | LenMaximum: | ||
| 380 | mov edx, [esp+deflatestate] | ||
| 381 | mov DWORD PTR[esp+bestlen], MAX_MATCH | ||
| 382 | mov [edx].ds_match_start, ecx | ||
| 383 | |||
| 384 | ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ | ||
| 385 | ;/* return s->lookahead; */ | ||
| 386 | |||
| 387 | LeaveNow: | ||
| 388 | mov edx, [esp+deflatestate] | ||
| 389 | mov ebx, [esp+bestlen] | ||
| 390 | mov eax, [edx].ds_lookahead | ||
| 391 | cmp ebx, eax | ||
| 392 | jg SHORT LookaheadRet | ||
| 393 | mov eax, ebx | ||
| 394 | LookaheadRet: | ||
| 395 | |||
| 396 | ; Restore the stack and return from whence we came. | ||
| 397 | |||
| 398 | add esp, varsize | ||
| 399 | pop ebx | ||
| 400 | pop esi | ||
| 401 | pop edi | ||
| 402 | pop ebp | ||
| 403 | ret | ||
| 404 | |||
| 405 | _longest_match ENDP | ||
| 406 | |||
| 407 | _TEXT ENDS | ||
| 408 | END | ||
