diff options
Diffstat (limited to 'Asm/x86/LzFindOpt.asm')
-rw-r--r-- | Asm/x86/LzFindOpt.asm | 513 |
1 files changed, 513 insertions, 0 deletions
diff --git a/Asm/x86/LzFindOpt.asm b/Asm/x86/LzFindOpt.asm new file mode 100644 index 0000000..42e10bd --- /dev/null +++ b/Asm/x86/LzFindOpt.asm | |||
@@ -0,0 +1,513 @@ | |||
1 | ; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function | ||
2 | ; 2021-07-21: Igor Pavlov : Public domain | ||
3 | ; | ||
4 | |||
5 | ifndef x64 | ||
6 | ; x64=1 | ||
7 | ; .err <x64_IS_REQUIRED> | ||
8 | endif | ||
9 | |||
10 | include 7zAsm.asm | ||
11 | |||
12 | MY_ASM_START | ||
13 | |||
14 | _TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE' | ||
15 | |||
16 | MY_ALIGN macro num:req | ||
17 | align num | ||
18 | endm | ||
19 | |||
20 | MY_ALIGN_32 macro | ||
21 | MY_ALIGN 32 | ||
22 | endm | ||
23 | |||
24 | MY_ALIGN_64 macro | ||
25 | MY_ALIGN 64 | ||
26 | endm | ||
27 | |||
28 | |||
29 | t0_L equ x0_L | ||
30 | t0_x equ x0 | ||
31 | t0 equ r0 | ||
32 | t1_x equ x3 | ||
33 | t1 equ r3 | ||
34 | |||
35 | cp_x equ t1_x | ||
36 | cp_r equ t1 | ||
37 | m equ x5 | ||
38 | m_r equ r5 | ||
39 | len_x equ x6 | ||
40 | len equ r6 | ||
41 | diff_x equ x7 | ||
42 | diff equ r7 | ||
43 | len0 equ r10 | ||
44 | len1_x equ x11 | ||
45 | len1 equ r11 | ||
46 | maxLen_x equ x12 | ||
47 | maxLen equ r12 | ||
48 | d equ r13 | ||
49 | ptr0 equ r14 | ||
50 | ptr1 equ r15 | ||
51 | |||
52 | d_lim equ m_r | ||
53 | cycSize equ len_x | ||
54 | hash_lim equ len0 | ||
55 | delta1_x equ len1_x | ||
56 | delta1_r equ len1 | ||
57 | delta_x equ maxLen_x | ||
58 | delta_r equ maxLen | ||
59 | hash equ ptr0 | ||
60 | src equ ptr1 | ||
61 | |||
62 | |||
63 | |||
64 | if (IS_LINUX gt 0) | ||
65 | |||
66 | ; r1 r2 r8 r9 : win32 | ||
67 | ; r7 r6 r2 r1 r8 r9 : linux | ||
68 | |||
69 | lenLimit equ r8 | ||
70 | lenLimit_x equ x8 | ||
71 | ; pos_r equ r2 | ||
72 | pos equ x2 | ||
73 | cur equ r1 | ||
74 | son equ r9 | ||
75 | |||
76 | else | ||
77 | |||
78 | lenLimit equ REG_ABI_PARAM_2 | ||
79 | lenLimit_x equ REG_ABI_PARAM_2_x | ||
80 | pos equ REG_ABI_PARAM_1_x | ||
81 | cur equ REG_ABI_PARAM_0 | ||
82 | son equ REG_ABI_PARAM_3 | ||
83 | |||
84 | endif | ||
85 | |||
86 | |||
87 | if (IS_LINUX gt 0) | ||
88 | maxLen_OFFS equ (REG_SIZE * (6 + 1)) | ||
89 | else | ||
90 | cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4)) | ||
91 | d_OFFS equ (REG_SIZE + cutValue_OFFS) | ||
92 | maxLen_OFFS equ (REG_SIZE + d_OFFS) | ||
93 | endif | ||
94 | hash_OFFS equ (REG_SIZE + maxLen_OFFS) | ||
95 | limit_OFFS equ (REG_SIZE + hash_OFFS) | ||
96 | size_OFFS equ (REG_SIZE + limit_OFFS) | ||
97 | cycPos_OFFS equ (REG_SIZE + size_OFFS) | ||
98 | cycSize_OFFS equ (REG_SIZE + cycPos_OFFS) | ||
99 | posRes_OFFS equ (REG_SIZE + cycSize_OFFS) | ||
100 | |||
101 | if (IS_LINUX gt 0) | ||
102 | else | ||
103 | cutValue_PAR equ [r0 + cutValue_OFFS] | ||
104 | d_PAR equ [r0 + d_OFFS] | ||
105 | endif | ||
106 | maxLen_PAR equ [r0 + maxLen_OFFS] | ||
107 | hash_PAR equ [r0 + hash_OFFS] | ||
108 | limit_PAR equ [r0 + limit_OFFS] | ||
109 | size_PAR equ [r0 + size_OFFS] | ||
110 | cycPos_PAR equ [r0 + cycPos_OFFS] | ||
111 | cycSize_PAR equ [r0 + cycSize_OFFS] | ||
112 | posRes_PAR equ [r0 + posRes_OFFS] | ||
113 | |||
114 | |||
115 | cutValue_VAR equ DWORD PTR [r4 + 8 * 0] | ||
116 | cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4] | ||
117 | cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0] | ||
118 | cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4] | ||
119 | hash_VAR equ QWORD PTR [r4 + 8 * 2] | ||
120 | limit_VAR equ QWORD PTR [r4 + 8 * 3] | ||
121 | size_VAR equ QWORD PTR [r4 + 8 * 4] | ||
122 | distances equ QWORD PTR [r4 + 8 * 5] | ||
123 | maxLen_VAR equ QWORD PTR [r4 + 8 * 6] | ||
124 | |||
125 | Old_RSP equ QWORD PTR [r4 + 8 * 7] | ||
126 | LOCAL_SIZE equ 8 * 8 | ||
127 | |||
128 | COPY_VAR_32 macro dest_var, src_var | ||
129 | mov x3, src_var | ||
130 | mov dest_var, x3 | ||
131 | endm | ||
132 | |||
133 | COPY_VAR_64 macro dest_var, src_var | ||
134 | mov r3, src_var | ||
135 | mov dest_var, r3 | ||
136 | endm | ||
137 | |||
138 | |||
139 | ; MY_ALIGN_64 | ||
140 | MY_PROC GetMatchesSpecN_2, 13 | ||
141 | MY_PUSH_PRESERVED_ABI_REGS | ||
142 | mov r0, RSP | ||
143 | lea r3, [r0 - LOCAL_SIZE] | ||
144 | and r3, -64 | ||
145 | mov RSP, r3 | ||
146 | mov Old_RSP, r0 | ||
147 | |||
148 | if (IS_LINUX gt 0) | ||
149 | mov d, REG_ABI_PARAM_5 ; r13 = r9 | ||
150 | mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8 | ||
151 | mov son, REG_ABI_PARAM_3 ; r9 = r1 | ||
152 | mov r8, REG_ABI_PARAM_2 ; r8 = r2 | ||
153 | mov pos, REG_ABI_PARAM_1_x ; r2 = x6 | ||
154 | mov r1, REG_ABI_PARAM_0 ; r1 = r7 | ||
155 | else | ||
156 | COPY_VAR_32 cutValue_VAR, cutValue_PAR | ||
157 | mov d, d_PAR | ||
158 | endif | ||
159 | |||
160 | COPY_VAR_64 limit_VAR, limit_PAR | ||
161 | |||
162 | mov hash_lim, size_PAR | ||
163 | mov size_VAR, hash_lim | ||
164 | |||
165 | mov cp_x, cycPos_PAR | ||
166 | mov hash, hash_PAR | ||
167 | |||
168 | mov cycSize, cycSize_PAR | ||
169 | mov cycSize_VAR, cycSize | ||
170 | |||
171 | ; we want cur in (rcx). So we change the cur and lenLimit variables | ||
172 | sub lenLimit, cur | ||
173 | neg lenLimit_x | ||
174 | inc lenLimit_x | ||
175 | |||
176 | mov t0_x, maxLen_PAR | ||
177 | sub t0, lenLimit | ||
178 | mov maxLen_VAR, t0 | ||
179 | |||
180 | jmp main_loop | ||
181 | |||
182 | MY_ALIGN_64 | ||
183 | fill_empty: | ||
184 | ; ptr0 = *ptr1 = kEmptyHashValue; | ||
185 | mov QWORD PTR [ptr1], 0 | ||
186 | inc pos | ||
187 | inc cp_x | ||
188 | mov DWORD PTR [d - 4], 0 | ||
189 | cmp d, limit_VAR | ||
190 | jae fin | ||
191 | cmp hash, hash_lim | ||
192 | je fin | ||
193 | |||
194 | ; MY_ALIGN_64 | ||
195 | main_loop: | ||
196 | ; UInt32 delta = *hash++; | ||
197 | mov diff_x, [hash] ; delta | ||
198 | add hash, 4 | ||
199 | ; mov cycPos_VAR, cp_x | ||
200 | |||
201 | inc cur | ||
202 | add d, 4 | ||
203 | mov m, pos | ||
204 | sub m, diff_x; ; matchPos | ||
205 | |||
206 | ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; | ||
207 | lea ptr1, [son + 8 * cp_r] | ||
208 | ; mov cycSize, cycSize_VAR | ||
209 | cmp pos, cycSize | ||
210 | jb directMode ; if (pos < cycSize_VAR) | ||
211 | |||
212 | ; CYC MODE | ||
213 | |||
214 | cmp diff_x, cycSize | ||
215 | jae fill_empty ; if (delta >= cycSize_VAR) | ||
216 | |||
217 | xor t0_x, t0_x | ||
218 | mov cycPos_VAR, cp_x | ||
219 | sub cp_x, diff_x | ||
220 | ; jae prepare_for_tree_loop | ||
221 | ; add cp_x, cycSize | ||
222 | cmovb t0_x, cycSize | ||
223 | add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) | ||
224 | jmp prepare_for_tree_loop | ||
225 | |||
226 | |||
227 | directMode: | ||
228 | cmp diff_x, pos | ||
229 | je fill_empty ; if (delta == pos) | ||
230 | jae fin_error ; if (delta >= pos) | ||
231 | |||
232 | mov cycPos_VAR, cp_x | ||
233 | mov cp_x, m | ||
234 | |||
235 | prepare_for_tree_loop: | ||
236 | mov len0, lenLimit | ||
237 | mov hash_VAR, hash | ||
238 | ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1; | ||
239 | lea ptr0, [ptr1 + 4] | ||
240 | ; UInt32 *_distances = ++d; | ||
241 | mov distances, d | ||
242 | |||
243 | neg len0 | ||
244 | mov len1, len0 | ||
245 | |||
246 | mov t0_x, cutValue_VAR | ||
247 | mov maxLen, maxLen_VAR | ||
248 | mov cutValueCur_VAR, t0_x | ||
249 | |||
250 | MY_ALIGN_32 | ||
251 | tree_loop: | ||
252 | neg diff | ||
253 | mov len, len0 | ||
254 | cmp len1, len0 | ||
255 | cmovb len, len1 ; len = (len1 < len0 ? len1 : len0); | ||
256 | add diff, cur | ||
257 | |||
258 | mov t0_x, [son + cp_r * 8] ; prefetch | ||
259 | movzx t0_x, BYTE PTR [diff + 1 * len] | ||
260 | lea cp_r, [son + cp_r * 8] | ||
261 | cmp [cur + 1 * len], t0_L | ||
262 | je matched_1 | ||
263 | |||
264 | jb left_0 | ||
265 | |||
266 | mov [ptr1], m | ||
267 | mov m, [cp_r + 4] | ||
268 | lea ptr1, [cp_r + 4] | ||
269 | sub diff, cur ; FIX32 | ||
270 | jmp next_node | ||
271 | |||
272 | MY_ALIGN_32 | ||
273 | left_0: | ||
274 | mov [ptr0], m | ||
275 | mov m, [cp_r] | ||
276 | mov ptr0, cp_r | ||
277 | sub diff, cur ; FIX32 | ||
278 | ; jmp next_node | ||
279 | |||
280 | ; ------------ NEXT NODE ------------ | ||
281 | ; MY_ALIGN_32 | ||
282 | next_node: | ||
283 | mov cycSize, cycSize_VAR | ||
284 | dec cutValueCur_VAR | ||
285 | je finish_tree | ||
286 | |||
287 | add diff_x, pos ; prev_match = pos + diff | ||
288 | cmp m, diff_x | ||
289 | jae fin_error ; if (new_match >= prev_match) | ||
290 | |||
291 | mov diff_x, pos | ||
292 | sub diff_x, m ; delta = pos - new_match | ||
293 | cmp pos, cycSize | ||
294 | jae cyc_mode_2 ; if (pos >= cycSize) | ||
295 | |||
296 | mov cp_x, m | ||
297 | test m, m | ||
298 | jne tree_loop ; if (m != 0) | ||
299 | |||
300 | finish_tree: | ||
301 | ; ptr0 = *ptr1 = kEmptyHashValue; | ||
302 | mov DWORD PTR [ptr0], 0 | ||
303 | mov DWORD PTR [ptr1], 0 | ||
304 | |||
305 | inc pos | ||
306 | |||
307 | ; _distances[-1] = (UInt32)(d - _distances); | ||
308 | mov t0, distances | ||
309 | mov t1, d | ||
310 | sub t1, t0 | ||
311 | shr t1_x, 2 | ||
312 | mov [t0 - 4], t1_x | ||
313 | |||
314 | cmp d, limit_VAR | ||
315 | jae fin ; if (d >= limit) | ||
316 | |||
317 | mov cp_x, cycPos_VAR | ||
318 | mov hash, hash_VAR | ||
319 | mov hash_lim, size_VAR | ||
320 | inc cp_x | ||
321 | cmp hash, hash_lim | ||
322 | jne main_loop ; if (hash != size) | ||
323 | jmp fin | ||
324 | |||
325 | |||
326 | MY_ALIGN_32 | ||
327 | cyc_mode_2: | ||
328 | cmp diff_x, cycSize | ||
329 | jae finish_tree ; if (delta >= cycSize) | ||
330 | |||
331 | mov cp_x, cycPos_VAR | ||
332 | xor t0_x, t0_x | ||
333 | sub cp_x, diff_x ; cp_x = cycPos - delta | ||
334 | cmovb t0_x, cycSize | ||
335 | add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) | ||
336 | jmp tree_loop | ||
337 | |||
338 | |||
339 | MY_ALIGN_32 | ||
340 | matched_1: | ||
341 | |||
342 | inc len | ||
343 | ; cmp len_x, lenLimit_x | ||
344 | je short lenLimit_reach | ||
345 | movzx t0_x, BYTE PTR [diff + 1 * len] | ||
346 | cmp [cur + 1 * len], t0_L | ||
347 | jne mismatch | ||
348 | |||
349 | |||
350 | MY_ALIGN_32 | ||
351 | match_loop: | ||
352 | ; while (++len != lenLimit) (len[diff] != len[0]) ; | ||
353 | |||
354 | inc len | ||
355 | ; cmp len_x, lenLimit_x | ||
356 | je short lenLimit_reach | ||
357 | movzx t0_x, BYTE PTR [diff + 1 * len] | ||
358 | cmp BYTE PTR [cur + 1 * len], t0_L | ||
359 | je match_loop | ||
360 | |||
361 | mismatch: | ||
362 | jb left_2 | ||
363 | |||
364 | mov [ptr1], m | ||
365 | mov m, [cp_r + 4] | ||
366 | lea ptr1, [cp_r + 4] | ||
367 | mov len1, len | ||
368 | |||
369 | jmp max_update | ||
370 | |||
371 | MY_ALIGN_32 | ||
372 | left_2: | ||
373 | mov [ptr0], m | ||
374 | mov m, [cp_r] | ||
375 | mov ptr0, cp_r | ||
376 | mov len0, len | ||
377 | |||
378 | max_update: | ||
379 | sub diff, cur ; restore diff | ||
380 | |||
381 | cmp maxLen, len | ||
382 | jae next_node | ||
383 | |||
384 | mov maxLen, len | ||
385 | add len, lenLimit | ||
386 | mov [d], len_x | ||
387 | mov t0_x, diff_x | ||
388 | not t0_x | ||
389 | mov [d + 4], t0_x | ||
390 | add d, 8 | ||
391 | |||
392 | jmp next_node | ||
393 | |||
394 | |||
395 | |||
396 | MY_ALIGN_32 | ||
397 | lenLimit_reach: | ||
398 | |||
399 | mov delta_r, cur | ||
400 | sub delta_r, diff | ||
401 | lea delta1_r, [delta_r - 1] | ||
402 | |||
403 | mov t0_x, [cp_r] | ||
404 | mov [ptr1], t0_x | ||
405 | mov t0_x, [cp_r + 4] | ||
406 | mov [ptr0], t0_x | ||
407 | |||
408 | mov [d], lenLimit_x | ||
409 | mov [d + 4], delta1_x | ||
410 | add d, 8 | ||
411 | |||
412 | ; _distances[-1] = (UInt32)(d - _distances); | ||
413 | mov t0, distances | ||
414 | mov t1, d | ||
415 | sub t1, t0 | ||
416 | shr t1_x, 2 | ||
417 | mov [t0 - 4], t1_x | ||
418 | |||
419 | mov hash, hash_VAR | ||
420 | mov hash_lim, size_VAR | ||
421 | |||
422 | inc pos | ||
423 | mov cp_x, cycPos_VAR | ||
424 | inc cp_x | ||
425 | |||
426 | mov d_lim, limit_VAR | ||
427 | mov cycSize, cycSize_VAR | ||
428 | ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) | ||
429 | ; break; | ||
430 | cmp hash, hash_lim | ||
431 | je fin | ||
432 | cmp d, d_lim | ||
433 | jae fin | ||
434 | cmp delta_x, [hash] | ||
435 | jne main_loop | ||
436 | movzx t0_x, BYTE PTR [diff] | ||
437 | cmp [cur], t0_L | ||
438 | jne main_loop | ||
439 | |||
440 | ; jmp main_loop ; bypass for debug | ||
441 | |||
442 | mov cycPos_VAR, cp_x | ||
443 | shl len, 3 ; cycSize * 8 | ||
444 | sub diff, cur ; restore diff | ||
445 | xor t0_x, t0_x | ||
446 | cmp cp_x, delta_x ; cmp (cycPos_VAR, delta) | ||
447 | lea cp_r, [son + 8 * cp_r] ; dest | ||
448 | lea src, [cp_r + 8 * diff] | ||
449 | cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0) | ||
450 | add src, t0 | ||
451 | add len, son ; len = son + cycSize * 8 | ||
452 | |||
453 | |||
454 | MY_ALIGN_32 | ||
455 | long_loop: | ||
456 | add hash, 4 | ||
457 | |||
458 | ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff]; | ||
459 | |||
460 | mov t0, [src] | ||
461 | add src, 8 | ||
462 | mov [cp_r], t0 | ||
463 | add cp_r, 8 | ||
464 | cmp src, len | ||
465 | cmove src, son ; if end of (son) buffer is reached, we wrap to begin | ||
466 | |||
467 | mov DWORD PTR [d], 2 | ||
468 | mov [d + 4], lenLimit_x | ||
469 | mov [d + 8], delta1_x | ||
470 | add d, 12 | ||
471 | |||
472 | inc cur | ||
473 | |||
474 | cmp hash, hash_lim | ||
475 | je long_footer | ||
476 | cmp delta_x, [hash] | ||
477 | jne long_footer | ||
478 | movzx t0_x, BYTE PTR [diff + 1 * cur] | ||
479 | cmp [cur], t0_L | ||
480 | jne long_footer | ||
481 | cmp d, d_lim | ||
482 | jb long_loop | ||
483 | |||
484 | long_footer: | ||
485 | sub cp_r, son | ||
486 | shr cp_r, 3 | ||
487 | add pos, cp_x | ||
488 | sub pos, cycPos_VAR | ||
489 | mov cycSize, cycSize_VAR | ||
490 | |||
491 | cmp d, d_lim | ||
492 | jae fin | ||
493 | cmp hash, hash_lim | ||
494 | jne main_loop | ||
495 | jmp fin | ||
496 | |||
497 | |||
498 | |||
499 | fin_error: | ||
500 | xor d, d | ||
501 | |||
502 | fin: | ||
503 | mov RSP, Old_RSP | ||
504 | mov t0, [r4 + posRes_OFFS] | ||
505 | mov [t0], pos | ||
506 | mov r0, d | ||
507 | |||
508 | MY_POP_PRESERVED_ABI_REGS | ||
509 | MY_ENDP | ||
510 | |||
511 | _TEXT$LZFINDOPT ENDS | ||
512 | |||
513 | end | ||