aboutsummaryrefslogtreecommitdiff
path: root/Asm/x86/LzFindOpt.asm
blob: 42e10bda2ee5cc3f1d7aae2a927966b78d3df047 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
; 2021-07-21: Igor Pavlov : Public domain
;

ifndef x64
; x64=1
; .err <x64_IS_REQUIRED>
endif

include 7zAsm.asm

MY_ASM_START

_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'

MY_ALIGN macro num:req
        align  num
endm

MY_ALIGN_32 macro
        MY_ALIGN 32
endm

MY_ALIGN_64 macro
        MY_ALIGN 64
endm


t0_L    equ x0_L
t0_x    equ x0
t0      equ r0
t1_x    equ x3
t1      equ r3

cp_x    equ t1_x
cp_r    equ t1
m       equ x5
m_r     equ r5
len_x   equ x6
len     equ r6
diff_x  equ x7
diff    equ r7
len0    equ r10
len1_x  equ x11
len1    equ r11
maxLen_x equ x12
maxLen  equ r12
d       equ r13
ptr0    equ r14
ptr1    equ r15

d_lim       equ m_r
cycSize     equ len_x
hash_lim    equ len0
delta1_x    equ len1_x
delta1_r    equ len1
delta_x     equ maxLen_x
delta_r     equ maxLen
hash        equ ptr0
src         equ ptr1



if (IS_LINUX gt 0)

; r1 r2  r8 r9        : win32
; r7 r6  r2 r1  r8 r9 : linux

lenLimit        equ r8
lenLimit_x      equ x8
; pos_r           equ r2
pos             equ x2
cur             equ r1
son             equ r9

else

lenLimit        equ REG_ABI_PARAM_2
lenLimit_x      equ REG_ABI_PARAM_2_x
pos             equ REG_ABI_PARAM_1_x
cur             equ REG_ABI_PARAM_0
son             equ REG_ABI_PARAM_3

endif


if (IS_LINUX gt 0)
    maxLen_OFFS         equ  (REG_SIZE * (6 + 1))
else
    cutValue_OFFS       equ  (REG_SIZE * (8 + 1 + 4))
    d_OFFS              equ  (REG_SIZE + cutValue_OFFS)
    maxLen_OFFS         equ  (REG_SIZE + d_OFFS)
endif
    hash_OFFS           equ  (REG_SIZE + maxLen_OFFS)
    limit_OFFS          equ  (REG_SIZE + hash_OFFS)
    size_OFFS           equ  (REG_SIZE + limit_OFFS)
    cycPos_OFFS         equ  (REG_SIZE + size_OFFS)
    cycSize_OFFS        equ  (REG_SIZE + cycPos_OFFS)
    posRes_OFFS         equ  (REG_SIZE + cycSize_OFFS)
    
if (IS_LINUX gt 0)
else
    cutValue_PAR        equ  [r0 + cutValue_OFFS]
    d_PAR               equ  [r0 + d_OFFS]
endif
    maxLen_PAR          equ  [r0 + maxLen_OFFS]
    hash_PAR            equ  [r0 + hash_OFFS]
    limit_PAR           equ  [r0 + limit_OFFS]
    size_PAR            equ  [r0 + size_OFFS]
    cycPos_PAR          equ  [r0 + cycPos_OFFS]
    cycSize_PAR         equ  [r0 + cycSize_OFFS]
    posRes_PAR          equ  [r0 + posRes_OFFS]


    cutValue_VAR        equ  DWORD PTR [r4 + 8 * 0]
    cutValueCur_VAR     equ  DWORD PTR [r4 + 8 * 0 + 4]
    cycPos_VAR          equ  DWORD PTR [r4 + 8 * 1 + 0]
    cycSize_VAR         equ  DWORD PTR [r4 + 8 * 1 + 4]
    hash_VAR            equ  QWORD PTR [r4 + 8 * 2]
    limit_VAR           equ  QWORD PTR [r4 + 8 * 3]
    size_VAR            equ  QWORD PTR [r4 + 8 * 4]
    distances           equ  QWORD PTR [r4 + 8 * 5]
    maxLen_VAR          equ  QWORD PTR [r4 + 8 * 6]

    Old_RSP             equ  QWORD PTR [r4 + 8 * 7]
    LOCAL_SIZE          equ  8 * 8

COPY_VAR_32 macro dest_var, src_var
        mov     x3, src_var
        mov     dest_var, x3
endm

COPY_VAR_64 macro dest_var, src_var
        mov     r3, src_var
        mov     dest_var, r3
endm


; MY_ALIGN_64
MY_PROC GetMatchesSpecN_2, 13
MY_PUSH_PRESERVED_ABI_REGS
        mov     r0, RSP
        lea     r3, [r0 - LOCAL_SIZE]
        and     r3, -64
        mov     RSP, r3
        mov     Old_RSP, r0

if (IS_LINUX gt 0)
        mov     d,            REG_ABI_PARAM_5       ; r13 = r9
        mov     cutValue_VAR, REG_ABI_PARAM_4_x     ;     = r8
        mov     son,          REG_ABI_PARAM_3       ;  r9 = r1
        mov     r8,           REG_ABI_PARAM_2       ;  r8 = r2
        mov     pos,          REG_ABI_PARAM_1_x     ;  r2 = x6
        mov     r1,           REG_ABI_PARAM_0       ;  r1 = r7
else
        COPY_VAR_32 cutValue_VAR, cutValue_PAR
        mov     d, d_PAR
endif

        COPY_VAR_64 limit_VAR, limit_PAR
        
        mov     hash_lim, size_PAR
        mov     size_VAR, hash_lim
        
        mov     cp_x, cycPos_PAR
        mov     hash, hash_PAR

        mov     cycSize, cycSize_PAR
        mov     cycSize_VAR, cycSize
        
        ; we want cur in (rcx). So we change the cur and lenLimit variables
        sub     lenLimit, cur
        neg     lenLimit_x
        inc     lenLimit_x
        
        mov     t0_x, maxLen_PAR
        sub     t0, lenLimit
        mov     maxLen_VAR, t0

        jmp     main_loop

MY_ALIGN_64
fill_empty:
        ; ptr0 = *ptr1 = kEmptyHashValue;
        mov     QWORD PTR [ptr1], 0
        inc     pos
        inc     cp_x
        mov     DWORD PTR [d - 4], 0
        cmp     d, limit_VAR
        jae     fin
        cmp     hash, hash_lim
        je      fin

; MY_ALIGN_64
main_loop:
        ; UInt32 delta = *hash++;
        mov     diff_x, [hash]  ; delta
        add     hash, 4
        ; mov     cycPos_VAR, cp_x
       
        inc     cur
        add     d, 4
        mov     m, pos
        sub     m, diff_x;      ; matchPos
        
        ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
        lea     ptr1, [son + 8 * cp_r]
        ; mov     cycSize, cycSize_VAR
        cmp     pos, cycSize
        jb      directMode      ; if (pos < cycSize_VAR)
        
        ; CYC MODE

        cmp     diff_x, cycSize
        jae     fill_empty      ; if (delta >= cycSize_VAR)
        
        xor     t0_x, t0_x
        mov     cycPos_VAR, cp_x
        sub     cp_x, diff_x
        ; jae     prepare_for_tree_loop
        ; add     cp_x, cycSize
        cmovb   t0_x, cycSize
        add     cp_x, t0_x      ; cp_x +=  (cycPos < delta ? cycSize : 0)
        jmp     prepare_for_tree_loop
        
        
directMode:
        cmp     diff_x,  pos
        je      fill_empty      ; if (delta == pos)
        jae     fin_error       ; if (delta >= pos)
        
        mov     cycPos_VAR, cp_x
        mov     cp_x, m
        
prepare_for_tree_loop:
        mov     len0, lenLimit
        mov     hash_VAR, hash
        ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
        lea     ptr0, [ptr1 + 4]
        ; UInt32 *_distances = ++d;
        mov     distances, d

        neg     len0
        mov     len1, len0

        mov     t0_x, cutValue_VAR
        mov     maxLen, maxLen_VAR
        mov     cutValueCur_VAR, t0_x

MY_ALIGN_32
tree_loop:
        neg     diff
        mov     len, len0
        cmp     len1, len0
        cmovb   len, len1       ; len = (len1 < len0 ? len1 : len0);
        add     diff, cur

        mov     t0_x, [son + cp_r * 8]  ; prefetch
        movzx   t0_x, BYTE PTR [diff + 1 * len]
        lea     cp_r, [son + cp_r * 8]
        cmp     [cur + 1 * len], t0_L
        je      matched_1
        
        jb      left_0

        mov     [ptr1], m
        mov        m, [cp_r + 4]
        lea     ptr1, [cp_r + 4]
        sub     diff, cur ; FIX32
        jmp     next_node

MY_ALIGN_32
left_0:
        mov     [ptr0], m
        mov        m, [cp_r]
        mov     ptr0, cp_r
        sub     diff, cur ; FIX32
        ; jmp     next_node

; ------------ NEXT NODE ------------
; MY_ALIGN_32
next_node:
        mov     cycSize, cycSize_VAR
        dec     cutValueCur_VAR
        je      finish_tree
        
        add     diff_x, pos     ; prev_match = pos + diff
        cmp     m, diff_x
        jae     fin_error       ; if (new_match >= prev_match)
        
        mov     diff_x, pos
        sub     diff_x, m       ; delta = pos - new_match
        cmp     pos, cycSize
        jae     cyc_mode_2      ; if (pos >= cycSize)

        mov     cp_x, m
        test    m, m
        jne     tree_loop       ; if (m != 0)
        
finish_tree:
        ; ptr0 = *ptr1 = kEmptyHashValue;
        mov     DWORD PTR [ptr0], 0
        mov     DWORD PTR [ptr1], 0

        inc     pos
        
        ; _distances[-1] = (UInt32)(d - _distances);
        mov     t0, distances
        mov     t1, d
        sub     t1, t0
        shr     t1_x, 2
        mov     [t0 - 4], t1_x

        cmp     d, limit_VAR
        jae     fin             ; if (d >= limit)
   
        mov     cp_x, cycPos_VAR
        mov     hash, hash_VAR
        mov     hash_lim, size_VAR
        inc     cp_x
        cmp     hash, hash_lim
        jne     main_loop       ; if (hash != size)
        jmp     fin
        

MY_ALIGN_32
cyc_mode_2:
        cmp     diff_x, cycSize
        jae     finish_tree     ; if (delta >= cycSize)

        mov     cp_x, cycPos_VAR
        xor     t0_x, t0_x
        sub     cp_x, diff_x    ; cp_x = cycPos - delta
        cmovb   t0_x, cycSize
        add     cp_x, t0_x      ; cp_x += (cycPos < delta ? cycSize : 0)
        jmp     tree_loop

        
MY_ALIGN_32
matched_1:

        inc     len
        ; cmp     len_x, lenLimit_x
        je      short lenLimit_reach
        movzx   t0_x, BYTE PTR [diff + 1 * len]
        cmp     [cur + 1 * len], t0_L
        jne     mismatch

        
MY_ALIGN_32
match_loop:
        ;  while (++len != lenLimit)  (len[diff] != len[0]) ;

        inc     len
        ; cmp     len_x, lenLimit_x
        je      short lenLimit_reach
        movzx   t0_x, BYTE PTR [diff + 1 * len]
        cmp     BYTE PTR [cur + 1 * len], t0_L
        je      match_loop

mismatch:
        jb      left_2

        mov     [ptr1], m
        mov        m, [cp_r + 4]
        lea     ptr1, [cp_r + 4]
        mov     len1, len

        jmp     max_update
        
MY_ALIGN_32
left_2:
        mov     [ptr0], m
        mov        m, [cp_r]
        mov     ptr0, cp_r
        mov     len0, len

max_update:
        sub     diff, cur       ; restore diff

        cmp     maxLen, len
        jae     next_node
        
        mov     maxLen, len
        add     len, lenLimit
        mov     [d], len_x
        mov     t0_x, diff_x
        not     t0_x
        mov     [d + 4], t0_x
        add     d, 8
       
        jmp     next_node


        
MY_ALIGN_32
lenLimit_reach:

        mov     delta_r, cur
        sub     delta_r, diff
        lea     delta1_r, [delta_r - 1]

        mov     t0_x, [cp_r]
        mov     [ptr1], t0_x
        mov     t0_x, [cp_r + 4]
        mov     [ptr0], t0_x

        mov     [d], lenLimit_x
        mov     [d + 4], delta1_x
        add     d, 8

        ; _distances[-1] = (UInt32)(d - _distances);
        mov     t0, distances
        mov     t1, d
        sub     t1, t0
        shr     t1_x, 2
        mov     [t0 - 4], t1_x

        mov     hash, hash_VAR
        mov     hash_lim, size_VAR

        inc     pos
        mov     cp_x, cycPos_VAR
        inc     cp_x

        mov     d_lim, limit_VAR
        mov     cycSize, cycSize_VAR
        ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
        ;    break;
        cmp     hash, hash_lim
        je      fin
        cmp     d, d_lim
        jae     fin
        cmp     delta_x, [hash]
        jne     main_loop
        movzx   t0_x, BYTE PTR [diff]
        cmp     [cur], t0_L
        jne     main_loop

        ; jmp     main_loop     ; bypass for debug
        
        mov     cycPos_VAR, cp_x
        shl     len, 3          ; cycSize * 8
        sub     diff, cur       ; restore diff
        xor     t0_x, t0_x
        cmp     cp_x, delta_x   ; cmp (cycPos_VAR, delta)
        lea     cp_r, [son + 8 * cp_r]  ; dest
        lea     src, [cp_r + 8 * diff]
        cmovb   t0, len         ; t0 =  (cycPos_VAR < delta ? cycSize * 8 : 0)
        add     src, t0
        add     len, son        ; len = son + cycSize * 8

       
MY_ALIGN_32
long_loop:
        add     hash, 4
        
        ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
        
        mov     t0, [src]
        add     src, 8
        mov     [cp_r], t0
        add     cp_r, 8
        cmp     src, len
        cmove   src, son       ; if end of (son) buffer is reached, we wrap to begin

        mov     DWORD PTR [d], 2
        mov     [d + 4], lenLimit_x
        mov     [d + 8], delta1_x
        add     d, 12

        inc     cur

        cmp     hash, hash_lim
        je      long_footer
        cmp     delta_x, [hash]
        jne     long_footer
        movzx   t0_x, BYTE PTR [diff + 1 * cur]
        cmp     [cur], t0_L
        jne     long_footer
        cmp     d, d_lim
        jb      long_loop

long_footer:
        sub     cp_r, son
        shr     cp_r, 3
        add     pos, cp_x
        sub     pos, cycPos_VAR
        mov     cycSize, cycSize_VAR
        
        cmp     d, d_lim
        jae     fin
        cmp     hash, hash_lim
        jne     main_loop
        jmp     fin



fin_error:
        xor     d, d
        
fin:
        mov     RSP, Old_RSP
        mov     t0, [r4 + posRes_OFFS]
        mov     [t0], pos
        mov     r0, d

MY_POP_PRESERVED_ABI_REGS
MY_ENDP

_TEXT$LZFINDOPT ENDS

end