; SortTest.asm -- ASM version of HeapSort() function
; Igor Pavlov : Public domain

include ../../../../Asm/x86/7zAsm.asm

MY_ASM_START

ifndef Z7_SORT_ASM_USE_SEGMENT
if (IS_LINUX gt 0)
  ; Z7_SORT_ASM_USE_SEGMENT equ 1
else
  ; Z7_SORT_ASM_USE_SEGMENT equ 1
endif
endif

ifdef Z7_SORT_ASM_USE_SEGMENT
_TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req
        align  num
endm
else
MY_ALIGN macro num:req
        ; We expect that ".text" is aligned for 16-bytes.
        ; So we don't need large alignment inside our function.
        align  16
endm
endif


MY_ALIGN_16 macro
        MY_ALIGN 16
endm

MY_ALIGN_32 macro
        MY_ALIGN 32
endm

MY_ALIGN_64 macro
        MY_ALIGN 64
endm

ifdef x64

NUM_PREFETCH_LEVELS     equ 3 ; to prefetch 1x 64-bytes line (is good for most cases)
; NUM_PREFETCH_LEVELS     equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays)

acc           equ x0
k             equ r0
k_x           equ x0

p             equ r1

s             equ r2
s_x           equ x2

a0            equ x3
t0            equ a0

a3            equ x5
qq            equ a3

a1            equ x6
t1            equ a1
t1_r          equ r6

a2            equ x7
t2            equ a2

i             equ r8
e0            equ x8

e1            equ x9

num_last      equ r10
num_last_x    equ x10

next4_lim     equ r11
pref_lim      equ r12


SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg
        mov     temp_reg, b0
        cmp     b0, b1
        cmovae  b0, b1 ; min
        cmovae  b1, temp_reg ; max
endm

SORT macro b0, b1
        SORT_2_WITH_TEMP_REG b0, b1, acc
endm

LOAD macro dest:req, index:req
        mov     dest, [p + 4 * index]
endm

STORE macro reg:req, index:req
        mov     [p + 4 * index], reg
endm


if (NUM_PREFETCH_LEVELS gt 3)
   num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3))
else
   num_prefetches equ 1
endif

PREFETCH_OP macro offs
  cur_offset = 7 * 4    ; it's average offset in 64-bytes cache line.
  ; cur_offset = 0      ; we can use zero offset, if we are sure that array is aligned for 64-bytes.
  rept num_prefetches
    if 1
        prefetcht0 byte ptr [p + offs + cur_offset]
    else
        mov     pref_x, dword ptr [p + offs + cur_offset]
    endif
        cur_offset = cur_offset + 64
  endm
endm

PREFETCH_MY macro
if 1
    if 1
        shl     k, NUM_PREFETCH_LEVELS + 3
    else
        ; we delay prefetch instruction to improve main loads
        shl     k, NUM_PREFETCH_LEVELS
        shl     k, 3
        ; shl     k, 0
    endif
        PREFETCH_OP k
elseif 1
        shl     k, 3
        PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it
endif
endm


STEP_1 macro exit_label, prefetch_macro
use_cmov_1 equ 1  ; set 1 for cmov, but it's slower in some cases
                  ; set 0 for LOAD after adc s, 0
        cmp     t0, t1
    if use_cmov_1
        cmovb   t0, t1
        ; STORE   t0, k
    endif
        adc     s, 0
    if use_cmov_1 eq 0
        LOAD    t0, s
    endif
        cmp     qq, t0
        jae     exit_label
    if 1 ; use_cmov_1 eq 0
        STORE   t0, k
    endif
        prefetch_macro
        mov     t0, [p + s * 8]
        mov     t1, [p + s * 8 + 4]
        mov     k, s
        add     s, s                 ; slower  for some cpus
        ; lea     s, dword ptr [s + s] ; slower for some cpus
        ; shl     s, 1               ; faster for some cpus
        ; lea     s, dword ptr [s * 2] ; faster for some cpus
    rept 0 ; 1000 for debug : 0 for normal
        ; number of calls in generate_stage : ~0.6 of number of items
        shl     k, 0
    endm
endm


STEP_2 macro exit_label, prefetch_macro
use_cmov_2 equ 0  ; set 1 for cmov, but it's slower in some cases
                  ; set 0 for LOAD after adc s, 0
        cmp     t0, t1
    if use_cmov_2
        mov     t2, t0
        cmovb   t2, t1
        ; STORE   t2, k
    endif
        mov     t0, [p + s * 8]
        mov     t1, [p + s * 8 + 4]
        cmovb   t0, [p + s * 8 + 8]
        cmovb   t1, [p + s * 8 + 12]
        adc     s, 0
    if use_cmov_2 eq 0
        LOAD    t2, s
    endif
        cmp     qq, t2
        jae     exit_label
    if 1 ; use_cmov_2 eq 0
        STORE   t2, k
    endif
        prefetch_macro
        mov     k, s
        ; add     s, s
        ; lea     s, [s + s]
        shl     s, 1
        ; lea     s, [s * 2]
endm


MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls
        LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes

        ; s == k * 2
        ; t0 == (p)[s]
        ; t1 == (p)[s + 1]
        cmp     k, next4_lim
        jae     leaves

    rept num_unrolls
        STEP    exit_2
        cmp     k, next4_lim
        jae     leaves
    endm
  
    if use_prefetch
        prefetch_macro  equ PREFETCH_MY
        pref_lim_2      equ pref_lim
        ; lea     pref_lim, dword ptr [num_last + 1]
        ; shr     pref_lim, NUM_PREFETCH_LEVELS + 1
        cmp     k, pref_lim_2
        jae     last_nodes
    else
        prefetch_macro  equ
        pref_lim_2      equ next4_lim
    endif
  
MY_ALIGN_16
opt_loop:
        STEP    exit_2, prefetch_macro
        cmp     k, pref_lim_2
        jb      opt_loop

last_nodes:
        ; k >= pref_lim_2
        ; 2 cases are possible:
        ;   case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2
        ;   case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 &&
        ;        next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1)
  if use_prefetch
    yyy = NUM_PREFETCH_LEVELS - 1
    while yyy
        yyy = yyy - 1
        STEP    exit_2
      if yyy
        cmp     k, next4_lim
        jae     leaves
      endif
    endm
  endif

leaves:
        ; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code.
        ;   we     have    2 nodes in (s)     level :  always
        ;   we can have some nodes in (s * 2) level :  low probability case
        ;   we     have   no nodes in (s * 4) level
        ; s == k * 2
        ; t0 == (p)[s]
        ; t1 == (p)[s + 1]
        cmp     t0, t1
        cmovb   t0, t1
        adc     s, 0
        STORE   t0, k

        ; t0 == (p)[s]
        ; s / 2 == k  : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
        ; we have 3 possible cases here:
        ;   s * 2 >  num_last : (s) node has no childs
        ;   s * 2 == num_last : (s) node has 1 leaf child that is last item of array
        ;   s * 2 <  num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last)
        ; we check for (s * 2 > num_last) before "cmp qq, t0" check, because
        ; we will replace conditional jump with cmov instruction later.
        lea     t1_r, dword ptr [s + s]
        cmp     t1_r, num_last
        ja      exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch
        
        ; it's low probability branch
        ; s * 2 <= num_last
        cmp     qq, t0
        jae     exit_2

        ; qq < t0, so we go to next level
        ; we check 1 or 2 childs in next level
        mov     t0, [p + s * 8]
        mov     k, s
        mov     s, t1_r
        cmp     t1_r, num_last
        je      @F ; (s == num_last) means that we have single child in tree

        ; (s < num_last) : so we must read both childs and select max of them.
        mov     t1, [p + k * 8 + 4]
        cmp     t0, t1
        cmovb   t0, t1
        adc     s, 0
@@:
        STORE   t0, k
exit_1:
        ; t0 == (p)[s],  s / 2 == k  : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
        cmp     qq, t0
        cmovb   k, s
exit_2:
        STORE   qq, k
endm


ifdef Z7_SORT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
  MY_ALIGN_16
endif

MY_PROC HeapSort, 2

if (IS_LINUX gt 0)
        mov     p, REG_ABI_PARAM_0    ; r1 <- r7 : linux
endif
        mov     num_last, REG_ABI_PARAM_1  ; r10 <- r6 : linux
                                      ; r10 <- r2 : win64
        cmp     num_last, 2
        jb      end_1
        
        ; MY_PUSH_PRESERVED_ABI_REGS
        MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
        push    r12
        
        cmp     num_last, 4
        ja      sort_5
        
        LOAD    a0, 0
        LOAD    a1, 1
        SORT    a0, a1
        cmp     num_last, 3
        jb      end_2
        
        LOAD    a2, 2
        je      sort_3
        
        LOAD    a3, 3
        SORT    a2, a3
        SORT    a1, a3
        STORE   a3, 3
sort_3:
        SORT    a0, a2
        SORT    a1, a2
        STORE   a2, 2
        jmp     end_2
        
sort_5:
        ; (num_last > 4) is required here
        ; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1
        mov     next4_lim, num_last
        shr     next4_lim, 2
        
        dec     num_last
        mov     k, num_last
        shr     k, 1
        mov     i, num_last
        shr     i, 2
        test    num_last, 1
        jnz     size_even

        ; ODD number of items. So we compare parent with single child
        LOAD    t1, num_last
        LOAD    t0, k
        SORT_2_WITH_TEMP_REG  t1, t0, t2
        STORE   t1, num_last
        STORE   t0, k
        dec     k

size_even:
        cmp     k, i
        jbe     loop_down ; jump for num_last == 4 case

if 0 ; 1 for debug
        mov     r15, k
        mov     r14d, 1 ; 100
loop_benchmark:
endif
        ; optimized loop for leaf nodes:
        mov     t0, [p + k * 8]
        mov     t1, [p + k * 8 + 4]

MY_ALIGN_16
loop_down_1:
        ; we compare parent with max of childs:
        ; lea     s, dword ptr [2 * k]
        mov     s, k
        cmp     t0, t1
        cmovb   t0, t1
        adc     s, s
        LOAD    t2, k
        STORE   t0, k
        cmp     t2, t0
        cmovae  s, k
        dec     k
        ; we preload next items before STORE operation for calculated address
        mov     t0, [p + k * 8]
        mov     t1, [p + k * 8 + 4]
        STORE   t2, s
        cmp     k, i
        jne     loop_down_1

if 0 ; 1 for debug
        mov     k, r15
        dec     r14d
        jnz     loop_benchmark
        ; jmp end_debug
endif
       
MY_ALIGN_16
loop_down:
        mov     t0, [p + i * 8]
        mov     t1, [p + i * 8 + 4]
        LOAD    qq, i
        mov     k, i
        lea     s, dword ptr [i + i]
        ; jmp end_debug
    DOWN_use_prefetch  equ 0
    DOWN_num_unrolls   equ 0
        MOVE_SMALLEST_UP  STEP_1, DOWN_use_prefetch, DOWN_num_unrolls
        sub     i, 1
        jnb     loop_down

        ; jmp end_debug
        LOAD    e0, 0
        LOAD    e1, 1

   LEVEL_3_LIMIT equ 8   ; 8 is default, but 7 also can work

        cmp     num_last, LEVEL_3_LIMIT + 1
        jb      main_loop_sort_5

MY_ALIGN_16
main_loop_sort:
        ; num_last > LEVEL_3_LIMIT
        ; p[size--] = p[0];
        LOAD    qq, num_last
        STORE   e0, num_last
        mov     e0, e1
        
        mov     next4_lim, num_last
        shr     next4_lim, 2
        mov     pref_lim, num_last
        shr     pref_lim, NUM_PREFETCH_LEVELS + 1
        
        dec     num_last
if 0    ; 1 for debug
        ; that optional optimization can improve the performance, if there are identical items in array
        ;    3 times improvement : if all items in array are identical
        ;   20%  improvement : if items are different for 1 bit only
        ; 1-10%  improvement : if items are different for (2+) bits
        ; no gain : if items are different
        cmp     qq, e1
        jae     next_iter_main
endif
        LOAD    e1, 2
        LOAD    t0, 3
        mov     k_x, 2
        cmp     e1, t0
        cmovb   e1, t0
        mov     t0, [p + 4 * (4 + 0)]
        mov     t1, [p + 4 * (4 + 1)]
        cmovb   t0, [p + 4 * (4 + 2)]
        cmovb   t1, [p + 4 * (4 + 3)]
        adc     k_x, 0
        ; (qq <= e1), because the tree is correctly sorted
        ; also here we could check (qq >= e1) or (qq == e1) for faster exit
        lea     s, dword ptr [k + k]
    MAIN_use_prefetch  equ 1
    MAIN_num_unrolls   equ 0
        MOVE_SMALLEST_UP  STEP_2, MAIN_use_prefetch, MAIN_num_unrolls

next_iter_main:
        cmp     num_last, LEVEL_3_LIMIT
        jne     main_loop_sort

        ; num_last == LEVEL_3_LIMIT
main_loop_sort_5:
        ; 4 <= num_last <= LEVEL_3_LIMIT
        ; p[size--] = p[0];
        LOAD    qq, num_last
        STORE   e0, num_last
        mov     e0, e1
        dec     num_last_x
        
        LOAD    e1, 2
        LOAD    t0, 3
        mov     k_x, 2
        cmp     e1, t0
        cmovb   e1, t0
        adc     k_x, 0

        lea     s_x, dword ptr [k * 2]
        cmp     s_x, num_last_x
        ja      exit_2

        mov     t0, [p + k * 8]
        je      exit_1

        ; s < num_last
        mov     t1, [p + k * 8 + 4]
        cmp     t0, t1
        cmovb   t0, t1
        adc     s_x, 0
exit_1:
        STORE   t0, k
        cmp     qq, t0
        cmovb   k_x, s_x
exit_2:
        STORE   qq, k
        cmp     num_last_x, 3
        jne     main_loop_sort_5

        ; num_last == 3 (real_size == 4)
        LOAD    a0, 2
        LOAD    a1, 3
        STORE   e1, 2
        STORE   e0, 3
        SORT    a0, a1
end_2:
        STORE   a0, 0
        STORE   a1, 1
; end_debug:
        ; MY_POP_PRESERVED_ABI_REGS
        pop     r12
        MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
end_1:
MY_ENDP


else
; ------------ x86 32-bit ------------

ifdef x64
IS_CDECL = 0
endif

acc           equ x0
k             equ r0
k_x           equ acc

p             equ r1

num_last      equ r2
num_last_x    equ x2

a0            equ x3
t0            equ a0

a3            equ x5
i             equ r5
e0            equ a3

a1            equ x6
qq            equ a1
 
a2            equ x7
s             equ r7
s_x           equ a2


SORT macro b0, b1
        cmp     b1, b0
        jae     @F
    if 1
        xchg    b0, b1
    else
        mov     acc, b0
        mov     b0, b1 ; min
        mov     b1, acc ; max
    endif
@@:
endm

LOAD macro dest:req, index:req
        mov     dest, [p + 4 * index]
endm

STORE macro reg:req, index:req
        mov     [p + 4 * index], reg
endm


STEP_1 macro exit_label
        mov     t0, [p + k * 8]
        cmp     t0, [p + k * 8 + 4]
        adc     s, 0
        LOAD    t0, s
        STORE   t0, k ; we lookahed stooring for most expected branch
        cmp     qq, t0
        jae     exit_label
        ; STORE   t0, k  ; use if
        mov     k, s
        add     s, s
        ; lea     s, dword ptr [s + s]
        ; shl     s, 1
        ; lea     s, dword ptr [s * 2]
endm

STEP_BRANCH macro exit_label
        mov     t0, [p + k * 8]
        cmp     t0, [p + k * 8 + 4]
        jae     @F
        inc     s
        mov     t0, [p + k * 8 + 4]
@@:
        cmp     qq, t0
        jae     exit_label
        STORE   t0, k
        mov     k, s
        add     s, s
endm


MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2
        LOCAL leaves, opt_loop, single

        ; s == k * 2
    rept num_unrolls
        cmp     s, num_last
        jae     leaves
        STEP_1  exit_2
    endm
        cmp     s, num_last
        jb      opt_loop

leaves:
        ; (s >= num_last)
        jne     exit_2
single:
        ; (s == num_last)
        mov     t0, [p + k * 8]
        cmp     qq, t0
        jae     exit_2
        STORE   t0, k
        mov     k, s
        jmp     exit_2
 
MY_ALIGN_16
opt_loop:
        STEP    exit_2
        cmp     s, num_last
        jb      opt_loop
        je      single
exit_2:
        STORE   qq, k
endm


ifdef Z7_SORT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
  MY_ALIGN_16
endif

MY_PROC HeapSort, 2
  ifdef x64
    if (IS_LINUX gt 0)
        mov     num_last, REG_ABI_PARAM_1  ; r2 <- r6 : linux
        mov     p,        REG_ABI_PARAM_0  ; r1 <- r7 : linux
    endif
  elseif (IS_CDECL gt 0)
        mov     num_last, [r4 + REG_SIZE * 2]
        mov     p,        [r4 + REG_SIZE * 1]
  endif
        cmp     num_last, 2
        jb      end_1
        MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
        
        cmp     num_last, 4
        ja      sort_5
        
        LOAD    a0, 0
        LOAD    a1, 1
        SORT    a0, a1
        cmp     num_last, 3
        jb      end_2
        
        LOAD    a2, 2
        je      sort_3
        
        LOAD    a3, 3
        SORT    a2, a3
        SORT    a1, a3
        STORE   a3, 3
sort_3:
        SORT    a0, a2
        SORT    a1, a2
        STORE   a2, 2
        jmp     end_2
        
sort_5:
        ; num_last > 4
        lea     i, dword ptr [num_last - 2]
        dec     num_last
        test    i, 1
        jz      loop_down

        ; single child
        mov     t0, [p + num_last * 4]
        mov     qq, [p + num_last * 2]
        dec     i
        cmp     qq, t0
        jae     loop_down

        mov     [p + num_last * 2], t0
        mov     [p + num_last * 4], qq
       
MY_ALIGN_16
loop_down:
        mov     t0, [p + i * 4]
        cmp     t0, [p + i * 4 + 4]
        mov     k, i
        mov     qq, [p + i * 2]
        adc     k, 0
        LOAD    t0, k
        cmp     qq, t0
        jae     down_next
        mov     [p + i * 2], t0
        lea     s, dword ptr [k + k]
        
        DOWN_num_unrolls   equ 0
        MOVE_SMALLEST_UP  STEP_1, DOWN_num_unrolls, down_exit_label
down_next:
        sub     i, 2
        jnb     loop_down
        ; jmp end_debug

        LOAD    e0, 0

MY_ALIGN_16
main_loop_sort:
        ; num_last > 3
        mov     t0, [p + 2 * 4]
        cmp     t0, [p + 3 * 4]
        LOAD    qq, num_last
        STORE   e0, num_last
        LOAD    e0, 1
        mov     s_x, 2
        mov     k_x, 1
        adc     s, 0
        LOAD    t0, s
        dec     num_last
        cmp     qq, t0
        jae     main_exit_label
        STORE   t0, 1
        mov     k, s
        add     s, s
    if 1
        ; for branch data prefetch mode :
        ; it's faster for large arrays : larger than (1 << 13) items.
        MAIN_num_unrolls   equ 10
        STEP_LOOP          equ STEP_BRANCH
    else
        MAIN_num_unrolls   equ 0
        STEP_LOOP          equ STEP_1
    endif
        
        MOVE_SMALLEST_UP  STEP_LOOP, MAIN_num_unrolls, main_exit_label
        
        ; jmp end_debug
        cmp     num_last, 3
        jne     main_loop_sort

        ; num_last == 3 (real_size == 4)
        LOAD    a0, 2
        LOAD    a1, 3
        LOAD    a2, 1
        STORE   e0, 3  ; e0 is alias for a3
        STORE   a2, 2
        SORT    a0, a1
end_2:
        STORE   a0, 0
        STORE   a1, 1
; end_debug:
        MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
end_1:
MY_ENDP

endif

ifdef Z7_SORT_ASM_USE_SEGMENT
_TEXT$Z7_SORT ENDS
endif

if 0
LEA_IS_D8 (R64) [R2 * 4 + 16]
 Lat : TP
   2 :  1 :      adl-e
   2 :  3   p056 adl-p
   1 :  2 : p15  hsw-rocket
   1 :  2 : p01  snb-ivb
   1 :  1 : p1   conroe-wsm
   1 :  4 : zen3,zen4
   2 :  4 : zen1,zen2

LEA_B_IS (R64) [R2 + R3 * 4]
 Lat : TP
   1 :  1 :      adl-e
   2 :  3   p056 adl-p
   1 :  2 : p15  hsw-rocket
   1 :  2 : p01  snb-ivb
   1 :  1 : p1   nhm-wsm
   1 :  1 : p0   conroe-wsm
   1 :  4 : zen3,zen4
   2 :2,4 : zen1,zen2
   
LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16]
 Lat : TP
   2 :  1 :      adl-e
   2 :  3   p056 adl-p
   1 :  2 : p15  ice-rocket
   3 :  1 : p1/p15 hsw-rocket
   3 :  1 : p01  snb-ivb
   1 :  1 : p1   nhm-wsm
   1 :  1 : p0   conroe-wsm
 2,1 :  2 : zen3,zen4
   2 :  2 : zen1,zen2
   
CMOVB (R64, R64)
 Lat : TP
 1,2 :  2 :      adl-e
   1 :  2   p06  adl-p
   1 :  2 : p06  bwd-rocket
 1,2 :  2 : p0156+p06 hsw
 1,2 :1.5 : p015+p05  snb-ivb
 1,2 :  1 : p015+p05  nhm
   1 :  1 : 2*p015  conroe
   1 :  2 : zen3,zen4
   1 :  4 : zen1,zen2

ADC (R64, 0)
 Lat : TP
 1,2 :  2 :      adl-e
   1 :  2   p06  adl-p
   1 :  2 : p06  bwd-rocket
   1 :1.5 : p0156+p06 hsw
   1 :1.5 : p015+p05  snb-ivb
   2 :  1 : 2*p015    conroe-wstm
   1 :  2 : zen1,zen2,zen3,zen4
   
PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution.
  L1 : Pentium3
  L2 : NetBurst
  L1, not L2: Core duo, Core 2, Atom processors
  L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ...
      NEHALEM: Fills L1/L3, L1 LRU is not updated
  L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ...
PREFETCHT0 : fetch data into all cache levels.
PREFETCHT1 : fetch data into L2 and L3
endif

end