1 files changed, 860 insertions, 0 deletions
diff --git a/Asm/x86/Sort.asm b/Asm/x86/Sort.asm
new file mode 100644
index 0000000..517c615
--- /dev/null
+++ b/Asm/x86/Sort.asm
@@ -0,0 +1,860 @@
+; SortTest.asm -- ASM version of HeapSort() function
+; Igor Pavlov : Public domain
+include ../../../../Asm/x86/7zAsm.asm
+MY_ASM_START
+ifndef Z7_SORT_ASM_USE_SEGMENT
+if (IS_LINUX gt 0)
+  ; Z7_SORT_ASM_USE_SEGMENT equ 1
+else
+  ; Z7_SORT_ASM_USE_SEGMENT equ 1
+endif
+endif
+ifdef Z7_SORT_ASM_USE_SEGMENT
+_TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE'
+MY_ALIGN macro num:req
+        align  num
+endm
+else
+MY_ALIGN macro num:req
+        ; We expect that ".text" is aligned for 16-bytes.
+        ; So we don't need large alignment inside our function.
+        align  16
+endm
+endif
+MY_ALIGN_16 macro
+        MY_ALIGN 16
+endm
+MY_ALIGN_32 macro
+        MY_ALIGN 32
+endm
+MY_ALIGN_64 macro
+        MY_ALIGN 64
+endm
+ifdef x64
+NUM_PREFETCH_LEVELS     equ 3 ; to prefetch 1x 64-bytes line (is good for most cases)
+; NUM_PREFETCH_LEVELS     equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays)
+acc           equ x0
+k             equ r0
+k_x           equ x0
+p             equ r1
+s             equ r2
+s_x           equ x2
+a0            equ x3
+t0            equ a0
+a3            equ x5
+qq            equ a3
+a1            equ x6
+t1            equ a1
+t1_r          equ r6
+a2            equ x7
+t2            equ a2
+i             equ r8
+e0            equ x8
+e1            equ x9
+num_last      equ r10
+num_last_x    equ x10
+next4_lim     equ r11
+pref_lim      equ r12
+SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg
+        mov     temp_reg, b0
+        cmp     b0, b1
+        cmovae  b0, b1 ; min
+        cmovae  b1, temp_reg ; max
+endm
+SORT macro b0, b1
+        SORT_2_WITH_TEMP_REG b0, b1, acc
+endm
+LOAD macro dest:req, index:req
+        mov     dest, [p + 4 * index]
+endm
+STORE macro reg:req, index:req
+        mov     [p + 4 * index], reg
+endm
+if (NUM_PREFETCH_LEVELS gt 3)
+   num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3))
+else
+   num_prefetches equ 1
+endif
+PREFETCH_OP macro offs
+  cur_offset = 7 * 4    ; it's average offset in 64-bytes cache line.
+  ; cur_offset = 0      ; we can use zero offset, if we are sure that array is aligned for 64-bytes.
+  rept num_prefetches
+    if 1
+        prefetcht0 byte ptr [p + offs + cur_offset]
+    else
+        mov     pref_x, dword ptr [p + offs + cur_offset]
+    endif
+        cur_offset = cur_offset + 64
+  endm
+endm
+PREFETCH_MY macro
+if 1
+    if 1
+        shl     k, NUM_PREFETCH_LEVELS + 3
+    else
+        ; we delay prefetch instruction to improve main loads
+        shl     k, NUM_PREFETCH_LEVELS
+        shl     k, 3
+        ; shl     k, 0
+    endif
+        PREFETCH_OP k
+elseif 1
+        shl     k, 3
+        PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it
+endif
+endm
+STEP_1 macro exit_label, prefetch_macro
+use_cmov_1 equ 1  ; set 1 for cmov, but it's slower in some cases
+                  ; set 0 for LOAD after adc s, 0
+        cmp     t0, t1
+    if use_cmov_1
+        cmovb   t0, t1
+        ; STORE   t0, k
+    endif
+        adc     s, 0
+    if use_cmov_1 eq 0
+        LOAD    t0, s
+    endif
+        cmp     qq, t0
+        jae     exit_label
+    if 1 ; use_cmov_1 eq 0
+        STORE   t0, k
+    endif
+        prefetch_macro
+        mov     t0, [p + s * 8]
+        mov     t1, [p + s * 8 + 4]
+        mov     k, s
+        add     s, s                 ; slower  for some cpus
+        ; lea     s, dword ptr [s + s] ; slower for some cpus
+        ; shl     s, 1               ; faster for some cpus
+        ; lea     s, dword ptr [s * 2] ; faster for some cpus
+    rept 0 ; 1000 for debug : 0 for normal
+        ; number of calls in generate_stage : ~0.6 of number of items
+        shl     k, 0
+    endm
+endm
+STEP_2 macro exit_label, prefetch_macro
+use_cmov_2 equ 0  ; set 1 for cmov, but it's slower in some cases
+                  ; set 0 for LOAD after adc s, 0
+        cmp     t0, t1
+    if use_cmov_2
+        mov     t2, t0
+        cmovb   t2, t1
+        ; STORE   t2, k
+    endif
+        mov     t0, [p + s * 8]
+        mov     t1, [p + s * 8 + 4]
+        cmovb   t0, [p + s * 8 + 8]
+        cmovb   t1, [p + s * 8 + 12]
+        adc     s, 0
+    if use_cmov_2 eq 0
+        LOAD    t2, s
+    endif
+        cmp     qq, t2
+        jae     exit_label
+    if 1 ; use_cmov_2 eq 0
+        STORE   t2, k
+    endif
+        prefetch_macro
+        mov     k, s
+        ; add     s, s
+        ; lea     s, [s + s]
+        shl     s, 1
+        ; lea     s, [s * 2]
+endm
+MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls
+        LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes
+        ; s == k * 2
+        ; t0 == (p)[s]
+        ; t1 == (p)[s + 1]
+        cmp     k, next4_lim
+        jae     leaves
+    rept num_unrolls
+        STEP    exit_2
+        cmp     k, next4_lim
+        jae     leaves
+    endm
+  
+    if use_prefetch
+        prefetch_macro  equ PREFETCH_MY
+        pref_lim_2      equ pref_lim
+        ; lea     pref_lim, dword ptr [num_last + 1]
+        ; shr     pref_lim, NUM_PREFETCH_LEVELS + 1
+        cmp     k, pref_lim_2
+        jae     last_nodes
+    else
+        prefetch_macro  equ
+        pref_lim_2      equ next4_lim
+    endif
+  
+MY_ALIGN_16
+opt_loop:
+        STEP    exit_2, prefetch_macro
+        cmp     k, pref_lim_2
+        jb      opt_loop
+last_nodes:
+        ; k >= pref_lim_2
+        ; 2 cases are possible:
+        ;   case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2
+        ;   case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 &&
+        ;        next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1)
+  if use_prefetch
+    yyy = NUM_PREFETCH_LEVELS - 1
+    while yyy
+        yyy = yyy - 1
+        STEP    exit_2
+      if yyy
+        cmp     k, next4_lim
+        jae     leaves
+      endif
+    endm
+  endif
+leaves:
+        ; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code.
+        ;   we     have    2 nodes in (s)     level :  always
+        ;   we can have some nodes in (s * 2) level :  low probability case
+        ;   we     have   no nodes in (s * 4) level
+        ; s == k * 2
+        ; t0 == (p)[s]
+        ; t1 == (p)[s + 1]
+        cmp     t0, t1
+        cmovb   t0, t1
+        adc     s, 0
+        STORE   t0, k
+        ; t0 == (p)[s]
+        ; s / 2 == k  : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
+        ; we have 3 possible cases here:
+        ;   s * 2 >  num_last : (s) node has no childs
+        ;   s * 2 == num_last : (s) node has 1 leaf child that is last item of array
+        ;   s * 2 <  num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last)
+        ; we check for (s * 2 > num_last) before "cmp qq, t0" check, because
+        ; we will replace conditional jump with cmov instruction later.
+        lea     t1_r, dword ptr [s + s]
+        cmp     t1_r, num_last
+        ja      exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch
+        
+        ; it's low probability branch
+        ; s * 2 <= num_last
+        cmp     qq, t0
+        jae     exit_2
+        ; qq < t0, so we go to next level
+        ; we check 1 or 2 childs in next level
+        mov     t0, [p + s * 8]
+        mov     k, s
+        mov     s, t1_r
+        cmp     t1_r, num_last
+        je      @F ; (s == num_last) means that we have single child in tree
+        ; (s < num_last) : so we must read both childs and select max of them.
+        mov     t1, [p + k * 8 + 4]
+        cmp     t0, t1
+        cmovb   t0, t1
+        adc     s, 0
+@@:
+        STORE   t0, k
+exit_1:
+        ; t0 == (p)[s],  s / 2 == k  : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
+        cmp     qq, t0
+        cmovb   k, s
+exit_2:
+        STORE   qq, k
+endm
+ifdef Z7_SORT_ASM_USE_SEGMENT
+; MY_ALIGN_64
+else
+  MY_ALIGN_16
+endif
+MY_PROC HeapSort, 2
+if (IS_LINUX gt 0)
+        mov     p, REG_ABI_PARAM_0    ; r1 <- r7 : linux
+endif
+        mov     num_last, REG_ABI_PARAM_1  ; r10 <- r6 : linux
+                                      ; r10 <- r2 : win64
+        cmp     num_last, 2
+        jb      end_1
+        
+        ; MY_PUSH_PRESERVED_ABI_REGS
+        MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
+        push    r12
+        
+        cmp     num_last, 4
+        ja      sort_5
+        
+        LOAD    a0, 0
+        LOAD    a1, 1
+        SORT    a0, a1
+        cmp     num_last, 3
+        jb      end_2
+        
+        LOAD    a2, 2
+        je      sort_3
+        
+        LOAD    a3, 3
+        SORT    a2, a3
+        SORT    a1, a3
+        STORE   a3, 3
+sort_3:
+        SORT    a0, a2
+        SORT    a1, a2
+        STORE   a2, 2
+        jmp     end_2
+        
+sort_5:
+        ; (num_last > 4) is required here
+        ; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1
+        mov     next4_lim, num_last
+        shr     next4_lim, 2
+        
+        dec     num_last
+        mov     k, num_last
+        shr     k, 1
+        mov     i, num_last
+        shr     i, 2
+        test    num_last, 1
+        jnz     size_even
+        ; ODD number of items. So we compare parent with single child
+        LOAD    t1, num_last
+        LOAD    t0, k
+        SORT_2_WITH_TEMP_REG  t1, t0, t2
+        STORE   t1, num_last
+        STORE   t0, k
+        dec     k
+size_even:
+        cmp     k, i
+        jbe     loop_down ; jump for num_last == 4 case
+if 0 ; 1 for debug
+        mov     r15, k
+        mov     r14d, 1 ; 100
+loop_benchmark:
+endif
+        ; optimized loop for leaf nodes:
+        mov     t0, [p + k * 8]
+        mov     t1, [p + k * 8 + 4]
+MY_ALIGN_16
+loop_down_1:
+        ; we compare parent with max of childs:
+        ; lea     s, dword ptr [2 * k]
+        mov     s, k
+        cmp     t0, t1
+        cmovb   t0, t1
+        adc     s, s
+        LOAD    t2, k
+        STORE   t0, k
+        cmp     t2, t0
+        cmovae  s, k
+        dec     k
+        ; we preload next items before STORE operation for calculated address
+        mov     t0, [p + k * 8]
+        mov     t1, [p + k * 8 + 4]
+        STORE   t2, s
+        cmp     k, i
+        jne     loop_down_1
+if 0 ; 1 for debug
+        mov     k, r15
+        dec     r14d
+        jnz     loop_benchmark
+        ; jmp end_debug
+endif
+       
+MY_ALIGN_16
+loop_down:
+        mov     t0, [p + i * 8]
+        mov     t1, [p + i * 8 + 4]
+        LOAD    qq, i
+        mov     k, i
+        lea     s, dword ptr [i + i]
+        ; jmp end_debug
+    DOWN_use_prefetch  equ 0
+    DOWN_num_unrolls   equ 0
+        MOVE_SMALLEST_UP  STEP_1, DOWN_use_prefetch, DOWN_num_unrolls
+        sub     i, 1
+        jnb     loop_down
+        ; jmp end_debug
+        LOAD    e0, 0
+        LOAD    e1, 1
+   LEVEL_3_LIMIT equ 8   ; 8 is default, but 7 also can work
+        cmp     num_last, LEVEL_3_LIMIT + 1
+        jb      main_loop_sort_5
+MY_ALIGN_16
+main_loop_sort:
+        ; num_last > LEVEL_3_LIMIT
+        ; p[size--] = p[0];
+        LOAD    qq, num_last
+        STORE   e0, num_last
+        mov     e0, e1
+        
+        mov     next4_lim, num_last
+        shr     next4_lim, 2
+        mov     pref_lim, num_last
+        shr     pref_lim, NUM_PREFETCH_LEVELS + 1
+        
+        dec     num_last
+if 0    ; 1 for debug
+        ; that optional optimization can improve the performance, if there are identical items in array
+        ;    3 times improvement : if all items in array are identical
+        ;   20%  improvement : if items are different for 1 bit only
+        ; 1-10%  improvement : if items are different for (2+) bits
+        ; no gain : if items are different
+        cmp     qq, e1
+        jae     next_iter_main
+endif
+        LOAD    e1, 2
+        LOAD    t0, 3
+        mov     k_x, 2
+        cmp     e1, t0
+        cmovb   e1, t0
+        mov     t0, [p + 4 * (4 + 0)]
+        mov     t1, [p + 4 * (4 + 1)]
+        cmovb   t0, [p + 4 * (4 + 2)]
+        cmovb   t1, [p + 4 * (4 + 3)]
+        adc     k_x, 0
+        ; (qq <= e1), because the tree is correctly sorted
+        ; also here we could check (qq >= e1) or (qq == e1) for faster exit
+        lea     s, dword ptr [k + k]
+    MAIN_use_prefetch  equ 1
+    MAIN_num_unrolls   equ 0
+        MOVE_SMALLEST_UP  STEP_2, MAIN_use_prefetch, MAIN_num_unrolls
+next_iter_main:
+        cmp     num_last, LEVEL_3_LIMIT
+        jne     main_loop_sort
+        ; num_last == LEVEL_3_LIMIT
+main_loop_sort_5:
+        ; 4 <= num_last <= LEVEL_3_LIMIT
+        ; p[size--] = p[0];
+        LOAD    qq, num_last
+        STORE   e0, num_last
+        mov     e0, e1
+        dec     num_last_x
+        
+        LOAD    e1, 2
+        LOAD    t0, 3
+        mov     k_x, 2
+        cmp     e1, t0
+        cmovb   e1, t0
+        adc     k_x, 0
+        lea     s_x, dword ptr [k * 2]
+        cmp     s_x, num_last_x
+        ja      exit_2
+        mov     t0, [p + k * 8]
+        je      exit_1
+        ; s < num_last
+        mov     t1, [p + k * 8 + 4]
+        cmp     t0, t1
+        cmovb   t0, t1
+        adc     s_x, 0
+exit_1:
+        STORE   t0, k
+        cmp     qq, t0
+        cmovb   k_x, s_x
+exit_2:
+        STORE   qq, k
+        cmp     num_last_x, 3
+        jne     main_loop_sort_5
+        ; num_last == 3 (real_size == 4)
+        LOAD    a0, 2
+        LOAD    a1, 3
+        STORE   e1, 2
+        STORE   e0, 3
+        SORT    a0, a1
+end_2:
+        STORE   a0, 0
+        STORE   a1, 1
+; end_debug:
+        ; MY_POP_PRESERVED_ABI_REGS
+        pop     r12
+        MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
+end_1:
+MY_ENDP
+else
+; ------------ x86 32-bit ------------
+ifdef x64
+IS_CDECL = 0
+endif
+acc           equ x0
+k             equ r0
+k_x           equ acc
+p             equ r1
+num_last      equ r2
+num_last_x    equ x2
+a0            equ x3
+t0            equ a0
+a3            equ x5
+i             equ r5
+e0            equ a3
+a1            equ x6
+qq            equ a1
+ 
+a2            equ x7
+s             equ r7
+s_x           equ a2
+SORT macro b0, b1
+        cmp     b1, b0
+        jae     @F
+    if 1
+        xchg    b0, b1
+    else
+        mov     acc, b0
+        mov     b0, b1 ; min
+        mov     b1, acc ; max
+    endif
+@@:
+endm
+LOAD macro dest:req, index:req
+        mov     dest, [p + 4 * index]
+endm
+STORE macro reg:req, index:req
+        mov     [p + 4 * index], reg
+endm
+STEP_1 macro exit_label
+        mov     t0, [p + k * 8]
+        cmp     t0, [p + k * 8 + 4]
+        adc     s, 0
+        LOAD    t0, s
+        STORE   t0, k ; we lookahed stooring for most expected branch
+        cmp     qq, t0
+        jae     exit_label
+        ; STORE   t0, k  ; use if
+        mov     k, s
+        add     s, s
+        ; lea     s, dword ptr [s + s]
+        ; shl     s, 1
+        ; lea     s, dword ptr [s * 2]
+endm
+STEP_BRANCH macro exit_label
+        mov     t0, [p + k * 8]
+        cmp     t0, [p + k * 8 + 4]
+        jae     @F
+        inc     s
+        mov     t0, [p + k * 8 + 4]
+@@:
+        cmp     qq, t0
+        jae     exit_label
+        STORE   t0, k
+        mov     k, s
+        add     s, s
+endm
+MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2
+        LOCAL leaves, opt_loop, single
+        ; s == k * 2
+    rept num_unrolls
+        cmp     s, num_last
+        jae     leaves
+        STEP_1  exit_2
+    endm
+        cmp     s, num_last
+        jb      opt_loop
+leaves:
+        ; (s >= num_last)
+        jne     exit_2
+single:
+        ; (s == num_last)
+        mov     t0, [p + k * 8]
+        cmp     qq, t0
+        jae     exit_2
+        STORE   t0, k
+        mov     k, s
+        jmp     exit_2
+ 
+MY_ALIGN_16
+opt_loop:
+        STEP    exit_2
+        cmp     s, num_last
+        jb      opt_loop
+        je      single
+exit_2:
+        STORE   qq, k
+endm
+ifdef Z7_SORT_ASM_USE_SEGMENT
+; MY_ALIGN_64
+else
+  MY_ALIGN_16
+endif
+MY_PROC HeapSort, 2
+  ifdef x64
+    if (IS_LINUX gt 0)
+        mov     num_last, REG_ABI_PARAM_1  ; r2 <- r6 : linux
+        mov     p,        REG_ABI_PARAM_0  ; r1 <- r7 : linux
+    endif
+  elseif (IS_CDECL gt 0)
+        mov     num_last, [r4 + REG_SIZE * 2]
+        mov     p,        [r4 + REG_SIZE * 1]
+  endif
+        cmp     num_last, 2
+        jb      end_1
+        MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
+        
+        cmp     num_last, 4
+        ja      sort_5
+        
+        LOAD    a0, 0
+        LOAD    a1, 1
+        SORT    a0, a1
+        cmp     num_last, 3
+        jb      end_2
+        
+        LOAD    a2, 2
+        je      sort_3
+        
+        LOAD    a3, 3
+        SORT    a2, a3
+        SORT    a1, a3
+        STORE   a3, 3
+sort_3:
+        SORT    a0, a2
+        SORT    a1, a2
+        STORE   a2, 2
+        jmp     end_2
+        
+sort_5:
+        ; num_last > 4
+        lea     i, dword ptr [num_last - 2]
+        dec     num_last
+        test    i, 1
+        jz      loop_down
+        ; single child
+        mov     t0, [p + num_last * 4]
+        mov     qq, [p + num_last * 2]
+        dec     i
+        cmp     qq, t0
+        jae     loop_down
+        mov     [p + num_last * 2], t0
+        mov     [p + num_last * 4], qq
+       
+MY_ALIGN_16
+loop_down:
+        mov     t0, [p + i * 4]
+        cmp     t0, [p + i * 4 + 4]
+        mov     k, i
+        mov     qq, [p + i * 2]
+        adc     k, 0
+        LOAD    t0, k
+        cmp     qq, t0
+        jae     down_next
+        mov     [p + i * 2], t0
+        lea     s, dword ptr [k + k]
+        
+        DOWN_num_unrolls   equ 0
+        MOVE_SMALLEST_UP  STEP_1, DOWN_num_unrolls, down_exit_label
+down_next:
+        sub     i, 2
+        jnb     loop_down
+        ; jmp end_debug
+        LOAD    e0, 0
+MY_ALIGN_16
+main_loop_sort:
+        ; num_last > 3
+        mov     t0, [p + 2 * 4]
+        cmp     t0, [p + 3 * 4]
+        LOAD    qq, num_last
+        STORE   e0, num_last
+        LOAD    e0, 1
+        mov     s_x, 2
+        mov     k_x, 1
+        adc     s, 0
+        LOAD    t0, s
+        dec     num_last
+        cmp     qq, t0
+        jae     main_exit_label
+        STORE   t0, 1
+        mov     k, s
+        add     s, s
+    if 1
+        ; for branch data prefetch mode :
+        ; it's faster for large arrays : larger than (1 << 13) items.
+        MAIN_num_unrolls   equ 10
+        STEP_LOOP          equ STEP_BRANCH
+    else
+        MAIN_num_unrolls   equ 0
+        STEP_LOOP          equ STEP_1
+    endif
+        
+        MOVE_SMALLEST_UP  STEP_LOOP, MAIN_num_unrolls, main_exit_label
+        
+        ; jmp end_debug
+        cmp     num_last, 3
+        jne     main_loop_sort
+        ; num_last == 3 (real_size == 4)
+        LOAD    a0, 2
+        LOAD    a1, 3
+        LOAD    a2, 1
+        STORE   e0, 3  ; e0 is alias for a3
+        STORE   a2, 2
+        SORT    a0, a1
+end_2:
+        STORE   a0, 0
+        STORE   a1, 1
+; end_debug:
+        MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
+end_1:
+MY_ENDP
+endif
+ifdef Z7_SORT_ASM_USE_SEGMENT
+_TEXT$Z7_SORT ENDS
+endif
+if 0
+LEA_IS_D8 (R64) [R2 * 4 + 16]
+ Lat : TP
+   2 :  1 :      adl-e
+   2 :  3   p056 adl-p
+   1 :  2 : p15  hsw-rocket
+   1 :  2 : p01  snb-ivb
+   1 :  1 : p1   conroe-wsm
+   1 :  4 : zen3,zen4
+   2 :  4 : zen1,zen2
+LEA_B_IS (R64) [R2 + R3 * 4]
+ Lat : TP
+   1 :  1 :      adl-e
+   2 :  3   p056 adl-p
+   1 :  2 : p15  hsw-rocket
+   1 :  2 : p01  snb-ivb
+   1 :  1 : p1   nhm-wsm
+   1 :  1 : p0   conroe-wsm
+   1 :  4 : zen3,zen4
+   2 :2,4 : zen1,zen2
+   
+LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16]
+ Lat : TP
+   2 :  1 :      adl-e
+   2 :  3   p056 adl-p
+   1 :  2 : p15  ice-rocket
+   3 :  1 : p1/p15 hsw-rocket
+   3 :  1 : p01  snb-ivb
+   1 :  1 : p1   nhm-wsm
+   1 :  1 : p0   conroe-wsm
+ 2,1 :  2 : zen3,zen4
+   2 :  2 : zen1,zen2
+   
+CMOVB (R64, R64)
+ Lat : TP
+ 1,2 :  2 :      adl-e
+   1 :  2   p06  adl-p
+   1 :  2 : p06  bwd-rocket
+ 1,2 :  2 : p0156+p06 hsw
+ 1,2 :1.5 : p015+p05  snb-ivb
+ 1,2 :  1 : p015+p05  nhm
+   1 :  1 : 2*p015  conroe
+   1 :  2 : zen3,zen4
+   1 :  4 : zen1,zen2
+ADC (R64, 0)
+ Lat : TP
+ 1,2 :  2 :      adl-e
+   1 :  2   p06  adl-p
+   1 :  2 : p06  bwd-rocket
+   1 :1.5 : p0156+p06 hsw
+   1 :1.5 : p015+p05  snb-ivb
+   2 :  1 : 2*p015    conroe-wstm
+   1 :  2 : zen1,zen2,zen3,zen4
+   
+PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution.
+  L1 : Pentium3
+  L2 : NetBurst
+  L1, not L2: Core duo, Core 2, Atom processors
+  L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ...
+      NEHALEM: Fills L1/L3, L1 LRU is not updated
+  L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ...
+PREFETCHT0 : fetch data into all cache levels.
+PREFETCHT1 : fetch data into L2 and L3
+endif
+end

diff --git a/Asm/x86/Sort.asm b/Asm/x86/Sort.asm new file mode 100644 index 0000000..517c615 --- /dev/null +++ b/Asm/x86/Sort.asm
@@ -0,0 +1,860 @@
	1	; SortTest.asm -- ASM version of HeapSort() function
	2	; Igor Pavlov : Public domain
	3
	4	include ../../../../Asm/x86/7zAsm.asm
	5
	6	MY_ASM_START
	7
	8	ifndef Z7_SORT_ASM_USE_SEGMENT
	9	if (IS_LINUX gt 0)
	10	; Z7_SORT_ASM_USE_SEGMENT equ 1
	11	else
	12	; Z7_SORT_ASM_USE_SEGMENT equ 1
	13	endif
	14	endif
	15
	16	ifdef Z7_SORT_ASM_USE_SEGMENT
	17	_TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE'
	18	MY_ALIGN macro num:req
	19	align num
	20	endm
	21	else
	22	MY_ALIGN macro num:req
	23	; We expect that ".text" is aligned for 16-bytes.
	24	; So we don't need large alignment inside our function.
	25	align 16
	26	endm
	27	endif
	28
	29
	30	MY_ALIGN_16 macro
	31	MY_ALIGN 16
	32	endm
	33
	34	MY_ALIGN_32 macro
	35	MY_ALIGN 32
	36	endm
	37
	38	MY_ALIGN_64 macro
	39	MY_ALIGN 64
	40	endm
	41
	42	ifdef x64
	43
	44	NUM_PREFETCH_LEVELS equ 3 ; to prefetch 1x 64-bytes line (is good for most cases)
	45	; NUM_PREFETCH_LEVELS equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays)
	46
	47	acc equ x0
	48	k equ r0
	49	k_x equ x0
	50
	51	p equ r1
	52
	53	s equ r2
	54	s_x equ x2
	55
	56	a0 equ x3
	57	t0 equ a0
	58
	59	a3 equ x5
	60	qq equ a3
	61
	62	a1 equ x6
	63	t1 equ a1
	64	t1_r equ r6
	65
	66	a2 equ x7
	67	t2 equ a2
	68
	69	i equ r8
	70	e0 equ x8
	71
	72	e1 equ x9
	73
	74	num_last equ r10
	75	num_last_x equ x10
	76
	77	next4_lim equ r11
	78	pref_lim equ r12
	79
	80
	81
	82	SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg
	83	mov temp_reg, b0
	84	cmp b0, b1
	85	cmovae b0, b1 ; min
	86	cmovae b1, temp_reg ; max
	87	endm
	88
	89	SORT macro b0, b1
	90	SORT_2_WITH_TEMP_REG b0, b1, acc
	91	endm
	92
	93	LOAD macro dest:req, index:req
	94	mov dest, [p + 4 * index]
	95	endm
	96
	97	STORE macro reg:req, index:req
	98	mov [p + 4 * index], reg
	99	endm
	100
	101
	102	if (NUM_PREFETCH_LEVELS gt 3)
	103	num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3))
	104	else
	105	num_prefetches equ 1
	106	endif
	107
	108	PREFETCH_OP macro offs
	109	cur_offset = 7 * 4 ; it's average offset in 64-bytes cache line.
	110	; cur_offset = 0 ; we can use zero offset, if we are sure that array is aligned for 64-bytes.
	111	rept num_prefetches
	112	if 1
	113	prefetcht0 byte ptr [p + offs + cur_offset]
	114	else
	115	mov pref_x, dword ptr [p + offs + cur_offset]
	116	endif
	117	cur_offset = cur_offset + 64
	118	endm
	119	endm
	120
	121	PREFETCH_MY macro
	122	if 1
	123	if 1
	124	shl k, NUM_PREFETCH_LEVELS + 3
	125	else
	126	; we delay prefetch instruction to improve main loads
	127	shl k, NUM_PREFETCH_LEVELS
	128	shl k, 3
	129	; shl k, 0
	130	endif
	131	PREFETCH_OP k
	132	elseif 1
	133	shl k, 3
	134	PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it
	135	endif
	136	endm
	137
	138
	139	STEP_1 macro exit_label, prefetch_macro
	140	use_cmov_1 equ 1 ; set 1 for cmov, but it's slower in some cases
	141	; set 0 for LOAD after adc s, 0
	142	cmp t0, t1
	143	if use_cmov_1
	144	cmovb t0, t1
	145	; STORE t0, k
	146	endif
	147	adc s, 0
	148	if use_cmov_1 eq 0
	149	LOAD t0, s
	150	endif
	151	cmp qq, t0
	152	jae exit_label
	153	if 1 ; use_cmov_1 eq 0
	154	STORE t0, k
	155	endif
	156	prefetch_macro
	157	mov t0, [p + s * 8]
	158	mov t1, [p + s * 8 + 4]
	159	mov k, s
	160	add s, s ; slower for some cpus
	161	; lea s, dword ptr [s + s] ; slower for some cpus
	162	; shl s, 1 ; faster for some cpus
	163	; lea s, dword ptr [s * 2] ; faster for some cpus
	164	rept 0 ; 1000 for debug : 0 for normal
	165	; number of calls in generate_stage : ~0.6 of number of items
	166	shl k, 0
	167	endm
	168	endm
	169
	170
	171	STEP_2 macro exit_label, prefetch_macro
	172	use_cmov_2 equ 0 ; set 1 for cmov, but it's slower in some cases
	173	; set 0 for LOAD after adc s, 0
	174	cmp t0, t1
	175	if use_cmov_2
	176	mov t2, t0
	177	cmovb t2, t1
	178	; STORE t2, k
	179	endif
	180	mov t0, [p + s * 8]
	181	mov t1, [p + s * 8 + 4]
	182	cmovb t0, [p + s * 8 + 8]
	183	cmovb t1, [p + s * 8 + 12]
	184	adc s, 0
	185	if use_cmov_2 eq 0
	186	LOAD t2, s
	187	endif
	188	cmp qq, t2
	189	jae exit_label
	190	if 1 ; use_cmov_2 eq 0
	191	STORE t2, k
	192	endif
	193	prefetch_macro
	194	mov k, s
	195	; add s, s
	196	; lea s, [s + s]
	197	shl s, 1
	198	; lea s, [s * 2]
	199	endm
	200
	201
	202	MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls
	203	LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes
	204
	205	; s == k * 2
	206	; t0 == (p)[s]
	207	; t1 == (p)[s + 1]
	208	cmp k, next4_lim
	209	jae leaves
	210
	211	rept num_unrolls
	212	STEP exit_2
	213	cmp k, next4_lim
	214	jae leaves
	215	endm
	216
	217	if use_prefetch
	218	prefetch_macro equ PREFETCH_MY
	219	pref_lim_2 equ pref_lim
	220	; lea pref_lim, dword ptr [num_last + 1]
	221	; shr pref_lim, NUM_PREFETCH_LEVELS + 1
	222	cmp k, pref_lim_2
	223	jae last_nodes
	224	else
	225	prefetch_macro equ
	226	pref_lim_2 equ next4_lim
	227	endif
	228
	229	MY_ALIGN_16
	230	opt_loop:
	231	STEP exit_2, prefetch_macro
	232	cmp k, pref_lim_2
	233	jb opt_loop
	234
	235	last_nodes:
	236	; k >= pref_lim_2
	237	; 2 cases are possible:
	238	; case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2
	239	; case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 &&
	240	; next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1)
	241	if use_prefetch
	242	yyy = NUM_PREFETCH_LEVELS - 1
	243	while yyy
	244	yyy = yyy - 1
	245	STEP exit_2
	246	if yyy
	247	cmp k, next4_lim
	248	jae leaves
	249	endif
	250	endm
	251	endif
	252
	253	leaves:
	254	; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code.
	255	; we have 2 nodes in (s) level : always
	256	; we can have some nodes in (s * 2) level : low probability case
	257	; we have no nodes in (s * 4) level
	258	; s == k * 2
	259	; t0 == (p)[s]
	260	; t1 == (p)[s + 1]
	261	cmp t0, t1
	262	cmovb t0, t1
	263	adc s, 0
	264	STORE t0, k
	265
	266	; t0 == (p)[s]
	267	; s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
	268	; we have 3 possible cases here:
	269	; s * 2 > num_last : (s) node has no childs
	270	; s * 2 == num_last : (s) node has 1 leaf child that is last item of array
	271	; s * 2 < num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last)
	272	; we check for (s * 2 > num_last) before "cmp qq, t0" check, because
	273	; we will replace conditional jump with cmov instruction later.
	274	lea t1_r, dword ptr [s + s]
	275	cmp t1_r, num_last
	276	ja exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch
	277
	278	; it's low probability branch
	279	; s * 2 <= num_last
	280	cmp qq, t0
	281	jae exit_2
	282
	283	; qq < t0, so we go to next level
	284	; we check 1 or 2 childs in next level
	285	mov t0, [p + s * 8]
	286	mov k, s
	287	mov s, t1_r
	288	cmp t1_r, num_last
	289	je @F ; (s == num_last) means that we have single child in tree
	290
	291	; (s < num_last) : so we must read both childs and select max of them.
	292	mov t1, [p + k * 8 + 4]
	293	cmp t0, t1
	294	cmovb t0, t1
	295	adc s, 0
	296	@@:
	297	STORE t0, k
	298	exit_1:
	299	; t0 == (p)[s], s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
	300	cmp qq, t0
	301	cmovb k, s
	302	exit_2:
	303	STORE qq, k
	304	endm
	305
	306
	307
	308
	309	ifdef Z7_SORT_ASM_USE_SEGMENT
	310	; MY_ALIGN_64
	311	else
	312	MY_ALIGN_16
	313	endif
	314
	315	MY_PROC HeapSort, 2
	316
	317	if (IS_LINUX gt 0)
	318	mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
	319	endif
	320	mov num_last, REG_ABI_PARAM_1 ; r10 <- r6 : linux
	321	; r10 <- r2 : win64
	322	cmp num_last, 2
	323	jb end_1
	324
	325	; MY_PUSH_PRESERVED_ABI_REGS
	326	MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
	327	push r12
	328
	329	cmp num_last, 4
	330	ja sort_5
	331
	332	LOAD a0, 0
	333	LOAD a1, 1
	334	SORT a0, a1
	335	cmp num_last, 3
	336	jb end_2
	337
	338	LOAD a2, 2
	339	je sort_3
	340
	341	LOAD a3, 3
	342	SORT a2, a3
	343	SORT a1, a3
	344	STORE a3, 3
	345	sort_3:
	346	SORT a0, a2
	347	SORT a1, a2
	348	STORE a2, 2
	349	jmp end_2
	350
	351	sort_5:
	352	; (num_last > 4) is required here
	353	; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1
	354	mov next4_lim, num_last
	355	shr next4_lim, 2
	356
	357	dec num_last
	358	mov k, num_last
	359	shr k, 1
	360	mov i, num_last
	361	shr i, 2
	362	test num_last, 1
	363	jnz size_even
	364
	365	; ODD number of items. So we compare parent with single child
	366	LOAD t1, num_last
	367	LOAD t0, k
	368	SORT_2_WITH_TEMP_REG t1, t0, t2
	369	STORE t1, num_last
	370	STORE t0, k
	371	dec k
	372
	373	size_even:
	374	cmp k, i
	375	jbe loop_down ; jump for num_last == 4 case
	376
	377	if 0 ; 1 for debug
	378	mov r15, k
	379	mov r14d, 1 ; 100
	380	loop_benchmark:
	381	endif
	382	; optimized loop for leaf nodes:
	383	mov t0, [p + k * 8]
	384	mov t1, [p + k * 8 + 4]
	385
	386	MY_ALIGN_16
	387	loop_down_1:
	388	; we compare parent with max of childs:
	389	; lea s, dword ptr [2 * k]
	390	mov s, k
	391	cmp t0, t1
	392	cmovb t0, t1
	393	adc s, s
	394	LOAD t2, k
	395	STORE t0, k
	396	cmp t2, t0
	397	cmovae s, k
	398	dec k
	399	; we preload next items before STORE operation for calculated address
	400	mov t0, [p + k * 8]
	401	mov t1, [p + k * 8 + 4]
	402	STORE t2, s
	403	cmp k, i
	404	jne loop_down_1
	405
	406	if 0 ; 1 for debug
	407	mov k, r15
	408	dec r14d
	409	jnz loop_benchmark
	410	; jmp end_debug
	411	endif
	412
	413	MY_ALIGN_16
	414	loop_down:
	415	mov t0, [p + i * 8]
	416	mov t1, [p + i * 8 + 4]
	417	LOAD qq, i
	418	mov k, i
	419	lea s, dword ptr [i + i]
	420	; jmp end_debug
	421	DOWN_use_prefetch equ 0
	422	DOWN_num_unrolls equ 0
	423	MOVE_SMALLEST_UP STEP_1, DOWN_use_prefetch, DOWN_num_unrolls
	424	sub i, 1
	425	jnb loop_down
	426
	427	; jmp end_debug
	428	LOAD e0, 0
	429	LOAD e1, 1
	430
	431	LEVEL_3_LIMIT equ 8 ; 8 is default, but 7 also can work
	432
	433	cmp num_last, LEVEL_3_LIMIT + 1
	434	jb main_loop_sort_5
	435
	436	MY_ALIGN_16
	437	main_loop_sort:
	438	; num_last > LEVEL_3_LIMIT
	439	; p[size--] = p[0];
	440	LOAD qq, num_last
	441	STORE e0, num_last
	442	mov e0, e1
	443
	444	mov next4_lim, num_last
	445	shr next4_lim, 2
	446	mov pref_lim, num_last
	447	shr pref_lim, NUM_PREFETCH_LEVELS + 1
	448
	449	dec num_last
	450	if 0 ; 1 for debug
	451	; that optional optimization can improve the performance, if there are identical items in array
	452	; 3 times improvement : if all items in array are identical
	453	; 20% improvement : if items are different for 1 bit only
	454	; 1-10% improvement : if items are different for (2+) bits
	455	; no gain : if items are different
	456	cmp qq, e1
	457	jae next_iter_main
	458	endif
	459	LOAD e1, 2
	460	LOAD t0, 3
	461	mov k_x, 2
	462	cmp e1, t0
	463	cmovb e1, t0
	464	mov t0, [p + 4 * (4 + 0)]
	465	mov t1, [p + 4 * (4 + 1)]
	466	cmovb t0, [p + 4 * (4 + 2)]
	467	cmovb t1, [p + 4 * (4 + 3)]
	468	adc k_x, 0
	469	; (qq <= e1), because the tree is correctly sorted
	470	; also here we could check (qq >= e1) or (qq == e1) for faster exit
	471	lea s, dword ptr [k + k]
	472	MAIN_use_prefetch equ 1
	473	MAIN_num_unrolls equ 0
	474	MOVE_SMALLEST_UP STEP_2, MAIN_use_prefetch, MAIN_num_unrolls
	475
	476	next_iter_main:
	477	cmp num_last, LEVEL_3_LIMIT
	478	jne main_loop_sort
	479
	480	; num_last == LEVEL_3_LIMIT
	481	main_loop_sort_5:
	482	; 4 <= num_last <= LEVEL_3_LIMIT
	483	; p[size--] = p[0];
	484	LOAD qq, num_last
	485	STORE e0, num_last
	486	mov e0, e1
	487	dec num_last_x
	488
	489	LOAD e1, 2
	490	LOAD t0, 3
	491	mov k_x, 2
	492	cmp e1, t0
	493	cmovb e1, t0
	494	adc k_x, 0
	495
	496	lea s_x, dword ptr [k * 2]
	497	cmp s_x, num_last_x
	498	ja exit_2
	499
	500	mov t0, [p + k * 8]
	501	je exit_1
	502
	503	; s < num_last
	504	mov t1, [p + k * 8 + 4]
	505	cmp t0, t1
	506	cmovb t0, t1
	507	adc s_x, 0
	508	exit_1:
	509	STORE t0, k
	510	cmp qq, t0
	511	cmovb k_x, s_x
	512	exit_2:
	513	STORE qq, k
	514	cmp num_last_x, 3
	515	jne main_loop_sort_5
	516
	517	; num_last == 3 (real_size == 4)
	518	LOAD a0, 2
	519	LOAD a1, 3
	520	STORE e1, 2
	521	STORE e0, 3
	522	SORT a0, a1
	523	end_2:
	524	STORE a0, 0
	525	STORE a1, 1
	526	; end_debug:
	527	; MY_POP_PRESERVED_ABI_REGS
	528	pop r12
	529	MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
	530	end_1:
	531	MY_ENDP
	532
	533
	534
	535	else
	536	; ------------ x86 32-bit ------------
	537
	538	ifdef x64
	539	IS_CDECL = 0
	540	endif
	541
	542	acc equ x0
	543	k equ r0
	544	k_x equ acc
	545
	546	p equ r1
	547
	548	num_last equ r2
	549	num_last_x equ x2
	550
	551	a0 equ x3
	552	t0 equ a0
	553
	554	a3 equ x5
	555	i equ r5
	556	e0 equ a3
	557
	558	a1 equ x6
	559	qq equ a1
	560
	561	a2 equ x7
	562	s equ r7
	563	s_x equ a2
	564
	565
	566	SORT macro b0, b1
	567	cmp b1, b0
	568	jae @F
	569	if 1
	570	xchg b0, b1
	571	else
	572	mov acc, b0
	573	mov b0, b1 ; min
	574	mov b1, acc ; max
	575	endif
	576	@@:
	577	endm
	578
	579	LOAD macro dest:req, index:req
	580	mov dest, [p + 4 * index]
	581	endm
	582
	583	STORE macro reg:req, index:req
	584	mov [p + 4 * index], reg
	585	endm
	586
	587
	588	STEP_1 macro exit_label
	589	mov t0, [p + k * 8]
	590	cmp t0, [p + k * 8 + 4]
	591	adc s, 0
	592	LOAD t0, s
	593	STORE t0, k ; we lookahed stooring for most expected branch
	594	cmp qq, t0
	595	jae exit_label
	596	; STORE t0, k ; use if
	597	mov k, s
	598	add s, s
	599	; lea s, dword ptr [s + s]
	600	; shl s, 1
	601	; lea s, dword ptr [s * 2]
	602	endm
	603
	604	STEP_BRANCH macro exit_label
	605	mov t0, [p + k * 8]
	606	cmp t0, [p + k * 8 + 4]
	607	jae @F
	608	inc s
	609	mov t0, [p + k * 8 + 4]
	610	@@:
	611	cmp qq, t0
	612	jae exit_label
	613	STORE t0, k
	614	mov k, s
	615	add s, s
	616	endm
	617
	618
	619
	620	MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2
	621	LOCAL leaves, opt_loop, single
	622
	623	; s == k * 2
	624	rept num_unrolls
	625	cmp s, num_last
	626	jae leaves
	627	STEP_1 exit_2
	628	endm
	629	cmp s, num_last
	630	jb opt_loop
	631
	632	leaves:
	633	; (s >= num_last)
	634	jne exit_2
	635	single:
	636	; (s == num_last)
	637	mov t0, [p + k * 8]
	638	cmp qq, t0
	639	jae exit_2
	640	STORE t0, k
	641	mov k, s
	642	jmp exit_2
	643
	644	MY_ALIGN_16
	645	opt_loop:
	646	STEP exit_2
	647	cmp s, num_last
	648	jb opt_loop
	649	je single
	650	exit_2:
	651	STORE qq, k
	652	endm
	653
	654
	655
	656
	657	ifdef Z7_SORT_ASM_USE_SEGMENT
	658	; MY_ALIGN_64
	659	else
	660	MY_ALIGN_16
	661	endif
	662
	663	MY_PROC HeapSort, 2
	664	ifdef x64
	665	if (IS_LINUX gt 0)
	666	mov num_last, REG_ABI_PARAM_1 ; r2 <- r6 : linux
	667	mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
	668	endif
	669	elseif (IS_CDECL gt 0)
	670	mov num_last, [r4 + REG_SIZE * 2]
	671	mov p, [r4 + REG_SIZE * 1]
	672	endif
	673	cmp num_last, 2
	674	jb end_1
	675	MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
	676
	677	cmp num_last, 4
	678	ja sort_5
	679
	680	LOAD a0, 0
	681	LOAD a1, 1
	682	SORT a0, a1
	683	cmp num_last, 3
	684	jb end_2
	685
	686	LOAD a2, 2
	687	je sort_3
	688
	689	LOAD a3, 3
	690	SORT a2, a3
	691	SORT a1, a3
	692	STORE a3, 3
	693	sort_3:
	694	SORT a0, a2
	695	SORT a1, a2
	696	STORE a2, 2
	697	jmp end_2
	698
	699	sort_5:
	700	; num_last > 4
	701	lea i, dword ptr [num_last - 2]
	702	dec num_last
	703	test i, 1
	704	jz loop_down
	705
	706	; single child
	707	mov t0, [p + num_last * 4]
	708	mov qq, [p + num_last * 2]
	709	dec i
	710	cmp qq, t0
	711	jae loop_down
	712
	713	mov [p + num_last * 2], t0
	714	mov [p + num_last * 4], qq
	715
	716	MY_ALIGN_16
	717	loop_down:
	718	mov t0, [p + i * 4]
	719	cmp t0, [p + i * 4 + 4]
	720	mov k, i
	721	mov qq, [p + i * 2]
	722	adc k, 0
	723	LOAD t0, k
	724	cmp qq, t0
	725	jae down_next
	726	mov [p + i * 2], t0
	727	lea s, dword ptr [k + k]
	728
	729	DOWN_num_unrolls equ 0
	730	MOVE_SMALLEST_UP STEP_1, DOWN_num_unrolls, down_exit_label
	731	down_next:
	732	sub i, 2
	733	jnb loop_down
	734	; jmp end_debug
	735
	736	LOAD e0, 0
	737
	738	MY_ALIGN_16
	739	main_loop_sort:
	740	; num_last > 3
	741	mov t0, [p + 2 * 4]
	742	cmp t0, [p + 3 * 4]
	743	LOAD qq, num_last
	744	STORE e0, num_last
	745	LOAD e0, 1
	746	mov s_x, 2
	747	mov k_x, 1
	748	adc s, 0
	749	LOAD t0, s
	750	dec num_last
	751	cmp qq, t0
	752	jae main_exit_label
	753	STORE t0, 1
	754	mov k, s
	755	add s, s
	756	if 1
	757	; for branch data prefetch mode :
	758	; it's faster for large arrays : larger than (1 << 13) items.
	759	MAIN_num_unrolls equ 10
	760	STEP_LOOP equ STEP_BRANCH
	761	else
	762	MAIN_num_unrolls equ 0
	763	STEP_LOOP equ STEP_1
	764	endif
	765
	766	MOVE_SMALLEST_UP STEP_LOOP, MAIN_num_unrolls, main_exit_label
	767
	768	; jmp end_debug
	769	cmp num_last, 3
	770	jne main_loop_sort
	771
	772	; num_last == 3 (real_size == 4)
	773	LOAD a0, 2
	774	LOAD a1, 3
	775	LOAD a2, 1
	776	STORE e0, 3 ; e0 is alias for a3
	777	STORE a2, 2
	778	SORT a0, a1
	779	end_2:
	780	STORE a0, 0
	781	STORE a1, 1
	782	; end_debug:
	783	MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
	784	end_1:
	785	MY_ENDP
	786
	787	endif
	788
	789	ifdef Z7_SORT_ASM_USE_SEGMENT
	790	_TEXT$Z7_SORT ENDS
	791	endif
	792
	793	if 0
	794	LEA_IS_D8 (R64) [R2 * 4 + 16]
	795	Lat : TP
	796	2 : 1 : adl-e
	797	2 : 3 p056 adl-p
	798	1 : 2 : p15 hsw-rocket
	799	1 : 2 : p01 snb-ivb
	800	1 : 1 : p1 conroe-wsm
	801	1 : 4 : zen3,zen4
	802	2 : 4 : zen1,zen2
	803
	804	LEA_B_IS (R64) [R2 + R3 * 4]
	805	Lat : TP
	806	1 : 1 : adl-e
	807	2 : 3 p056 adl-p
	808	1 : 2 : p15 hsw-rocket
	809	1 : 2 : p01 snb-ivb
	810	1 : 1 : p1 nhm-wsm
	811	1 : 1 : p0 conroe-wsm
	812	1 : 4 : zen3,zen4
	813	2 :2,4 : zen1,zen2
	814
	815	LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16]
	816	Lat : TP
	817	2 : 1 : adl-e
	818	2 : 3 p056 adl-p
	819	1 : 2 : p15 ice-rocket
	820	3 : 1 : p1/p15 hsw-rocket
	821	3 : 1 : p01 snb-ivb
	822	1 : 1 : p1 nhm-wsm
	823	1 : 1 : p0 conroe-wsm
	824	2,1 : 2 : zen3,zen4
	825	2 : 2 : zen1,zen2
	826
	827	CMOVB (R64, R64)
	828	Lat : TP
	829	1,2 : 2 : adl-e
	830	1 : 2 p06 adl-p
	831	1 : 2 : p06 bwd-rocket
	832	1,2 : 2 : p0156+p06 hsw
	833	1,2 :1.5 : p015+p05 snb-ivb
	834	1,2 : 1 : p015+p05 nhm
	835	1 : 1 : 2*p015 conroe
	836	1 : 2 : zen3,zen4
	837	1 : 4 : zen1,zen2
	838
	839	ADC (R64, 0)
	840	Lat : TP
	841	1,2 : 2 : adl-e
	842	1 : 2 p06 adl-p
	843	1 : 2 : p06 bwd-rocket
	844	1 :1.5 : p0156+p06 hsw
	845	1 :1.5 : p015+p05 snb-ivb
	846	2 : 1 : 2*p015 conroe-wstm
	847	1 : 2 : zen1,zen2,zen3,zen4
	848
	849	PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution.
	850	L1 : Pentium3
	851	L2 : NetBurst
	852	L1, not L2: Core duo, Core 2, Atom processors
	853	L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ...
	854	NEHALEM: Fills L1/L3, L1 LRU is not updated
	855	L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ...
	856	PREFETCHT0 : fetch data into all cache levels.
	857	PREFETCHT1 : fetch data into L2 and L3
	858	endif
	859
	860	end