1 files changed, 241 insertions, 114 deletions
diff --git a/C/Sort.c b/C/Sort.c
index e1097e3..20e3e69 100644
--- a/C/Sort.c
+++ b/C/Sort.c
@@ -1,141 +1,268 @@
 /* Sort.c -- Sort functions
-2014-04-05 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 #include "Precomp.h"
 #include "Sort.h"
+#include "CpuArch.h"
-#define HeapSortDown(p, k, size, temp) \
+#if (  (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
-  { for (;;) { \
+    || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
-    size_t s = (k << 1); \
+    )
-    if (s > size) break; \
+// the code with prefetch is slow for small arrays on x86.
-    if (s < size && p[s + 1] > p[s]) s++; \
+// So we disable prefetch for x86.
-    if (temp >= p[s]) break; \
+#ifndef MY_CPU_X86
-    p[k] = p[s]; k = s; \
+  // #pragma message("Z7_PREFETCH : __builtin_prefetch")
-  } p[k] = temp; }
+  #define Z7_PREFETCH(a)  __builtin_prefetch((a))
+#endif
-void HeapSort(UInt32 *p, size_t size)
+#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
-{
-  if (size <= 1)
+#include "7zWindows.h"
-    return;
-  p--;
+// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
-  {
+// For example, clang-cl can generate "prefetcht2" instruction for
-    size_t i = size / 2;
+// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
-    do
+// But we want to generate "prefetcht0" instruction.
-    {
+// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
-      UInt32 temp = p[i];
+// instead of PreFetchCacheLine() / _mm_prefetch().
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
+// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
-    }
+// But old x86 cpus don't support "prefetcht0".
-    while (--i != 0);
+// So we will use PreFetchCacheLine(), only if we are sure that
-  }
+// generated instruction is supported by all cpus of that isa.
-  /*
+#if defined(MY_CPU_AMD64) \
-  do
+    || defined(MY_CPU_ARM64) \
-  {
+    || defined(MY_CPU_IA64)
-    size_t k = 1;
+// we need to use additional braces for (a) in PreFetchCacheLine call, because
-    UInt32 temp = p[size];
+// PreFetchCacheLine macro doesn't use braces:
-    p[size--] = p[1];
+//   #define PreFetchCacheLine(l, a)  _mm_prefetch((CHAR CONST *) a, l)
-    HeapSortDown(p, k, size, temp)
+  // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
-  }
+  #define Z7_PREFETCH(a)  PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
-  while (size > 1);
+#endif
-  */
-  while (size > 3)
+#endif // _WIN32
-  {
-    UInt32 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
+#define PREFETCH_NO(p,k,s,size)
-    p[size--] = p[1];
-    p[1] = p[k];
+#ifndef Z7_PREFETCH
-    HeapSortDown(p, k, size, temp)
+  #define SORT_PREFETCH(p,k,s,size)
-  }
+#else
-  {
-    UInt32 temp = p[size];
+// #define PREFETCH_LEVEL 2  // use it if cache line is 32-bytes
-    p[size] = p[1];
+#define PREFETCH_LEVEL 3  // it is fast for most cases (64-bytes cache line prefetch)
-    if (size > 2 && p[2] < temp)
+// #define PREFETCH_LEVEL 4  // it can be faster for big array (128-bytes prefetch)
-    {
-      p[1] = p[2];
+#if PREFETCH_LEVEL == 0
-      p[2] = temp;
-    }
+  #define SORT_PREFETCH(p,k,s,size)
-    else
-      p[1] = temp;
+#else // PREFETCH_LEVEL != 0
-  }
+/*
+if  defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we prefetch one value per cache line.
+    Use it if array is aligned for cache line size (64 bytes)
+    or if array is small (less than L1 cache size).
+if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we perfetch all cache lines that can be required.
+    it can be faster for big unaligned arrays.
+*/
+  #define USE_PREFETCH_FOR_ALIGNED_ARRAY
+// s == k * 2
+#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
+  // x86 supports (lea r1*8+offset)
+  #define PREFETCH_OFFSET(k,s)  ((s) << PREFETCH_LEVEL)
+#else
+  #define PREFETCH_OFFSET(k,s)  ((k) << (PREFETCH_LEVEL + 1))
+#endif
+#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+  #define PREFETCH_ADD_OFFSET   0
+#else
+  // last offset that can be reqiured in PREFETCH_LEVEL step:
+  #define PREFETCH_RANGE        ((2 << PREFETCH_LEVEL) - 1)
+  #define PREFETCH_ADD_OFFSET   PREFETCH_RANGE / 2
+#endif
+#if PREFETCH_LEVEL <= 3
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+#else // PREFETCH_LEVEL > 3
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - 16)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+#endif // PREFETCH_LEVEL > 3
+#endif // PREFETCH_LEVEL != 0
+#endif // Z7_PREFETCH
+#if defined(MY_CPU_ARM64) \
+    /* || defined(MY_CPU_AMD64) */ \
+    /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
+  // we want to use cmov, if cmov is very fast:
+  // - this cmov version is slower for clang-x64.
+  // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
+  #define Z7_FAST_CMOV_SUPPORTED
+#endif
+ 
+#ifdef Z7_FAST_CMOV_SUPPORTED
+  // we want to use cmov here, if cmov is fast: new arm64 cpus.
+  // we want the compiler to use conditional move for this branch
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  if (n0 < n1) n0 = n1;
+#else
+  // use this branch, if cpu doesn't support fast conditional move.
+  // it uses slow array access reading:
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  n0 = max_val_slow;
+#endif
+#define HeapSortDown(p, k, size, temp, macro_prefetch) \
+{ \
+  for (;;) { \
+    UInt32 n0, n1; \
+    size_t s = k * 2; \
+    if (s >= size) { \
+      if (s == size) { \
+        n0 = p[s]; \
+        p[k] = n0; \
+        if (temp < n0) k = s; \
+      } \
+      break; \
+    } \
+    n0 = p[k * 2]; \
+    n1 = p[k * 2 + 1]; \
+    s += n0 < n1; \
+    GET_MAX_VAL(n0, n1, p[s]) \
+    if (temp >= n0) break; \
+    macro_prefetch(p, k, s, size) \
+    p[k] = n0; \
+    k = s; \
+  } \
+  p[k] = temp; \
 }
-void HeapSort64(UInt64 *p, size_t size)
+/*
+stage-1 : O(n) :
+  we generate intermediate partially sorted binary tree:
+  p[0]  : it's additional item for better alignment of tree structure in memory.
+  p[1]
+  p[2]       p[3]
+  p[4] p[5]  p[6] p[7]
+  ...
+  p[x] >= p[x * 2]
+  p[x] >= p[x * 2 + 1]
+  
+stage-2 : O(n)*log2(N):
+  we move largest item p[0] from head of tree to the end of array
+  and insert last item to sorted binary tree.
+*/
+// (p) must be aligned for cache line size (64-bytes) for best performance
+void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
 {
-  if (size <= 1)
+  if (size < 2)
    return;
-  p--;
+  if (size == 2)
-  {
-    size_t i = size / 2;
-    do
-    {
-      UInt64 temp = p[i];
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
-    }
-    while (--i != 0);
-  }
-  /*
-  do
  {
-    size_t k = 1;
+    const UInt32 a0 = p[0];
-    UInt64 temp = p[size];
+    const UInt32 a1 = p[1];
-    p[size--] = p[1];
+    const unsigned k = a1 < a0;
-    HeapSortDown(p, k, size, temp)
+    p[k] = a0;
-  }
+    p[k ^ 1] = a1;
-  while (size > 1);
+    return;
-  */
-  while (size > 3)
-  {
-    UInt64 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
-    p[size--] = p[1];
-    p[1] = p[k];
-    HeapSortDown(p, k, size, temp)
  }
  {
-    UInt64 temp = p[size];
+    // stage-1 : O(n)
-    p[size] = p[1];
+    // we transform array to partially sorted binary tree.
-    if (size > 2 && p[2] < temp)
+    size_t i = --size / 2;
+    // (size) now is the index of the last item in tree,
+    // if (i)
    {
-      p[1] = p[2];
+      do
-      p[2] = temp;
+      {
+        const UInt32 temp = p[i];
+        size_t k = i;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
+      }
+      while (--i);
+    }
+    {
+      const UInt32 temp = p[0];
+      const UInt32 a1 = p[1];
+      if (temp < a1)
+      {
+        size_t k = 1;
+        p[0] = a1;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
+      }
    }
-    else
-      p[1] = temp;
  }
-}
-/*
+  if (size < 3)
-#define HeapSortRefDown(p, vals, n, size, temp) \
+  {
-  { size_t k = n; UInt32 val = vals[temp]; for (;;) { \
+    // size == 2
-    size_t s = (k << 1); \
+    const UInt32 a0 = p[0];
-    if (s > size) break; \
+    p[0] = p[2];
-    if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
+    p[2] = a0;
-    if (val >= vals[p[s]]) break; \
-    p[k] = p[s]; k = s; \
-  } p[k] = temp; }
-void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
-{
-  if (size <= 1)
    return;
-  p--;
+  }
+  if (size != 3)
  {
-    size_t i = size / 2;
+    // stage-2 : O(size) * log2(size):
+    // we move largest item p[0] from head to the end of array,
+    // and insert last item to sorted binary tree.
    do
    {
-      UInt32 temp = p[i];
+      const UInt32 temp = p[size];
-      HeapSortRefDown(p, vals, i, size, temp);
+      size_t k = p[2] < p[3] ? 3 : 2;
+      p[size--] = p[0];
+      p[0] = p[1];
+      p[1] = p[k];
+      HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
    }
-    while (--i != 0);
+    while (size != 3);
  }
-  do
  {
-    UInt32 temp = p[size];
+    const UInt32 a2 = p[2];
-    p[size--] = p[1];
+    const UInt32 a3 = p[3];
-    HeapSortRefDown(p, vals, 1, size, temp);
+    const size_t k = a2 < a3;
+    p[2] = p[1];
+    p[3] = p[0];
+    p[k] = a3;
+    p[k ^ 1] = a2;
  }
-  while (size > 1);
 }
-*/

diff --git a/C/Sort.c b/C/Sort.c index e1097e3..20e3e69 100644 --- a/C/Sort.c +++ b/C/Sort.c
@@ -1,141 +1,268 @@
1	/* Sort.c -- Sort functions	1	/* Sort.c -- Sort functions
2	2014-04-05 : Igor Pavlov : Public domain */	2	: Igor Pavlov : Public domain */
3		3
4	#include "Precomp.h"	4	#include "Precomp.h"
5		5
6	#include "Sort.h"	6	#include "Sort.h"
		7	#include "CpuArch.h"
7		8
8	#define HeapSortDown(p, k, size, temp) \	9	#if ( (defined(__GNUC__) && (__GNUC__ > 3 \|\| (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
9	{ for (;;) { \	10	\|\| (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
10	size_t s = (k << 1); \	11	)
11	if (s > size) break; \	12	// the code with prefetch is slow for small arrays on x86.
12	if (s < size && p[s + 1] > p[s]) s++; \	13	// So we disable prefetch for x86.
13	if (temp >= p[s]) break; \	14	#ifndef MY_CPU_X86
14	p[k] = p[s]; k = s; \	15	// #pragma message("Z7_PREFETCH : __builtin_prefetch")
15	} p[k] = temp; }	16	#define Z7_PREFETCH(a) __builtin_prefetch((a))
		17	#endif
16		18
17	void HeapSort(UInt32 *p, size_t size)	19	#elif defined(_WIN32) // \|\| defined(_MSC_VER) && (_MSC_VER >= 1200)
18	{	20
19	if (size <= 1)	21	#include "7zWindows.h"
20	return;	22
21	p--;	23	// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
22	{	24	// For example, clang-cl can generate "prefetcht2" instruction for
23	size_t i = size / 2;	25	// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
24	do	26	// But we want to generate "prefetcht0" instruction.
25	{	27	// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
26	UInt32 temp = p[i];	28	// instead of PreFetchCacheLine() / _mm_prefetch().
27	size_t k = i;	29
28	HeapSortDown(p, k, size, temp)	30	// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
29	}	31	// But old x86 cpus don't support "prefetcht0".
30	while (--i != 0);	32	// So we will use PreFetchCacheLine(), only if we are sure that
31	}	33	// generated instruction is supported by all cpus of that isa.
32	/*	34	#if defined(MY_CPU_AMD64) \
33	do	35	\|\| defined(MY_CPU_ARM64) \
34	{	36	\|\| defined(MY_CPU_IA64)
35	size_t k = 1;	37	// we need to use additional braces for (a) in PreFetchCacheLine call, because
36	UInt32 temp = p[size];	38	// PreFetchCacheLine macro doesn't use braces:
37	p[size--] = p[1];	39	// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
38	HeapSortDown(p, k, size, temp)	40	// #pragma message("Z7_PREFETCH : PreFetchCacheLine")
39	}	41	#define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
40	while (size > 1);	42	#endif
41	*/	43
42	while (size > 3)	44	#endif // _WIN32
43	{	45
44	UInt32 temp = p[size];	46
45	size_t k = (p[3] > p[2]) ? 3 : 2;	47	#define PREFETCH_NO(p,k,s,size)
46	p[size--] = p[1];	48
47	p[1] = p[k];	49	#ifndef Z7_PREFETCH
48	HeapSortDown(p, k, size, temp)	50	#define SORT_PREFETCH(p,k,s,size)
49	}	51	#else
50	{	52
51	UInt32 temp = p[size];	53	// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
52	p[size] = p[1];	54	#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
53	if (size > 2 && p[2] < temp)	55	// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
54	{	56
55	p[1] = p[2];	57	#if PREFETCH_LEVEL == 0
56	p[2] = temp;	58
57	}	59	#define SORT_PREFETCH(p,k,s,size)
58	else	60
59	p[1] = temp;	61	#else // PREFETCH_LEVEL != 0
60	}	62
		63	/*
		64	if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
		65	we prefetch one value per cache line.
		66	Use it if array is aligned for cache line size (64 bytes)
		67	or if array is small (less than L1 cache size).
		68
		69	if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
		70	we perfetch all cache lines that can be required.
		71	it can be faster for big unaligned arrays.
		72	*/
		73	#define USE_PREFETCH_FOR_ALIGNED_ARRAY
		74
		75	// s == k * 2
		76	#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
		77	// x86 supports (lea r1*8+offset)
		78	#define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
		79	#else
		80	#define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
		81	#endif
		82
		83	#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
		84	#define PREFETCH_ADD_OFFSET 0
		85	#else
		86	// last offset that can be reqiured in PREFETCH_LEVEL step:
		87	#define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
		88	#define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
		89	#endif
		90
		91	#if PREFETCH_LEVEL <= 3
		92
		93	#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
		94	#define SORT_PREFETCH(p,k,s,size) \
		95	{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
		96	if (s2 <= size) { \
		97	Z7_PREFETCH((p + s2)); \
		98	}}
		99	#else /* for unaligned array */
		100	#define SORT_PREFETCH(p,k,s,size) \
		101	{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
		102	if (s2 <= size) { \
		103	Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
		104	Z7_PREFETCH((p + s2)); \
		105	}}
		106	#endif
		107
		108	#else // PREFETCH_LEVEL > 3
		109
		110	#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
		111	#define SORT_PREFETCH(p,k,s,size) \
		112	{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
		113	if (s2 <= size) { \
		114	Z7_PREFETCH((p + s2 - 16)); \
		115	Z7_PREFETCH((p + s2)); \
		116	}}
		117	#else /* for unaligned array */
		118	#define SORT_PREFETCH(p,k,s,size) \
		119	{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
		120	if (s2 <= size) { \
		121	Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
		122	Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
		123	Z7_PREFETCH((p + s2)); \
		124	}}
		125	#endif
		126
		127	#endif // PREFETCH_LEVEL > 3
		128	#endif // PREFETCH_LEVEL != 0
		129	#endif // Z7_PREFETCH
		130
		131
		132	#if defined(MY_CPU_ARM64) \
		133	/* \|\| defined(MY_CPU_AMD64) */ \
		134	/* \|\| defined(MY_CPU_ARM) && !defined(_MSC_VER) */
		135	// we want to use cmov, if cmov is very fast:
		136	// - this cmov version is slower for clang-x64.
		137	// - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
		138	#define Z7_FAST_CMOV_SUPPORTED
		139	#endif
		140
		141	#ifdef Z7_FAST_CMOV_SUPPORTED
		142	// we want to use cmov here, if cmov is fast: new arm64 cpus.
		143	// we want the compiler to use conditional move for this branch
		144	#define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
		145	#else
		146	// use this branch, if cpu doesn't support fast conditional move.
		147	// it uses slow array access reading:
		148	#define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
		149	#endif
		150
		151	#define HeapSortDown(p, k, size, temp, macro_prefetch) \
		152	{ \
		153	for (;;) { \
		154	UInt32 n0, n1; \
		155	size_t s = k * 2; \
		156	if (s >= size) { \
		157	if (s == size) { \
		158	n0 = p[s]; \
		159	p[k] = n0; \
		160	if (temp < n0) k = s; \
		161	} \
		162	break; \
		163	} \
		164	n0 = p[k * 2]; \
		165	n1 = p[k * 2 + 1]; \
		166	s += n0 < n1; \
		167	GET_MAX_VAL(n0, n1, p[s]) \
		168	if (temp >= n0) break; \
		169	macro_prefetch(p, k, s, size) \
		170	p[k] = n0; \
		171	k = s; \
		172	} \
		173	p[k] = temp; \
61	}	174	}
62		175
63	void HeapSort64(UInt64 *p, size_t size)	176
		177	/*
		178	stage-1 : O(n) :
		179	we generate intermediate partially sorted binary tree:
		180	p[0] : it's additional item for better alignment of tree structure in memory.
		181	p[1]
		182	p[2] p[3]
		183	p[4] p[5] p[6] p[7]
		184	...
		185	p[x] >= p[x * 2]
		186	p[x] >= p[x * 2 + 1]
		187
		188	stage-2 : O(n)*log2(N):
		189	we move largest item p[0] from head of tree to the end of array
		190	and insert last item to sorted binary tree.
		191	*/
		192
		193	// (p) must be aligned for cache line size (64-bytes) for best performance
		194
		195	void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
64	{	196	{
65	if (size <= 1)	197	if (size < 2)
66	return;	198	return;
67	p--;	199	if (size == 2)
68	{
69	size_t i = size / 2;
70	do
71	{
72	UInt64 temp = p[i];
73	size_t k = i;
74	HeapSortDown(p, k, size, temp)
75	}
76	while (--i != 0);
77	}
78	/*
79	do
80	{	200	{
81	size_t k = 1;	201	const UInt32 a0 = p[0];
82	UInt64 temp = p[size];	202	const UInt32 a1 = p[1];
83	p[size--] = p[1];	203	const unsigned k = a1 < a0;
84	HeapSortDown(p, k, size, temp)	204	p[k] = a0;
85	}	205	p[k ^ 1] = a1;
86	while (size > 1);	206	return;
87	*/
88	while (size > 3)
89	{
90	UInt64 temp = p[size];
91	size_t k = (p[3] > p[2]) ? 3 : 2;
92	p[size--] = p[1];
93	p[1] = p[k];
94	HeapSortDown(p, k, size, temp)
95	}	207	}
96	{	208	{
97	UInt64 temp = p[size];	209	// stage-1 : O(n)
98	p[size] = p[1];	210	// we transform array to partially sorted binary tree.
99	if (size > 2 && p[2] < temp)	211	size_t i = --size / 2;
		212	// (size) now is the index of the last item in tree,
		213	// if (i)
100	{	214	{
101	p[1] = p[2];	215	do
102	p[2] = temp;	216	{
		217	const UInt32 temp = p[i];
		218	size_t k = i;
		219	HeapSortDown(p, k, size, temp, PREFETCH_NO)
		220	}
		221	while (--i);
		222	}
		223	{
		224	const UInt32 temp = p[0];
		225	const UInt32 a1 = p[1];
		226	if (temp < a1)
		227	{
		228	size_t k = 1;
		229	p[0] = a1;
		230	HeapSortDown(p, k, size, temp, PREFETCH_NO)
		231	}
103	}	232	}
104	else
105	p[1] = temp;
106	}	233	}
107	}
108		234
109	/*	235	if (size < 3)
110	#define HeapSortRefDown(p, vals, n, size, temp) \	236	{
111	{ size_t k = n; UInt32 val = vals[temp]; for (;;) { \	237	// size == 2
112	size_t s = (k << 1); \	238	const UInt32 a0 = p[0];
113	if (s > size) break; \	239	p[0] = p[2];
114	if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \	240	p[2] = a0;
115	if (val >= vals[p[s]]) break; \
116	p[k] = p[s]; k = s; \
117	} p[k] = temp; }
118
119	void HeapSortRef(UInt32 p, UInt32 vals, size_t size)
120	{
121	if (size <= 1)
122	return;	241	return;
123	p--;	242	}
		243	if (size != 3)
124	{	244	{
125	size_t i = size / 2;	245	// stage-2 : O(size) * log2(size):
		246	// we move largest item p[0] from head to the end of array,
		247	// and insert last item to sorted binary tree.
126	do	248	do
127	{	249	{
128	UInt32 temp = p[i];	250	const UInt32 temp = p[size];
129	HeapSortRefDown(p, vals, i, size, temp);	251	size_t k = p[2] < p[3] ? 3 : 2;
		252	p[size--] = p[0];
		253	p[0] = p[1];
		254	p[1] = p[k];
		255	HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
130	}	256	}
131	while (--i != 0);	257	while (size != 3);
132	}	258	}
133	do
134	{	259	{
135	UInt32 temp = p[size];	260	const UInt32 a2 = p[2];
136	p[size--] = p[1];	261	const UInt32 a3 = p[3];
137	HeapSortRefDown(p, vals, 1, size, temp);	262	const size_t k = a2 < a3;
		263	p[2] = p[1];
		264	p[3] = p[0];
		265	p[k] = a3;
		266	p[k ^ 1] = a2;
138	}	267	}
139	while (size > 1);
140	}	268	}
141	*/