aboutsummaryrefslogtreecommitdiff
path: root/C/Sort.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--C/Sort.c355
1 files changed, 241 insertions, 114 deletions
diff --git a/C/Sort.c b/C/Sort.c
index e1097e3..20e3e69 100644
--- a/C/Sort.c
+++ b/C/Sort.c
@@ -1,141 +1,268 @@
1/* Sort.c -- Sort functions 1/* Sort.c -- Sort functions
22014-04-05 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "Sort.h" 6#include "Sort.h"
7#include "CpuArch.h"
7 8
8#define HeapSortDown(p, k, size, temp) \ 9#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
9 { for (;;) { \ 10 || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
10 size_t s = (k << 1); \ 11 )
11 if (s > size) break; \ 12// the code with prefetch is slow for small arrays on x86.
12 if (s < size && p[s + 1] > p[s]) s++; \ 13// So we disable prefetch for x86.
13 if (temp >= p[s]) break; \ 14#ifndef MY_CPU_X86
14 p[k] = p[s]; k = s; \ 15 // #pragma message("Z7_PREFETCH : __builtin_prefetch")
15 } p[k] = temp; } 16 #define Z7_PREFETCH(a) __builtin_prefetch((a))
17#endif
16 18
17void HeapSort(UInt32 *p, size_t size) 19#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
18{ 20
19 if (size <= 1) 21#include "7zWindows.h"
20 return; 22
21 p--; 23// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
22 { 24// For example, clang-cl can generate "prefetcht2" instruction for
23 size_t i = size / 2; 25// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
24 do 26// But we want to generate "prefetcht0" instruction.
25 { 27// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
26 UInt32 temp = p[i]; 28// instead of PreFetchCacheLine() / _mm_prefetch().
27 size_t k = i; 29
28 HeapSortDown(p, k, size, temp) 30// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
29 } 31// But old x86 cpus don't support "prefetcht0".
30 while (--i != 0); 32// So we will use PreFetchCacheLine(), only if we are sure that
31 } 33// generated instruction is supported by all cpus of that isa.
32 /* 34#if defined(MY_CPU_AMD64) \
33 do 35 || defined(MY_CPU_ARM64) \
34 { 36 || defined(MY_CPU_IA64)
35 size_t k = 1; 37// we need to use additional braces for (a) in PreFetchCacheLine call, because
36 UInt32 temp = p[size]; 38// PreFetchCacheLine macro doesn't use braces:
37 p[size--] = p[1]; 39// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
38 HeapSortDown(p, k, size, temp) 40 // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
39 } 41 #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
40 while (size > 1); 42#endif
41 */ 43
42 while (size > 3) 44#endif // _WIN32
43 { 45
44 UInt32 temp = p[size]; 46
45 size_t k = (p[3] > p[2]) ? 3 : 2; 47#define PREFETCH_NO(p,k,s,size)
46 p[size--] = p[1]; 48
47 p[1] = p[k]; 49#ifndef Z7_PREFETCH
48 HeapSortDown(p, k, size, temp) 50 #define SORT_PREFETCH(p,k,s,size)
49 } 51#else
50 { 52
51 UInt32 temp = p[size]; 53// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
52 p[size] = p[1]; 54#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
53 if (size > 2 && p[2] < temp) 55// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
54 { 56
55 p[1] = p[2]; 57#if PREFETCH_LEVEL == 0
56 p[2] = temp; 58
57 } 59 #define SORT_PREFETCH(p,k,s,size)
58 else 60
59 p[1] = temp; 61#else // PREFETCH_LEVEL != 0
60 } 62
63/*
64if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
65 we prefetch one value per cache line.
66 Use it if array is aligned for cache line size (64 bytes)
67 or if array is small (less than L1 cache size).
68
69if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
70 we perfetch all cache lines that can be required.
71 it can be faster for big unaligned arrays.
72*/
73 #define USE_PREFETCH_FOR_ALIGNED_ARRAY
74
75// s == k * 2
76#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
77 // x86 supports (lea r1*8+offset)
78 #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
79#else
80 #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
81#endif
82
83#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
84 #define PREFETCH_ADD_OFFSET 0
85#else
86 // last offset that can be reqiured in PREFETCH_LEVEL step:
87 #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
88 #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
89#endif
90
91#if PREFETCH_LEVEL <= 3
92
93#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
94 #define SORT_PREFETCH(p,k,s,size) \
95 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
96 if (s2 <= size) { \
97 Z7_PREFETCH((p + s2)); \
98 }}
99#else /* for unaligned array */
100 #define SORT_PREFETCH(p,k,s,size) \
101 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
102 if (s2 <= size) { \
103 Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
104 Z7_PREFETCH((p + s2)); \
105 }}
106#endif
107
108#else // PREFETCH_LEVEL > 3
109
110#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
111 #define SORT_PREFETCH(p,k,s,size) \
112 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
113 if (s2 <= size) { \
114 Z7_PREFETCH((p + s2 - 16)); \
115 Z7_PREFETCH((p + s2)); \
116 }}
117#else /* for unaligned array */
118 #define SORT_PREFETCH(p,k,s,size) \
119 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
120 if (s2 <= size) { \
121 Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
122 Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
123 Z7_PREFETCH((p + s2)); \
124 }}
125#endif
126
127#endif // PREFETCH_LEVEL > 3
128#endif // PREFETCH_LEVEL != 0
129#endif // Z7_PREFETCH
130
131
132#if defined(MY_CPU_ARM64) \
133 /* || defined(MY_CPU_AMD64) */ \
134 /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
135 // we want to use cmov, if cmov is very fast:
136 // - this cmov version is slower for clang-x64.
137 // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
138 #define Z7_FAST_CMOV_SUPPORTED
139#endif
140
141#ifdef Z7_FAST_CMOV_SUPPORTED
142 // we want to use cmov here, if cmov is fast: new arm64 cpus.
143 // we want the compiler to use conditional move for this branch
144 #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
145#else
146 // use this branch, if cpu doesn't support fast conditional move.
147 // it uses slow array access reading:
148 #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
149#endif
150
151#define HeapSortDown(p, k, size, temp, macro_prefetch) \
152{ \
153 for (;;) { \
154 UInt32 n0, n1; \
155 size_t s = k * 2; \
156 if (s >= size) { \
157 if (s == size) { \
158 n0 = p[s]; \
159 p[k] = n0; \
160 if (temp < n0) k = s; \
161 } \
162 break; \
163 } \
164 n0 = p[k * 2]; \
165 n1 = p[k * 2 + 1]; \
166 s += n0 < n1; \
167 GET_MAX_VAL(n0, n1, p[s]) \
168 if (temp >= n0) break; \
169 macro_prefetch(p, k, s, size) \
170 p[k] = n0; \
171 k = s; \
172 } \
173 p[k] = temp; \
61} 174}
62 175
63void HeapSort64(UInt64 *p, size_t size) 176
177/*
178stage-1 : O(n) :
179 we generate intermediate partially sorted binary tree:
180 p[0] : it's additional item for better alignment of tree structure in memory.
181 p[1]
182 p[2] p[3]
183 p[4] p[5] p[6] p[7]
184 ...
185 p[x] >= p[x * 2]
186 p[x] >= p[x * 2 + 1]
187
188stage-2 : O(n)*log2(N):
189 we move largest item p[0] from head of tree to the end of array
190 and insert last item to sorted binary tree.
191*/
192
193// (p) must be aligned for cache line size (64-bytes) for best performance
194
195void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
64{ 196{
65 if (size <= 1) 197 if (size < 2)
66 return; 198 return;
67 p--; 199 if (size == 2)
68 {
69 size_t i = size / 2;
70 do
71 {
72 UInt64 temp = p[i];
73 size_t k = i;
74 HeapSortDown(p, k, size, temp)
75 }
76 while (--i != 0);
77 }
78 /*
79 do
80 { 200 {
81 size_t k = 1; 201 const UInt32 a0 = p[0];
82 UInt64 temp = p[size]; 202 const UInt32 a1 = p[1];
83 p[size--] = p[1]; 203 const unsigned k = a1 < a0;
84 HeapSortDown(p, k, size, temp) 204 p[k] = a0;
85 } 205 p[k ^ 1] = a1;
86 while (size > 1); 206 return;
87 */
88 while (size > 3)
89 {
90 UInt64 temp = p[size];
91 size_t k = (p[3] > p[2]) ? 3 : 2;
92 p[size--] = p[1];
93 p[1] = p[k];
94 HeapSortDown(p, k, size, temp)
95 } 207 }
96 { 208 {
97 UInt64 temp = p[size]; 209 // stage-1 : O(n)
98 p[size] = p[1]; 210 // we transform array to partially sorted binary tree.
99 if (size > 2 && p[2] < temp) 211 size_t i = --size / 2;
212 // (size) now is the index of the last item in tree,
213 // if (i)
100 { 214 {
101 p[1] = p[2]; 215 do
102 p[2] = temp; 216 {
217 const UInt32 temp = p[i];
218 size_t k = i;
219 HeapSortDown(p, k, size, temp, PREFETCH_NO)
220 }
221 while (--i);
222 }
223 {
224 const UInt32 temp = p[0];
225 const UInt32 a1 = p[1];
226 if (temp < a1)
227 {
228 size_t k = 1;
229 p[0] = a1;
230 HeapSortDown(p, k, size, temp, PREFETCH_NO)
231 }
103 } 232 }
104 else
105 p[1] = temp;
106 } 233 }
107}
108 234
109/* 235 if (size < 3)
110#define HeapSortRefDown(p, vals, n, size, temp) \ 236 {
111 { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ 237 // size == 2
112 size_t s = (k << 1); \ 238 const UInt32 a0 = p[0];
113 if (s > size) break; \ 239 p[0] = p[2];
114 if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ 240 p[2] = a0;
115 if (val >= vals[p[s]]) break; \
116 p[k] = p[s]; k = s; \
117 } p[k] = temp; }
118
119void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
120{
121 if (size <= 1)
122 return; 241 return;
123 p--; 242 }
243 if (size != 3)
124 { 244 {
125 size_t i = size / 2; 245 // stage-2 : O(size) * log2(size):
246 // we move largest item p[0] from head to the end of array,
247 // and insert last item to sorted binary tree.
126 do 248 do
127 { 249 {
128 UInt32 temp = p[i]; 250 const UInt32 temp = p[size];
129 HeapSortRefDown(p, vals, i, size, temp); 251 size_t k = p[2] < p[3] ? 3 : 2;
252 p[size--] = p[0];
253 p[0] = p[1];
254 p[1] = p[k];
255 HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
130 } 256 }
131 while (--i != 0); 257 while (size != 3);
132 } 258 }
133 do
134 { 259 {
135 UInt32 temp = p[size]; 260 const UInt32 a2 = p[2];
136 p[size--] = p[1]; 261 const UInt32 a3 = p[3];
137 HeapSortRefDown(p, vals, 1, size, temp); 262 const size_t k = a2 < a3;
263 p[2] = p[1];
264 p[3] = p[0];
265 p[k] = a3;
266 p[k ^ 1] = a2;
138 } 267 }
139 while (size > 1);
140} 268}
141*/