diff options
Diffstat (limited to '')
-rw-r--r-- | C/Sort.c | 355 |
1 files changed, 241 insertions, 114 deletions
@@ -1,141 +1,268 @@ | |||
1 | /* Sort.c -- Sort functions | 1 | /* Sort.c -- Sort functions |
2 | 2014-04-05 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include "Sort.h" | 6 | #include "Sort.h" |
7 | #include "CpuArch.h" | ||
7 | 8 | ||
8 | #define HeapSortDown(p, k, size, temp) \ | 9 | #if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ |
9 | { for (;;) { \ | 10 | || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \ |
10 | size_t s = (k << 1); \ | 11 | ) |
11 | if (s > size) break; \ | 12 | // the code with prefetch is slow for small arrays on x86. |
12 | if (s < size && p[s + 1] > p[s]) s++; \ | 13 | // So we disable prefetch for x86. |
13 | if (temp >= p[s]) break; \ | 14 | #ifndef MY_CPU_X86 |
14 | p[k] = p[s]; k = s; \ | 15 | // #pragma message("Z7_PREFETCH : __builtin_prefetch") |
15 | } p[k] = temp; } | 16 | #define Z7_PREFETCH(a) __builtin_prefetch((a)) |
17 | #endif | ||
16 | 18 | ||
17 | void HeapSort(UInt32 *p, size_t size) | 19 | #elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200) |
18 | { | 20 | |
19 | if (size <= 1) | 21 | #include "7zWindows.h" |
20 | return; | 22 | |
21 | p--; | 23 | // NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1. |
22 | { | 24 | // For example, clang-cl can generate "prefetcht2" instruction for |
23 | size_t i = size / 2; | 25 | // PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call. |
24 | do | 26 | // But we want to generate "prefetcht0" instruction. |
25 | { | 27 | // So for CLANG/GCC we must use __builtin_prefetch() in code branch above |
26 | UInt32 temp = p[i]; | 28 | // instead of PreFetchCacheLine() / _mm_prefetch(). |
27 | size_t k = i; | 29 | |
28 | HeapSortDown(p, k, size, temp) | 30 | // New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call. |
29 | } | 31 | // But old x86 cpus don't support "prefetcht0". |
30 | while (--i != 0); | 32 | // So we will use PreFetchCacheLine(), only if we are sure that |
31 | } | 33 | // generated instruction is supported by all cpus of that isa. |
32 | /* | 34 | #if defined(MY_CPU_AMD64) \ |
33 | do | 35 | || defined(MY_CPU_ARM64) \ |
34 | { | 36 | || defined(MY_CPU_IA64) |
35 | size_t k = 1; | 37 | // we need to use additional braces for (a) in PreFetchCacheLine call, because |
36 | UInt32 temp = p[size]; | 38 | // PreFetchCacheLine macro doesn't use braces: |
37 | p[size--] = p[1]; | 39 | // #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l) |
38 | HeapSortDown(p, k, size, temp) | 40 | // #pragma message("Z7_PREFETCH : PreFetchCacheLine") |
39 | } | 41 | #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a)) |
40 | while (size > 1); | 42 | #endif |
41 | */ | 43 | |
42 | while (size > 3) | 44 | #endif // _WIN32 |
43 | { | 45 | |
44 | UInt32 temp = p[size]; | 46 | |
45 | size_t k = (p[3] > p[2]) ? 3 : 2; | 47 | #define PREFETCH_NO(p,k,s,size) |
46 | p[size--] = p[1]; | 48 | |
47 | p[1] = p[k]; | 49 | #ifndef Z7_PREFETCH |
48 | HeapSortDown(p, k, size, temp) | 50 | #define SORT_PREFETCH(p,k,s,size) |
49 | } | 51 | #else |
50 | { | 52 | |
51 | UInt32 temp = p[size]; | 53 | // #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes |
52 | p[size] = p[1]; | 54 | #define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch) |
53 | if (size > 2 && p[2] < temp) | 55 | // #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch) |
54 | { | 56 | |
55 | p[1] = p[2]; | 57 | #if PREFETCH_LEVEL == 0 |
56 | p[2] = temp; | 58 | |
57 | } | 59 | #define SORT_PREFETCH(p,k,s,size) |
58 | else | 60 | |
59 | p[1] = temp; | 61 | #else // PREFETCH_LEVEL != 0 |
60 | } | 62 | |
63 | /* | ||
64 | if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
65 | we prefetch one value per cache line. | ||
66 | Use it if array is aligned for cache line size (64 bytes) | ||
67 | or if array is small (less than L1 cache size). | ||
68 | |||
69 | if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
70 | we perfetch all cache lines that can be required. | ||
71 | it can be faster for big unaligned arrays. | ||
72 | */ | ||
73 | #define USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
74 | |||
75 | // s == k * 2 | ||
76 | #if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64) | ||
77 | // x86 supports (lea r1*8+offset) | ||
78 | #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL) | ||
79 | #else | ||
80 | #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1)) | ||
81 | #endif | ||
82 | |||
83 | #if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
84 | #define PREFETCH_ADD_OFFSET 0 | ||
85 | #else | ||
86 | // last offset that can be reqiured in PREFETCH_LEVEL step: | ||
87 | #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1) | ||
88 | #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2 | ||
89 | #endif | ||
90 | |||
91 | #if PREFETCH_LEVEL <= 3 | ||
92 | |||
93 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
94 | #define SORT_PREFETCH(p,k,s,size) \ | ||
95 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \ | ||
96 | if (s2 <= size) { \ | ||
97 | Z7_PREFETCH((p + s2)); \ | ||
98 | }} | ||
99 | #else /* for unaligned array */ | ||
100 | #define SORT_PREFETCH(p,k,s,size) \ | ||
101 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
102 | if (s2 <= size) { \ | ||
103 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
104 | Z7_PREFETCH((p + s2)); \ | ||
105 | }} | ||
106 | #endif | ||
107 | |||
108 | #else // PREFETCH_LEVEL > 3 | ||
109 | |||
110 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
111 | #define SORT_PREFETCH(p,k,s,size) \ | ||
112 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \ | ||
113 | if (s2 <= size) { \ | ||
114 | Z7_PREFETCH((p + s2 - 16)); \ | ||
115 | Z7_PREFETCH((p + s2)); \ | ||
116 | }} | ||
117 | #else /* for unaligned array */ | ||
118 | #define SORT_PREFETCH(p,k,s,size) \ | ||
119 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
120 | if (s2 <= size) { \ | ||
121 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
122 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \ | ||
123 | Z7_PREFETCH((p + s2)); \ | ||
124 | }} | ||
125 | #endif | ||
126 | |||
127 | #endif // PREFETCH_LEVEL > 3 | ||
128 | #endif // PREFETCH_LEVEL != 0 | ||
129 | #endif // Z7_PREFETCH | ||
130 | |||
131 | |||
132 | #if defined(MY_CPU_ARM64) \ | ||
133 | /* || defined(MY_CPU_AMD64) */ \ | ||
134 | /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */ | ||
135 | // we want to use cmov, if cmov is very fast: | ||
136 | // - this cmov version is slower for clang-x64. | ||
137 | // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus. | ||
138 | #define Z7_FAST_CMOV_SUPPORTED | ||
139 | #endif | ||
140 | |||
141 | #ifdef Z7_FAST_CMOV_SUPPORTED | ||
142 | // we want to use cmov here, if cmov is fast: new arm64 cpus. | ||
143 | // we want the compiler to use conditional move for this branch | ||
144 | #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1; | ||
145 | #else | ||
146 | // use this branch, if cpu doesn't support fast conditional move. | ||
147 | // it uses slow array access reading: | ||
148 | #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow; | ||
149 | #endif | ||
150 | |||
151 | #define HeapSortDown(p, k, size, temp, macro_prefetch) \ | ||
152 | { \ | ||
153 | for (;;) { \ | ||
154 | UInt32 n0, n1; \ | ||
155 | size_t s = k * 2; \ | ||
156 | if (s >= size) { \ | ||
157 | if (s == size) { \ | ||
158 | n0 = p[s]; \ | ||
159 | p[k] = n0; \ | ||
160 | if (temp < n0) k = s; \ | ||
161 | } \ | ||
162 | break; \ | ||
163 | } \ | ||
164 | n0 = p[k * 2]; \ | ||
165 | n1 = p[k * 2 + 1]; \ | ||
166 | s += n0 < n1; \ | ||
167 | GET_MAX_VAL(n0, n1, p[s]) \ | ||
168 | if (temp >= n0) break; \ | ||
169 | macro_prefetch(p, k, s, size) \ | ||
170 | p[k] = n0; \ | ||
171 | k = s; \ | ||
172 | } \ | ||
173 | p[k] = temp; \ | ||
61 | } | 174 | } |
62 | 175 | ||
63 | void HeapSort64(UInt64 *p, size_t size) | 176 | |
177 | /* | ||
178 | stage-1 : O(n) : | ||
179 | we generate intermediate partially sorted binary tree: | ||
180 | p[0] : it's additional item for better alignment of tree structure in memory. | ||
181 | p[1] | ||
182 | p[2] p[3] | ||
183 | p[4] p[5] p[6] p[7] | ||
184 | ... | ||
185 | p[x] >= p[x * 2] | ||
186 | p[x] >= p[x * 2 + 1] | ||
187 | |||
188 | stage-2 : O(n)*log2(N): | ||
189 | we move largest item p[0] from head of tree to the end of array | ||
190 | and insert last item to sorted binary tree. | ||
191 | */ | ||
192 | |||
193 | // (p) must be aligned for cache line size (64-bytes) for best performance | ||
194 | |||
195 | void Z7_FASTCALL HeapSort(UInt32 *p, size_t size) | ||
64 | { | 196 | { |
65 | if (size <= 1) | 197 | if (size < 2) |
66 | return; | 198 | return; |
67 | p--; | 199 | if (size == 2) |
68 | { | ||
69 | size_t i = size / 2; | ||
70 | do | ||
71 | { | ||
72 | UInt64 temp = p[i]; | ||
73 | size_t k = i; | ||
74 | HeapSortDown(p, k, size, temp) | ||
75 | } | ||
76 | while (--i != 0); | ||
77 | } | ||
78 | /* | ||
79 | do | ||
80 | { | 200 | { |
81 | size_t k = 1; | 201 | const UInt32 a0 = p[0]; |
82 | UInt64 temp = p[size]; | 202 | const UInt32 a1 = p[1]; |
83 | p[size--] = p[1]; | 203 | const unsigned k = a1 < a0; |
84 | HeapSortDown(p, k, size, temp) | 204 | p[k] = a0; |
85 | } | 205 | p[k ^ 1] = a1; |
86 | while (size > 1); | 206 | return; |
87 | */ | ||
88 | while (size > 3) | ||
89 | { | ||
90 | UInt64 temp = p[size]; | ||
91 | size_t k = (p[3] > p[2]) ? 3 : 2; | ||
92 | p[size--] = p[1]; | ||
93 | p[1] = p[k]; | ||
94 | HeapSortDown(p, k, size, temp) | ||
95 | } | 207 | } |
96 | { | 208 | { |
97 | UInt64 temp = p[size]; | 209 | // stage-1 : O(n) |
98 | p[size] = p[1]; | 210 | // we transform array to partially sorted binary tree. |
99 | if (size > 2 && p[2] < temp) | 211 | size_t i = --size / 2; |
212 | // (size) now is the index of the last item in tree, | ||
213 | // if (i) | ||
100 | { | 214 | { |
101 | p[1] = p[2]; | 215 | do |
102 | p[2] = temp; | 216 | { |
217 | const UInt32 temp = p[i]; | ||
218 | size_t k = i; | ||
219 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
220 | } | ||
221 | while (--i); | ||
222 | } | ||
223 | { | ||
224 | const UInt32 temp = p[0]; | ||
225 | const UInt32 a1 = p[1]; | ||
226 | if (temp < a1) | ||
227 | { | ||
228 | size_t k = 1; | ||
229 | p[0] = a1; | ||
230 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
231 | } | ||
103 | } | 232 | } |
104 | else | ||
105 | p[1] = temp; | ||
106 | } | 233 | } |
107 | } | ||
108 | 234 | ||
109 | /* | 235 | if (size < 3) |
110 | #define HeapSortRefDown(p, vals, n, size, temp) \ | 236 | { |
111 | { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ | 237 | // size == 2 |
112 | size_t s = (k << 1); \ | 238 | const UInt32 a0 = p[0]; |
113 | if (s > size) break; \ | 239 | p[0] = p[2]; |
114 | if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ | 240 | p[2] = a0; |
115 | if (val >= vals[p[s]]) break; \ | ||
116 | p[k] = p[s]; k = s; \ | ||
117 | } p[k] = temp; } | ||
118 | |||
119 | void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) | ||
120 | { | ||
121 | if (size <= 1) | ||
122 | return; | 241 | return; |
123 | p--; | 242 | } |
243 | if (size != 3) | ||
124 | { | 244 | { |
125 | size_t i = size / 2; | 245 | // stage-2 : O(size) * log2(size): |
246 | // we move largest item p[0] from head to the end of array, | ||
247 | // and insert last item to sorted binary tree. | ||
126 | do | 248 | do |
127 | { | 249 | { |
128 | UInt32 temp = p[i]; | 250 | const UInt32 temp = p[size]; |
129 | HeapSortRefDown(p, vals, i, size, temp); | 251 | size_t k = p[2] < p[3] ? 3 : 2; |
252 | p[size--] = p[0]; | ||
253 | p[0] = p[1]; | ||
254 | p[1] = p[k]; | ||
255 | HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO | ||
130 | } | 256 | } |
131 | while (--i != 0); | 257 | while (size != 3); |
132 | } | 258 | } |
133 | do | ||
134 | { | 259 | { |
135 | UInt32 temp = p[size]; | 260 | const UInt32 a2 = p[2]; |
136 | p[size--] = p[1]; | 261 | const UInt32 a3 = p[3]; |
137 | HeapSortRefDown(p, vals, 1, size, temp); | 262 | const size_t k = a2 < a3; |
263 | p[2] = p[1]; | ||
264 | p[3] = p[0]; | ||
265 | p[k] = a3; | ||
266 | p[k ^ 1] = a2; | ||
138 | } | 267 | } |
139 | while (size > 1); | ||
140 | } | 268 | } |
141 | */ | ||