diff options
Diffstat (limited to 'C/Sort.c')
| -rw-r--r-- | C/Sort.c | 355 |
1 files changed, 241 insertions, 114 deletions
| @@ -1,141 +1,268 @@ | |||
| 1 | /* Sort.c -- Sort functions | 1 | /* Sort.c -- Sort functions |
| 2 | 2014-04-05 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include "Sort.h" | 6 | #include "Sort.h" |
| 7 | #include "CpuArch.h" | ||
| 7 | 8 | ||
| 8 | #define HeapSortDown(p, k, size, temp) \ | 9 | #if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ |
| 9 | { for (;;) { \ | 10 | || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \ |
| 10 | size_t s = (k << 1); \ | 11 | ) |
| 11 | if (s > size) break; \ | 12 | // the code with prefetch is slow for small arrays on x86. |
| 12 | if (s < size && p[s + 1] > p[s]) s++; \ | 13 | // So we disable prefetch for x86. |
| 13 | if (temp >= p[s]) break; \ | 14 | #ifndef MY_CPU_X86 |
| 14 | p[k] = p[s]; k = s; \ | 15 | // #pragma message("Z7_PREFETCH : __builtin_prefetch") |
| 15 | } p[k] = temp; } | 16 | #define Z7_PREFETCH(a) __builtin_prefetch((a)) |
| 17 | #endif | ||
| 16 | 18 | ||
| 17 | void HeapSort(UInt32 *p, size_t size) | 19 | #elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200) |
| 18 | { | 20 | |
| 19 | if (size <= 1) | 21 | #include "7zWindows.h" |
| 20 | return; | 22 | |
| 21 | p--; | 23 | // NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1. |
| 22 | { | 24 | // For example, clang-cl can generate "prefetcht2" instruction for |
| 23 | size_t i = size / 2; | 25 | // PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call. |
| 24 | do | 26 | // But we want to generate "prefetcht0" instruction. |
| 25 | { | 27 | // So for CLANG/GCC we must use __builtin_prefetch() in code branch above |
| 26 | UInt32 temp = p[i]; | 28 | // instead of PreFetchCacheLine() / _mm_prefetch(). |
| 27 | size_t k = i; | 29 | |
| 28 | HeapSortDown(p, k, size, temp) | 30 | // New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call. |
| 29 | } | 31 | // But old x86 cpus don't support "prefetcht0". |
| 30 | while (--i != 0); | 32 | // So we will use PreFetchCacheLine(), only if we are sure that |
| 31 | } | 33 | // generated instruction is supported by all cpus of that isa. |
| 32 | /* | 34 | #if defined(MY_CPU_AMD64) \ |
| 33 | do | 35 | || defined(MY_CPU_ARM64) \ |
| 34 | { | 36 | || defined(MY_CPU_IA64) |
| 35 | size_t k = 1; | 37 | // we need to use additional braces for (a) in PreFetchCacheLine call, because |
| 36 | UInt32 temp = p[size]; | 38 | // PreFetchCacheLine macro doesn't use braces: |
| 37 | p[size--] = p[1]; | 39 | // #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l) |
| 38 | HeapSortDown(p, k, size, temp) | 40 | // #pragma message("Z7_PREFETCH : PreFetchCacheLine") |
| 39 | } | 41 | #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a)) |
| 40 | while (size > 1); | 42 | #endif |
| 41 | */ | 43 | |
| 42 | while (size > 3) | 44 | #endif // _WIN32 |
| 43 | { | 45 | |
| 44 | UInt32 temp = p[size]; | 46 | |
| 45 | size_t k = (p[3] > p[2]) ? 3 : 2; | 47 | #define PREFETCH_NO(p,k,s,size) |
| 46 | p[size--] = p[1]; | 48 | |
| 47 | p[1] = p[k]; | 49 | #ifndef Z7_PREFETCH |
| 48 | HeapSortDown(p, k, size, temp) | 50 | #define SORT_PREFETCH(p,k,s,size) |
| 49 | } | 51 | #else |
| 50 | { | 52 | |
| 51 | UInt32 temp = p[size]; | 53 | // #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes |
| 52 | p[size] = p[1]; | 54 | #define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch) |
| 53 | if (size > 2 && p[2] < temp) | 55 | // #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch) |
| 54 | { | 56 | |
| 55 | p[1] = p[2]; | 57 | #if PREFETCH_LEVEL == 0 |
| 56 | p[2] = temp; | 58 | |
| 57 | } | 59 | #define SORT_PREFETCH(p,k,s,size) |
| 58 | else | 60 | |
| 59 | p[1] = temp; | 61 | #else // PREFETCH_LEVEL != 0 |
| 60 | } | 62 | |
| 63 | /* | ||
| 64 | if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
| 65 | we prefetch one value per cache line. | ||
| 66 | Use it if array is aligned for cache line size (64 bytes) | ||
| 67 | or if array is small (less than L1 cache size). | ||
| 68 | |||
| 69 | if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
| 70 | we perfetch all cache lines that can be required. | ||
| 71 | it can be faster for big unaligned arrays. | ||
| 72 | */ | ||
| 73 | #define USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
| 74 | |||
| 75 | // s == k * 2 | ||
| 76 | #if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64) | ||
| 77 | // x86 supports (lea r1*8+offset) | ||
| 78 | #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL) | ||
| 79 | #else | ||
| 80 | #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1)) | ||
| 81 | #endif | ||
| 82 | |||
| 83 | #if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
| 84 | #define PREFETCH_ADD_OFFSET 0 | ||
| 85 | #else | ||
| 86 | // last offset that can be reqiured in PREFETCH_LEVEL step: | ||
| 87 | #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1) | ||
| 88 | #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2 | ||
| 89 | #endif | ||
| 90 | |||
| 91 | #if PREFETCH_LEVEL <= 3 | ||
| 92 | |||
| 93 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
| 94 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 95 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \ | ||
| 96 | if (s2 <= size) { \ | ||
| 97 | Z7_PREFETCH((p + s2)); \ | ||
| 98 | }} | ||
| 99 | #else /* for unaligned array */ | ||
| 100 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 101 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
| 102 | if (s2 <= size) { \ | ||
| 103 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
| 104 | Z7_PREFETCH((p + s2)); \ | ||
| 105 | }} | ||
| 106 | #endif | ||
| 107 | |||
| 108 | #else // PREFETCH_LEVEL > 3 | ||
| 109 | |||
| 110 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
| 111 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 112 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \ | ||
| 113 | if (s2 <= size) { \ | ||
| 114 | Z7_PREFETCH((p + s2 - 16)); \ | ||
| 115 | Z7_PREFETCH((p + s2)); \ | ||
| 116 | }} | ||
| 117 | #else /* for unaligned array */ | ||
| 118 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 119 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
| 120 | if (s2 <= size) { \ | ||
| 121 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
| 122 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \ | ||
| 123 | Z7_PREFETCH((p + s2)); \ | ||
| 124 | }} | ||
| 125 | #endif | ||
| 126 | |||
| 127 | #endif // PREFETCH_LEVEL > 3 | ||
| 128 | #endif // PREFETCH_LEVEL != 0 | ||
| 129 | #endif // Z7_PREFETCH | ||
| 130 | |||
| 131 | |||
| 132 | #if defined(MY_CPU_ARM64) \ | ||
| 133 | /* || defined(MY_CPU_AMD64) */ \ | ||
| 134 | /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */ | ||
| 135 | // we want to use cmov, if cmov is very fast: | ||
| 136 | // - this cmov version is slower for clang-x64. | ||
| 137 | // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus. | ||
| 138 | #define Z7_FAST_CMOV_SUPPORTED | ||
| 139 | #endif | ||
| 140 | |||
| 141 | #ifdef Z7_FAST_CMOV_SUPPORTED | ||
| 142 | // we want to use cmov here, if cmov is fast: new arm64 cpus. | ||
| 143 | // we want the compiler to use conditional move for this branch | ||
| 144 | #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1; | ||
| 145 | #else | ||
| 146 | // use this branch, if cpu doesn't support fast conditional move. | ||
| 147 | // it uses slow array access reading: | ||
| 148 | #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow; | ||
| 149 | #endif | ||
| 150 | |||
| 151 | #define HeapSortDown(p, k, size, temp, macro_prefetch) \ | ||
| 152 | { \ | ||
| 153 | for (;;) { \ | ||
| 154 | UInt32 n0, n1; \ | ||
| 155 | size_t s = k * 2; \ | ||
| 156 | if (s >= size) { \ | ||
| 157 | if (s == size) { \ | ||
| 158 | n0 = p[s]; \ | ||
| 159 | p[k] = n0; \ | ||
| 160 | if (temp < n0) k = s; \ | ||
| 161 | } \ | ||
| 162 | break; \ | ||
| 163 | } \ | ||
| 164 | n0 = p[k * 2]; \ | ||
| 165 | n1 = p[k * 2 + 1]; \ | ||
| 166 | s += n0 < n1; \ | ||
| 167 | GET_MAX_VAL(n0, n1, p[s]) \ | ||
| 168 | if (temp >= n0) break; \ | ||
| 169 | macro_prefetch(p, k, s, size) \ | ||
| 170 | p[k] = n0; \ | ||
| 171 | k = s; \ | ||
| 172 | } \ | ||
| 173 | p[k] = temp; \ | ||
| 61 | } | 174 | } |
| 62 | 175 | ||
| 63 | void HeapSort64(UInt64 *p, size_t size) | 176 | |
| 177 | /* | ||
| 178 | stage-1 : O(n) : | ||
| 179 | we generate intermediate partially sorted binary tree: | ||
| 180 | p[0] : it's additional item for better alignment of tree structure in memory. | ||
| 181 | p[1] | ||
| 182 | p[2] p[3] | ||
| 183 | p[4] p[5] p[6] p[7] | ||
| 184 | ... | ||
| 185 | p[x] >= p[x * 2] | ||
| 186 | p[x] >= p[x * 2 + 1] | ||
| 187 | |||
| 188 | stage-2 : O(n)*log2(N): | ||
| 189 | we move largest item p[0] from head of tree to the end of array | ||
| 190 | and insert last item to sorted binary tree. | ||
| 191 | */ | ||
| 192 | |||
| 193 | // (p) must be aligned for cache line size (64-bytes) for best performance | ||
| 194 | |||
| 195 | void Z7_FASTCALL HeapSort(UInt32 *p, size_t size) | ||
| 64 | { | 196 | { |
| 65 | if (size <= 1) | 197 | if (size < 2) |
| 66 | return; | 198 | return; |
| 67 | p--; | 199 | if (size == 2) |
| 68 | { | ||
| 69 | size_t i = size / 2; | ||
| 70 | do | ||
| 71 | { | ||
| 72 | UInt64 temp = p[i]; | ||
| 73 | size_t k = i; | ||
| 74 | HeapSortDown(p, k, size, temp) | ||
| 75 | } | ||
| 76 | while (--i != 0); | ||
| 77 | } | ||
| 78 | /* | ||
| 79 | do | ||
| 80 | { | 200 | { |
| 81 | size_t k = 1; | 201 | const UInt32 a0 = p[0]; |
| 82 | UInt64 temp = p[size]; | 202 | const UInt32 a1 = p[1]; |
| 83 | p[size--] = p[1]; | 203 | const unsigned k = a1 < a0; |
| 84 | HeapSortDown(p, k, size, temp) | 204 | p[k] = a0; |
| 85 | } | 205 | p[k ^ 1] = a1; |
| 86 | while (size > 1); | 206 | return; |
| 87 | */ | ||
| 88 | while (size > 3) | ||
| 89 | { | ||
| 90 | UInt64 temp = p[size]; | ||
| 91 | size_t k = (p[3] > p[2]) ? 3 : 2; | ||
| 92 | p[size--] = p[1]; | ||
| 93 | p[1] = p[k]; | ||
| 94 | HeapSortDown(p, k, size, temp) | ||
| 95 | } | 207 | } |
| 96 | { | 208 | { |
| 97 | UInt64 temp = p[size]; | 209 | // stage-1 : O(n) |
| 98 | p[size] = p[1]; | 210 | // we transform array to partially sorted binary tree. |
| 99 | if (size > 2 && p[2] < temp) | 211 | size_t i = --size / 2; |
| 212 | // (size) now is the index of the last item in tree, | ||
| 213 | // if (i) | ||
| 100 | { | 214 | { |
| 101 | p[1] = p[2]; | 215 | do |
| 102 | p[2] = temp; | 216 | { |
| 217 | const UInt32 temp = p[i]; | ||
| 218 | size_t k = i; | ||
| 219 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
| 220 | } | ||
| 221 | while (--i); | ||
| 222 | } | ||
| 223 | { | ||
| 224 | const UInt32 temp = p[0]; | ||
| 225 | const UInt32 a1 = p[1]; | ||
| 226 | if (temp < a1) | ||
| 227 | { | ||
| 228 | size_t k = 1; | ||
| 229 | p[0] = a1; | ||
| 230 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
| 231 | } | ||
| 103 | } | 232 | } |
| 104 | else | ||
| 105 | p[1] = temp; | ||
| 106 | } | 233 | } |
| 107 | } | ||
| 108 | 234 | ||
| 109 | /* | 235 | if (size < 3) |
| 110 | #define HeapSortRefDown(p, vals, n, size, temp) \ | 236 | { |
| 111 | { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ | 237 | // size == 2 |
| 112 | size_t s = (k << 1); \ | 238 | const UInt32 a0 = p[0]; |
| 113 | if (s > size) break; \ | 239 | p[0] = p[2]; |
| 114 | if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ | 240 | p[2] = a0; |
| 115 | if (val >= vals[p[s]]) break; \ | ||
| 116 | p[k] = p[s]; k = s; \ | ||
| 117 | } p[k] = temp; } | ||
| 118 | |||
| 119 | void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) | ||
| 120 | { | ||
| 121 | if (size <= 1) | ||
| 122 | return; | 241 | return; |
| 123 | p--; | 242 | } |
| 243 | if (size != 3) | ||
| 124 | { | 244 | { |
| 125 | size_t i = size / 2; | 245 | // stage-2 : O(size) * log2(size): |
| 246 | // we move largest item p[0] from head to the end of array, | ||
| 247 | // and insert last item to sorted binary tree. | ||
| 126 | do | 248 | do |
| 127 | { | 249 | { |
| 128 | UInt32 temp = p[i]; | 250 | const UInt32 temp = p[size]; |
| 129 | HeapSortRefDown(p, vals, i, size, temp); | 251 | size_t k = p[2] < p[3] ? 3 : 2; |
| 252 | p[size--] = p[0]; | ||
| 253 | p[0] = p[1]; | ||
| 254 | p[1] = p[k]; | ||
| 255 | HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO | ||
| 130 | } | 256 | } |
| 131 | while (--i != 0); | 257 | while (size != 3); |
| 132 | } | 258 | } |
| 133 | do | ||
| 134 | { | 259 | { |
| 135 | UInt32 temp = p[size]; | 260 | const UInt32 a2 = p[2]; |
| 136 | p[size--] = p[1]; | 261 | const UInt32 a3 = p[3]; |
| 137 | HeapSortRefDown(p, vals, 1, size, temp); | 262 | const size_t k = a2 < a3; |
| 263 | p[2] = p[1]; | ||
| 264 | p[3] = p[0]; | ||
| 265 | p[k] = a3; | ||
| 266 | p[k ^ 1] = a2; | ||
| 138 | } | 267 | } |
| 139 | while (size > 1); | ||
| 140 | } | 268 | } |
| 141 | */ | ||
