diff options
Diffstat (limited to 'blocksort.c')
-rw-r--r-- | blocksort.c | 1242 |
1 files changed, 799 insertions, 443 deletions
diff --git a/blocksort.c b/blocksort.c index d8bb26a..85a02de 100644 --- a/blocksort.c +++ b/blocksort.c | |||
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-1998 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-1999 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -41,9 +41,9 @@ | |||
41 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 41 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Guildford, Surrey, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@acm.org |
46 | bzip2/libbzip2 version 0.9.0c of 18 October 1998 | 46 | bzip2/libbzip2 version 0.9.5 of 24 May 1999 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
49 | Mike Burrows | 49 | Mike Burrows |
@@ -62,106 +62,404 @@ | |||
62 | #include "bzlib_private.h" | 62 | #include "bzlib_private.h" |
63 | 63 | ||
64 | /*---------------------------------------------*/ | 64 | /*---------------------------------------------*/ |
65 | /*-- | 65 | /*--- Fallback O(N log(N)^2) sorting ---*/ |
66 | Compare two strings in block. We assume (see | 66 | /*--- algorithm, for repetitive blocks ---*/ |
67 | discussion above) that i1 and i2 have a max | 67 | /*---------------------------------------------*/ |
68 | offset of 10 on entry, and that the first | 68 | |
69 | bytes of both block and quadrant have been | 69 | /*---------------------------------------------*/ |
70 | copied into the "overshoot area", ie | 70 | static |
71 | into the subscript range | 71 | __inline__ |
72 | [nblock .. nblock+NUM_OVERSHOOT_BYTES-1]. | 72 | void fallbackSimpleSort ( UInt32* fmap, |
73 | --*/ | 73 | UInt32* eclass, |
74 | static __inline__ Bool fullGtU ( UChar* block, | 74 | Int32 lo, |
75 | UInt16* quadrant, | 75 | Int32 hi ) |
76 | UInt32 nblock, | 76 | { |
77 | Int32* workDone, | 77 | Int32 i, j, tmp; |
78 | Int32 i1, | 78 | UInt32 ec_tmp; |
79 | Int32 i2 | 79 | |
80 | ) | 80 | if (lo == hi) return; |
81 | |||
82 | if (hi - lo > 3) { | ||
83 | for ( i = hi-4; i >= lo; i-- ) { | ||
84 | tmp = fmap[i]; | ||
85 | ec_tmp = eclass[tmp]; | ||
86 | for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 ) | ||
87 | fmap[j-4] = fmap[j]; | ||
88 | fmap[j-4] = tmp; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | for ( i = hi-1; i >= lo; i-- ) { | ||
93 | tmp = fmap[i]; | ||
94 | ec_tmp = eclass[tmp]; | ||
95 | for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ ) | ||
96 | fmap[j-1] = fmap[j]; | ||
97 | fmap[j-1] = tmp; | ||
98 | } | ||
99 | } | ||
100 | |||
101 | |||
102 | /*---------------------------------------------*/ | ||
103 | #define fswap(zz1, zz2) \ | ||
104 | { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } | ||
105 | |||
106 | #define fvswap(zzp1, zzp2, zzn) \ | ||
107 | { \ | ||
108 | Int32 yyp1 = (zzp1); \ | ||
109 | Int32 yyp2 = (zzp2); \ | ||
110 | Int32 yyn = (zzn); \ | ||
111 | while (yyn > 0) { \ | ||
112 | fswap(fmap[yyp1], fmap[yyp2]); \ | ||
113 | yyp1++; yyp2++; yyn--; \ | ||
114 | } \ | ||
115 | } | ||
116 | |||
117 | |||
118 | #define fmin(a,b) ((a) < (b)) ? (a) : (b) | ||
119 | |||
120 | #define fpush(lz,hz) { stackLo[sp] = lz; \ | ||
121 | stackHi[sp] = hz; \ | ||
122 | sp++; } | ||
123 | |||
124 | #define fpop(lz,hz) { sp--; \ | ||
125 | lz = stackLo[sp]; \ | ||
126 | hz = stackHi[sp]; } | ||
127 | |||
128 | #define FALLBACK_QSORT_SMALL_THRESH 10 | ||
129 | #define FALLBACK_QSORT_STACK_SIZE 100 | ||
130 | |||
131 | |||
132 | static | ||
133 | void fallbackQSort3 ( UInt32* fmap, | ||
134 | UInt32* eclass, | ||
135 | Int32 loSt, | ||
136 | Int32 hiSt ) | ||
137 | { | ||
138 | Int32 unLo, unHi, ltLo, gtHi, n, m; | ||
139 | Int32 sp, lo, hi; | ||
140 | UInt32 med, r, r3; | ||
141 | Int32 stackLo[FALLBACK_QSORT_STACK_SIZE]; | ||
142 | Int32 stackHi[FALLBACK_QSORT_STACK_SIZE]; | ||
143 | |||
144 | r = 0; | ||
145 | |||
146 | sp = 0; | ||
147 | fpush ( loSt, hiSt ); | ||
148 | |||
149 | while (sp > 0) { | ||
150 | |||
151 | AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 ); | ||
152 | |||
153 | fpop ( lo, hi ); | ||
154 | if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { | ||
155 | fallbackSimpleSort ( fmap, eclass, lo, hi ); | ||
156 | continue; | ||
157 | } | ||
158 | |||
159 | /* Random partitioning. Median of 3 sometimes fails to | ||
160 | avoid bad cases. Median of 9 seems to help but | ||
161 | looks rather expensive. This too seems to work but | ||
162 | is cheaper. Guidance for the magic constants | ||
163 | 7621 and 32768 is taken from Sedgewick's algorithms | ||
164 | book, chapter 35. | ||
165 | */ | ||
166 | r = ((r * 7621) + 1) % 32768; | ||
167 | r3 = r % 3; | ||
168 | if (r3 == 0) med = eclass[fmap[lo]]; else | ||
169 | if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else | ||
170 | med = eclass[fmap[hi]]; | ||
171 | |||
172 | unLo = ltLo = lo; | ||
173 | unHi = gtHi = hi; | ||
174 | |||
175 | while (1) { | ||
176 | while (1) { | ||
177 | if (unLo > unHi) break; | ||
178 | n = (Int32)eclass[fmap[unLo]] - (Int32)med; | ||
179 | if (n == 0) { | ||
180 | fswap(fmap[unLo], fmap[ltLo]); | ||
181 | ltLo++; unLo++; | ||
182 | continue; | ||
183 | }; | ||
184 | if (n > 0) break; | ||
185 | unLo++; | ||
186 | } | ||
187 | while (1) { | ||
188 | if (unLo > unHi) break; | ||
189 | n = (Int32)eclass[fmap[unHi]] - (Int32)med; | ||
190 | if (n == 0) { | ||
191 | fswap(fmap[unHi], fmap[gtHi]); | ||
192 | gtHi--; unHi--; | ||
193 | continue; | ||
194 | }; | ||
195 | if (n < 0) break; | ||
196 | unHi--; | ||
197 | } | ||
198 | if (unLo > unHi) break; | ||
199 | fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; | ||
200 | } | ||
201 | |||
202 | AssertD ( unHi == unLo-1, "fallbackQSort3(2)" ); | ||
203 | |||
204 | if (gtHi < ltLo) continue; | ||
205 | |||
206 | n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n); | ||
207 | m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m); | ||
208 | |||
209 | n = lo + unLo - ltLo - 1; | ||
210 | m = hi - (gtHi - unHi) + 1; | ||
211 | |||
212 | if (n - lo > hi - m) { | ||
213 | fpush ( lo, n ); | ||
214 | fpush ( m, hi ); | ||
215 | } else { | ||
216 | fpush ( m, hi ); | ||
217 | fpush ( lo, n ); | ||
218 | } | ||
219 | } | ||
220 | } | ||
221 | |||
222 | #undef fmin | ||
223 | #undef fpush | ||
224 | #undef fpop | ||
225 | #undef fswap | ||
226 | #undef fvswap | ||
227 | #undef FALLBACK_QSORT_SMALL_THRESH | ||
228 | #undef FALLBACK_QSORT_STACK_SIZE | ||
229 | |||
230 | |||
231 | /*---------------------------------------------*/ | ||
232 | /* Pre: | ||
233 | nblock > 0 | ||
234 | eclass exists for [0 .. nblock-1] | ||
235 | ((UInt16*)eclass) [0 .. nblock-1] [15:8] holds block | ||
236 | ptr exists for [0 .. nblock-1] | ||
237 | |||
238 | Post: | ||
239 | ((UInt16*)eclass) [0 .. nblock-1] [15:8] holds block | ||
240 | All other areas of eclass destroyed | ||
241 | fmap [0 .. nblock-1] holds sorted order | ||
242 | bhtab [ 0 .. 2+(nblock/32) ] destroyed | ||
243 | */ | ||
244 | |||
245 | #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) | ||
246 | #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) | ||
247 | #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) | ||
248 | #define WORD_BH(zz) bhtab[(zz) >> 5] | ||
249 | #define UNALIGNED_BH(zz) ((zz) & 0x01f) | ||
250 | |||
251 | static | ||
252 | void fallbackSort ( UInt32* fmap, | ||
253 | UInt32* eclass, | ||
254 | UInt32* bhtab, | ||
255 | Int32 nblock, | ||
256 | Int32 verb ) | ||
257 | { | ||
258 | Int32 ftab[257]; | ||
259 | Int32 ftabCopy[256]; | ||
260 | Int32 H, i, j, k, l, r, cc, cc1; | ||
261 | Int32 nNotDone; | ||
262 | Int32 nBhtab; | ||
263 | UInt16* eclass16 = (UInt16*)eclass; | ||
264 | |||
265 | /*-- | ||
266 | Initial 1-char radix sort to generate | ||
267 | initial fmap and initial BH bits. | ||
268 | --*/ | ||
269 | if (verb >= 4) | ||
270 | VPrintf0 ( " bucket sorting ...\n" ); | ||
271 | for (i = 0; i < 257; i++) ftab[i] = 0; | ||
272 | for (i = 0; i < nblock; i++) ftab[eclass16[i] >> 8]++; | ||
273 | for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; | ||
274 | for (i = 1; i < 257; i++) ftab[i] += ftab[i-1]; | ||
275 | |||
276 | for (i = 0; i < nblock; i++) { | ||
277 | j = eclass16[i] >> 8; | ||
278 | k = ftab[j] - 1; | ||
279 | ftab[j] = k; | ||
280 | fmap[k] = i; | ||
281 | } | ||
282 | |||
283 | nBhtab = 2 + (nblock / 32); | ||
284 | for (i = 0; i < nBhtab; i++) bhtab[i] = 0; | ||
285 | for (i = 0; i < 256; i++) SET_BH(ftab[i]); | ||
286 | |||
287 | /*-- | ||
288 | Inductively refine the buckets. Kind-of an | ||
289 | "exponential radix sort" (!), inspired by the | ||
290 | Manber-Myers suffix array construction algorithm. | ||
291 | --*/ | ||
292 | |||
293 | /*-- set sentinel bits for block-end detection --*/ | ||
294 | for (i = 0; i < 32; i++) { | ||
295 | SET_BH(nblock + 2*i); | ||
296 | CLEAR_BH(nblock + 2*i + 1); | ||
297 | } | ||
298 | |||
299 | /*-- the log(N) loop --*/ | ||
300 | H = 1; | ||
301 | while (1) { | ||
302 | |||
303 | if (verb >= 4) | ||
304 | VPrintf1 ( " depth %6d has ", H ); | ||
305 | |||
306 | j = 0; | ||
307 | for (i = 0; i < nblock; i++) { | ||
308 | if (ISSET_BH(i)) j = i; | ||
309 | k = fmap[i] - H; if (k < 0) k += nblock; | ||
310 | eclass[k] = j; | ||
311 | } | ||
312 | |||
313 | nNotDone = 0; | ||
314 | r = -1; | ||
315 | while (1) { | ||
316 | |||
317 | /*-- find the next non-singleton bucket --*/ | ||
318 | k = r + 1; | ||
319 | while (ISSET_BH(k) && UNALIGNED_BH(k)) k++; | ||
320 | if (ISSET_BH(k)) { | ||
321 | while (WORD_BH(k) == 0xffffffff) k += 32; | ||
322 | while (ISSET_BH(k)) k++; | ||
323 | } | ||
324 | l = k - 1; | ||
325 | if (l >= nblock) break; | ||
326 | while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++; | ||
327 | if (!ISSET_BH(k)) { | ||
328 | while (WORD_BH(k) == 0x00000000) k += 32; | ||
329 | while (!ISSET_BH(k)) k++; | ||
330 | } | ||
331 | r = k - 1; | ||
332 | if (r >= nblock) break; | ||
333 | |||
334 | /*-- now [l, r] bracket current bucket --*/ | ||
335 | if (r > l) { | ||
336 | nNotDone += (r - l + 1); | ||
337 | fallbackQSort3 ( fmap, eclass, l, r ); | ||
338 | |||
339 | /*-- scan bucket and generate header bits-- */ | ||
340 | cc = -1; | ||
341 | for (i = l; i <= r; i++) { | ||
342 | cc1 = eclass[fmap[i]]; | ||
343 | if (cc != cc1) { SET_BH(i); cc = cc1; }; | ||
344 | } | ||
345 | } | ||
346 | } | ||
347 | |||
348 | if (verb >= 4) | ||
349 | VPrintf1 ( "%6d unresolved strings\n", nNotDone ); | ||
350 | |||
351 | H *= 2; | ||
352 | if (H > nblock || nNotDone == 0) break; | ||
353 | } | ||
354 | |||
355 | /*-- | ||
356 | Reconstruct the original block in | ||
357 | eclass16 [0 .. nblock-1] [15:8], since the | ||
358 | previous phase destroyed it. | ||
359 | --*/ | ||
360 | if (verb >= 4) | ||
361 | VPrintf0 ( " reconstructing block ...\n" ); | ||
362 | j = 0; | ||
363 | for (i = 0; i < nblock; i++) { | ||
364 | while (ftabCopy[j] == 0) j++; | ||
365 | ftabCopy[j]--; | ||
366 | eclass16[fmap[i]] = j << 8; | ||
367 | } | ||
368 | AssertH ( j < 256, 1005 ); | ||
369 | } | ||
370 | |||
371 | #undef SET_BH | ||
372 | #undef CLEAR_BH | ||
373 | #undef ISSET_BH | ||
374 | #undef WORD_BH | ||
375 | #undef UNALIGNED_BH | ||
376 | |||
377 | |||
378 | /*---------------------------------------------*/ | ||
379 | /*--- The main, O(N^2 log(N)) sorting ---*/ | ||
380 | /*--- algorithm. Faster for "normal" ---*/ | ||
381 | /*--- non-repetitive blocks. ---*/ | ||
382 | /*---------------------------------------------*/ | ||
383 | |||
384 | /*---------------------------------------------*/ | ||
385 | static | ||
386 | __inline__ | ||
387 | Bool mainGtU ( UInt32 i1, | ||
388 | UInt32 i2, | ||
389 | UInt16* block, | ||
390 | UInt16* quadrant, | ||
391 | UInt32 nblock, | ||
392 | Int32* budget ) | ||
81 | { | 393 | { |
82 | Int32 k; | 394 | Int32 k; |
83 | UChar c1, c2; | ||
84 | UInt16 s1, s2; | 395 | UInt16 s1, s2; |
85 | 396 | ||
86 | AssertD ( i1 != i2, "fullGtU(1)" ); | 397 | AssertD ( i1 != i2, "mainGtU" ); |
87 | 398 | ||
88 | c1 = block[i1]; | 399 | s1 = block[i1]; s2 = block[i2]; |
89 | c2 = block[i2]; | 400 | if (s1 != s2) return (s1 > s2); |
90 | if (c1 != c2) return (c1 > c2); | 401 | i1 += 2; i2 += 2; |
91 | i1++; i2++; | ||
92 | 402 | ||
93 | c1 = block[i1]; | 403 | s1 = block[i1]; s2 = block[i2]; |
94 | c2 = block[i2]; | 404 | if (s1 != s2) return (s1 > s2); |
95 | if (c1 != c2) return (c1 > c2); | 405 | i1 += 2; i2 += 2; |
96 | i1++; i2++; | ||
97 | 406 | ||
98 | c1 = block[i1]; | 407 | s1 = block[i1]; s2 = block[i2]; |
99 | c2 = block[i2]; | 408 | if (s1 != s2) return (s1 > s2); |
100 | if (c1 != c2) return (c1 > c2); | 409 | i1 += 2; i2 += 2; |
101 | i1++; i2++; | ||
102 | 410 | ||
103 | c1 = block[i1]; | 411 | s1 = block[i1]; s2 = block[i2]; |
104 | c2 = block[i2]; | 412 | if (s1 != s2) return (s1 > s2); |
105 | if (c1 != c2) return (c1 > c2); | 413 | i1 += 2; i2 += 2; |
106 | i1++; i2++; | ||
107 | 414 | ||
108 | c1 = block[i1]; | 415 | s1 = block[i1]; s2 = block[i2]; |
109 | c2 = block[i2]; | 416 | if (s1 != s2) return (s1 > s2); |
110 | if (c1 != c2) return (c1 > c2); | 417 | i1 += 2; i2 += 2; |
111 | i1++; i2++; | ||
112 | 418 | ||
113 | c1 = block[i1]; | 419 | s1 = block[i1]; s2 = block[i2]; |
114 | c2 = block[i2]; | 420 | if (s1 != s2) return (s1 > s2); |
115 | if (c1 != c2) return (c1 > c2); | 421 | i1 += 2; i2 += 2; |
116 | i1++; i2++; | ||
117 | 422 | ||
118 | k = nblock; | 423 | k = nblock + 8; |
119 | 424 | ||
120 | do { | 425 | do { |
121 | 426 | ||
122 | c1 = block[i1]; | 427 | s1 = block[i1]; s2 = block[i2]; |
123 | c2 = block[i2]; | ||
124 | if (c1 != c2) return (c1 > c2); | ||
125 | s1 = quadrant[i1]; | ||
126 | s2 = quadrant[i2]; | ||
127 | if (s1 != s2) return (s1 > s2); | 428 | if (s1 != s2) return (s1 > s2); |
128 | i1++; i2++; | 429 | s1 = quadrant[i1]; s2 = quadrant[i2]; |
430 | if (s1 != s2) return (s1 > s2); | ||
431 | i1 += 2; i2 += 2; | ||
129 | 432 | ||
130 | c1 = block[i1]; | 433 | s1 = block[i1]; s2 = block[i2]; |
131 | c2 = block[i2]; | 434 | if (s1 != s2) return (s1 > s2); |
132 | if (c1 != c2) return (c1 > c2); | 435 | s1 = quadrant[i1]; s2 = quadrant[i2]; |
133 | s1 = quadrant[i1]; | ||
134 | s2 = quadrant[i2]; | ||
135 | if (s1 != s2) return (s1 > s2); | 436 | if (s1 != s2) return (s1 > s2); |
136 | i1++; i2++; | 437 | i1 += 2; i2 += 2; |
137 | 438 | ||
138 | c1 = block[i1]; | 439 | s1 = block[i1]; s2 = block[i2]; |
139 | c2 = block[i2]; | ||
140 | if (c1 != c2) return (c1 > c2); | ||
141 | s1 = quadrant[i1]; | ||
142 | s2 = quadrant[i2]; | ||
143 | if (s1 != s2) return (s1 > s2); | 440 | if (s1 != s2) return (s1 > s2); |
144 | i1++; i2++; | 441 | s1 = quadrant[i1]; s2 = quadrant[i2]; |
442 | if (s1 != s2) return (s1 > s2); | ||
443 | i1 += 2; i2 += 2; | ||
145 | 444 | ||
146 | c1 = block[i1]; | 445 | s1 = block[i1]; s2 = block[i2]; |
147 | c2 = block[i2]; | 446 | if (s1 != s2) return (s1 > s2); |
148 | if (c1 != c2) return (c1 > c2); | 447 | s1 = quadrant[i1]; s2 = quadrant[i2]; |
149 | s1 = quadrant[i1]; | ||
150 | s2 = quadrant[i2]; | ||
151 | if (s1 != s2) return (s1 > s2); | 448 | if (s1 != s2) return (s1 > s2); |
152 | i1++; i2++; | 449 | i1 += 2; i2 += 2; |
153 | 450 | ||
154 | if (i1 >= nblock) i1 -= nblock; | 451 | if (i1 >= nblock) i1 -= nblock; |
155 | if (i2 >= nblock) i2 -= nblock; | 452 | if (i2 >= nblock) i2 -= nblock; |
156 | 453 | ||
157 | k -= 4; | 454 | k -= 8; |
158 | (*workDone)++; | 455 | (*budget)--; |
159 | } | 456 | } |
160 | while (k >= 0); | 457 | while (k >= 0); |
161 | 458 | ||
162 | return False; | 459 | return False; |
163 | } | 460 | } |
164 | 461 | ||
462 | |||
165 | /*---------------------------------------------*/ | 463 | /*---------------------------------------------*/ |
166 | /*-- | 464 | /*-- |
167 | Knuth's increments seem to work better | 465 | Knuth's increments seem to work better |
@@ -169,22 +467,22 @@ static __inline__ Bool fullGtU ( UChar* block, | |||
169 | because the number of elems to sort is | 467 | because the number of elems to sort is |
170 | usually small, typically <= 20. | 468 | usually small, typically <= 20. |
171 | --*/ | 469 | --*/ |
172 | static Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, | 470 | Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, |
173 | 9841, 29524, 88573, 265720, | 471 | 9841, 29524, 88573, 265720, |
174 | 797161, 2391484 }; | 472 | 797161, 2391484 }; |
175 | 473 | ||
176 | static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d ) | 474 | static |
475 | void mainSimpleSort ( UInt32* ptr, | ||
476 | UInt16* block, | ||
477 | UInt16* quadrant, | ||
478 | Int32 nblock, | ||
479 | Int32 lo, | ||
480 | Int32 hi, | ||
481 | Int32 d, | ||
482 | Int32* budget ) | ||
177 | { | 483 | { |
178 | Int32 i, j, h, bigN, hp; | 484 | Int32 i, j, h, bigN, hp; |
179 | Int32 v; | 485 | UInt32 v; |
180 | |||
181 | UChar* block = s->block; | ||
182 | UInt32* zptr = s->zptr; | ||
183 | UInt16* quadrant = s->quadrant; | ||
184 | Int32* workDone = &(s->workDone); | ||
185 | Int32 nblock = s->nblock; | ||
186 | Int32 workLimit = s->workLimit; | ||
187 | Bool firstAttempt = s->firstAttempt; | ||
188 | 486 | ||
189 | bigN = hi - lo + 1; | 487 | bigN = hi - lo + 1; |
190 | if (bigN < 2) return; | 488 | if (bigN < 2) return; |
@@ -195,49 +493,53 @@ static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d ) | |||
195 | 493 | ||
196 | for (; hp >= 0; hp--) { | 494 | for (; hp >= 0; hp--) { |
197 | h = incs[hp]; | 495 | h = incs[hp]; |
496 | |||
198 | i = lo + h; | 497 | i = lo + h; |
199 | while (True) { | 498 | while (True) { |
200 | 499 | ||
201 | /*-- copy 1 --*/ | 500 | /*-- copy 1 --*/ |
202 | if (i > hi) break; | 501 | if (i > hi) break; |
203 | v = zptr[i]; | 502 | v = ptr[i]; |
204 | j = i; | 503 | j = i; |
205 | while ( fullGtU ( block, quadrant, nblock, workDone, | 504 | while ( mainGtU ( |
206 | zptr[j-h]+d, v+d ) ) { | 505 | ptr[j-h]+d, v+d, block, quadrant, nblock, budget |
207 | zptr[j] = zptr[j-h]; | 506 | ) ) { |
507 | ptr[j] = ptr[j-h]; | ||
208 | j = j - h; | 508 | j = j - h; |
209 | if (j <= (lo + h - 1)) break; | 509 | if (j <= (lo + h - 1)) break; |
210 | } | 510 | } |
211 | zptr[j] = v; | 511 | ptr[j] = v; |
212 | i++; | 512 | i++; |
213 | 513 | ||
214 | /*-- copy 2 --*/ | 514 | /*-- copy 2 --*/ |
215 | if (i > hi) break; | 515 | if (i > hi) break; |
216 | v = zptr[i]; | 516 | v = ptr[i]; |
217 | j = i; | 517 | j = i; |
218 | while ( fullGtU ( block, quadrant, nblock, workDone, | 518 | while ( mainGtU ( |
219 | zptr[j-h]+d, v+d ) ) { | 519 | ptr[j-h]+d, v+d, block, quadrant, nblock, budget |
220 | zptr[j] = zptr[j-h]; | 520 | ) ) { |
521 | ptr[j] = ptr[j-h]; | ||
221 | j = j - h; | 522 | j = j - h; |
222 | if (j <= (lo + h - 1)) break; | 523 | if (j <= (lo + h - 1)) break; |
223 | } | 524 | } |
224 | zptr[j] = v; | 525 | ptr[j] = v; |
225 | i++; | 526 | i++; |
226 | 527 | ||
227 | /*-- copy 3 --*/ | 528 | /*-- copy 3 --*/ |
228 | if (i > hi) break; | 529 | if (i > hi) break; |
229 | v = zptr[i]; | 530 | v = ptr[i]; |
230 | j = i; | 531 | j = i; |
231 | while ( fullGtU ( block, quadrant, nblock, workDone, | 532 | while ( mainGtU ( |
232 | zptr[j-h]+d, v+d ) ) { | 533 | ptr[j-h]+d, v+d, block, quadrant, nblock, budget |
233 | zptr[j] = zptr[j-h]; | 534 | ) ) { |
535 | ptr[j] = ptr[j-h]; | ||
234 | j = j - h; | 536 | j = j - h; |
235 | if (j <= (lo + h - 1)) break; | 537 | if (j <= (lo + h - 1)) break; |
236 | } | 538 | } |
237 | zptr[j] = v; | 539 | ptr[j] = v; |
238 | i++; | 540 | i++; |
239 | 541 | ||
240 | if (*workDone > workLimit && firstAttempt) return; | 542 | if (*budget < 0) return; |
241 | } | 543 | } |
242 | } | 544 | } |
243 | } | 545 | } |
@@ -252,20 +554,26 @@ static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d ) | |||
252 | Sedgewick and Jon L. Bentley. | 554 | Sedgewick and Jon L. Bentley. |
253 | --*/ | 555 | --*/ |
254 | 556 | ||
255 | #define swap(lv1, lv2) \ | 557 | #define mswap(zz1, zz2) \ |
256 | { Int32 tmp = lv1; lv1 = lv2; lv2 = tmp; } | 558 | { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } |
257 | 559 | ||
258 | static void vswap ( UInt32* zptr, Int32 p1, Int32 p2, Int32 n ) | 560 | #define mvswap(zzp1, zzp2, zzn) \ |
259 | { | 561 | { \ |
260 | while (n > 0) { | 562 | Int32 yyp1 = (zzp1); \ |
261 | swap(zptr[p1], zptr[p2]); | 563 | Int32 yyp2 = (zzp2); \ |
262 | p1++; p2++; n--; | 564 | Int32 yyn = (zzn); \ |
263 | } | 565 | while (yyn > 0) { \ |
566 | mswap(ptr[yyp1], ptr[yyp2]); \ | ||
567 | yyp1++; yyp2++; yyn--; \ | ||
568 | } \ | ||
264 | } | 569 | } |
265 | 570 | ||
266 | static UChar med3 ( UChar a, UChar b, UChar c ) | 571 | |
572 | static | ||
573 | __inline__ | ||
574 | UInt16 mmed3 ( UInt16 a, UInt16 b, UInt16 c ) | ||
267 | { | 575 | { |
268 | UChar t; | 576 | UInt16 t; |
269 | if (a > b) { t = a; a = b; b = t; }; | 577 | if (a > b) { t = a; a = b; b = t; }; |
270 | if (b > c) { t = b; b = c; c = t; }; | 578 | if (b > c) { t = b; b = c; c = t; }; |
271 | if (a > b) b = a; | 579 | if (a > b) b = a; |
@@ -273,66 +581,72 @@ static UChar med3 ( UChar a, UChar b, UChar c ) | |||
273 | } | 581 | } |
274 | 582 | ||
275 | 583 | ||
276 | #define min(a,b) ((a) < (b)) ? (a) : (b) | 584 | #define mmin(a,b) ((a) < (b)) ? (a) : (b) |
277 | 585 | ||
278 | typedef | 586 | #define mpush(lz,hz,dz) { stackLo[sp] = lz; \ |
279 | struct { Int32 ll; Int32 hh; Int32 dd; } | 587 | stackHi[sp] = hz; \ |
280 | StackElem; | 588 | stackD [sp] = dz; \ |
589 | sp++; } | ||
281 | 590 | ||
282 | #define push(lz,hz,dz) { stack[sp].ll = lz; \ | 591 | #define mpop(lz,hz,dz) { sp--; \ |
283 | stack[sp].hh = hz; \ | 592 | lz = stackLo[sp]; \ |
284 | stack[sp].dd = dz; \ | 593 | hz = stackHi[sp]; \ |
285 | sp++; } | 594 | dz = stackD [sp]; } |
286 | 595 | ||
287 | #define pop(lz,hz,dz) { sp--; \ | ||
288 | lz = stack[sp].ll; \ | ||
289 | hz = stack[sp].hh; \ | ||
290 | dz = stack[sp].dd; } | ||
291 | 596 | ||
292 | #define SMALL_THRESH 20 | 597 | #define mnextsize(az) (nextHi[az]-nextLo[az]) |
293 | #define DEPTH_THRESH 10 | 598 | |
599 | #define mnextswap(az,bz) \ | ||
600 | { Int32 tz; \ | ||
601 | tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ | ||
602 | tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ | ||
603 | tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; } | ||
294 | 604 | ||
295 | /*-- | ||
296 | If you are ever unlucky/improbable enough | ||
297 | to get a stack overflow whilst sorting, | ||
298 | increase the following constant and try | ||
299 | again. In practice I have never seen the | ||
300 | stack go above 27 elems, so the following | ||
301 | limit seems very generous. | ||
302 | --*/ | ||
303 | #define QSORT_STACK_SIZE 1000 | ||
304 | 605 | ||
606 | #define MAIN_QSORT_SMALL_THRESH 20 | ||
607 | #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) | ||
608 | #define MAIN_QSORT_STACK_SIZE 100 | ||
305 | 609 | ||
306 | static void qSort3 ( EState* s, Int32 loSt, Int32 hiSt, Int32 dSt ) | 610 | static |
611 | void mainQSort3 ( UInt32* ptr, | ||
612 | UInt16* block, | ||
613 | UInt16* quadrant, | ||
614 | Int32 nblock, | ||
615 | Int32 loSt, | ||
616 | Int32 hiSt, | ||
617 | Int32 dSt, | ||
618 | Int32* budget ) | ||
307 | { | 619 | { |
308 | Int32 unLo, unHi, ltLo, gtHi, med, n, m; | 620 | Int32 unLo, unHi, ltLo, gtHi, n, m, med; |
309 | Int32 sp, lo, hi, d; | 621 | Int32 sp, lo, hi, d; |
310 | StackElem stack[QSORT_STACK_SIZE]; | ||
311 | 622 | ||
312 | UChar* block = s->block; | 623 | Int32 stackLo[MAIN_QSORT_STACK_SIZE]; |
313 | UInt32* zptr = s->zptr; | 624 | Int32 stackHi[MAIN_QSORT_STACK_SIZE]; |
314 | Int32* workDone = &(s->workDone); | 625 | Int32 stackD [MAIN_QSORT_STACK_SIZE]; |
315 | Int32 workLimit = s->workLimit; | 626 | |
316 | Bool firstAttempt = s->firstAttempt; | 627 | Int32 nextLo[3]; |
628 | Int32 nextHi[3]; | ||
629 | Int32 nextD [3]; | ||
317 | 630 | ||
318 | sp = 0; | 631 | sp = 0; |
319 | push ( loSt, hiSt, dSt ); | 632 | mpush ( loSt, hiSt, dSt ); |
320 | 633 | ||
321 | while (sp > 0) { | 634 | while (sp > 0) { |
322 | 635 | ||
323 | AssertH ( sp < QSORT_STACK_SIZE, 1001 ); | 636 | AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 ); |
324 | 637 | ||
325 | pop ( lo, hi, d ); | 638 | mpop ( lo, hi, d ); |
326 | 639 | if (hi - lo < MAIN_QSORT_SMALL_THRESH || | |
327 | if (hi - lo < SMALL_THRESH || d > DEPTH_THRESH) { | 640 | d > MAIN_QSORT_DEPTH_THRESH) { |
328 | simpleSort ( s, lo, hi, d ); | 641 | mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget ); |
329 | if (*workDone > workLimit && firstAttempt) return; | 642 | if (*budget < 0) return; |
330 | continue; | 643 | continue; |
331 | } | 644 | } |
332 | 645 | ||
333 | med = med3 ( block[zptr[ lo ]+d], | 646 | med = (Int32) |
334 | block[zptr[ hi ]+d], | 647 | mmed3 ( block[ptr[ lo ]+d], |
335 | block[zptr[ (lo+hi)>>1 ]+d] ); | 648 | block[ptr[ hi ]+d], |
649 | block[ptr[ (lo+hi)>>1 ]+d] ); | ||
336 | 650 | ||
337 | unLo = ltLo = lo; | 651 | unLo = ltLo = lo; |
338 | unHi = gtHi = hi; | 652 | unHi = gtHi = hi; |
@@ -340,370 +654,412 @@ static void qSort3 ( EState* s, Int32 loSt, Int32 hiSt, Int32 dSt ) | |||
340 | while (True) { | 654 | while (True) { |
341 | while (True) { | 655 | while (True) { |
342 | if (unLo > unHi) break; | 656 | if (unLo > unHi) break; |
343 | n = ((Int32)block[zptr[unLo]+d]) - med; | 657 | n = ((Int32)block[ptr[unLo]+d]) - med; |
344 | if (n == 0) { swap(zptr[unLo], zptr[ltLo]); ltLo++; unLo++; continue; }; | 658 | if (n == 0) { |
659 | mswap(ptr[unLo], ptr[ltLo]); | ||
660 | ltLo++; unLo++; continue; | ||
661 | }; | ||
345 | if (n > 0) break; | 662 | if (n > 0) break; |
346 | unLo++; | 663 | unLo++; |
347 | } | 664 | } |
348 | while (True) { | 665 | while (True) { |
349 | if (unLo > unHi) break; | 666 | if (unLo > unHi) break; |
350 | n = ((Int32)block[zptr[unHi]+d]) - med; | 667 | n = ((Int32)block[ptr[unHi]+d]) - med; |
351 | if (n == 0) { swap(zptr[unHi], zptr[gtHi]); gtHi--; unHi--; continue; }; | 668 | if (n == 0) { |
669 | mswap(ptr[unHi], ptr[gtHi]); | ||
670 | gtHi--; unHi--; continue; | ||
671 | }; | ||
352 | if (n < 0) break; | 672 | if (n < 0) break; |
353 | unHi--; | 673 | unHi--; |
354 | } | 674 | } |
355 | if (unLo > unHi) break; | 675 | if (unLo > unHi) break; |
356 | swap(zptr[unLo], zptr[unHi]); unLo++; unHi--; | 676 | mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--; |
357 | } | 677 | } |
358 | 678 | ||
359 | AssertD ( unHi == unLo-1, "bad termination in qSort3" ); | 679 | AssertD ( unHi == unLo-1, "mainQSort3(2)" ); |
360 | 680 | ||
361 | if (gtHi < ltLo) { | 681 | if (gtHi < ltLo) { |
362 | push(lo, hi, d+1 ); | 682 | mpush(lo, hi, d+2 ); |
363 | continue; | 683 | continue; |
364 | } | 684 | } |
365 | 685 | ||
366 | n = min(ltLo-lo, unLo-ltLo); vswap(zptr, lo, unLo-n, n); | 686 | n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n); |
367 | m = min(hi-gtHi, gtHi-unHi); vswap(zptr, unLo, hi-m+1, m); | 687 | m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m); |
368 | 688 | ||
369 | n = lo + unLo - ltLo - 1; | 689 | n = lo + unLo - ltLo - 1; |
370 | m = hi - (gtHi - unHi) + 1; | 690 | m = hi - (gtHi - unHi) + 1; |
371 | 691 | ||
372 | push ( lo, n, d ); | 692 | nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; |
373 | push ( n+1, m-1, d+1 ); | 693 | nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; |
374 | push ( m, hi, d ); | 694 | nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+2; |
695 | |||
696 | if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); | ||
697 | if (mnextsize(1) < mnextsize(2)) mnextswap(1,2); | ||
698 | if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); | ||
699 | |||
700 | AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" ); | ||
701 | AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" ); | ||
702 | |||
703 | mpush (nextLo[0], nextHi[0], nextD[0]); | ||
704 | mpush (nextLo[1], nextHi[1], nextD[1]); | ||
705 | mpush (nextLo[2], nextHi[2], nextD[2]); | ||
375 | } | 706 | } |
376 | } | 707 | } |
377 | 708 | ||
709 | #undef mswap | ||
710 | #undef mvswap | ||
711 | #undef mpush | ||
712 | #undef mpop | ||
713 | #undef mmin | ||
714 | #undef mnextsize | ||
715 | #undef mnextswap | ||
716 | #undef MAIN_QSORT_SMALL_THRESH | ||
717 | #undef MAIN_QSORT_DEPTH_THRESH | ||
718 | #undef MAIN_QSORT_STACK_SIZE | ||
719 | |||
378 | 720 | ||
379 | /*---------------------------------------------*/ | 721 | /*---------------------------------------------*/ |
722 | /* Pre: | ||
723 | nblock > N_OVERSHOOT | ||
724 | block32 exists for [0 .. nblock-1 +N_OVERSHOOT] | ||
725 | ((UInt16*)block32) [0 .. nblock-1] [15:8] holds block | ||
726 | ptr exists for [0 .. nblock-1] | ||
727 | |||
728 | Post: | ||
729 | ((UInt16*)block32) [0 .. nblock-1] [15:8] holds block | ||
730 | All other areas of block32 destroyed | ||
731 | ftab [0 .. 65536 ] destroyed | ||
732 | ptr [0 .. nblock-1] holds sorted order | ||
733 | if (*budget < 0), sorting was abandoned | ||
734 | */ | ||
380 | 735 | ||
381 | #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) | 736 | #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) |
382 | |||
383 | #define SETMASK (1 << 21) | 737 | #define SETMASK (1 << 21) |
384 | #define CLEARMASK (~(SETMASK)) | 738 | #define CLEARMASK (~(SETMASK)) |
385 | 739 | ||
386 | static void sortMain ( EState* s ) | 740 | static |
741 | void mainSort ( UInt32* ptr, | ||
742 | UInt16* block, | ||
743 | UInt16* quadrant, | ||
744 | UInt32* ftab, | ||
745 | Int32 nblock, | ||
746 | Int32 verb, | ||
747 | Int32* budget ) | ||
387 | { | 748 | { |
388 | Int32 i, j, k, ss, sb; | 749 | Int32 i, j, k, m, ss, sb; |
389 | Int32 runningOrder[256]; | 750 | Int32 runningOrder[256]; |
390 | Int32 copy[256]; | 751 | Int32 copy[256]; |
391 | Bool bigDone[256]; | 752 | Bool bigDone[256]; |
392 | UChar c1, c2; | 753 | UChar c1; |
393 | Int32 numQSorted; | 754 | Int32 numQSorted; |
394 | 755 | Int32 biggestSoFar; | |
395 | UChar* block = s->block; | 756 | UInt16 s; |
396 | UInt32* zptr = s->zptr; | 757 | |
397 | UInt16* quadrant = s->quadrant; | 758 | if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" ); |
398 | Int32* ftab = s->ftab; | 759 | |
399 | Int32* workDone = &(s->workDone); | 760 | /*-- Stripe the block data into 16 bits, and at the |
400 | Int32 nblock = s->nblock; | 761 | same time set up the 2-byte frequency table |
401 | Int32 workLimit = s->workLimit; | ||
402 | Bool firstAttempt = s->firstAttempt; | ||
403 | |||
404 | /*-- | ||
405 | In the various block-sized structures, live data runs | ||
406 | from 0 to last+NUM_OVERSHOOT_BYTES inclusive. First, | ||
407 | set up the overshoot area for block. | ||
408 | --*/ | 762 | --*/ |
763 | for (i = 65536; i >= 0; i--) ftab[i] = 0; | ||
764 | |||
765 | s = block[0]; | ||
766 | for (i = 1; i < nblock; i++) { | ||
767 | quadrant[i] = 0; | ||
768 | s = (s << 8) | block[i]; | ||
769 | block[i-1] = s; | ||
770 | ftab[s]++; | ||
771 | } | ||
772 | quadrant[0] = 0; | ||
773 | s = (s << 8) | (block[0] >> 8); | ||
774 | block[nblock-1] = s; | ||
775 | ftab[s]++; | ||
776 | |||
777 | /*-- (emphasises close relationship of block & quadrant) --*/ | ||
778 | for (i = 0; i < BZ_N_OVERSHOOT; i++) { | ||
779 | block [nblock+i] = block[i]; | ||
780 | quadrant[nblock+i] = 0; | ||
781 | } | ||
409 | 782 | ||
410 | if (s->verbosity >= 4) | 783 | if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" ); |
411 | VPrintf0( " sort initialise ...\n" ); | ||
412 | |||
413 | for (i = 0; i < BZ_NUM_OVERSHOOT_BYTES; i++) | ||
414 | block[nblock+i] = block[i % nblock]; | ||
415 | for (i = 0; i < nblock+BZ_NUM_OVERSHOOT_BYTES; i++) | ||
416 | quadrant[i] = 0; | ||
417 | |||
418 | |||
419 | if (nblock <= 4000) { | ||
420 | |||
421 | /*-- | ||
422 | Use simpleSort(), since the full sorting mechanism | ||
423 | has quite a large constant overhead. | ||
424 | --*/ | ||
425 | if (s->verbosity >= 4) VPrintf0( " simpleSort ...\n" ); | ||
426 | for (i = 0; i < nblock; i++) zptr[i] = i; | ||
427 | firstAttempt = False; | ||
428 | *workDone = workLimit = 0; | ||
429 | simpleSort ( s, 0, nblock-1, 0 ); | ||
430 | if (s->verbosity >= 4) VPrintf0( " simpleSort done.\n" ); | ||
431 | 784 | ||
432 | } else { | 785 | /*-- Complete the initial radix sort --*/ |
786 | for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; | ||
433 | 787 | ||
434 | numQSorted = 0; | 788 | for (i = 0; i < nblock; i++) { |
435 | for (i = 0; i <= 255; i++) bigDone[i] = False; | 789 | s = block[i]; |
790 | j = ftab[s] - 1; | ||
791 | ftab[s] = j; | ||
792 | ptr[j] = i; | ||
793 | } | ||
436 | 794 | ||
437 | if (s->verbosity >= 4) VPrintf0( " bucket sorting ...\n" ); | 795 | /*-- |
796 | Now ftab contains the first loc of every small bucket. | ||
797 | Calculate the running order, from smallest to largest | ||
798 | big bucket. | ||
799 | --*/ | ||
800 | for (i = 0; i <= 255; i++) { | ||
801 | bigDone [i] = False; | ||
802 | runningOrder[i] = i; | ||
803 | } | ||
438 | 804 | ||
439 | for (i = 0; i <= 65536; i++) ftab[i] = 0; | 805 | { |
806 | Int32 vv; | ||
807 | Int32 h = 1; | ||
808 | do h = 3 * h + 1; while (h <= 256); | ||
809 | do { | ||
810 | h = h / 3; | ||
811 | for (i = h; i <= 255; i++) { | ||
812 | vv = runningOrder[i]; | ||
813 | j = i; | ||
814 | while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { | ||
815 | runningOrder[j] = runningOrder[j-h]; | ||
816 | j = j - h; | ||
817 | if (j <= (h - 1)) goto zero; | ||
818 | } | ||
819 | zero: | ||
820 | runningOrder[j] = vv; | ||
821 | } | ||
822 | } while (h != 1); | ||
823 | } | ||
440 | 824 | ||
441 | c1 = block[nblock-1]; | 825 | /*-- |
442 | for (i = 0; i < nblock; i++) { | 826 | The main sorting loop. |
443 | c2 = block[i]; | 827 | --*/ |
444 | ftab[(c1 << 8) + c2]++; | ||
445 | c1 = c2; | ||
446 | } | ||
447 | 828 | ||
448 | for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; | 829 | biggestSoFar = numQSorted = 0; |
449 | 830 | ||
450 | c1 = block[0]; | 831 | for (i = 0; i <= 255; i++) { |
451 | for (i = 0; i < nblock-1; i++) { | ||
452 | c2 = block[i+1]; | ||
453 | j = (c1 << 8) + c2; | ||
454 | c1 = c2; | ||
455 | ftab[j]--; | ||
456 | zptr[ftab[j]] = i; | ||
457 | } | ||
458 | j = (block[nblock-1] << 8) + block[0]; | ||
459 | ftab[j]--; | ||
460 | zptr[ftab[j]] = nblock-1; | ||
461 | 832 | ||
462 | /*-- | 833 | /*-- |
463 | Now ftab contains the first loc of every small bucket. | 834 | Process big buckets, starting with the least full. |
464 | Calculate the running order, from smallest to largest | 835 | Basically this is a 4-step process in which we call |
465 | big bucket. | 836 | mainQSort3 to sort the small buckets [ss, j], but |
837 | also make a big effort to avoid the calls if we can. | ||
466 | --*/ | 838 | --*/ |
839 | ss = runningOrder[i]; | ||
467 | 840 | ||
468 | for (i = 0; i <= 255; i++) runningOrder[i] = i; | 841 | /*-- |
469 | 842 | Step 1: | |
470 | { | 843 | Complete the big bucket [ss] by quicksorting |
471 | Int32 vv; | 844 | any unsorted small buckets [ss, j], for j != ss. |
472 | Int32 h = 1; | 845 | Hopefully previous pointer-scanning phases have already |
473 | do h = 3 * h + 1; while (h <= 256); | 846 | completed many of the small buckets [ss, j], so |
474 | do { | 847 | we don't have to sort them at all. |
475 | h = h / 3; | 848 | --*/ |
476 | for (i = h; i <= 255; i++) { | 849 | for (j = 0; j <= 255; j++) { |
477 | vv = runningOrder[i]; | 850 | if (j != ss) { |
478 | j = i; | 851 | sb = (ss << 8) + j; |
479 | while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { | 852 | if ( ! (ftab[sb] & SETMASK) ) { |
480 | runningOrder[j] = runningOrder[j-h]; | 853 | Int32 lo = ftab[sb] & CLEARMASK; |
481 | j = j - h; | 854 | Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; |
482 | if (j <= (h - 1)) goto zero; | 855 | if (hi > lo) { |
856 | if (verb >= 4) | ||
857 | VPrintf4 ( " qsort [0x%x, 0x%x] " | ||
858 | "done %d this %d\n", | ||
859 | ss, j, numQSorted, hi - lo + 1 ); | ||
860 | mainQSort3 ( | ||
861 | ptr, block, quadrant, nblock, | ||
862 | lo, hi, BZ_N_RADIX, budget | ||
863 | ); | ||
864 | numQSorted += (hi - lo + 1); | ||
865 | if (*budget < 0) return; | ||
483 | } | 866 | } |
484 | zero: | ||
485 | runningOrder[j] = vv; | ||
486 | } | 867 | } |
487 | } while (h != 1); | 868 | ftab[sb] |= SETMASK; |
869 | } | ||
488 | } | 870 | } |
489 | 871 | ||
490 | /*-- | 872 | /*-- |
491 | The main sorting loop. | 873 | Step 2: |
874 | Deal specially with case [ss, ss]. This establishes the | ||
875 | sorted order for [ss, ss] without any comparisons. | ||
876 | A clever trick, cryptically described as steps Q6b and Q6c | ||
877 | in SRC-124 (aka BW94). Compared to bzip2, this makes it | ||
878 | practical not to use a preliminary run-length coder. | ||
492 | --*/ | 879 | --*/ |
880 | { | ||
881 | Int32 put0, get0, put1, get1; | ||
882 | Int32 sbn = (ss << 8) + ss; | ||
883 | Int32 lo = ftab[sbn] & CLEARMASK; | ||
884 | Int32 hi = (ftab[sbn+1] & CLEARMASK) - 1; | ||
885 | UChar ssc = (UChar)ss; | ||
886 | put0 = lo; | ||
887 | get0 = ftab[ss << 8] & CLEARMASK; | ||
888 | put1 = hi; | ||
889 | get1 = (ftab[(ss+1) << 8] & CLEARMASK) - 1; | ||
890 | while (get0 < put0) { | ||
891 | j = ptr[get0]-1; if (j < 0) j += nblock; | ||
892 | c1 = (UChar)(block[j] >> 8); | ||
893 | if (c1 == ssc) { ptr[put0] = j; put0++; }; | ||
894 | get0++; | ||
895 | } | ||
896 | while (get1 > put1) { | ||
897 | j = ptr[get1]-1; if (j < 0) j += nblock; | ||
898 | c1 = (UChar)(block[j] >> 8); | ||
899 | if (c1 == ssc) { ptr[put1] = j; put1--; }; | ||
900 | get1--; | ||
901 | } | ||
902 | ftab[sbn] |= SETMASK; | ||
903 | } | ||
493 | 904 | ||
494 | for (i = 0; i <= 255; i++) { | 905 | /*-- |
495 | 906 | Step 3: | |
496 | /*-- | 907 | The [ss] big bucket is now done. Record this fact, |
497 | Process big buckets, starting with the least full. | 908 | and update the quadrant descriptors. Remember to |
498 | Basically this is a 4-step process in which we call | 909 | update quadrants in the overshoot area too, if |
499 | qSort3 to sort the small buckets [ss, j], but | 910 | necessary. The "if (i < 255)" test merely skips |
500 | also make a big effort to avoid the calls if we can. | 911 | this updating for the last bucket processed, since |
501 | --*/ | 912 | updating for the last bucket is pointless. |
502 | ss = runningOrder[i]; | 913 | |
503 | 914 | The quadrant array provides a way to incrementally | |
504 | /*-- | 915 | cache sort orderings, as they appear, so as to |
505 | Step 1: | 916 | make subsequent comparisons in fullGtU() complete |
506 | Complete the big bucket [ss] by quicksorting | 917 | faster. For repetitive blocks this makes a big |
507 | any unsorted small buckets [ss, j], for j != ss. | 918 | difference (but not big enough to be able to avoid |
508 | Hopefully previous pointer-scanning phases have already | 919 | the fallback sorting mechanism, exponential radix sort). |
509 | completed many of the small buckets [ss, j], so | 920 | |
510 | we don't have to sort them at all. | 921 | The precise meaning is: at all times: |
511 | --*/ | 922 | |
512 | for (j = 0; j <= 255; j++) { | 923 | for 0 <= i < nblock and 0 <= j <= nblock |
513 | if (j != ss) { | 924 | |
514 | sb = (ss << 8) + j; | 925 | if block[i] != block[j], |
515 | if ( ! (ftab[sb] & SETMASK) ) { | 926 | |
516 | Int32 lo = ftab[sb] & CLEARMASK; | 927 | then the relative values of quadrant[i] and |
517 | Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; | 928 | quadrant[j] are meaningless. |
518 | if (hi > lo) { | 929 | |
519 | if (s->verbosity >= 4) | 930 | else { |
520 | VPrintf4( " qsort [0x%x, 0x%x] done %d this %d\n", | 931 | if quadrant[i] < quadrant[j] |
521 | ss, j, numQSorted, hi - lo + 1 ); | 932 | then the string starting at i lexicographically |
522 | qSort3 ( s, lo, hi, 2 ); | 933 | precedes the string starting at j |
523 | numQSorted += ( hi - lo + 1 ); | 934 | |
524 | if (*workDone > workLimit && firstAttempt) return; | 935 | else if quadrant[i] > quadrant[j] |
525 | } | 936 | then the string starting at j lexicographically |
937 | precedes the string starting at i | ||
938 | |||
939 | else | ||
940 | the relative ordering of the strings starting | ||
941 | at i and j has not yet been determined. | ||
526 | } | 942 | } |
527 | ftab[sb] |= SETMASK; | 943 | --*/ |
528 | } | 944 | bigDone[ss] = True; |
529 | } | ||
530 | 945 | ||
531 | /*-- | 946 | if (i < 255) { |
532 | Step 2: | 947 | Int32 bbStart = ftab[ss << 8] & CLEARMASK; |
533 | Deal specially with case [ss, ss]. This establishes the | 948 | Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; |
534 | sorted order for [ss, ss] without any comparisons. | 949 | Int32 shifts = 0; |
535 | A clever trick, cryptically described as steps Q6b and Q6c | ||
536 | in SRC-124 (aka BW94). This makes it entirely practical to | ||
537 | not use a preliminary run-length coder, but unfortunately | ||
538 | we are now stuck with the .bz2 file format. | ||
539 | --*/ | ||
540 | { | ||
541 | Int32 put0, get0, put1, get1; | ||
542 | Int32 sbn = (ss << 8) + ss; | ||
543 | Int32 lo = ftab[sbn] & CLEARMASK; | ||
544 | Int32 hi = (ftab[sbn+1] & CLEARMASK) - 1; | ||
545 | UChar ssc = (UChar)ss; | ||
546 | put0 = lo; | ||
547 | get0 = ftab[ss << 8] & CLEARMASK; | ||
548 | put1 = hi; | ||
549 | get1 = (ftab[(ss+1) << 8] & CLEARMASK) - 1; | ||
550 | while (get0 < put0) { | ||
551 | j = zptr[get0]-1; if (j < 0) j += nblock; | ||
552 | c1 = block[j]; | ||
553 | if (c1 == ssc) { zptr[put0] = j; put0++; }; | ||
554 | get0++; | ||
555 | } | ||
556 | while (get1 > put1) { | ||
557 | j = zptr[get1]-1; if (j < 0) j += nblock; | ||
558 | c1 = block[j]; | ||
559 | if (c1 == ssc) { zptr[put1] = j; put1--; }; | ||
560 | get1--; | ||
561 | } | ||
562 | ftab[sbn] |= SETMASK; | ||
563 | } | ||
564 | 950 | ||
565 | /*-- | 951 | while ((bbSize >> shifts) > 65534) shifts++; |
566 | Step 3: | ||
567 | The [ss] big bucket is now done. Record this fact, | ||
568 | and update the quadrant descriptors. Remember to | ||
569 | update quadrants in the overshoot area too, if | ||
570 | necessary. The "if (i < 255)" test merely skips | ||
571 | this updating for the last bucket processed, since | ||
572 | updating for the last bucket is pointless. | ||
573 | |||
574 | The quadrant array provides a way to incrementally | ||
575 | cache sort orderings, as they appear, so as to | ||
576 | make subsequent comparisons in fullGtU() complete | ||
577 | faster. For repetitive blocks this makes a big | ||
578 | difference (but not big enough to be able to avoid | ||
579 | randomisation for very repetitive data.) | ||
580 | |||
581 | The precise meaning is: at all times: | ||
582 | |||
583 | for 0 <= i < nblock and 0 <= j <= nblock | ||
584 | |||
585 | if block[i] != block[j], | ||
586 | |||
587 | then the relative values of quadrant[i] and | ||
588 | quadrant[j] are meaningless. | ||
589 | |||
590 | else { | ||
591 | if quadrant[i] < quadrant[j] | ||
592 | then the string starting at i lexicographically | ||
593 | precedes the string starting at j | ||
594 | |||
595 | else if quadrant[i] > quadrant[j] | ||
596 | then the string starting at j lexicographically | ||
597 | precedes the string starting at i | ||
598 | |||
599 | else | ||
600 | the relative ordering of the strings starting | ||
601 | at i and j has not yet been determined. | ||
602 | } | ||
603 | --*/ | ||
604 | bigDone[ss] = True; | ||
605 | |||
606 | if (i < 255) { | ||
607 | Int32 bbStart = ftab[ss << 8] & CLEARMASK; | ||
608 | Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; | ||
609 | Int32 shifts = 0; | ||
610 | |||
611 | while ((bbSize >> shifts) > 65534) shifts++; | ||
612 | |||
613 | for (j = 0; j < bbSize; j++) { | ||
614 | Int32 a2update = zptr[bbStart + j]; | ||
615 | UInt16 qVal = (UInt16)(j >> shifts); | ||
616 | quadrant[a2update] = qVal; | ||
617 | if (a2update < BZ_NUM_OVERSHOOT_BYTES) | ||
618 | quadrant[a2update + nblock] = qVal; | ||
619 | } | ||
620 | 952 | ||
621 | AssertH ( ( ((bbSize-1) >> shifts) <= 65535 ), 1002 ); | 953 | for (j = 0; j < bbSize; j++) { |
954 | Int32 a2update = ptr[bbStart + j]; | ||
955 | UInt16 qVal = (UInt16)(j >> shifts); | ||
956 | quadrant[a2update] = qVal; | ||
957 | if (a2update < BZ_N_OVERSHOOT) | ||
958 | quadrant[a2update + nblock] = qVal; | ||
622 | } | 959 | } |
960 | AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 ); | ||
961 | } | ||
623 | 962 | ||
624 | /*-- | 963 | /*-- |
625 | Step 4: | 964 | Step 4: |
626 | Now scan this big bucket [ss] so as to synthesise the | 965 | Now scan this big bucket [ss] so as to synthesise the |
627 | sorted order for small buckets [t, ss] for all t != ss. | 966 | sorted order for small buckets [t, ss] for all t != ss. |
628 | This will avoid doing Real Work in subsequent Step 1's. | 967 | This will avoid doing Real Work in subsequent Step 1's. |
629 | --*/ | 968 | --*/ |
630 | for (j = 0; j <= 255; j++) | 969 | for (j = 0; j <= 255; j++) |
631 | copy[j] = ftab[(j << 8) + ss] & CLEARMASK; | 970 | copy[j] = ftab[(j << 8) + ss] & CLEARMASK; |
632 | 971 | ||
633 | for (j = ftab[ss << 8] & CLEARMASK; | 972 | m = ftab[(ss+1) << 8] & CLEARMASK; |
634 | j < (ftab[(ss+1) << 8] & CLEARMASK); | 973 | for (j = ftab[ss << 8] & CLEARMASK; j < m; j++) { |
635 | j++) { | 974 | k = ptr[j] - 1; if (k < 0) k += nblock; |
636 | k = zptr[j]-1; if (k < 0) k += nblock; | 975 | c1 = (UChar)(block[k] >> 8); |
637 | c1 = block[k]; | 976 | if ( ! bigDone[c1] ) { |
638 | if ( ! bigDone[c1] ) { | 977 | ptr[copy[c1]] = k; |
639 | zptr[copy[c1]] = k; | 978 | copy[c1] ++; |
640 | copy[c1] ++; | ||
641 | } | ||
642 | } | 979 | } |
643 | |||
644 | for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; | ||
645 | } | 980 | } |
646 | if (s->verbosity >= 4) | ||
647 | VPrintf3( " %d pointers, %d sorted, %d scanned\n", | ||
648 | nblock, numQSorted, nblock - numQSorted ); | ||
649 | } | ||
650 | } | ||
651 | |||
652 | 981 | ||
653 | /*---------------------------------------------*/ | 982 | for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; |
654 | static void randomiseBlock ( EState* s ) | ||
655 | { | ||
656 | Int32 i; | ||
657 | BZ_RAND_INIT_MASK; | ||
658 | for (i = 0; i < 256; i++) s->inUse[i] = False; | ||
659 | |||
660 | for (i = 0; i < s->nblock; i++) { | ||
661 | BZ_RAND_UPD_MASK; | ||
662 | s->block[i] ^= BZ_RAND_MASK; | ||
663 | s->inUse[s->block[i]] = True; | ||
664 | } | 983 | } |
984 | |||
985 | if (verb >= 4) | ||
986 | VPrintf3 ( " %d pointers, %d sorted, %d scanned\n", | ||
987 | nblock, numQSorted, nblock - numQSorted ); | ||
665 | } | 988 | } |
666 | 989 | ||
990 | #undef BIGFREQ | ||
991 | #undef SETMASK | ||
992 | #undef CLEARMASK | ||
993 | |||
667 | 994 | ||
668 | /*---------------------------------------------*/ | 995 | /*---------------------------------------------*/ |
996 | /* Pre: | ||
997 | nblock > 0 | ||
998 | arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] | ||
999 | ((UInt16*)arr2) [0 .. nblock-1] [15:8] holds block | ||
1000 | arr1 exists for [0 .. nblock-1] | ||
1001 | |||
1002 | Post: | ||
1003 | ((UInt16*)arr2) [0 .. nblock-1] [15:8] holds block | ||
1004 | All other areas of block destroyed | ||
1005 | ftab [ 0 .. 65536 ] destroyed | ||
1006 | arr1 [0 .. nblock-1] holds sorted order | ||
1007 | */ | ||
669 | void blockSort ( EState* s ) | 1008 | void blockSort ( EState* s ) |
670 | { | 1009 | { |
671 | Int32 i; | 1010 | UInt32* ptr = s->ptr; |
672 | 1011 | UInt16* block = s->block; | |
673 | s->workLimit = s->workFactor * (s->nblock - 1); | 1012 | UInt32* ftab = s->ftab; |
674 | s->workDone = 0; | 1013 | Int32 nblock = s->nblock; |
675 | s->blockRandomised = False; | 1014 | Int32 verb = s->verbosity; |
676 | s->firstAttempt = True; | 1015 | Int32 wfact = s->workFactor; |
677 | 1016 | UInt16* quadrant; | |
678 | sortMain ( s ); | 1017 | Int32 budget; |
679 | 1018 | Int32 budgetInit; | |
680 | if (s->verbosity >= 3) | 1019 | Int32 i; |
681 | VPrintf3( " %d work, %d block, ratio %5.2f\n", | 1020 | |
682 | s->workDone, s->nblock-1, | 1021 | if (nblock < 10000) { |
683 | (float)(s->workDone) / (float)(s->nblock-1) ); | 1022 | for (i = 0; i < nblock; i++) block[i] <<= 8; |
684 | 1023 | fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); | |
685 | if (s->workDone > s->workLimit && s->firstAttempt) { | 1024 | } else { |
686 | if (s->verbosity >= 2) | 1025 | quadrant = &(block[nblock+BZ_N_OVERSHOOT]); |
687 | VPrintf0( " sorting aborted; randomising block\n" ); | 1026 | |
688 | randomiseBlock ( s ); | 1027 | /* (wfact-1) / 3 puts the default-factor-30 |
689 | s->workLimit = s->workDone = 0; | 1028 | transition point at very roughly the same place as |
690 | s->blockRandomised = True; | 1029 | with v0.1 and v0.9.0. |
691 | s->firstAttempt = False; | 1030 | Not that it particularly matters any more, since the |
692 | sortMain ( s ); | 1031 | resulting compressed stream is now the same regardless |
693 | if (s->verbosity >= 3) | 1032 | of whether or not we use the main sort or fallback sort. |
694 | VPrintf3( " %d work, %d block, ratio %f\n", | 1033 | */ |
695 | s->workDone, s->nblock-1, | 1034 | if (wfact < 1 ) wfact = 1; |
696 | (float)(s->workDone) / (float)(s->nblock-1) ); | 1035 | if (wfact > 100) wfact = 100; |
1036 | budgetInit = nblock * ((wfact-1) / 3); | ||
1037 | budget = budgetInit; | ||
1038 | |||
1039 | mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget ); | ||
1040 | if (verb >= 3) | ||
1041 | VPrintf3 ( " %d work, %d block, ratio %5.2f\n", | ||
1042 | budgetInit - budget, | ||
1043 | nblock, | ||
1044 | (float)(budgetInit - budget) / | ||
1045 | (float)(nblock==0 ? 1 : nblock) ); | ||
1046 | if (budget < 0) { | ||
1047 | if (verb >= 2) | ||
1048 | VPrintf0 ( " too repetitive; using fallback" | ||
1049 | " sorting algorithm\n" ); | ||
1050 | fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); | ||
1051 | } | ||
697 | } | 1052 | } |
698 | 1053 | ||
699 | s->origPtr = -1; | 1054 | s->origPtr = -1; |
700 | for (i = 0; i < s->nblock; i++) | 1055 | for (i = 0; i < s->nblock; i++) |
701 | if (s->zptr[i] == 0) | 1056 | if (ptr[i] == 0) |
702 | { s->origPtr = i; break; }; | 1057 | { s->origPtr = i; break; }; |
703 | 1058 | ||
704 | AssertH( s->origPtr != -1, 1003 ); | 1059 | AssertH( s->origPtr != -1, 1003 ); |
705 | } | 1060 | } |
706 | 1061 | ||
1062 | |||
707 | /*-------------------------------------------------------------*/ | 1063 | /*-------------------------------------------------------------*/ |
708 | /*--- end blocksort.c ---*/ | 1064 | /*--- end blocksort.c ---*/ |
709 | /*-------------------------------------------------------------*/ | 1065 | /*-------------------------------------------------------------*/ |