From f93cd82a9a7094ad90fd19bbc6ccf6f4627f8060 Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Sat, 4 Sep 1999 22:13:13 +0200
Subject: bzip2-0.9.5d

---
 blocksort.c | 1242 ++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 799 insertions(+), 443 deletions(-)

(limited to 'blocksort.c')

diff --git a/blocksort.c b/blocksort.c
index d8bb26a..85a02de 100644
--- a/blocksort.c
+++ b/blocksort.c
@@ -8,7 +8,7 @@
   This file is a part of bzip2 and/or libbzip2, a program and
   library for lossless, block-sorting data compression.
 
-  Copyright (C) 1996-1998 Julian R Seward.  All rights reserved.
+  Copyright (C) 1996-1999 Julian R Seward.  All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -41,9 +41,9 @@
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-  Julian Seward, Guildford, Surrey, UK.
+  Julian Seward, Cambridge, UK.
   jseward@acm.org
-  bzip2/libbzip2 version 0.9.0c of 18 October 1998
+  bzip2/libbzip2 version 0.9.5 of 24 May 1999
 
   This program is based on (at least) the work of:
      Mike Burrows
@@ -62,106 +62,404 @@
 #include "bzlib_private.h"
 
 /*---------------------------------------------*/
-/*--
-  Compare two strings in block.  We assume (see
-  discussion above) that i1 and i2 have a max
-  offset of 10 on entry, and that the first
-  bytes of both block and quadrant have been
-  copied into the "overshoot area", ie
-  into the subscript range
-  [nblock .. nblock+NUM_OVERSHOOT_BYTES-1].
---*/
-static __inline__ Bool fullGtU ( UChar*  block,
-                                 UInt16* quadrant,
-                                 UInt32  nblock,
-                                 Int32*  workDone, 
-                                 Int32   i1, 
-                                 Int32   i2
-                               )
+/*--- Fallback O(N log(N)^2) sorting        ---*/
+/*--- algorithm, for repetitive blocks      ---*/
+/*---------------------------------------------*/
+
+/*---------------------------------------------*/
+static 
+__inline__
+void fallbackSimpleSort ( UInt32* fmap, 
+                          UInt32* eclass, 
+                          Int32   lo, 
+                          Int32   hi )
+{
+   Int32 i, j, tmp;
+   UInt32 ec_tmp;
+
+   if (lo == hi) return;
+
+   if (hi - lo > 3) {
+      for ( i = hi-4; i >= lo; i-- ) {
+         tmp = fmap[i];
+         ec_tmp = eclass[tmp];
+         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
+            fmap[j-4] = fmap[j];
+         fmap[j-4] = tmp;
+      }
+   }
+
+   for ( i = hi-1; i >= lo; i-- ) {
+      tmp = fmap[i];
+      ec_tmp = eclass[tmp];
+      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
+         fmap[j-1] = fmap[j];
+      fmap[j-1] = tmp;
+   }
+}
+
+
+/*---------------------------------------------*/
+#define fswap(zz1, zz2) \
+   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
+
+#define fvswap(zzp1, zzp2, zzn)       \
+{                                     \
+   Int32 yyp1 = (zzp1);               \
+   Int32 yyp2 = (zzp2);               \
+   Int32 yyn  = (zzn);                \
+   while (yyn > 0) {                  \
+      fswap(fmap[yyp1], fmap[yyp2]);  \
+      yyp1++; yyp2++; yyn--;          \
+   }                                  \
+}
+
+
+#define fmin(a,b) ((a) < (b)) ? (a) : (b)
+
+#define fpush(lz,hz) { stackLo[sp] = lz; \
+                       stackHi[sp] = hz; \
+                       sp++; }
+
+#define fpop(lz,hz) { sp--;              \
+                      lz = stackLo[sp];  \
+                      hz = stackHi[sp]; }
+
+#define FALLBACK_QSORT_SMALL_THRESH 10
+#define FALLBACK_QSORT_STACK_SIZE   100
+
+
+static
+void fallbackQSort3 ( UInt32* fmap, 
+                      UInt32* eclass,
+                      Int32   loSt, 
+                      Int32   hiSt )
+{
+   Int32 unLo, unHi, ltLo, gtHi, n, m;
+   Int32 sp, lo, hi;
+   UInt32 med, r, r3;
+   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
+   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
+
+   r = 0;
+
+   sp = 0;
+   fpush ( loSt, hiSt );
+
+   while (sp > 0) {
+
+      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
+
+      fpop ( lo, hi );
+      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
+         fallbackSimpleSort ( fmap, eclass, lo, hi );
+         continue;
+      }
+
+      /* Random partitioning.  Median of 3 sometimes fails to
+         avoid bad cases.  Median of 9 seems to help but 
+         looks rather expensive.  This too seems to work but
+         is cheaper.  Guidance for the magic constants 
+         7621 and 32768 is taken from Sedgewick's algorithms
+         book, chapter 35.
+      */
+      r = ((r * 7621) + 1) % 32768;
+      r3 = r % 3;
+      if (r3 == 0) med = eclass[fmap[lo]]; else
+      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
+                   med = eclass[fmap[hi]];
+
+      unLo = ltLo = lo;
+      unHi = gtHi = hi;
+
+      while (1) {
+         while (1) {
+            if (unLo > unHi) break;
+            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
+            if (n == 0) { 
+               fswap(fmap[unLo], fmap[ltLo]); 
+               ltLo++; unLo++; 
+               continue; 
+            };
+            if (n > 0) break;
+            unLo++;
+         }
+         while (1) {
+            if (unLo > unHi) break;
+            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
+            if (n == 0) { 
+               fswap(fmap[unHi], fmap[gtHi]); 
+               gtHi--; unHi--; 
+               continue; 
+            };
+            if (n < 0) break;
+            unHi--;
+         }
+         if (unLo > unHi) break;
+         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
+      }
+
+      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
+
+      if (gtHi < ltLo) continue;
+
+      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
+      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
+
+      n = lo + unLo - ltLo - 1;
+      m = hi - (gtHi - unHi) + 1;
+
+      if (n - lo > hi - m) {
+         fpush ( lo, n );
+         fpush ( m, hi );
+      } else {
+         fpush ( m, hi );
+         fpush ( lo, n );
+      }
+   }
+}
+
+#undef fmin
+#undef fpush
+#undef fpop
+#undef fswap
+#undef fvswap
+#undef FALLBACK_QSORT_SMALL_THRESH
+#undef FALLBACK_QSORT_STACK_SIZE
+
+
+/*---------------------------------------------*/
+/* Pre:
+      nblock > 0
+      eclass exists for [0 .. nblock-1]
+      ((UInt16*)eclass) [0 .. nblock-1] [15:8] holds block
+      ptr exists for [0 .. nblock-1]
+
+   Post:
+      ((UInt16*)eclass) [0 .. nblock-1] [15:8] holds block
+      All other areas of eclass destroyed
+      fmap [0 .. nblock-1] holds sorted order
+      bhtab [ 0 .. 2+(nblock/32) ] destroyed
+*/
+
+#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
+#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
+#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
+#define      WORD_BH(zz)  bhtab[(zz) >> 5]
+#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
+
+static
+void fallbackSort ( UInt32* fmap, 
+                    UInt32* eclass, 
+                    UInt32* bhtab,
+                    Int32   nblock,
+                    Int32   verb )
+{
+   Int32 ftab[257];
+   Int32 ftabCopy[256];
+   Int32 H, i, j, k, l, r, cc, cc1;
+   Int32 nNotDone;
+   Int32 nBhtab;
+   UInt16* eclass16 = (UInt16*)eclass;
+
+   /*--
+      Initial 1-char radix sort to generate
+      initial fmap and initial BH bits.
+   --*/
+   if (verb >= 4)
+      VPrintf0 ( "        bucket sorting ...\n" );
+   for (i = 0; i < 257;    i++) ftab[i] = 0;
+   for (i = 0; i < nblock; i++) ftab[eclass16[i] >> 8]++;
+   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
+   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
+
+   for (i = 0; i < nblock; i++) {
+      j = eclass16[i] >> 8;
+      k = ftab[j] - 1;
+      ftab[j] = k;
+      fmap[k] = i;
+   }
+
+   nBhtab = 2 + (nblock / 32);
+   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
+   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
+
+   /*--
+      Inductively refine the buckets.  Kind-of an
+      "exponential radix sort" (!), inspired by the
+      Manber-Myers suffix array construction algorithm.
+   --*/
+
+   /*-- set sentinel bits for block-end detection --*/
+   for (i = 0; i < 32; i++) { 
+      SET_BH(nblock + 2*i);
+      CLEAR_BH(nblock + 2*i + 1);
+   }
+
+   /*-- the log(N) loop --*/
+   H = 1;
+   while (1) {
+
+      if (verb >= 4) 
+         VPrintf1 ( "        depth %6d has ", H );
+
+      j = 0;
+      for (i = 0; i < nblock; i++) {
+         if (ISSET_BH(i)) j = i;
+         k = fmap[i] - H; if (k < 0) k += nblock;
+         eclass[k] = j;
+      }
+
+      nNotDone = 0;
+      r = -1;
+      while (1) {
+
+	 /*-- find the next non-singleton bucket --*/
+         k = r + 1;
+         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
+         if (ISSET_BH(k)) {
+            while (WORD_BH(k) == 0xffffffff) k += 32;
+            while (ISSET_BH(k)) k++;
+         }
+         l = k - 1;
+         if (l >= nblock) break;
+         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
+         if (!ISSET_BH(k)) {
+            while (WORD_BH(k) == 0x00000000) k += 32;
+            while (!ISSET_BH(k)) k++;
+         }
+         r = k - 1;
+         if (r >= nblock) break;
+
+         /*-- now [l, r] bracket current bucket --*/
+         if (r > l) {
+            nNotDone += (r - l + 1);
+            fallbackQSort3 ( fmap, eclass, l, r );
+
+            /*-- scan bucket and generate header bits-- */
+            cc = -1;
+            for (i = l; i <= r; i++) {
+               cc1 = eclass[fmap[i]];
+               if (cc != cc1) { SET_BH(i); cc = cc1; };
+            }
+         }
+      }
+
+      if (verb >= 4) 
+         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
+
+      H *= 2;
+      if (H > nblock || nNotDone == 0) break;
+   }
+
+   /*-- 
+      Reconstruct the original block in
+      eclass16 [0 .. nblock-1] [15:8], since the
+      previous phase destroyed it.
+   --*/
+   if (verb >= 4)
+      VPrintf0 ( "        reconstructing block ...\n" );
+   j = 0;
+   for (i = 0; i < nblock; i++) {
+      while (ftabCopy[j] == 0) j++;
+      ftabCopy[j]--;
+      eclass16[fmap[i]] = j << 8;
+   }
+   AssertH ( j < 256, 1005 );
+}
+
+#undef       SET_BH
+#undef     CLEAR_BH
+#undef     ISSET_BH
+#undef      WORD_BH
+#undef UNALIGNED_BH
+
+
+/*---------------------------------------------*/
+/*--- The main, O(N^2 log(N)) sorting       ---*/
+/*--- algorithm.  Faster for "normal"       ---*/
+/*--- non-repetitive blocks.                ---*/
+/*---------------------------------------------*/
+
+/*---------------------------------------------*/
+static
+__inline__
+Bool mainGtU ( UInt32  i1, 
+               UInt32  i2,
+               UInt16* block, 
+               UInt16* quadrant,
+               UInt32  nblock,
+               Int32*  budget )
 {
    Int32 k;
-   UChar c1, c2;
    UInt16 s1, s2;
 
-   AssertD ( i1 != i2, "fullGtU(1)" );
+   AssertD ( i1 != i2, "mainGtU" );
 
-   c1 = block[i1];
-   c2 = block[i2];
-   if (c1 != c2) return (c1 > c2);
-   i1++; i2++;
+   s1 = block[i1]; s2 = block[i2];
+   if (s1 != s2) return (s1 > s2);
+   i1 += 2; i2 += 2;
 
-   c1 = block[i1];
-   c2 = block[i2];
-   if (c1 != c2) return (c1 > c2);
-   i1++; i2++;
+   s1 = block[i1]; s2 = block[i2];
+   if (s1 != s2) return (s1 > s2);
+   i1 += 2; i2 += 2;
 
-   c1 = block[i1];
-   c2 = block[i2];
-   if (c1 != c2) return (c1 > c2);
-   i1++; i2++;
+   s1 = block[i1]; s2 = block[i2];
+   if (s1 != s2) return (s1 > s2);
+   i1 += 2; i2 += 2;
 
-   c1 = block[i1];
-   c2 = block[i2];
-   if (c1 != c2) return (c1 > c2);
-   i1++; i2++;
+   s1 = block[i1]; s2 = block[i2];
+   if (s1 != s2) return (s1 > s2);
+   i1 += 2; i2 += 2;
 
-   c1 = block[i1];
-   c2 = block[i2];
-   if (c1 != c2) return (c1 > c2);
-   i1++; i2++;
+   s1 = block[i1]; s2 = block[i2];
+   if (s1 != s2) return (s1 > s2);
+   i1 += 2; i2 += 2;
 
-   c1 = block[i1];
-   c2 = block[i2];
-   if (c1 != c2) return (c1 > c2);
-   i1++; i2++;
+   s1 = block[i1]; s2 = block[i2];
+   if (s1 != s2) return (s1 > s2);
+   i1 += 2; i2 += 2;
 
-   k = nblock;
+   k = nblock + 8;
 
    do {
 
-      c1 = block[i1];
-      c2 = block[i2];
-      if (c1 != c2) return (c1 > c2);
-      s1 = quadrant[i1];
-      s2 = quadrant[i2];
+      s1 = block[i1]; s2 = block[i2];
       if (s1 != s2) return (s1 > s2);
-      i1++; i2++;
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1 += 2; i2 += 2;
 
-      c1 = block[i1];
-      c2 = block[i2];
-      if (c1 != c2) return (c1 > c2);
-      s1 = quadrant[i1];
-      s2 = quadrant[i2];
+      s1 = block[i1]; s2 = block[i2];
+      if (s1 != s2) return (s1 > s2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
       if (s1 != s2) return (s1 > s2);
-      i1++; i2++;
+      i1 += 2; i2 += 2;
 
-      c1 = block[i1];
-      c2 = block[i2];
-      if (c1 != c2) return (c1 > c2);
-      s1 = quadrant[i1];
-      s2 = quadrant[i2];
+      s1 = block[i1]; s2 = block[i2];
       if (s1 != s2) return (s1 > s2);
-      i1++; i2++;
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1 += 2; i2 += 2;
 
-      c1 = block[i1];
-      c2 = block[i2];
-      if (c1 != c2) return (c1 > c2);
-      s1 = quadrant[i1];
-      s2 = quadrant[i2];
+      s1 = block[i1]; s2 = block[i2];
+      if (s1 != s2) return (s1 > s2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
       if (s1 != s2) return (s1 > s2);
-      i1++; i2++;
+      i1 += 2; i2 += 2;
 
       if (i1 >= nblock) i1 -= nblock;
       if (i2 >= nblock) i2 -= nblock;
 
-      k -= 4;
-      (*workDone)++;
+      k -= 8;
+      (*budget)--;
    }
       while (k >= 0);
 
    return False;
 }
 
+
 /*---------------------------------------------*/
 /*--
    Knuth's increments seem to work better
@@ -169,22 +467,22 @@ static __inline__ Bool fullGtU ( UChar*  block,
    because the number of elems to sort is
    usually small, typically <= 20.
 --*/
-static Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
-                          9841, 29524, 88573, 265720,
-                          797161, 2391484 };
-
-static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d )
+Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
+                   9841, 29524, 88573, 265720,
+                   797161, 2391484 };
+
+static
+void mainSimpleSort ( UInt32* ptr,
+                      UInt16* block,
+                      UInt16* quadrant,
+                      Int32   nblock,
+                      Int32   lo, 
+                      Int32   hi, 
+                      Int32   d,
+                      Int32*  budget )
 {
    Int32 i, j, h, bigN, hp;
-   Int32 v;
-
-   UChar*  block        = s->block;
-   UInt32* zptr         = s->zptr;
-   UInt16* quadrant     = s->quadrant;
-   Int32*  workDone     = &(s->workDone);
-   Int32   nblock       = s->nblock;
-   Int32   workLimit    = s->workLimit;
-   Bool    firstAttempt = s->firstAttempt;
+   UInt32 v;
 
    bigN = hi - lo + 1;
    if (bigN < 2) return;
@@ -195,49 +493,53 @@ static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d )
 
    for (; hp >= 0; hp--) {
       h = incs[hp];
+
       i = lo + h;
       while (True) {
 
          /*-- copy 1 --*/
          if (i > hi) break;
-         v = zptr[i];
+         v = ptr[i];
          j = i;
-         while ( fullGtU ( block, quadrant, nblock, workDone,
-                           zptr[j-h]+d, v+d ) ) {
-            zptr[j] = zptr[j-h];
+         while ( mainGtU ( 
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
+                 ) ) {
+            ptr[j] = ptr[j-h];
             j = j - h;
             if (j <= (lo + h - 1)) break;
          }
-         zptr[j] = v;
+         ptr[j] = v;
          i++;
 
          /*-- copy 2 --*/
          if (i > hi) break;
-         v = zptr[i];
+         v = ptr[i];
          j = i;
-         while ( fullGtU ( block, quadrant, nblock, workDone,
-                 zptr[j-h]+d, v+d ) ) {
-            zptr[j] = zptr[j-h];
+         while ( mainGtU ( 
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
+                 ) ) {
+            ptr[j] = ptr[j-h];
             j = j - h;
             if (j <= (lo + h - 1)) break;
          }
-         zptr[j] = v;
+         ptr[j] = v;
          i++;
 
          /*-- copy 3 --*/
          if (i > hi) break;
-         v = zptr[i];
+         v = ptr[i];
          j = i;
-         while ( fullGtU ( block, quadrant, nblock, workDone,
-                           zptr[j-h]+d, v+d ) ) {
-            zptr[j] = zptr[j-h];
+         while ( mainGtU ( 
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
+                 ) ) {
+            ptr[j] = ptr[j-h];
             j = j - h;
             if (j <= (lo + h - 1)) break;
          }
-         zptr[j] = v;
+         ptr[j] = v;
          i++;
 
-         if (*workDone > workLimit && firstAttempt) return;
+         if (*budget < 0) return;
       }
    }
 }
@@ -252,20 +554,26 @@ static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d )
    Sedgewick and Jon L. Bentley.
 --*/
 
-#define swap(lv1, lv2) \
-   { Int32 tmp = lv1; lv1 = lv2; lv2 = tmp; }
-
-static void vswap ( UInt32* zptr, Int32 p1, Int32 p2, Int32 n )
-{
-   while (n > 0) {
-      swap(zptr[p1], zptr[p2]);
-      p1++; p2++; n--;
-   }
+#define mswap(zz1, zz2) \
+   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
+
+#define mvswap(zzp1, zzp2, zzn)       \
+{                                     \
+   Int32 yyp1 = (zzp1);               \
+   Int32 yyp2 = (zzp2);               \
+   Int32 yyn  = (zzn);                \
+   while (yyn > 0) {                  \
+      mswap(ptr[yyp1], ptr[yyp2]);    \
+      yyp1++; yyp2++; yyn--;          \
+   }                                  \
 }
 
-static UChar med3 ( UChar a, UChar b, UChar c )
+
+static 
+__inline__
+UInt16 mmed3 ( UInt16 a, UInt16 b, UInt16 c )
 {
-   UChar t;
+   UInt16 t;
    if (a > b) { t = a; a = b; b = t; };
    if (b > c) { t = b; b = c; c = t; };
    if (a > b)          b = a;
@@ -273,66 +581,72 @@ static UChar med3 ( UChar a, UChar b, UChar c )
 }
 
 
-#define min(a,b) ((a) < (b)) ? (a) : (b)
+#define mmin(a,b) ((a) < (b)) ? (a) : (b)
 
-typedef
-   struct { Int32 ll; Int32 hh; Int32 dd; }
-   StackElem;
+#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
+                          stackHi[sp] = hz; \
+                          stackD [sp] = dz; \
+                          sp++; }
 
-#define push(lz,hz,dz) { stack[sp].ll = lz; \
-                         stack[sp].hh = hz; \
-                         stack[sp].dd = dz; \
-                         sp++; }
+#define mpop(lz,hz,dz) { sp--;             \
+                         lz = stackLo[sp]; \
+                         hz = stackHi[sp]; \
+                         dz = stackD [sp]; }
 
-#define pop(lz,hz,dz) { sp--;               \
-                        lz = stack[sp].ll;  \
-                        hz = stack[sp].hh;  \
-                        dz = stack[sp].dd; }
 
-#define SMALL_THRESH 20
-#define DEPTH_THRESH 10
+#define mnextsize(az) (nextHi[az]-nextLo[az])
+
+#define mnextswap(az,bz)                                        \
+   { Int32 tz;                                                  \
+     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
+     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
+     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 
-/*--
-   If you are ever unlucky/improbable enough
-   to get a stack overflow whilst sorting,
-   increase the following constant and try
-   again.  In practice I have never seen the
-   stack go above 27 elems, so the following
-   limit seems very generous.
---*/
-#define QSORT_STACK_SIZE 1000
 
+#define MAIN_QSORT_SMALL_THRESH 20
+#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
+#define MAIN_QSORT_STACK_SIZE 100
 
-static void qSort3 ( EState* s, Int32 loSt, Int32 hiSt, Int32 dSt )
+static
+void mainQSort3 ( UInt32* ptr,
+                  UInt16* block,
+                  UInt16* quadrant,
+                  Int32   nblock,
+                  Int32   loSt, 
+                  Int32   hiSt, 
+                  Int32   dSt,
+                  Int32*  budget )
 {
-   Int32 unLo, unHi, ltLo, gtHi, med, n, m;
+   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
    Int32 sp, lo, hi, d;
-   StackElem stack[QSORT_STACK_SIZE];
 
-   UChar*  block        = s->block;
-   UInt32* zptr         = s->zptr;
-   Int32*  workDone     = &(s->workDone);
-   Int32   workLimit    = s->workLimit;
-   Bool    firstAttempt = s->firstAttempt;
+   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
+   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
+   Int32 stackD [MAIN_QSORT_STACK_SIZE];
+
+   Int32 nextLo[3];
+   Int32 nextHi[3];
+   Int32 nextD [3];
 
    sp = 0;
-   push ( loSt, hiSt, dSt );
+   mpush ( loSt, hiSt, dSt );
 
    while (sp > 0) {
 
-      AssertH ( sp < QSORT_STACK_SIZE, 1001 );
+      AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
 
-      pop ( lo, hi, d );
-
-      if (hi - lo < SMALL_THRESH || d > DEPTH_THRESH) {
-         simpleSort ( s, lo, hi, d );
-         if (*workDone > workLimit && firstAttempt) return;
+      mpop ( lo, hi, d );
+      if (hi - lo < MAIN_QSORT_SMALL_THRESH || 
+          d > MAIN_QSORT_DEPTH_THRESH) {
+         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
+         if (*budget < 0) return;
          continue;
       }
 
-      med = med3 ( block[zptr[ lo         ]+d],
-                   block[zptr[ hi         ]+d],
-                   block[zptr[ (lo+hi)>>1 ]+d] );
+      med = (Int32) 
+            mmed3 ( block[ptr[ lo         ]+d],
+                    block[ptr[ hi         ]+d],
+                    block[ptr[ (lo+hi)>>1 ]+d] );
 
       unLo = ltLo = lo;
       unHi = gtHi = hi;
@@ -340,370 +654,412 @@ static void qSort3 ( EState* s, Int32 loSt, Int32 hiSt, Int32 dSt )
       while (True) {
          while (True) {
             if (unLo > unHi) break;
-            n = ((Int32)block[zptr[unLo]+d]) - med;
-            if (n == 0) { swap(zptr[unLo], zptr[ltLo]); ltLo++; unLo++; continue; };
+            n = ((Int32)block[ptr[unLo]+d]) - med;
+            if (n == 0) { 
+               mswap(ptr[unLo], ptr[ltLo]); 
+               ltLo++; unLo++; continue; 
+            };
             if (n >  0) break;
             unLo++;
          }
          while (True) {
             if (unLo > unHi) break;
-            n = ((Int32)block[zptr[unHi]+d]) - med;
-            if (n == 0) { swap(zptr[unHi], zptr[gtHi]); gtHi--; unHi--; continue; };
+            n = ((Int32)block[ptr[unHi]+d]) - med;
+            if (n == 0) { 
+               mswap(ptr[unHi], ptr[gtHi]); 
+               gtHi--; unHi--; continue; 
+            };
             if (n <  0) break;
             unHi--;
          }
          if (unLo > unHi) break;
-         swap(zptr[unLo], zptr[unHi]); unLo++; unHi--;
+         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
       }
 
-      AssertD ( unHi == unLo-1, "bad termination in qSort3" );
+      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 
       if (gtHi < ltLo) {
-         push(lo, hi, d+1 );
+         mpush(lo, hi, d+2 );
          continue;
       }
 
-      n = min(ltLo-lo, unLo-ltLo); vswap(zptr, lo, unLo-n, n);
-      m = min(hi-gtHi, gtHi-unHi); vswap(zptr, unLo, hi-m+1, m);
+      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
+      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 
       n = lo + unLo - ltLo - 1;
       m = hi - (gtHi - unHi) + 1;
 
-      push ( lo, n, d );
-      push ( n+1, m-1, d+1 );
-      push ( m, hi, d );
+      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
+      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
+      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+2;
+
+      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
+      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+
+      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
+      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
+
+      mpush (nextLo[0], nextHi[0], nextD[0]);
+      mpush (nextLo[1], nextHi[1], nextD[1]);
+      mpush (nextLo[2], nextHi[2], nextD[2]);
    }
 }
 
+#undef mswap
+#undef mvswap
+#undef mpush
+#undef mpop
+#undef mmin
+#undef mnextsize
+#undef mnextswap
+#undef MAIN_QSORT_SMALL_THRESH
+#undef MAIN_QSORT_DEPTH_THRESH
+#undef MAIN_QSORT_STACK_SIZE
+
 
 /*---------------------------------------------*/
+/* Pre:
+      nblock > N_OVERSHOOT
+      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
+      ((UInt16*)block32) [0 .. nblock-1] [15:8] holds block
+      ptr exists for [0 .. nblock-1]
+
+   Post:
+      ((UInt16*)block32) [0 .. nblock-1] [15:8] holds block
+      All other areas of block32 destroyed
+      ftab [0 .. 65536 ] destroyed
+      ptr [0 .. nblock-1] holds sorted order
+      if (*budget < 0), sorting was abandoned
+*/
 
 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
-
 #define SETMASK (1 << 21)
 #define CLEARMASK (~(SETMASK))
 
-static void sortMain ( EState* s )
+static
+void mainSort ( UInt32* ptr, 
+                UInt16* block,
+                UInt16* quadrant, 
+                UInt32* ftab,
+                Int32   nblock,
+                Int32   verb,
+                Int32*  budget )
 {
-   Int32 i, j, k, ss, sb;
-   Int32 runningOrder[256];
-   Int32 copy[256];
-   Bool  bigDone[256];
-   UChar c1, c2;
-   Int32 numQSorted;
-
-   UChar*  block        = s->block;
-   UInt32* zptr         = s->zptr;
-   UInt16* quadrant     = s->quadrant;
-   Int32*  ftab         = s->ftab;
-   Int32*  workDone     = &(s->workDone);
-   Int32   nblock       = s->nblock;
-   Int32   workLimit    = s->workLimit;
-   Bool    firstAttempt = s->firstAttempt;
-
-   /*--
-      In the various block-sized structures, live data runs
-      from 0 to last+NUM_OVERSHOOT_BYTES inclusive.  First,
-      set up the overshoot area for block.
+   Int32  i, j, k, m, ss, sb;
+   Int32  runningOrder[256];
+   Int32  copy[256];
+   Bool   bigDone[256];
+   UChar  c1;
+   Int32  numQSorted;
+   Int32  biggestSoFar;
+   UInt16 s;
+
+   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
+
+   /*-- Stripe the block data into 16 bits, and at the
+        same time set up the 2-byte frequency table 
    --*/
+   for (i = 65536; i >= 0; i--) ftab[i] = 0;
+
+   s = block[0];
+   for (i = 1; i < nblock; i++) {
+      quadrant[i] = 0;
+      s = (s << 8) | block[i];
+      block[i-1] = s;
+      ftab[s]++;      
+   }
+   quadrant[0] = 0;
+   s = (s << 8) | (block[0] >> 8);
+   block[nblock-1] = s;
+   ftab[s]++;
+
+   /*-- (emphasises close relationship of block & quadrant) --*/
+   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
+      block   [nblock+i] = block[i];
+      quadrant[nblock+i] = 0;
+   }
 
-   if (s->verbosity >= 4)
-      VPrintf0( "        sort initialise ...\n" );
-
-   for (i = 0; i < BZ_NUM_OVERSHOOT_BYTES; i++)
-       block[nblock+i] = block[i % nblock];
-   for (i = 0; i < nblock+BZ_NUM_OVERSHOOT_BYTES; i++)
-       quadrant[i] = 0;
-
-
-   if (nblock <= 4000) {
-
-      /*--
-         Use simpleSort(), since the full sorting mechanism
-         has quite a large constant overhead.
-      --*/
-      if (s->verbosity >= 4) VPrintf0( "        simpleSort ...\n" );
-      for (i = 0; i < nblock; i++) zptr[i] = i;
-      firstAttempt = False;
-      *workDone = workLimit = 0;
-      simpleSort ( s, 0, nblock-1, 0 );
-      if (s->verbosity >= 4) VPrintf0( "        simpleSort done.\n" );
+   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 
-   } else {
+   /*-- Complete the initial radix sort --*/
+   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 
-      numQSorted = 0;
-      for (i = 0; i <= 255; i++) bigDone[i] = False;
+   for (i = 0; i < nblock; i++) {
+      s = block[i];
+      j = ftab[s] - 1;
+      ftab[s] = j;
+      ptr[j] = i;
+   }
 
-      if (s->verbosity >= 4) VPrintf0( "        bucket sorting ...\n" );
+   /*--
+      Now ftab contains the first loc of every small bucket.
+      Calculate the running order, from smallest to largest
+      big bucket.
+   --*/
+   for (i = 0; i <= 255; i++) {
+      bigDone     [i] = False;
+      runningOrder[i] = i;
+   }
 
-      for (i = 0; i <= 65536; i++) ftab[i] = 0;
+   {
+      Int32 vv;
+      Int32 h = 1;
+      do h = 3 * h + 1; while (h <= 256);
+      do {
+         h = h / 3;
+         for (i = h; i <= 255; i++) {
+            vv = runningOrder[i];
+            j = i;
+            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
+               runningOrder[j] = runningOrder[j-h];
+               j = j - h;
+               if (j <= (h - 1)) goto zero;
+            }
+            zero:
+            runningOrder[j] = vv;
+         }
+      } while (h != 1);
+   }
 
-      c1 = block[nblock-1];
-      for (i = 0; i < nblock; i++) {
-         c2 = block[i];
-         ftab[(c1 << 8) + c2]++;
-         c1 = c2;
-      }
+   /*--
+      The main sorting loop.
+   --*/
 
-      for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
+   biggestSoFar = numQSorted = 0;
 
-      c1 = block[0];
-      for (i = 0; i < nblock-1; i++) {
-         c2 = block[i+1];
-         j = (c1 << 8) + c2;
-         c1 = c2;
-         ftab[j]--;
-         zptr[ftab[j]] = i;
-      }
-      j = (block[nblock-1] << 8) + block[0];
-      ftab[j]--;
-      zptr[ftab[j]] = nblock-1;
+   for (i = 0; i <= 255; i++) {
 
       /*--
-         Now ftab contains the first loc of every small bucket.
-         Calculate the running order, from smallest to largest
-         big bucket.
+         Process big buckets, starting with the least full.
+         Basically this is a 4-step process in which we call
+         mainQSort3 to sort the small buckets [ss, j], but
+         also make a big effort to avoid the calls if we can.
       --*/
+      ss = runningOrder[i];
 
-      for (i = 0; i <= 255; i++) runningOrder[i] = i;
-
-      {
-         Int32 vv;
-         Int32 h = 1;
-         do h = 3 * h + 1; while (h <= 256);
-         do {
-            h = h / 3;
-            for (i = h; i <= 255; i++) {
-               vv = runningOrder[i];
-               j = i;
-               while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
-                  runningOrder[j] = runningOrder[j-h];
-                  j = j - h;
-                  if (j <= (h - 1)) goto zero;
+      /*--
+         Step 1:
+         Complete the big bucket [ss] by quicksorting
+         any unsorted small buckets [ss, j], for j != ss.  
+         Hopefully previous pointer-scanning phases have already
+         completed many of the small buckets [ss, j], so
+         we don't have to sort them at all.
+      --*/
+      for (j = 0; j <= 255; j++) {
+         if (j != ss) {
+            sb = (ss << 8) + j;
+            if ( ! (ftab[sb] & SETMASK) ) {
+               Int32 lo = ftab[sb]   & CLEARMASK;
+               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
+               if (hi > lo) {
+                  if (verb >= 4)
+                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
+                                "done %d   this %d\n",
+                                ss, j, numQSorted, hi - lo + 1 );
+                  mainQSort3 ( 
+                     ptr, block, quadrant, nblock, 
+                     lo, hi, BZ_N_RADIX, budget 
+                  );   
+                  numQSorted += (hi - lo + 1);
+                  if (*budget < 0) return;
                }
-               zero:
-               runningOrder[j] = vv;
             }
-         } while (h != 1);
+            ftab[sb] |= SETMASK;
+         }
       }
 
       /*--
-         The main sorting loop.
+         Step 2:
+         Deal specially with case [ss, ss].  This establishes the
+         sorted order for [ss, ss] without any comparisons.  
+         A clever trick, cryptically described as steps Q6b and Q6c
+         in SRC-124 (aka BW94).  Compared to bzip2, this makes it
+         practical not to use a preliminary run-length coder.
       --*/
+      {
+         Int32 put0, get0, put1, get1;
+         Int32 sbn = (ss << 8) + ss;
+         Int32 lo = ftab[sbn] & CLEARMASK;
+         Int32 hi = (ftab[sbn+1] & CLEARMASK) - 1;
+         UChar ssc = (UChar)ss;
+         put0 = lo;
+         get0 = ftab[ss << 8] & CLEARMASK;
+         put1 = hi;
+         get1 = (ftab[(ss+1) << 8] & CLEARMASK) - 1;
+         while (get0 < put0) {
+            j = ptr[get0]-1; if (j < 0) j += nblock;
+            c1 = (UChar)(block[j] >> 8);
+            if (c1 == ssc) { ptr[put0] = j; put0++; };
+            get0++;
+         }
+         while (get1 > put1) {
+            j = ptr[get1]-1; if (j < 0) j += nblock;
+            c1 = (UChar)(block[j] >> 8);
+            if (c1 == ssc) { ptr[put1] = j; put1--; };
+            get1--;
+         }
+         ftab[sbn] |= SETMASK;
+      }
 
-      for (i = 0; i <= 255; i++) {
-
-         /*--
-            Process big buckets, starting with the least full.
-            Basically this is a 4-step process in which we call
-            qSort3 to sort the small buckets [ss, j], but
-            also make a big effort to avoid the calls if we can.
-         --*/
-         ss = runningOrder[i];
-
-         /*--
-            Step 1:
-            Complete the big bucket [ss] by quicksorting
-            any unsorted small buckets [ss, j], for j != ss.  
-            Hopefully previous pointer-scanning phases have already
-            completed many of the small buckets [ss, j], so
-            we don't have to sort them at all.
-         --*/
-         for (j = 0; j <= 255; j++) {
-            if (j != ss) {
-               sb = (ss << 8) + j;
-               if ( ! (ftab[sb] & SETMASK) ) {
-                  Int32 lo = ftab[sb]   & CLEARMASK;
-                  Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
-                  if (hi > lo) {
-                     if (s->verbosity >= 4)
-                        VPrintf4( "        qsort [0x%x, 0x%x]   done %d   this %d\n",
-                                  ss, j, numQSorted, hi - lo + 1 );
-                     qSort3 ( s, lo, hi, 2 );
-                     numQSorted += ( hi - lo + 1 );
-                     if (*workDone > workLimit && firstAttempt) return;
-                  }
+      /*--
+         Step 3:
+         The [ss] big bucket is now done.  Record this fact,
+         and update the quadrant descriptors.  Remember to
+         update quadrants in the overshoot area too, if
+         necessary.  The "if (i < 255)" test merely skips
+         this updating for the last bucket processed, since
+         updating for the last bucket is pointless.
+
+         The quadrant array provides a way to incrementally
+         cache sort orderings, as they appear, so as to 
+         make subsequent comparisons in fullGtU() complete
+         faster.  For repetitive blocks this makes a big
+         difference (but not big enough to be able to avoid
+         the fallback sorting mechanism, exponential radix sort).
+
+         The precise meaning is: at all times:
+
+            for 0 <= i < nblock and 0 <= j <= nblock
+
+            if block[i] != block[j], 
+
+               then the relative values of quadrant[i] and 
+                    quadrant[j] are meaningless.
+
+               else {
+                  if quadrant[i] < quadrant[j]
+                     then the string starting at i lexicographically
+                     precedes the string starting at j
+
+                  else if quadrant[i] > quadrant[j]
+                     then the string starting at j lexicographically
+                     precedes the string starting at i
+
+                  else
+                     the relative ordering of the strings starting
+                     at i and j has not yet been determined.
                }
-               ftab[sb] |= SETMASK;
-            }
-         }
+      --*/
+      bigDone[ss] = True;
 
-         /*--
-            Step 2:
-            Deal specially with case [ss, ss].  This establishes the
-            sorted order for [ss, ss] without any comparisons.  
-            A clever trick, cryptically described as steps Q6b and Q6c
-            in SRC-124 (aka BW94).  This makes it entirely practical to
-            not use a preliminary run-length coder, but unfortunately
-            we are now stuck with the .bz2 file format.
-         --*/
-         {
-            Int32 put0, get0, put1, get1;
-            Int32 sbn = (ss << 8) + ss;
-            Int32 lo = ftab[sbn] & CLEARMASK;
-            Int32 hi = (ftab[sbn+1] & CLEARMASK) - 1;
-            UChar ssc = (UChar)ss;
-            put0 = lo;
-            get0 = ftab[ss << 8] & CLEARMASK;
-            put1 = hi;
-            get1 = (ftab[(ss+1) << 8] & CLEARMASK) - 1;
-            while (get0 < put0) {
-               j = zptr[get0]-1; if (j < 0) j += nblock;
-               c1 = block[j];
-               if (c1 == ssc) { zptr[put0] = j; put0++; };
-               get0++;
-            }
-            while (get1 > put1) {
-               j = zptr[get1]-1; if (j < 0) j += nblock;
-               c1 = block[j];
-               if (c1 == ssc) { zptr[put1] = j; put1--; };
-               get1--;
-            }
-            ftab[sbn] |= SETMASK;
-         }
+      if (i < 255) {
+         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
+         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
+         Int32 shifts   = 0;
 
-         /*--
-            Step 3:
-            The [ss] big bucket is now done.  Record this fact,
-            and update the quadrant descriptors.  Remember to
-            update quadrants in the overshoot area too, if
-            necessary.  The "if (i < 255)" test merely skips
-            this updating for the last bucket processed, since
-            updating for the last bucket is pointless.
-
-            The quadrant array provides a way to incrementally
-            cache sort orderings, as they appear, so as to 
-            make subsequent comparisons in fullGtU() complete
-            faster.  For repetitive blocks this makes a big
-            difference (but not big enough to be able to avoid
-            randomisation for very repetitive data.)
-
-            The precise meaning is: at all times:
-
-               for 0 <= i < nblock and 0 <= j <= nblock
-
-               if block[i] != block[j], 
-
-                  then the relative values of quadrant[i] and 
-                       quadrant[j] are meaningless.
-
-                  else {
-                     if quadrant[i] < quadrant[j]
-                        then the string starting at i lexicographically
-                        precedes the string starting at j
-
-                     else if quadrant[i] > quadrant[j]
-                        then the string starting at j lexicographically
-                        precedes the string starting at i
-
-                     else
-                        the relative ordering of the strings starting
-                        at i and j has not yet been determined.
-                  }
-         --*/
-         bigDone[ss] = True;
-
-         if (i < 255) {
-            Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
-            Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
-            Int32 shifts   = 0;
-
-            while ((bbSize >> shifts) > 65534) shifts++;
-
-            for (j = 0; j < bbSize; j++) {
-               Int32 a2update     = zptr[bbStart + j];
-               UInt16 qVal        = (UInt16)(j >> shifts);
-               quadrant[a2update] = qVal;
-               if (a2update < BZ_NUM_OVERSHOOT_BYTES)
-                  quadrant[a2update + nblock] = qVal;
-            }
+         while ((bbSize >> shifts) > 65534) shifts++;
 
-            AssertH ( ( ((bbSize-1) >> shifts) <= 65535 ), 1002 );
+         for (j = 0; j < bbSize; j++) {
+            Int32 a2update     = ptr[bbStart + j];
+            UInt16 qVal        = (UInt16)(j >> shifts);
+            quadrant[a2update] = qVal;
+            if (a2update < BZ_N_OVERSHOOT)
+               quadrant[a2update + nblock] = qVal;
          }
+         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
+      }
 
-         /*--
-            Step 4:
-            Now scan this big bucket [ss] so as to synthesise the
-            sorted order for small buckets [t, ss] for all t != ss.
-            This will avoid doing Real Work in subsequent Step 1's.
-         --*/
-         for (j = 0; j <= 255; j++)
-            copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
-
-         for (j = ftab[ss << 8] & CLEARMASK;
-              j < (ftab[(ss+1) << 8] & CLEARMASK);
-              j++) {
-            k = zptr[j]-1; if (k < 0) k += nblock;
-            c1 = block[k];
-            if ( ! bigDone[c1] ) {
-               zptr[copy[c1]] = k;
-               copy[c1] ++;
-            }
+      /*--
+         Step 4:
+         Now scan this big bucket [ss] so as to synthesise the
+         sorted order for small buckets [t, ss] for all t != ss.
+         This will avoid doing Real Work in subsequent Step 1's.
+      --*/
+      for (j = 0; j <= 255; j++)
+         copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
+
+      m = ftab[(ss+1) << 8] & CLEARMASK;
+      for (j = ftab[ss << 8] & CLEARMASK; j < m; j++) {
+         k = ptr[j] - 1; if (k < 0) k += nblock;
+         c1 = (UChar)(block[k] >> 8);
+         if ( ! bigDone[c1] ) {
+            ptr[copy[c1]] = k;
+            copy[c1] ++;
          }
-
-         for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
       }
-      if (s->verbosity >= 4)
-         VPrintf3( "        %d pointers, %d sorted, %d scanned\n",
-                   nblock, numQSorted, nblock - numQSorted );
-   }
-}
-
 
-/*---------------------------------------------*/
-static void randomiseBlock ( EState* s )
-{
-   Int32 i;
-   BZ_RAND_INIT_MASK;
-   for (i = 0; i < 256; i++) s->inUse[i] = False;
-
-   for (i = 0; i < s->nblock; i++) {
-      BZ_RAND_UPD_MASK;
-      s->block[i] ^= BZ_RAND_MASK;
-      s->inUse[s->block[i]] = True;
+      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
    }
+
+   if (verb >= 4)
+      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
+                 nblock, numQSorted, nblock - numQSorted );
 }
 
+#undef BIGFREQ
+#undef SETMASK
+#undef CLEARMASK
+
 
 /*---------------------------------------------*/
+/* Pre:
+      nblock > 0
+      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
+      ((UInt16*)arr2) [0 .. nblock-1] [15:8] holds block
+      arr1 exists for [0 .. nblock-1]
+
+   Post:
+      ((UInt16*)arr2) [0 .. nblock-1] [15:8] holds block
+      All other areas of block destroyed
+      ftab [ 0 .. 65536 ] destroyed
+      arr1 [0 .. nblock-1] holds sorted order
+*/
 void blockSort ( EState* s )
 {
-   Int32 i;
-
-   s->workLimit       = s->workFactor * (s->nblock - 1);
-   s->workDone        = 0;
-   s->blockRandomised = False;
-   s->firstAttempt    = True;
-
-   sortMain ( s );
-
-   if (s->verbosity >= 3)
-      VPrintf3( "      %d work, %d block, ratio %5.2f\n",
-                s->workDone, s->nblock-1, 
-                (float)(s->workDone) / (float)(s->nblock-1) );
-
-   if (s->workDone > s->workLimit && s->firstAttempt) {
-      if (s->verbosity >= 2)
-         VPrintf0( "    sorting aborted; randomising block\n" );
-      randomiseBlock ( s );
-      s->workLimit = s->workDone = 0;
-      s->blockRandomised = True;
-      s->firstAttempt = False;
-      sortMain ( s );
-      if (s->verbosity >= 3)
-         VPrintf3( "      %d work, %d block, ratio %f\n",
-                   s->workDone, s->nblock-1, 
-                   (float)(s->workDone) / (float)(s->nblock-1) );
+   UInt32* ptr    = s->ptr; 
+   UInt16* block  = s->block;
+   UInt32* ftab   = s->ftab;
+   Int32   nblock = s->nblock;
+   Int32   verb   = s->verbosity;
+   Int32   wfact  = s->workFactor;
+   UInt16* quadrant;
+   Int32   budget;
+   Int32   budgetInit;
+   Int32   i;
+
+   if (nblock < 10000) {
+      for (i = 0; i < nblock; i++) block[i] <<= 8;
+      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
+   } else {
+      quadrant = &(block[nblock+BZ_N_OVERSHOOT]);
+
+      /* (wfact-1) / 3 puts the default-factor-30
+         transition point at very roughly the same place as 
+         with v0.1 and v0.9.0.  
+         Not that it particularly matters any more, since the
+         resulting compressed stream is now the same regardless
+         of whether or not we use the main sort or fallback sort.
+      */
+      if (wfact < 1  ) wfact = 1;
+      if (wfact > 100) wfact = 100;
+      budgetInit = nblock * ((wfact-1) / 3);
+      budget = budgetInit;
+
+      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
+      if (verb >= 3) 
+         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
+                    budgetInit - budget,
+                    nblock, 
+                    (float)(budgetInit - budget) /
+                    (float)(nblock==0 ? 1 : nblock) ); 
+      if (budget < 0) {
+         if (verb >= 2) 
+            VPrintf0 ( "    too repetitive; using fallback"
+                       " sorting algorithm\n" );
+         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
+      }
    }
 
    s->origPtr = -1;
    for (i = 0; i < s->nblock; i++)
-       if (s->zptr[i] == 0)
-          { s->origPtr = i; break; };
+      if (ptr[i] == 0)
+         { s->origPtr = i; break; };
 
    AssertH( s->origPtr != -1, 1003 );
 }
 
+
 /*-------------------------------------------------------------*/
 /*--- end                                       blocksort.c ---*/
 /*-------------------------------------------------------------*/
-- 
cgit v1.2.3-55-g6feb