From 33d134030248633ffa7d60c0a35a783c46da034b Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Thu, 7 Aug 1997 22:13:13 +0200 Subject: bzip2-0.1 --- bzip2.c | 4036 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4036 insertions(+) create mode 100644 bzip2.c (limited to 'bzip2.c') diff --git a/bzip2.c b/bzip2.c new file mode 100644 index 0000000..0fb45fb --- /dev/null +++ b/bzip2.c @@ -0,0 +1,4036 @@ + +/*-----------------------------------------------------------*/ +/*--- A block-sorting, lossless compressor bzip2.c ---*/ +/*-----------------------------------------------------------*/ + +/*-- + This program is bzip2, a lossless, block-sorting data compressor, + version 0.1pl0, dated 17-Aug-1997. + + Copyright (C) 1996, 1997 by Julian Seward. + Guildford, Surrey, UK + email: jseward@acm.org + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + The GNU General Public License is contained in the file LICENSE. + + This program is based on (at least) the work of: + Mike Burrows + David Wheeler + Peter Fenwick + Alistair Moffat + Radford Neal + Ian H. Witten + Robert Sedgewick + Jon L. Bentley + + For more information on these sources, see the file ALGORITHMS. +--*/ + +/*----------------------------------------------------*/ +/*--- IMPORTANT ---*/ +/*----------------------------------------------------*/ + +/*-- + WARNING: + This program (attempts to) compress data by performing several + non-trivial transformations on it. Unless you are 100% familiar + with *all* the algorithms contained herein, and with the + consequences of modifying them, you should NOT meddle with the + compression or decompression machinery. Incorrect changes can + and very likely *will* lead to disasterous loss of data. + + DISCLAIMER: + I TAKE NO RESPONSIBILITY FOR ANY LOSS OF DATA ARISING FROM THE + USE OF THIS PROGRAM, HOWSOEVER CAUSED. + + Every compression of a file implies an assumption that the + compressed file can be decompressed to reproduce the original. + Great efforts in design, coding and testing have been made to + ensure that this program works correctly. However, the + complexity of the algorithms, and, in particular, the presence + of various special cases in the code which occur with very low + but non-zero probability make it impossible to rule out the + possibility of bugs remaining in the program. DO NOT COMPRESS + ANY DATA WITH THIS PROGRAM UNLESS YOU ARE PREPARED TO ACCEPT THE + POSSIBILITY, HOWEVER SMALL, THAT THE DATA WILL NOT BE RECOVERABLE. + + That is not to say this program is inherently unreliable. + Indeed, I very much hope the opposite is true. bzip2 has been + carefully constructed and extensively tested. +--*/ + + + +/*----------------------------------------------------*/ +/*--- and now for something much more pleasant :-) ---*/ +/*----------------------------------------------------*/ + +/*---------------------------------------------*/ +/*-- + Place a 1 beside your platform, and 0 elsewhere. +--*/ + +/*-- + Generic 32-bit Unix. + Also works on 64-bit Unix boxes. +--*/ +#define BZ_UNIX 1 + +/*-- + Win32, as seen by Jacob Navia's excellent + port of (Chris Fraser & David Hanson)'s excellent + lcc compiler. +--*/ +#define BZ_LCCWIN32 0 + + + +/*---------------------------------------------*/ +/*-- + Some stuff for all platforms. +--*/ + +#include +#include +#if DEBUG + #include +#endif +#include +#include +#include +#include + +#define ERROR_IF_EOF(i) { if ((i) == EOF) ioError(); } +#define ERROR_IF_NOT_ZERO(i) { if ((i) != 0) ioError(); } +#define ERROR_IF_MINUS_ONE(i) { if ((i) == (-1)) ioError(); } + + +/*---------------------------------------------*/ +/*-- + Platform-specific stuff. +--*/ + +#if BZ_UNIX + #include + #include + #include + #include + #include + + #define Int32 int + #define UInt32 unsigned int + #define Char char + #define UChar unsigned char + #define Int16 short + #define UInt16 unsigned short + + #define PATH_SEP '/' + #define MY_LSTAT lstat + #define MY_S_IFREG S_ISREG + #define MY_STAT stat + + #define APPEND_FILESPEC(root, name) \ + root=snocString((root), (name)) + + #define SET_BINARY_MODE(fd) /**/ + + /*-- + You should try very hard to persuade your C compiler + to inline the bits marked INLINE. Otherwise bzip2 will + run rather slowly. gcc version 2.x is recommended. + --*/ + #ifdef __GNUC__ + #define INLINE inline + #define NORETURN __attribute__ ((noreturn)) + #else + #define INLINE /**/ + #define NORETURN /**/ + #endif +#endif + + + +#if BZ_LCCWIN32 + #include + #include + #include + + #define Int32 int + #define UInt32 unsigned int + #define Int16 short + #define UInt16 unsigned short + #define Char char + #define UChar unsigned char + + #define INLINE /**/ + #define NORETURN /**/ + #define PATH_SEP '\\' + #define MY_LSTAT _stat + #define MY_STAT _stat + #define MY_S_IFREG(x) ((x) & _S_IFREG) + + #if 0 + /*-- lcc-win32 seems to expand wildcards itself --*/ + #define APPEND_FILESPEC(root, spec) \ + do { \ + if ((spec)[0] == '-') { \ + root = snocString((root), (spec)); \ + } else { \ + struct _finddata_t c_file; \ + long hFile; \ + hFile = _findfirst((spec), &c_file); \ + if ( hFile == -1L ) { \ + root = snocString ((root), (spec)); \ + } else { \ + int anInt = 0; \ + while ( anInt == 0 ) { \ + root = snocString((root), \ + &c_file.name[0]); \ + anInt = _findnext(hFile, &c_file); \ + } \ + } \ + } \ + } while ( 0 ) + #else + #define APPEND_FILESPEC(root, name) \ + root = snocString ((root), (name)) + #endif + + #define SET_BINARY_MODE(fd) \ + do { \ + int retVal = setmode ( fileno ( fd ), \ + O_BINARY ); \ + ERROR_IF_MINUS_ONE ( retVal ); \ + } while ( 0 ) + +#endif + + +/*---------------------------------------------*/ +/*-- + Some more stuff for all platforms :-) +--*/ + +#define Bool unsigned char +#define True 1 +#define False 0 + +/*-- + IntNative is your platform's `native' int size. + Only here to avoid probs with 64-bit platforms. +--*/ +#define IntNative int + + +/*-- + change to 1, or compile with -DDEBUG=1 to debug +--*/ +#ifndef DEBUG +#define DEBUG 0 +#endif + + +/*---------------------------------------------------*/ +/*--- ---*/ +/*---------------------------------------------------*/ + +/*-- + Implementation notes, July 1997 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Memory allocation + ~~~~~~~~~~~~~~~~~ + All large data structures are allocated on the C heap, + for better or for worse. That includes the various + arrays of pointers, striped words, bytes, frequency + tables and buffers for compression and decompression. + + bzip2 can operate at various block-sizes, ranging from + 100k to 900k in 100k steps, and it allocates only as + much as it needs to. When compressing, we know from the + command-line options what the block-size is going to be, + so all allocation can be done at start-up; if that + succeeds, there can be no further allocation problems. + + Decompression is more complicated. Each compressed file + contains, in its header, a byte indicating the block + size used for compression. This means bzip2 potentially + needs to reallocate memory for each file it deals with, + which in turn opens the possibility for a memory allocation + failure part way through a run of files, by encountering + a file requiring a much larger block size than all the + ones preceding it. + + The policy is to simply give up if a memory allocation + failure occurs. During decompression, it would be + possible to move on to subsequent files in the hope that + some might ask for a smaller block size, but the + complications for doing this seem more trouble than they + are worth. + + + Compressed file formats + ~~~~~~~~~~~~~~~~~~~~~~~ + [This is now entirely different from both 0.21, and from + any previous Huffman-coded variant of bzip. + See the associated file bzip2.txt for details.] + + + Error conditions + ~~~~~~~~~~~~~~~~ + Dealing with error conditions is the least satisfactory + aspect of bzip2. The policy is to try and leave the + filesystem in a consistent state, then quit, even if it + means not processing some of the files mentioned in the + command line. `A consistent state' means that a file + exists either in its compressed or uncompressed form, + but not both. This boils down to the rule `delete the + output file if an error condition occurs, leaving the + input intact'. Input files are only deleted when we can + be pretty sure the output file has been written and + closed successfully. + + Errors are a dog because there's so many things to + deal with. The following can happen mid-file, and + require cleaning up. + + internal `panics' -- indicating a bug + corrupted or inconsistent compressed file + can't allocate enough memory to decompress this file + I/O error reading/writing/opening/closing + signal catches -- Control-C, SIGTERM, SIGHUP. + + Other conditions, primarily pertaining to file names, + can be checked in-between files, which makes dealing + with them easier. +--*/ + + + +/*---------------------------------------------------*/ +/*--- Misc (file handling) data decls ---*/ +/*---------------------------------------------------*/ + +UInt32 bytesIn, bytesOut; +Int32 verbosity; +Bool keepInputFiles, smallMode, testFailsExist; +UInt32 globalCrc; +Int32 numFileNames, numFilesProcessed; + + +/*-- source modes; F==file, I==stdin, O==stdout --*/ +#define SM_I2O 1 +#define SM_F2O 2 +#define SM_F2F 3 + +/*-- operation modes --*/ +#define OM_Z 1 +#define OM_UNZ 2 +#define OM_TEST 3 + +Int32 opMode; +Int32 srcMode; + + +Int32 longestFileName; +Char inName[1024]; +Char outName[1024]; +Char *progName; +Char progNameReally[1024]; +FILE *outputHandleJustInCase; + +void panic ( Char* ) NORETURN; +void ioError ( void ) NORETURN; +void compressOutOfMemory ( Int32, Int32 ) NORETURN; +void uncompressOutOfMemory ( Int32, Int32 ) NORETURN; +void blockOverrun ( void ) NORETURN; +void badBlockHeader ( void ) NORETURN; +void badBGLengths ( void ) NORETURN; +void crcError ( UInt32, UInt32 ) NORETURN; +void bitStreamEOF ( void ) NORETURN; +void cleanUpAndFail ( Int32 ) NORETURN; +void compressedStreamEOF ( void ) NORETURN; + +void* myMalloc ( Int32 ); + + + +/*---------------------------------------------------*/ +/*--- Data decls for the front end ---*/ +/*---------------------------------------------------*/ + +/*-- + The overshoot bytes allow us to avoid most of + the cost of pointer renormalisation during + comparison of rotations in sorting. + The figure of 20 is derived as follows: + qSort3 allows an overshoot of up to 10. + It then calls simpleSort, which calls + fullGtU, also with max overshoot 10. + fullGtU does up to 10 comparisons without + renormalising, giving 10+10 == 20. +--*/ +#define NUM_OVERSHOOT_BYTES 20 + +/*-- + These are the main data structures for + the Burrows-Wheeler transform. +--*/ + +/*-- + Pointers to compression and decompression + structures. Set by + allocateCompressStructures and + setDecompressStructureSizes + + The structures are always set to be suitable + for a block of size 100000 * blockSize100k. +--*/ +UChar *block; /*-- compress --*/ +UInt16 *quadrant; /*-- compress --*/ +Int32 *zptr; /*-- compress --*/ +UInt16 *szptr; /*-- overlays zptr ---*/ +Int32 *ftab; /*-- compress --*/ + +UInt16 *ll16; /*-- small decompress --*/ +UChar *ll4; /*-- small decompress --*/ + +Int32 *tt; /*-- fast decompress --*/ +UChar *ll8; /*-- fast decompress --*/ + + +/*-- + freq table collected to save a pass over the data + during decompression. +--*/ +Int32 unzftab[256]; + + +/*-- + index of the last char in the block, so + the block size == last + 1. +--*/ +Int32 last; + + +/*-- + index in zptr[] of original string after sorting. +--*/ +Int32 origPtr; + + +/*-- + always: in the range 0 .. 9. + The current block size is 100000 * this number. +--*/ +Int32 blockSize100k; + + +/*-- + Used when sorting. If too many long comparisons + happen, we stop sorting, randomise the block + slightly, and try again. +--*/ + +Int32 workFactor; +Int32 workDone; +Int32 workLimit; +Bool blockRandomised; +Bool firstAttempt; +Int32 nBlocksRandomised; + + + +/*---------------------------------------------------*/ +/*--- Data decls for the back end ---*/ +/*---------------------------------------------------*/ + +#define MAX_ALPHA_SIZE 258 +#define MAX_CODE_LEN 23 + +#define RUNA 0 +#define RUNB 1 + +#define N_GROUPS 6 +#define G_SIZE 50 +#define N_ITERS 4 + +#define MAX_SELECTORS (2 + (900000 / G_SIZE)) + +Bool inUse[256]; +Int32 nInUse; + +UChar seqToUnseq[256]; +UChar unseqToSeq[256]; + +UChar selector [MAX_SELECTORS]; +UChar selectorMtf[MAX_SELECTORS]; + +Int32 nMTF; + +Int32 mtfFreq[MAX_ALPHA_SIZE]; + +UChar len [N_GROUPS][MAX_ALPHA_SIZE]; + +/*-- decompress only --*/ +Int32 limit [N_GROUPS][MAX_ALPHA_SIZE]; +Int32 base [N_GROUPS][MAX_ALPHA_SIZE]; +Int32 perm [N_GROUPS][MAX_ALPHA_SIZE]; +Int32 minLens[N_GROUPS]; + +/*-- compress only --*/ +Int32 code [N_GROUPS][MAX_ALPHA_SIZE]; +Int32 rfreq[N_GROUPS][MAX_ALPHA_SIZE]; + + +/*---------------------------------------------------*/ +/*--- 32-bit CRC grunge ---*/ +/*---------------------------------------------------*/ + +/*-- + I think this is an implementation of the AUTODIN-II, + Ethernet & FDDI 32-bit CRC standard. Vaguely derived + from code by Rob Warnock, in Section 51 of the + comp.compression FAQ. +--*/ + +UInt32 crc32Table[256] = { + + /*-- Ugly, innit? --*/ + + 0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L, + 0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L, + 0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L, + 0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL, + 0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L, + 0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L, + 0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L, + 0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL, + 0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L, + 0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L, + 0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L, + 0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL, + 0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L, + 0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L, + 0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L, + 0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL, + 0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL, + 0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L, + 0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L, + 0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL, + 0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL, + 0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L, + 0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L, + 0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL, + 0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL, + 0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L, + 0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L, + 0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL, + 0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL, + 0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L, + 0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L, + 0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL, + 0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L, + 0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL, + 0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL, + 0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L, + 0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L, + 0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL, + 0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL, + 0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L, + 0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L, + 0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL, + 0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL, + 0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L, + 0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L, + 0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL, + 0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL, + 0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L, + 0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L, + 0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL, + 0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L, + 0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L, + 0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L, + 0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL, + 0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L, + 0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L, + 0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L, + 0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL, + 0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L, + 0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L, + 0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L, + 0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL, + 0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L, + 0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L +}; + + +/*---------------------------------------------*/ +void initialiseCRC ( void ) +{ + globalCrc = 0xffffffffL; +} + + +/*---------------------------------------------*/ +UInt32 getFinalCRC ( void ) +{ + return ~globalCrc; +} + + +/*---------------------------------------------*/ +UInt32 getGlobalCRC ( void ) +{ + return globalCrc; +} + + +/*---------------------------------------------*/ +void setGlobalCRC ( UInt32 newCrc ) +{ + globalCrc = newCrc; +} + + +/*---------------------------------------------*/ +#define UPDATE_CRC(crcVar,cha) \ +{ \ + crcVar = (crcVar << 8) ^ \ + crc32Table[(crcVar >> 24) ^ \ + ((UChar)cha)]; \ +} + + +/*---------------------------------------------------*/ +/*--- Bit stream I/O ---*/ +/*---------------------------------------------------*/ + + +UInt32 bsBuff; +Int32 bsLive; +FILE* bsStream; +Bool bsWriting; + + +/*---------------------------------------------*/ +void bsSetStream ( FILE* f, Bool wr ) +{ + if (bsStream != NULL) panic ( "bsSetStream" ); + bsStream = f; + bsLive = 0; + bsBuff = 0; + bytesOut = 0; + bytesIn = 0; + bsWriting = wr; +} + + +/*---------------------------------------------*/ +void bsFinishedWithStream ( void ) +{ + if (bsWriting) + while (bsLive > 0) { + fputc ( (UChar)(bsBuff >> 24), bsStream ); + bsBuff <<= 8; + bsLive -= 8; + bytesOut++; + } + bsStream = NULL; +} + + +/*---------------------------------------------*/ +#define bsNEEDR(nz) \ +{ \ + while (bsLive < nz) { \ + Int32 zzi = fgetc ( bsStream ); \ + if (zzi == EOF) compressedStreamEOF(); \ + bsBuff = (bsBuff << 8) | (zzi & 0xffL); \ + bsLive += 8; \ + } \ +} + + +/*---------------------------------------------*/ +#define bsNEEDW(nz) \ +{ \ + while (bsLive >= 8) { \ + fputc ( (UChar)(bsBuff >> 24), \ + bsStream ); \ + bsBuff <<= 8; \ + bsLive -= 8; \ + bytesOut++; \ + } \ +} + + +/*---------------------------------------------*/ +#define bsR1(vz) \ +{ \ + bsNEEDR(1); \ + vz = (bsBuff >> (bsLive-1)) & 1; \ + bsLive--; \ +} + + +/*---------------------------------------------*/ +INLINE UInt32 bsR ( Int32 n ) +{ + UInt32 v; + bsNEEDR ( n ); + v = (bsBuff >> (bsLive-n)) & ((1 << n)-1); + bsLive -= n; + return v; +} + + +/*---------------------------------------------*/ +INLINE void bsW ( Int32 n, UInt32 v ) +{ + bsNEEDW ( n ); + bsBuff |= (v << (32 - bsLive - n)); + bsLive += n; +} + + +/*---------------------------------------------*/ +UChar bsGetUChar ( void ) +{ + return (UChar)bsR(8); +} + + +/*---------------------------------------------*/ +void bsPutUChar ( UChar c ) +{ + bsW(8, (UInt32)c ); +} + + +/*---------------------------------------------*/ +Int32 bsGetUInt32 ( void ) +{ + UInt32 u; + u = 0; + u = (u << 8) | bsR(8); + u = (u << 8) | bsR(8); + u = (u << 8) | bsR(8); + u = (u << 8) | bsR(8); + return u; +} + + +/*---------------------------------------------*/ +UInt32 bsGetIntVS ( UInt32 numBits ) +{ + return (UInt32)bsR(numBits); +} + + +/*---------------------------------------------*/ +UInt32 bsGetInt32 ( void ) +{ + return (Int32)bsGetUInt32(); +} + + +/*---------------------------------------------*/ +void bsPutUInt32 ( UInt32 u ) +{ + bsW ( 8, (u >> 24) & 0xffL ); + bsW ( 8, (u >> 16) & 0xffL ); + bsW ( 8, (u >> 8) & 0xffL ); + bsW ( 8, u & 0xffL ); +} + + +/*---------------------------------------------*/ +void bsPutInt32 ( Int32 c ) +{ + bsPutUInt32 ( (UInt32)c ); +} + + +/*---------------------------------------------*/ +void bsPutIntVS ( Int32 numBits, UInt32 c ) +{ + bsW ( numBits, c ); +} + + +/*---------------------------------------------------*/ +/*--- Huffman coding low-level stuff ---*/ +/*---------------------------------------------------*/ + +#define WEIGHTOF(zz0) ((zz0) & 0xffffff00) +#define DEPTHOF(zz1) ((zz1) & 0x000000ff) +#define MYMAX(zz2,zz3) ((zz2) > (zz3) ? (zz2) : (zz3)) + +#define ADDWEIGHTS(zw1,zw2) \ + (WEIGHTOF(zw1)+WEIGHTOF(zw2)) | \ + (1 + MYMAX(DEPTHOF(zw1),DEPTHOF(zw2))) + +#define UPHEAP(z) \ +{ \ + Int32 zz, tmp; \ + zz = z; tmp = heap[zz]; \ + while (weight[tmp] < weight[heap[zz >> 1]]) { \ + heap[zz] = heap[zz >> 1]; \ + zz >>= 1; \ + } \ + heap[zz] = tmp; \ +} + +#define DOWNHEAP(z) \ +{ \ + Int32 zz, yy, tmp; \ + zz = z; tmp = heap[zz]; \ + while (True) { \ + yy = zz << 1; \ + if (yy > nHeap) break; \ + if (yy < nHeap && \ + weight[heap[yy+1]] < weight[heap[yy]]) \ + yy++; \ + if (weight[tmp] < weight[heap[yy]]) break; \ + heap[zz] = heap[yy]; \ + zz = yy; \ + } \ + heap[zz] = tmp; \ +} + + +/*---------------------------------------------*/ +void hbMakeCodeLengths ( UChar *len, + Int32 *freq, + Int32 alphaSize, + Int32 maxLen ) +{ + /*-- + Nodes and heap entries run from 1. Entry 0 + for both the heap and nodes is a sentinel. + --*/ + Int32 nNodes, nHeap, n1, n2, i, j, k; + Bool tooLong; + + Int32 heap [ MAX_ALPHA_SIZE + 2 ]; + Int32 weight [ MAX_ALPHA_SIZE * 2 ]; + Int32 parent [ MAX_ALPHA_SIZE * 2 ]; + + for (i = 0; i < alphaSize; i++) + weight[i+1] = (freq[i] == 0 ? 1 : freq[i]) << 8; + + while (True) { + + nNodes = alphaSize; + nHeap = 0; + + heap[0] = 0; + weight[0] = 0; + parent[0] = -2; + + for (i = 1; i <= alphaSize; i++) { + parent[i] = -1; + nHeap++; + heap[nHeap] = i; + UPHEAP(nHeap); + } + if (!(nHeap < (MAX_ALPHA_SIZE+2))) + panic ( "hbMakeCodeLengths(1)" ); + + while (nHeap > 1) { + n1 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1); + n2 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1); + nNodes++; + parent[n1] = parent[n2] = nNodes; + weight[nNodes] = ADDWEIGHTS(weight[n1], weight[n2]); + parent[nNodes] = -1; + nHeap++; + heap[nHeap] = nNodes; + UPHEAP(nHeap); + } + if (!(nNodes < (MAX_ALPHA_SIZE * 2))) + panic ( "hbMakeCodeLengths(2)" ); + + tooLong = False; + for (i = 1; i <= alphaSize; i++) { + j = 0; + k = i; + while (parent[k] >= 0) { k = parent[k]; j++; } + len[i-1] = j; + if (j > maxLen) tooLong = True; + } + + if (! tooLong) break; + + for (i = 1; i < alphaSize; i++) { + j = weight[i] >> 8; + j = 1 + (j / 2); + weight[i] = j << 8; + } + } +} + + +/*---------------------------------------------*/ +void hbAssignCodes ( Int32 *code, + UChar *length, + Int32 minLen, + Int32 maxLen, + Int32 alphaSize ) +{ + Int32 n, vec, i; + + vec = 0; + for (n = minLen; n <= maxLen; n++) { + for (i = 0; i < alphaSize; i++) + if (length[i] == n) { code[i] = vec; vec++; }; + vec <<= 1; + } +} + + +/*---------------------------------------------*/ +void hbCreateDecodeTables ( Int32 *limit, + Int32 *base, + Int32 *perm, + UChar *length, + Int32 minLen, + Int32 maxLen, + Int32 alphaSize ) +{ + Int32 pp, i, j, vec; + + pp = 0; + for (i = minLen; i <= maxLen; i++) + for (j = 0; j < alphaSize; j++) + if (length[j] == i) { perm[pp] = j; pp++; }; + + for (i = 0; i < MAX_CODE_LEN; i++) base[i] = 0; + for (i = 0; i < alphaSize; i++) base[length[i]+1]++; + + for (i = 1; i < MAX_CODE_LEN; i++) base[i] += base[i-1]; + + for (i = 0; i < MAX_CODE_LEN; i++) limit[i] = 0; + vec = 0; + + for (i = minLen; i <= maxLen; i++) { + vec += (base[i+1] - base[i]); + limit[i] = vec-1; + vec <<= 1; + } + for (i = minLen + 1; i <= maxLen; i++) + base[i] = ((limit[i-1] + 1) << 1) - base[i]; +} + + + +/*---------------------------------------------------*/ +/*--- Undoing the reversible transformation ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +#define SET_LL4(i,n) \ + { if (((i) & 0x1) == 0) \ + ll4[(i) >> 1] = (ll4[(i) >> 1] & 0xf0) | (n); else \ + ll4[(i) >> 1] = (ll4[(i) >> 1] & 0x0f) | ((n) << 4); \ + } + +#define GET_LL4(i) \ + (((UInt32)(ll4[(i) >> 1])) >> (((i) << 2) & 0x4) & 0xF) + +#define SET_LL(i,n) \ + { ll16[i] = (UInt16)(n & 0x0000ffff); \ + SET_LL4(i, n >> 16); \ + } + +#define GET_LL(i) \ + (((UInt32)ll16[i]) | (GET_LL4(i) << 16)) + + +/*---------------------------------------------*/ +/*-- + Manage memory for compression/decompression. + When compressing, a single block size applies to + all files processed, and that's set when the + program starts. But when decompressing, each file + processed could have been compressed with a + different block size, so we may have to free + and reallocate on a per-file basis. + + A call with argument of zero means + `free up everything.' And a value of zero for + blockSize100k means no memory is currently allocated. +--*/ + + +/*---------------------------------------------*/ +void allocateCompressStructures ( void ) +{ + Int32 n = 100000 * blockSize100k; + block = malloc ( (n + 1 + NUM_OVERSHOOT_BYTES) * sizeof(UChar) ); + quadrant = malloc ( (n + NUM_OVERSHOOT_BYTES) * sizeof(Int16) ); + zptr = malloc ( n * sizeof(Int32) ); + ftab = malloc ( 65537 * sizeof(Int32) ); + + if (block == NULL || quadrant == NULL || + zptr == NULL || ftab == NULL) { + Int32 totalDraw + = (n + 1 + NUM_OVERSHOOT_BYTES) * sizeof(UChar) + + (n + NUM_OVERSHOOT_BYTES) * sizeof(Int16) + + n * sizeof(Int32) + + 65537 * sizeof(Int32); + + compressOutOfMemory ( totalDraw, n ); + } + + /*-- + Since we want valid indexes for block of + -1 to n + NUM_OVERSHOOT_BYTES - 1 + inclusive. + --*/ + block++; + + /*-- + The back end needs a place to store the MTF values + whilst it calculates the coding tables. We could + put them in the zptr array. However, these values + will fit in a short, so we overlay szptr at the + start of zptr, in the hope of reducing the number + of cache misses induced by the multiple traversals + of the MTF values when calculating coding tables. + Seems to improve compression speed by about 1%. + --*/ + szptr = (UInt16*)zptr; +} + + +/*---------------------------------------------*/ +void setDecompressStructureSizes ( Int32 newSize100k ) +{ + if (! (0 <= newSize100k && newSize100k <= 9 && + 0 <= blockSize100k && blockSize100k <= 9)) + panic ( "setDecompressStructureSizes" ); + + if (newSize100k == blockSize100k) return; + + blockSize100k = newSize100k; + + if (ll16 != NULL) free ( ll16 ); + if (ll4 != NULL) free ( ll4 ); + if (ll8 != NULL) free ( ll8 ); + if (tt != NULL) free ( tt ); + + if (newSize100k == 0) return; + + if (smallMode) { + + Int32 n = 100000 * newSize100k; + ll16 = malloc ( n * sizeof(UInt16) ); + ll4 = malloc ( ((n+1) >> 1) * sizeof(UChar) ); + + if (ll4 == NULL || ll16 == NULL) { + Int32 totalDraw + = n * sizeof(Int16) + ((n+1) >> 1) * sizeof(UChar); + uncompressOutOfMemory ( totalDraw, n ); + } + + } else { + + Int32 n = 100000 * newSize100k; + ll8 = malloc ( n * sizeof(UChar) ); + tt = malloc ( n * sizeof(Int32) ); + + if (ll8 == NULL || tt == NULL) { + Int32 totalDraw + = n * sizeof(UChar) + n * sizeof(UInt32); + uncompressOutOfMemory ( totalDraw, n ); + } + + } +} + + + +/*---------------------------------------------------*/ +/*--- The new back end ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +void makeMaps ( void ) +{ + Int32 i; + nInUse = 0; + for (i = 0; i < 256; i++) + if (inUse[i]) { + seqToUnseq[nInUse] = i; + unseqToSeq[i] = nInUse; + nInUse++; + } +} + + +/*---------------------------------------------*/ +void generateMTFValues ( void ) +{ + UChar yy[256]; + Int32 i, j; + UChar tmp; + UChar tmp2; + Int32 zPend; + Int32 wr; + Int32 EOB; + + makeMaps(); + EOB = nInUse+1; + + for (i = 0; i <= EOB; i++) mtfFreq[i] = 0; + + wr = 0; + zPend = 0; + for (i = 0; i < nInUse; i++) yy[i] = (UChar) i; + + + for (i = 0; i <= last; i++) { + UChar ll_i; + + #if DEBUG + assert (wr <= i); + #endif + + ll_i = unseqToSeq[block[zptr[i] - 1]]; + #if DEBUG + assert (ll_i < nInUse); + #endif + + j = 0; + tmp = yy[j]; + while ( ll_i != tmp ) { + j++; + tmp2 = tmp; + tmp = yy[j]; + yy[j] = tmp2; + }; + yy[0] = tmp; + + if (j == 0) { + zPend++; + } else { + if (zPend > 0) { + zPend--; + while (True) { + switch (zPend % 2) { + case 0: szptr[wr] = RUNA; wr++; mtfFreq[RUNA]++; break; + case 1: szptr[wr] = RUNB; wr++; mtfFreq[RUNB]++; break; + }; + if (zPend < 2) break; + zPend = (zPend - 2) / 2; + }; + zPend = 0; + } + szptr[wr] = j+1; wr++; mtfFreq[j+1]++; + } + } + + if (zPend > 0) { + zPend--; + while (True) { + switch (zPend % 2) { + case 0: szptr[wr] = RUNA; wr++; mtfFreq[RUNA]++; break; + case 1: szptr[wr] = RUNB; wr++; mtfFreq[RUNB]++; break; + }; + if (zPend < 2) break; + zPend = (zPend - 2) / 2; + }; + } + + szptr[wr] = EOB; wr++; mtfFreq[EOB]++; + + nMTF = wr; +} + + +/*---------------------------------------------*/ +#define LESSER_ICOST 0 +#define GREATER_ICOST 15 + +void sendMTFValues ( void ) +{ + Int32 v, t, i, j, gs, ge, totc, bt, bc, iter; + Int32 nSelectors, alphaSize, minLen, maxLen, selCtr; + Int32 nGroups, nBytes; + + /*-- + UChar len [N_GROUPS][MAX_ALPHA_SIZE]; + is a global since the decoder also needs it. + + Int32 code[N_GROUPS][MAX_ALPHA_SIZE]; + Int32 rfreq[N_GROUPS][MAX_ALPHA_SIZE]; + are also globals only used in this proc. + Made global to keep stack frame size small. + --*/ + + + UInt16 cost[N_GROUPS]; + Int32 fave[N_GROUPS]; + + if (verbosity >= 3) + fprintf ( stderr, + " %d in block, %d after MTF & 1-2 coding, %d+2 syms in use\n", + last+1, nMTF, nInUse ); + + alphaSize = nInUse+2; + for (t = 0; t < N_GROUPS; t++) + for (v = 0; v < alphaSize; v++) + len[t][v] = GREATER_ICOST; + + /*--- Decide how many coding tables to use ---*/ + if (nMTF <= 0) panic ( "sendMTFValues(0)" ); + if (nMTF < 200) nGroups = 2; else + if (nMTF < 800) nGroups = 4; else + nGroups = 6; + + /*--- Generate an initial set of coding tables ---*/ + { + Int32 nPart, remF, tFreq, aFreq; + + nPart = nGroups; + remF = nMTF; + gs = 0; + while (nPart > 0) { + tFreq = remF / nPart; + ge = gs-1; + aFreq = 0; + while (aFreq < tFreq && ge < alphaSize-1) { + ge++; + aFreq += mtfFreq[ge]; + } + + if (ge > gs + && nPart != nGroups && nPart != 1 + && ((nGroups-nPart) % 2 == 1)) { + aFreq -= mtfFreq[ge]; + ge--; + } + + if (verbosity >= 3) + fprintf ( stderr, + " initial group %d, [%d .. %d], has %d syms (%4.1f%%)\n", + nPart, gs, ge, aFreq, + (100.0 * (float)aFreq) / (float)nMTF ); + + for (v = 0; v < alphaSize; v++) + if (v >= gs && v <= ge) + len[nPart-1][v] = LESSER_ICOST; else + len[nPart-1][v] = GREATER_ICOST; + + nPart--; + gs = ge+1; + remF -= aFreq; + } + } + + /*--- + Iterate up to N_ITERS times to improve the tables. + ---*/ + for (iter = 0; iter < N_ITERS; iter++) { + + for (t = 0; t < nGroups; t++) fave[t] = 0; + + for (t = 0; t < nGroups; t++) + for (v = 0; v < alphaSize; v++) + rfreq[t][v] = 0; + + nSelectors = 0; + totc = 0; + gs = 0; + while (True) { + + /*--- Set group start & end marks. --*/ + if (gs >= nMTF) break; + ge = gs + G_SIZE - 1; + if (ge >= nMTF) ge = nMTF-1; + + /*-- + Calculate the cost of this group as coded + by each of the coding tables. + --*/ + for (t = 0; t < nGroups; t++) cost[t] = 0; + + if (nGroups == 6) { + register UInt16 cost0, cost1, cost2, cost3, cost4, cost5; + cost0 = cost1 = cost2 = cost3 = cost4 = cost5 = 0; + for (i = gs; i <= ge; i++) { + UInt16 icv = szptr[i]; + cost0 += len[0][icv]; + cost1 += len[1][icv]; + cost2 += len[2][icv]; + cost3 += len[3][icv]; + cost4 += len[4][icv]; + cost5 += len[5][icv]; + } + cost[0] = cost0; cost[1] = cost1; cost[2] = cost2; + cost[3] = cost3; cost[4] = cost4; cost[5] = cost5; + } else { + for (i = gs; i <= ge; i++) { + UInt16 icv = szptr[i]; + for (t = 0; t < nGroups; t++) cost[t] += len[t][icv]; + } + } + + /*-- + Find the coding table which is best for this group, + and record its identity in the selector table. + --*/ + bc = 999999999; bt = -1; + for (t = 0; t < nGroups; t++) + if (cost[t] < bc) { bc = cost[t]; bt = t; }; + totc += bc; + fave[bt]++; + selector[nSelectors] = bt; + nSelectors++; + + /*-- + Increment the symbol frequencies for the selected table. + --*/ + for (i = gs; i <= ge; i++) + rfreq[bt][ szptr[i] ]++; + + gs = ge+1; + } + if (verbosity >= 3) { + fprintf ( stderr, + " pass %d: size is %d, grp uses are ", + iter+1, totc/8 ); + for (t = 0; t < nGroups; t++) + fprintf ( stderr, "%d ", fave[t] ); + fprintf ( stderr, "\n" ); + } + + /*-- + Recompute the tables based on the accumulated frequencies. + --*/ + for (t = 0; t < nGroups; t++) + hbMakeCodeLengths ( &len[t][0], &rfreq[t][0], alphaSize, 20 ); + } + + + if (!(nGroups < 8)) panic ( "sendMTFValues(1)" ); + if (!(nSelectors < 32768 && + nSelectors <= (2 + (900000 / G_SIZE)))) + panic ( "sendMTFValues(2)" ); + + + /*--- Compute MTF values for the selectors. ---*/ + { + UChar pos[N_GROUPS], ll_i, tmp2, tmp; + for (i = 0; i < nGroups; i++) pos[i] = i; + for (i = 0; i < nSelectors; i++) { + ll_i = selector[i]; + j = 0; + tmp = pos[j]; + while ( ll_i != tmp ) { + j++; + tmp2 = tmp; + tmp = pos[j]; + pos[j] = tmp2; + }; + pos[0] = tmp; + selectorMtf[i] = j; + } + }; + + /*--- Assign actual codes for the tables. --*/ + for (t = 0; t < nGroups; t++) { + minLen = 32; + maxLen = 0; + for (i = 0; i < alphaSize; i++) { + if (len[t][i] > maxLen) maxLen = len[t][i]; + if (len[t][i] < minLen) minLen = len[t][i]; + } + if (maxLen > 20) panic ( "sendMTFValues(3)" ); + if (minLen < 1) panic ( "sendMTFValues(4)" ); + hbAssignCodes ( &code[t][0], &len[t][0], + minLen, maxLen, alphaSize ); + } + + /*--- Transmit the mapping table. ---*/ + { + Bool inUse16[16]; + for (i = 0; i < 16; i++) { + inUse16[i] = False; + for (j = 0; j < 16; j++) + if (inUse[i * 16 + j]) inUse16[i] = True; + } + + nBytes = bytesOut; + for (i = 0; i < 16; i++) + if (inUse16[i]) bsW(1,1); else bsW(1,0); + + for (i = 0; i < 16; i++) + if (inUse16[i]) + for (j = 0; j < 16; j++) + if (inUse[i * 16 + j]) bsW(1,1); else bsW(1,0); + + if (verbosity >= 3) + fprintf ( stderr, " bytes: mapping %d, ", bytesOut-nBytes ); + } + + /*--- Now the selectors. ---*/ + nBytes = bytesOut; + bsW ( 3, nGroups ); + bsW ( 15, nSelectors ); + for (i = 0; i < nSelectors; i++) { + for (j = 0; j < selectorMtf[i]; j++) bsW(1,1); + bsW(1,0); + } + if (verbosity >= 3) + fprintf ( stderr, "selectors %d, ", bytesOut-nBytes ); + + /*--- Now the coding tables. ---*/ + nBytes = bytesOut; + + for (t = 0; t < nGroups; t++) { + Int32 curr = len[t][0]; + bsW ( 5, curr ); + for (i = 0; i < alphaSize; i++) { + while (curr < len[t][i]) { bsW(2,2); curr++; /* 10 */ }; + while (curr > len[t][i]) { bsW(2,3); curr--; /* 11 */ }; + bsW ( 1, 0 ); + } + } + + if (verbosity >= 3) + fprintf ( stderr, "code lengths %d, ", bytesOut-nBytes ); + + /*--- And finally, the block data proper ---*/ + nBytes = bytesOut; + selCtr = 0; + gs = 0; + while (True) { + if (gs >= nMTF) break; + ge = gs + G_SIZE - 1; + if (ge >= nMTF) ge = nMTF-1; + for (i = gs; i <= ge; i++) { + #if DEBUG + assert (selector[selCtr] < nGroups); + #endif + bsW ( len [selector[selCtr]] [szptr[i]], + code [selector[selCtr]] [szptr[i]] ); + } + + gs = ge+1; + selCtr++; + } + if (!(selCtr == nSelectors)) panic ( "sendMTFValues(5)" ); + + if (verbosity >= 3) + fprintf ( stderr, "codes %d\n", bytesOut-nBytes ); +} + + +/*---------------------------------------------*/ +void moveToFrontCodeAndSend ( void ) +{ + bsPutIntVS ( 24, origPtr ); + generateMTFValues(); + sendMTFValues(); +} + + +/*---------------------------------------------*/ +void recvDecodingTables ( void ) +{ + Int32 i, j, t, nGroups, nSelectors, alphaSize; + Int32 minLen, maxLen; + Bool inUse16[16]; + + /*--- Receive the mapping table ---*/ + for (i = 0; i < 16; i++) + if (bsR(1) == 1) + inUse16[i] = True; else + inUse16[i] = False; + + for (i = 0; i < 256; i++) inUse[i] = False; + + for (i = 0; i < 16; i++) + if (inUse16[i]) + for (j = 0; j < 16; j++) + if (bsR(1) == 1) inUse[i * 16 + j] = True; + + makeMaps(); + alphaSize = nInUse+2; + + /*--- Now the selectors ---*/ + nGroups = bsR ( 3 ); + nSelectors = bsR ( 15 ); + for (i = 0; i < nSelectors; i++) { + j = 0; + while (bsR(1) == 1) j++; + selectorMtf[i] = j; + } + + /*--- Undo the MTF values for the selectors. ---*/ + { + UChar pos[N_GROUPS], tmp, v; + for (v = 0; v < nGroups; v++) pos[v] = v; + + for (i = 0; i < nSelectors; i++) { + v = selectorMtf[i]; + tmp = pos[v]; + while (v > 0) { pos[v] = pos[v-1]; v--; } + pos[0] = tmp; + selector[i] = tmp; + } + } + + /*--- Now the coding tables ---*/ + for (t = 0; t < nGroups; t++) { + Int32 curr = bsR ( 5 ); + for (i = 0; i < alphaSize; i++) { + while (bsR(1) == 1) { + if (bsR(1) == 0) curr++; else curr--; + } + len[t][i] = curr; + } + } + + /*--- Create the Huffman decoding tables ---*/ + for (t = 0; t < nGroups; t++) { + minLen = 32; + maxLen = 0; + for (i = 0; i < alphaSize; i++) { + if (len[t][i] > maxLen) maxLen = len[t][i]; + if (len[t][i] < minLen) minLen = len[t][i]; + } + hbCreateDecodeTables ( + &limit[t][0], &base[t][0], &perm[t][0], &len[t][0], + minLen, maxLen, alphaSize + ); + minLens[t] = minLen; + } +} + + +/*---------------------------------------------*/ +#define GET_MTF_VAL(lval) \ +{ \ + Int32 zt, zn, zvec, zj; \ + if (groupPos == 0) { \ + groupNo++; \ + groupPos = G_SIZE; \ + } \ + groupPos--; \ + zt = selector[groupNo]; \ + zn = minLens[zt]; \ + zvec = bsR ( zn ); \ + while (zvec > limit[zt][zn]) { \ + zn++; bsR1(zj); \ + zvec = (zvec << 1) | zj; \ + }; \ + lval = perm[zt][zvec - base[zt][zn]]; \ +} + + +/*---------------------------------------------*/ +void getAndMoveToFrontDecode ( void ) +{ + UChar yy[256]; + Int32 i, j, nextSym, limitLast; + Int32 EOB, groupNo, groupPos; + + limitLast = 100000 * blockSize100k; + origPtr = bsGetIntVS ( 24 ); + + recvDecodingTables(); + EOB = nInUse+1; + groupNo = -1; + groupPos = 0; + + /*-- + Setting up the unzftab entries here is not strictly + necessary, but it does save having to do it later + in a separate pass, and so saves a block's worth of + cache misses. + --*/ + for (i = 0; i <= 255; i++) unzftab[i] = 0; + + for (i = 0; i <= 255; i++) yy[i] = (UChar) i; + + last = -1; + + GET_MTF_VAL(nextSym); + + while (True) { + + if (nextSym == EOB) break; + + if (nextSym == RUNA || nextSym == RUNB) { + UChar ch; + Int32 s = -1; + Int32 N = 1; + do { + if (nextSym == RUNA) s = s + (0+1) * N; else + if (nextSym == RUNB) s = s + (1+1) * N; + N = N * 2; + GET_MTF_VAL(nextSym); + } + while (nextSym == RUNA || nextSym == RUNB); + + s++; + ch = seqToUnseq[yy[0]]; + unzftab[ch] += s; + + if (smallMode) + while (s > 0) { + last++; + ll16[last] = ch; + s--; + } + else + while (s > 0) { + last++; + ll8[last] = ch; + s--; + }; + + if (last >= limitLast) blockOverrun(); + continue; + + } else { + + UChar tmp; + last++; if (last >= limitLast) blockOverrun(); + + tmp = yy[nextSym-1]; + unzftab[seqToUnseq[tmp]]++; + if (smallMode) + ll16[last] = seqToUnseq[tmp]; else + ll8[last] = seqToUnseq[tmp]; + + /*-- + This loop is hammered during decompression, + hence the unrolling. + + for (j = nextSym-1; j > 0; j--) yy[j] = yy[j-1]; + --*/ + + j = nextSym-1; + for (; j > 3; j -= 4) { + yy[j] = yy[j-1]; + yy[j-1] = yy[j-2]; + yy[j-2] = yy[j-3]; + yy[j-3] = yy[j-4]; + } + for (; j > 0; j--) yy[j] = yy[j-1]; + + yy[0] = tmp; + GET_MTF_VAL(nextSym); + continue; + } + } +} + + +/*---------------------------------------------------*/ +/*--- Block-sorting machinery ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +/*-- + Compare two strings in block. We assume (see + discussion above) that i1 and i2 have a max + offset of 10 on entry, and that the first + bytes of both block and quadrant have been + copied into the "overshoot area", ie + into the subscript range + [last+1 .. last+NUM_OVERSHOOT_BYTES]. +--*/ +INLINE Bool fullGtU ( Int32 i1, Int32 i2 ) +{ + Int32 k; + UChar c1, c2; + UInt16 s1, s2; + + #if DEBUG + /*-- + shellsort shouldn't ask to compare + something with itself. + --*/ + assert (i1 != i2); + #endif + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + + k = last + 1; + + do { + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; + s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; + s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; + s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + + c1 = block[i1]; + c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; + s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + + if (i1 > last) { i1 -= last; i1--; }; + if (i2 > last) { i2 -= last; i2--; }; + + k -= 4; + workDone++; + } + while (k >= 0); + + return False; +} + +/*---------------------------------------------*/ +/*-- + Knuth's increments seem to work better + than Incerpi-Sedgewick here. Possibly + because the number of elems to sort is + usually small, typically <= 20. +--*/ +Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, + 9841, 29524, 88573, 265720, + 797161, 2391484 }; + +void simpleSort ( Int32 lo, Int32 hi, Int32 d ) +{ + Int32 i, j, h, bigN, hp; + Int32 v; + + bigN = hi - lo + 1; + if (bigN < 2) return; + + hp = 0; + while (incs[hp] < bigN) hp++; + hp--; + + for (; hp >= 0; hp--) { + h = incs[hp]; + if (verbosity >= 5) + fprintf ( stderr, " shell increment %d\n", h ); + + i = lo + h; + while (True) { + + /*-- copy 1 --*/ + if (i > hi) break; + v = zptr[i]; + j = i; + while ( fullGtU ( zptr[j-h]+d, v+d ) ) { + zptr[j] = zptr[j-h]; + j = j - h; + if (j <= (lo + h - 1)) break; + } + zptr[j] = v; + i++; + + /*-- copy 2 --*/ + if (i > hi) break; + v = zptr[i]; + j = i; + while ( fullGtU ( zptr[j-h]+d, v+d ) ) { + zptr[j] = zptr[j-h]; + j = j - h; + if (j <= (lo + h - 1)) break; + } + zptr[j] = v; + i++; + + /*-- copy 3 --*/ + if (i > hi) break; + v = zptr[i]; + j = i; + while ( fullGtU ( zptr[j-h]+d, v+d ) ) { + zptr[j] = zptr[j-h]; + j = j - h; + if (j <= (lo + h - 1)) break; + } + zptr[j] = v; + i++; + + if (workDone > workLimit && firstAttempt) return; + } + } +} + + +/*---------------------------------------------*/ +/*-- + The following is an implementation of + an elegant 3-way quicksort for strings, + described in a paper "Fast Algorithms for + Sorting and Searching Strings", by Robert + Sedgewick and Jon L. Bentley. +--*/ + +#define swap(lv1, lv2) \ + { Int32 tmp = lv1; lv1 = lv2; lv2 = tmp; } + +INLINE void vswap ( Int32 p1, Int32 p2, Int32 n ) +{ + while (n > 0) { + swap(zptr[p1], zptr[p2]); + p1++; p2++; n--; + } +} + +INLINE UChar med3 ( UChar a, UChar b, UChar c ) +{ + UChar t; + if (a > b) { t = a; a = b; b = t; }; + if (b > c) { t = b; b = c; c = t; }; + if (a > b) b = a; + return b; +} + + +#define min(a,b) ((a) < (b)) ? (a) : (b) + +typedef + struct { Int32 ll; Int32 hh; Int32 dd; } + StackElem; + +#define push(lz,hz,dz) { stack[sp].ll = lz; \ + stack[sp].hh = hz; \ + stack[sp].dd = dz; \ + sp++; } + +#define pop(lz,hz,dz) { sp--; \ + lz = stack[sp].ll; \ + hz = stack[sp].hh; \ + dz = stack[sp].dd; } + +#define SMALL_THRESH 20 +#define DEPTH_THRESH 10 + +/*-- + If you are ever unlucky/improbable enough + to get a stack overflow whilst sorting, + increase the following constant and try + again. In practice I have never seen the + stack go above 27 elems, so the following + limit seems very generous. +--*/ +#define QSORT_STACK_SIZE 1000 + + +void qSort3 ( Int32 loSt, Int32 hiSt, Int32 dSt ) +{ + Int32 unLo, unHi, ltLo, gtHi, med, n, m; + Int32 sp, lo, hi, d; + StackElem stack[QSORT_STACK_SIZE]; + + sp = 0; + push ( loSt, hiSt, dSt ); + + while (sp > 0) { + + if (sp >= QSORT_STACK_SIZE) panic ( "stack overflow in qSort3" ); + + pop ( lo, hi, d ); + + if (hi - lo < SMALL_THRESH || d > DEPTH_THRESH) { + simpleSort ( lo, hi, d ); + if (workDone > workLimit && firstAttempt) return; + continue; + } + + med = med3 ( block[zptr[ lo ]+d], + block[zptr[ hi ]+d], + block[zptr[ (lo+hi)>>1 ]+d] ); + + unLo = ltLo = lo; + unHi = gtHi = hi; + + while (True) { + while (True) { + if (unLo > unHi) break; + n = ((Int32)block[zptr[unLo]+d]) - med; + if (n == 0) { swap(zptr[unLo], zptr[ltLo]); ltLo++; unLo++; continue; }; + if (n > 0) break; + unLo++; + } + while (True) { + if (unLo > unHi) break; + n = ((Int32)block[zptr[unHi]+d]) - med; + if (n == 0) { swap(zptr[unHi], zptr[gtHi]); gtHi--; unHi--; continue; }; + if (n < 0) break; + unHi--; + } + if (unLo > unHi) break; + swap(zptr[unLo], zptr[unHi]); unLo++; unHi--; + } + #if DEBUG + assert (unHi == unLo-1); + #endif + + if (gtHi < ltLo) { + push(lo, hi, d+1 ); + continue; + } + + n = min(ltLo-lo, unLo-ltLo); vswap(lo, unLo-n, n); + m = min(hi-gtHi, gtHi-unHi); vswap(unLo, hi-m+1, m); + + n = lo + unLo - ltLo - 1; + m = hi - (gtHi - unHi) + 1; + + push ( lo, n, d ); + push ( n+1, m-1, d+1 ); + push ( m, hi, d ); + } +} + + +/*---------------------------------------------*/ + +#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) + +#define SETMASK (1 << 21) +#define CLEARMASK (~(SETMASK)) + +void sortIt ( void ) +{ + Int32 i, j, ss, sb; + Int32 runningOrder[256]; + Int32 copy[256]; + Bool bigDone[256]; + UChar c1, c2; + Int32 numQSorted; + + /*-- + In the various block-sized structures, live data runs + from 0 to last+NUM_OVERSHOOT_BYTES inclusive. First, + set up the overshoot area for block. + --*/ + + if (verbosity >= 4) fprintf ( stderr, " sort initialise ...\n" ); + for (i = 0; i < NUM_OVERSHOOT_BYTES; i++) + block[last+i+1] = block[i % (last+1)]; + for (i = 0; i <= last+NUM_OVERSHOOT_BYTES; i++) + quadrant[i] = 0; + + block[-1] = block[last]; + + if (last < 4000) { + + /*-- + Use simpleSort(), since the full sorting mechanism + has quite a large constant overhead. + --*/ + if (verbosity >= 4) fprintf ( stderr, " simpleSort ...\n" ); + for (i = 0; i <= last; i++) zptr[i] = i; + firstAttempt = False; + workDone = workLimit = 0; + simpleSort ( 0, last, 0 ); + if (verbosity >= 4) fprintf ( stderr, " simpleSort done.\n" ); + + } else { + + numQSorted = 0; + for (i = 0; i <= 255; i++) bigDone[i] = False; + + if (verbosity >= 4) fprintf ( stderr, " bucket sorting ...\n" ); + + for (i = 0; i <= 65536; i++) ftab[i] = 0; + + c1 = block[-1]; + for (i = 0; i <= last; i++) { + c2 = block[i]; + ftab[(c1 << 8) + c2]++; + c1 = c2; + } + + for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; + + c1 = block[0]; + for (i = 0; i < last; i++) { + c2 = block[i+1]; + j = (c1 << 8) + c2; + c1 = c2; + ftab[j]--; + zptr[ftab[j]] = i; + } + j = (block[last] << 8) + block[0]; + ftab[j]--; + zptr[ftab[j]] = last; + + /*-- + Now ftab contains the first loc of every small bucket. + Calculate the running order, from smallest to largest + big bucket. + --*/ + + for (i = 0; i <= 255; i++) runningOrder[i] = i; + + { + Int32 vv; + Int32 h = 1; + do h = 3 * h + 1; while (h <= 256); + do { + h = h / 3; + for (i = h; i <= 255; i++) { + vv = runningOrder[i]; + j = i; + while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { + runningOrder[j] = runningOrder[j-h]; + j = j - h; + if (j <= (h - 1)) goto zero; + } + zero: + runningOrder[j] = vv; + } + } while (h != 1); + } + + /*-- + The main sorting loop. + --*/ + + for (i = 0; i <= 255; i++) { + + /*-- + Process big buckets, starting with the least full. + --*/ + ss = runningOrder[i]; + + /*-- + Complete the big bucket [ss] by quicksorting + any unsorted small buckets [ss, j]. Hopefully + previous pointer-scanning phases have already + completed many of the small buckets [ss, j], so + we don't have to sort them at all. + --*/ + for (j = 0; j <= 255; j++) { + sb = (ss << 8) + j; + if ( ! (ftab[sb] & SETMASK) ) { + Int32 lo = ftab[sb] & CLEARMASK; + Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; + if (hi > lo) { + if (verbosity >= 4) + fprintf ( stderr, + " qsort [0x%x, 0x%x] done %d this %d\n", + ss, j, numQSorted, hi - lo + 1 ); + qSort3 ( lo, hi, 2 ); + numQSorted += ( hi - lo + 1 ); + if (workDone > workLimit && firstAttempt) return; + } + ftab[sb] |= SETMASK; + } + } + + /*-- + The ss big bucket is now done. Record this fact, + and update the quadrant descriptors. Remember to + update quadrants in the overshoot area too, if + necessary. The "if (i < 255)" test merely skips + this updating for the last bucket processed, since + updating for the last bucket is pointless. + --*/ + bigDone[ss] = True; + + if (i < 255) { + Int32 bbStart = ftab[ss << 8] & CLEARMASK; + Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; + Int32 shifts = 0; + + while ((bbSize >> shifts) > 65534) shifts++; + + for (j = 0; j < bbSize; j++) { + Int32 a2update = zptr[bbStart + j]; + UInt16 qVal = (UInt16)(j >> shifts); + quadrant[a2update] = qVal; + if (a2update < NUM_OVERSHOOT_BYTES) + quadrant[a2update + last + 1] = qVal; + } + + if (! ( ((bbSize-1) >> shifts) <= 65535 )) panic ( "sortIt" ); + } + + /*-- + Now scan this big bucket so as to synthesise the + sorted order for small buckets [t, ss] for all t != ss. + --*/ + for (j = 0; j <= 255; j++) + copy[j] = ftab[(j << 8) + ss] & CLEARMASK; + + for (j = ftab[ss << 8] & CLEARMASK; + j < (ftab[(ss+1) << 8] & CLEARMASK); + j++) { + c1 = block[zptr[j]-1]; + if ( ! bigDone[c1] ) { + zptr[copy[c1]] = zptr[j] == 0 ? last : zptr[j] - 1; + copy[c1] ++; + } + } + + for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; + } + if (verbosity >= 4) + fprintf ( stderr, " %d pointers, %d sorted, %d scanned\n", + last+1, numQSorted, (last+1) - numQSorted ); + } +} + + +/*---------------------------------------------------*/ +/*--- Stuff for randomising repetitive blocks ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +Int32 rNums[512] = { + 619, 720, 127, 481, 931, 816, 813, 233, 566, 247, + 985, 724, 205, 454, 863, 491, 741, 242, 949, 214, + 733, 859, 335, 708, 621, 574, 73, 654, 730, 472, + 419, 436, 278, 496, 867, 210, 399, 680, 480, 51, + 878, 465, 811, 169, 869, 675, 611, 697, 867, 561, + 862, 687, 507, 283, 482, 129, 807, 591, 733, 623, + 150, 238, 59, 379, 684, 877, 625, 169, 643, 105, + 170, 607, 520, 932, 727, 476, 693, 425, 174, 647, + 73, 122, 335, 530, 442, 853, 695, 249, 445, 515, + 909, 545, 703, 919, 874, 474, 882, 500, 594, 612, + 641, 801, 220, 162, 819, 984, 589, 513, 495, 799, + 161, 604, 958, 533, 221, 400, 386, 867, 600, 782, + 382, 596, 414, 171, 516, 375, 682, 485, 911, 276, + 98, 553, 163, 354, 666, 933, 424, 341, 533, 870, + 227, 730, 475, 186, 263, 647, 537, 686, 600, 224, + 469, 68, 770, 919, 190, 373, 294, 822, 808, 206, + 184, 943, 795, 384, 383, 461, 404, 758, 839, 887, + 715, 67, 618, 276, 204, 918, 873, 777, 604, 560, + 951, 160, 578, 722, 79, 804, 96, 409, 713, 940, + 652, 934, 970, 447, 318, 353, 859, 672, 112, 785, + 645, 863, 803, 350, 139, 93, 354, 99, 820, 908, + 609, 772, 154, 274, 580, 184, 79, 626, 630, 742, + 653, 282, 762, 623, 680, 81, 927, 626, 789, 125, + 411, 521, 938, 300, 821, 78, 343, 175, 128, 250, + 170, 774, 972, 275, 999, 639, 495, 78, 352, 126, + 857, 956, 358, 619, 580, 124, 737, 594, 701, 612, + 669, 112, 134, 694, 363, 992, 809, 743, 168, 974, + 944, 375, 748, 52, 600, 747, 642, 182, 862, 81, + 344, 805, 988, 739, 511, 655, 814, 334, 249, 515, + 897, 955, 664, 981, 649, 113, 974, 459, 893, 228, + 433, 837, 553, 268, 926, 240, 102, 654, 459, 51, + 686, 754, 806, 760, 493, 403, 415, 394, 687, 700, + 946, 670, 656, 610, 738, 392, 760, 799, 887, 653, + 978, 321, 576, 617, 626, 502, 894, 679, 243, 440, + 680, 879, 194, 572, 640, 724, 926, 56, 204, 700, + 707, 151, 457, 449, 797, 195, 791, 558, 945, 679, + 297, 59, 87, 824, 713, 663, 412, 693, 342, 606, + 134, 108, 571, 364, 631, 212, 174, 643, 304, 329, + 343, 97, 430, 751, 497, 314, 983, 374, 822, 928, + 140, 206, 73, 263, 980, 736, 876, 478, 430, 305, + 170, 514, 364, 692, 829, 82, 855, 953, 676, 246, + 369, 970, 294, 750, 807, 827, 150, 790, 288, 923, + 804, 378, 215, 828, 592, 281, 565, 555, 710, 82, + 896, 831, 547, 261, 524, 462, 293, 465, 502, 56, + 661, 821, 976, 991, 658, 869, 905, 758, 745, 193, + 768, 550, 608, 933, 378, 286, 215, 979, 792, 961, + 61, 688, 793, 644, 986, 403, 106, 366, 905, 644, + 372, 567, 466, 434, 645, 210, 389, 550, 919, 135, + 780, 773, 635, 389, 707, 100, 626, 958, 165, 504, + 920, 176, 193, 713, 857, 265, 203, 50, 668, 108, + 645, 990, 626, 197, 510, 357, 358, 850, 858, 364, + 936, 638 +}; + + +#define RAND_DECLS \ + Int32 rNToGo = 0; \ + Int32 rTPos = 0; \ + +#define RAND_MASK ((rNToGo == 1) ? 1 : 0) + +#define RAND_UPD_MASK \ + if (rNToGo == 0) { \ + rNToGo = rNums[rTPos]; \ + rTPos++; if (rTPos == 512) rTPos = 0; \ + } \ + rNToGo--; + + + +/*---------------------------------------------------*/ +/*--- The Reversible Transformation (tm) ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +void randomiseBlock ( void ) +{ + Int32 i; + RAND_DECLS; + for (i = 0; i < 256; i++) inUse[i] = False; + + for (i = 0; i <= last; i++) { + RAND_UPD_MASK; + block[i] ^= RAND_MASK; + inUse[block[i]] = True; + } +} + + +/*---------------------------------------------*/ +void doReversibleTransformation ( void ) +{ + Int32 i; + + if (verbosity >= 2) fprintf ( stderr, "\n" ); + + workLimit = workFactor * last; + workDone = 0; + blockRandomised = False; + firstAttempt = True; + + sortIt (); + + if (verbosity >= 3) + fprintf ( stderr, " %d work, %d block, ratio %5.2f\n", + workDone, last, (float)workDone / (float)(last) ); + + if (workDone > workLimit && firstAttempt) { + if (verbosity >= 2) + fprintf ( stderr, " sorting aborted; randomising block\n" ); + randomiseBlock (); + workLimit = workDone = 0; + blockRandomised = True; + firstAttempt = False; + sortIt(); + if (verbosity >= 3) + fprintf ( stderr, " %d work, %d block, ratio %f\n", + workDone, last, (float)workDone / (float)(last) ); + } + + origPtr = -1; + for (i = 0; i <= last; i++) + if (zptr[i] == 0) + { origPtr = i; break; }; + + if (origPtr == -1) panic ( "doReversibleTransformation" ); +} + + +/*---------------------------------------------*/ + +INLINE Int32 indexIntoF ( Int32 indx, Int32 *cftab ) +{ + Int32 nb, na, mid; + nb = 0; + na = 256; + do { + mid = (nb + na) >> 1; + if (indx >= cftab[mid]) nb = mid; else na = mid; + } + while (na - nb != 1); + return nb; +} + + +#define GET_SMALL(cccc) \ + \ + cccc = indexIntoF ( tPos, cftab ); \ + tPos = GET_LL(tPos); + + +void undoReversibleTransformation_small ( FILE* dst ) +{ + Int32 cftab[257], cftabAlso[257]; + Int32 i, j, tmp, tPos; + UChar ch; + + /*-- + We assume here that the global array unzftab will + already be holding the frequency counts for + ll8[0 .. last]. + --*/ + + /*-- Set up cftab to facilitate generation of indexIntoF --*/ + cftab[0] = 0; + for (i = 1; i <= 256; i++) cftab[i] = unzftab[i-1]; + for (i = 1; i <= 256; i++) cftab[i] += cftab[i-1]; + + /*-- Make a copy of it, used in generation of T --*/ + for (i = 0; i <= 256; i++) cftabAlso[i] = cftab[i]; + + /*-- compute the T vector --*/ + for (i = 0; i <= last; i++) { + ch = (UChar)ll16[i]; + SET_LL(i, cftabAlso[ch]); + cftabAlso[ch]++; + } + + /*-- + Compute T^(-1) by pointer reversal on T. This is rather + subtle, in that, if the original block was two or more + (in general, N) concatenated copies of the same thing, + the T vector will consist of N cycles, each of length + blocksize / N, and decoding will involve traversing one + of these cycles N times. Which particular cycle doesn't + matter -- they are all equivalent. The tricky part is to + make sure that the pointer reversal creates a correct + reversed cycle for us to traverse. So, the code below + simply reverses whatever cycle origPtr happens to fall into, + without regard to the cycle length. That gives one reversed + cycle, which for normal blocks, is the entire block-size long. + For repeated blocks, it will be interspersed with the other + N-1 non-reversed cycles. Providing that the F-subscripting + phase which follows starts at origPtr, all then works ok. + --*/ + i = origPtr; + j = GET_LL(i); + do { + tmp = GET_LL(j); + SET_LL(j, i); + i = j; + j = tmp; + } + while (i != origPtr); + + /*-- + We recreate the original by subscripting F through T^(-1). + The run-length-decoder below requires characters incrementally, + so tPos is set to a starting value, and is updated by + the GET_SMALL macro. + --*/ + tPos = origPtr; + + /*-------------------------------------------------*/ + /*-- + This is pretty much a verbatim copy of the + run-length decoder present in the distribution + bzip-0.21; it has to be here to avoid creating + block[] as an intermediary structure. As in 0.21, + this code derives from some sent to me by + Christian von Roques. + + It allows dst==NULL, so as to support the test (-t) + option without slowing down the fast decompression + code. + --*/ + { + IntNative retVal; + Int32 i2, count, chPrev, ch2; + UInt32 localCrc; + + count = 0; + i2 = 0; + ch2 = 256; /*-- not a char and not EOF --*/ + localCrc = getGlobalCRC(); + + { + RAND_DECLS; + while ( i2 <= last ) { + chPrev = ch2; + GET_SMALL(ch2); + if (blockRandomised) { + RAND_UPD_MASK; + ch2 ^= (UInt32)RAND_MASK; + } + i2++; + + if (dst) + retVal = putc ( ch2, dst ); + + UPDATE_CRC ( localCrc, (UChar)ch2 ); + + if (ch2 != chPrev) { + count = 1; + } else { + count++; + if (count >= 4) { + Int32 j2; + UChar z; + GET_SMALL(z); + if (blockRandomised) { + RAND_UPD_MASK; + z ^= RAND_MASK; + } + for (j2 = 0; j2 < (Int32)z; j2++) { + if (dst) retVal = putc (ch2, dst); + UPDATE_CRC ( localCrc, (UChar)ch2 ); + } + i2++; + count = 0; + } + } + } + } + + setGlobalCRC ( localCrc ); + } + /*-- end of the in-line run-length-decoder. --*/ +} +#undef GET_SMALL + + +/*---------------------------------------------*/ + +#define GET_FAST(cccc) \ + \ + cccc = ll8[tPos]; \ + tPos = tt[tPos]; + + +void undoReversibleTransformation_fast ( FILE* dst ) +{ + Int32 cftab[257]; + Int32 i, tPos; + UChar ch; + + /*-- + We assume here that the global array unzftab will + already be holding the frequency counts for + ll8[0 .. last]. + --*/ + + /*-- Set up cftab to facilitate generation of T^(-1) --*/ + cftab[0] = 0; + for (i = 1; i <= 256; i++) cftab[i] = unzftab[i-1]; + for (i = 1; i <= 256; i++) cftab[i] += cftab[i-1]; + + /*-- compute the T^(-1) vector --*/ + for (i = 0; i <= last; i++) { + ch = (UChar)ll8[i]; + tt[cftab[ch]] = i; + cftab[ch]++; + } + + /*-- + We recreate the original by subscripting L through T^(-1). + The run-length-decoder below requires characters incrementally, + so tPos is set to a starting value, and is updated by + the GET_FAST macro. + --*/ + tPos = tt[origPtr]; + + /*-------------------------------------------------*/ + /*-- + This is pretty much a verbatim copy of the + run-length decoder present in the distribution + bzip-0.21; it has to be here to avoid creating + block[] as an intermediary structure. As in 0.21, + this code derives from some sent to me by + Christian von Roques. + --*/ + { + IntNative retVal; + Int32 i2, count, chPrev, ch2; + UInt32 localCrc; + + count = 0; + i2 = 0; + ch2 = 256; /*-- not a char and not EOF --*/ + localCrc = getGlobalCRC(); + + if (blockRandomised) { + RAND_DECLS; + while ( i2 <= last ) { + chPrev = ch2; + GET_FAST(ch2); + RAND_UPD_MASK; + ch2 ^= (UInt32)RAND_MASK; + i2++; + + retVal = putc ( ch2, dst ); + UPDATE_CRC ( localCrc, (UChar)ch2 ); + + if (ch2 != chPrev) { + count = 1; + } else { + count++; + if (count >= 4) { + Int32 j2; + UChar z; + GET_FAST(z); + RAND_UPD_MASK; + z ^= RAND_MASK; + for (j2 = 0; j2 < (Int32)z; j2++) { + retVal = putc (ch2, dst); + UPDATE_CRC ( localCrc, (UChar)ch2 ); + } + i2++; + count = 0; + } + } + } + + } else { + + while ( i2 <= last ) { + chPrev = ch2; + GET_FAST(ch2); + i2++; + + retVal = putc ( ch2, dst ); + UPDATE_CRC ( localCrc, (UChar)ch2 ); + + if (ch2 != chPrev) { + count = 1; + } else { + count++; + if (count >= 4) { + Int32 j2; + UChar z; + GET_FAST(z); + for (j2 = 0; j2 < (Int32)z; j2++) { + retVal = putc (ch2, dst); + UPDATE_CRC ( localCrc, (UChar)ch2 ); + } + i2++; + count = 0; + } + } + } + + } /*-- if (blockRandomised) --*/ + + setGlobalCRC ( localCrc ); + } + /*-- end of the in-line run-length-decoder. --*/ +} +#undef GET_FAST + + +/*---------------------------------------------------*/ +/*--- The block loader and RLEr ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +/* Top 16: run length, 1 to 255. +* Lower 16: the char, or MY_EOF for EOF. +*/ + +#define MY_EOF 257 + +INLINE Int32 getRLEpair ( FILE* src ) +{ + Int32 runLength; + IntNative ch, chLatest; + + ch = getc ( src ); + + /*--- Because I have no idea what kind of a value EOF is. ---*/ + if (ch == EOF) { + ERROR_IF_NOT_ZERO ( errno ); + return (1 << 16) | MY_EOF; + } + + runLength = 0; + do { + chLatest = getc ( src ); + runLength++; + bytesIn++; + } + while (ch == chLatest && runLength < 255); + + if ( chLatest != EOF ) { + if ( ungetc ( chLatest, src ) == EOF ) + panic ( "getRLEpair: ungetc failed" ); + } else { + ERROR_IF_NOT_ZERO ( errno ); + } + + /*--- Conditional is just a speedup hack. ---*/ + if (runLength == 1) { + UPDATE_CRC ( globalCrc, (UChar)ch ); + return (1 << 16) | ch; + } else { + Int32 i; + for (i = 1; i <= runLength; i++) + UPDATE_CRC ( globalCrc, (UChar)ch ); + return (runLength << 16) | ch; + } +} + + +/*---------------------------------------------*/ +void loadAndRLEsource ( FILE* src ) +{ + Int32 ch, allowableBlockSize, i; + + last = -1; + ch = 0; + + for (i = 0; i < 256; i++) inUse[i] = False; + + /*--- 20 is just a paranoia constant ---*/ + allowableBlockSize = 100000 * blockSize100k - 20; + + while (last < allowableBlockSize && ch != MY_EOF) { + Int32 rlePair, runLen; + rlePair = getRLEpair ( src ); + ch = rlePair & 0xFFFF; + runLen = (UInt32)rlePair >> 16; + + #if DEBUG + assert (runLen >= 1 && runLen <= 255); + #endif + + if (ch != MY_EOF) { + inUse[ch] = True; + switch (runLen) { + case 1: + last++; block[last] = (UChar)ch; break; + case 2: + last++; block[last] = (UChar)ch; + last++; block[last] = (UChar)ch; break; + case 3: + last++; block[last] = (UChar)ch; + last++; block[last] = (UChar)ch; + last++; block[last] = (UChar)ch; break; + default: + inUse[runLen-4] = True; + last++; block[last] = (UChar)ch; + last++; block[last] = (UChar)ch; + last++; block[last] = (UChar)ch; + last++; block[last] = (UChar)ch; + last++; block[last] = (UChar)(runLen-4); break; + } + } + } +} + + +/*---------------------------------------------------*/ +/*--- Processing of complete files and streams ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +void compressStream ( FILE *stream, FILE *zStream ) +{ + IntNative retVal; + UInt32 blockCRC, combinedCRC; + Int32 blockNo; + + blockNo = 0; + bytesIn = 0; + bytesOut = 0; + nBlocksRandomised = 0; + + SET_BINARY_MODE(stream); + SET_BINARY_MODE(zStream); + + ERROR_IF_NOT_ZERO ( ferror(stream) ); + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + + bsSetStream ( zStream, True ); + + /*--- Write `magic' bytes B and Z, + then h indicating file-format == huffmanised, + followed by a digit indicating blockSize100k. + ---*/ + bsPutUChar ( 'B' ); + bsPutUChar ( 'Z' ); + bsPutUChar ( 'h' ); + bsPutUChar ( '0' + blockSize100k ); + + combinedCRC = 0; + + if (verbosity >= 2) fprintf ( stderr, "\n" ); + + while (True) { + + blockNo++; + initialiseCRC (); + loadAndRLEsource ( stream ); + ERROR_IF_NOT_ZERO ( ferror(stream) ); + if (last == -1) break; + + blockCRC = getFinalCRC (); + combinedCRC = (combinedCRC << 1) | (combinedCRC >> 31); + combinedCRC ^= blockCRC; + + if (verbosity >= 2) + fprintf ( stderr, " block %d: crc = 0x%8x, combined CRC = 0x%8x, size = %d", + blockNo, blockCRC, combinedCRC, last+1 ); + + /*-- sort the block and establish posn of original string --*/ + doReversibleTransformation (); + + /*-- + A 6-byte block header, the value chosen arbitrarily + as 0x314159265359 :-). A 32 bit value does not really + give a strong enough guarantee that the value will not + appear by chance in the compressed datastream. Worst-case + probability of this event, for a 900k block, is about + 2.0e-3 for 32 bits, 1.0e-5 for 40 bits and 4.0e-8 for 48 bits. + For a compressed file of size 100Gb -- about 100000 blocks -- + only a 48-bit marker will do. NB: normal compression/ + decompression do *not* rely on these statistical properties. + They are only important when trying to recover blocks from + damaged files. + --*/ + bsPutUChar ( 0x31 ); bsPutUChar ( 0x41 ); + bsPutUChar ( 0x59 ); bsPutUChar ( 0x26 ); + bsPutUChar ( 0x53 ); bsPutUChar ( 0x59 ); + + /*-- Now the block's CRC, so it is in a known place. --*/ + bsPutUInt32 ( blockCRC ); + + /*-- Now a single bit indicating randomisation. --*/ + if (blockRandomised) { + bsW(1,1); nBlocksRandomised++; + } else + bsW(1,0); + + /*-- Finally, block's contents proper. --*/ + moveToFrontCodeAndSend (); + + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + } + + if (verbosity >= 2 && nBlocksRandomised > 0) + fprintf ( stderr, " %d block%s needed randomisation\n", + nBlocksRandomised, + nBlocksRandomised == 1 ? "" : "s" ); + + /*-- + Now another magic 48-bit number, 0x177245385090, to + indicate the end of the last block. (sqrt(pi), if + you want to know. I did want to use e, but it contains + too much repetition -- 27 18 28 18 28 46 -- for me + to feel statistically comfortable. Call me paranoid.) + --*/ + + bsPutUChar ( 0x17 ); bsPutUChar ( 0x72 ); + bsPutUChar ( 0x45 ); bsPutUChar ( 0x38 ); + bsPutUChar ( 0x50 ); bsPutUChar ( 0x90 ); + + bsPutUInt32 ( combinedCRC ); + if (verbosity >= 2) + fprintf ( stderr, " final combined CRC = 0x%x\n ", combinedCRC ); + + /*-- Close the files in an utterly paranoid way. --*/ + bsFinishedWithStream (); + + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + retVal = fflush ( zStream ); + ERROR_IF_EOF ( retVal ); + retVal = fclose ( zStream ); + ERROR_IF_EOF ( retVal ); + + ERROR_IF_NOT_ZERO ( ferror(stream) ); + retVal = fclose ( stream ); + ERROR_IF_EOF ( retVal ); + + if (bytesIn == 0) bytesIn = 1; + if (bytesOut == 0) bytesOut = 1; + + if (verbosity >= 1) + fprintf ( stderr, "%6.3f:1, %6.3f bits/byte, " + "%5.2f%% saved, %d in, %d out.\n", + (float)bytesIn / (float)bytesOut, + (8.0 * (float)bytesOut) / (float)bytesIn, + 100.0 * (1.0 - (float)bytesOut / (float)bytesIn), + bytesIn, + bytesOut + ); +} + + +/*---------------------------------------------*/ +Bool uncompressStream ( FILE *zStream, FILE *stream ) +{ + UChar magic1, magic2, magic3, magic4; + UChar magic5, magic6; + UInt32 storedBlockCRC, storedCombinedCRC; + UInt32 computedBlockCRC, computedCombinedCRC; + Int32 currBlockNo; + IntNative retVal; + + SET_BINARY_MODE(stream); + SET_BINARY_MODE(zStream); + + ERROR_IF_NOT_ZERO ( ferror(stream) ); + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + + bsSetStream ( zStream, False ); + + /*-- + A bad magic number is `recoverable from'; + return with False so the caller skips the file. + --*/ + magic1 = bsGetUChar (); + magic2 = bsGetUChar (); + magic3 = bsGetUChar (); + magic4 = bsGetUChar (); + if (magic1 != 'B' || + magic2 != 'Z' || + magic3 != 'h' || + magic4 < '1' || + magic4 > '9') { + bsFinishedWithStream(); + retVal = fclose ( stream ); + ERROR_IF_EOF ( retVal ); + return False; + } + + setDecompressStructureSizes ( magic4 - '0' ); + computedCombinedCRC = 0; + + if (verbosity >= 2) fprintf ( stderr, "\n " ); + currBlockNo = 0; + + while (True) { + magic1 = bsGetUChar (); + magic2 = bsGetUChar (); + magic3 = bsGetUChar (); + magic4 = bsGetUChar (); + magic5 = bsGetUChar (); + magic6 = bsGetUChar (); + if (magic1 == 0x17 && magic2 == 0x72 && + magic3 == 0x45 && magic4 == 0x38 && + magic5 == 0x50 && magic6 == 0x90) break; + + if (magic1 != 0x31 || magic2 != 0x41 || + magic3 != 0x59 || magic4 != 0x26 || + magic5 != 0x53 || magic6 != 0x59) badBlockHeader(); + + storedBlockCRC = bsGetUInt32 (); + + if (bsR(1) == 1) + blockRandomised = True; else + blockRandomised = False; + + currBlockNo++; + if (verbosity >= 2) + fprintf ( stderr, "[%d: huff+mtf ", currBlockNo ); + getAndMoveToFrontDecode (); + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + + initialiseCRC(); + if (verbosity >= 2) fprintf ( stderr, "rt+rld" ); + if (smallMode) + undoReversibleTransformation_small ( stream ); + else + undoReversibleTransformation_fast ( stream ); + + ERROR_IF_NOT_ZERO ( ferror(stream) ); + + computedBlockCRC = getFinalCRC(); + if (verbosity >= 3) + fprintf ( stderr, " {0x%x, 0x%x}", storedBlockCRC, computedBlockCRC ); + if (verbosity >= 2) fprintf ( stderr, "] " ); + + /*-- A bad CRC is considered a fatal error. --*/ + if (storedBlockCRC != computedBlockCRC) + crcError ( storedBlockCRC, computedBlockCRC ); + + computedCombinedCRC = (computedCombinedCRC << 1) | (computedCombinedCRC >> 31); + computedCombinedCRC ^= computedBlockCRC; + }; + + if (verbosity >= 2) fprintf ( stderr, "\n " ); + + storedCombinedCRC = bsGetUInt32 (); + if (verbosity >= 2) + fprintf ( stderr, + "combined CRCs: stored = 0x%x, computed = 0x%x\n ", + storedCombinedCRC, computedCombinedCRC ); + if (storedCombinedCRC != computedCombinedCRC) + crcError ( storedCombinedCRC, computedCombinedCRC ); + + + bsFinishedWithStream (); + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + retVal = fclose ( zStream ); + ERROR_IF_EOF ( retVal ); + + ERROR_IF_NOT_ZERO ( ferror(stream) ); + retVal = fflush ( stream ); + ERROR_IF_NOT_ZERO ( retVal ); + if (stream != stdout) { + retVal = fclose ( stream ); + ERROR_IF_EOF ( retVal ); + } + return True; +} + + +/*---------------------------------------------*/ +Bool testStream ( FILE *zStream ) +{ + UChar magic1, magic2, magic3, magic4; + UChar magic5, magic6; + UInt32 storedBlockCRC, storedCombinedCRC; + UInt32 computedBlockCRC, computedCombinedCRC; + Int32 currBlockNo; + IntNative retVal; + + SET_BINARY_MODE(zStream); + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + + bsSetStream ( zStream, False ); + + magic1 = bsGetUChar (); + magic2 = bsGetUChar (); + magic3 = bsGetUChar (); + magic4 = bsGetUChar (); + if (magic1 != 'B' || + magic2 != 'Z' || + magic3 != 'h' || + magic4 < '1' || + magic4 > '9') { + bsFinishedWithStream(); + fclose ( zStream ); + fprintf ( stderr, "\n%s: bad magic number (ie, not created by bzip2)\n", + inName ); + return False; + } + + smallMode = True; + setDecompressStructureSizes ( magic4 - '0' ); + computedCombinedCRC = 0; + + if (verbosity >= 2) fprintf ( stderr, "\n" ); + currBlockNo = 0; + + while (True) { + magic1 = bsGetUChar (); + magic2 = bsGetUChar (); + magic3 = bsGetUChar (); + magic4 = bsGetUChar (); + magic5 = bsGetUChar (); + magic6 = bsGetUChar (); + if (magic1 == 0x17 && magic2 == 0x72 && + magic3 == 0x45 && magic4 == 0x38 && + magic5 == 0x50 && magic6 == 0x90) break; + + currBlockNo++; + if (magic1 != 0x31 || magic2 != 0x41 || + magic3 != 0x59 || magic4 != 0x26 || + magic5 != 0x53 || magic6 != 0x59) { + bsFinishedWithStream(); + fclose ( zStream ); + fprintf ( stderr, + "\n%s, block %d: bad header (not == 0x314159265359)\n", + inName, currBlockNo ); + return False; + } + storedBlockCRC = bsGetUInt32 (); + + if (bsR(1) == 1) + blockRandomised = True; else + blockRandomised = False; + + if (verbosity >= 2) + fprintf ( stderr, " block [%d: huff+mtf ", currBlockNo ); + getAndMoveToFrontDecode (); + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + + initialiseCRC(); + if (verbosity >= 2) fprintf ( stderr, "rt+rld" ); + undoReversibleTransformation_small ( NULL ); + + computedBlockCRC = getFinalCRC(); + if (verbosity >= 3) + fprintf ( stderr, " {0x%x, 0x%x}", storedBlockCRC, computedBlockCRC ); + if (verbosity >= 2) fprintf ( stderr, "] " ); + + if (storedBlockCRC != computedBlockCRC) { + bsFinishedWithStream(); + fclose ( zStream ); + fprintf ( stderr, "\n%s, block %d: computed CRC does not match stored one\n", + inName, currBlockNo ); + return False; + } + + if (verbosity >= 2) fprintf ( stderr, "ok\n" ); + computedCombinedCRC = (computedCombinedCRC << 1) | (computedCombinedCRC >> 31); + computedCombinedCRC ^= computedBlockCRC; + }; + + storedCombinedCRC = bsGetUInt32 (); + if (verbosity >= 2) + fprintf ( stderr, + " combined CRCs: stored = 0x%x, computed = 0x%x\n ", + storedCombinedCRC, computedCombinedCRC ); + if (storedCombinedCRC != computedCombinedCRC) { + bsFinishedWithStream(); + fclose ( zStream ); + fprintf ( stderr, "\n%s: computed CRC does not match stored one\n", + inName ); + return False; + } + + bsFinishedWithStream (); + ERROR_IF_NOT_ZERO ( ferror(zStream) ); + retVal = fclose ( zStream ); + ERROR_IF_EOF ( retVal ); + return True; +} + + + +/*---------------------------------------------------*/ +/*--- Error [non-] handling grunge ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +void cadvise ( void ) +{ + fprintf ( + stderr, + "\nIt is possible that the compressed file(s) have become corrupted.\n" + "You can use the -tvv option to test integrity of such files.\n\n" + "You can use the `bzip2recover' program to *attempt* to recover\n" + "data from undamaged sections of corrupted files.\n\n" + ); +} + + +/*---------------------------------------------*/ +void showFileNames ( void ) +{ + fprintf ( + stderr, + "\tInput file = %s, output file = %s\n", + inName==NULL ? "(null)" : inName, + outName==NULL ? "(null)" : outName + ); +} + + +/*---------------------------------------------*/ +void cleanUpAndFail ( Int32 ec ) +{ + IntNative retVal; + + if ( srcMode == SM_F2F && opMode != OM_TEST ) { + fprintf ( stderr, "%s: Deleting output file %s, if it exists.\n", + progName, + outName==NULL ? "(null)" : outName ); + if (outputHandleJustInCase != NULL) + fclose ( outputHandleJustInCase ); + retVal = remove ( outName ); + if (retVal != 0) + fprintf ( stderr, + "%s: WARNING: deletion of output file (apparently) failed.\n", + progName ); + } + if (numFileNames > 0 && numFilesProcessed < numFileNames) { + fprintf ( stderr, + "%s: WARNING: some files have not been processed:\n" + "\t%d specified on command line, %d not processed yet.\n\n", + progName, numFileNames, + numFileNames - numFilesProcessed ); + } + exit ( ec ); +} + + +/*---------------------------------------------*/ +void panic ( Char* s ) +{ + fprintf ( stderr, + "\n%s: PANIC -- internal consistency error:\n" + "\t%s\n" + "\tThis is a BUG. Please report it to me at:\n" + "\tjseward@acm.org\n", + progName, s ); + showFileNames(); + cleanUpAndFail( 3 ); +} + + +/*---------------------------------------------*/ +void badBGLengths ( void ) +{ + fprintf ( stderr, + "\n%s: error when reading background model code lengths,\n" + "\twhich probably means the compressed file is corrupted.\n", + progName ); + showFileNames(); + cadvise(); + cleanUpAndFail( 2 ); +} + + +/*---------------------------------------------*/ +void crcError ( UInt32 crcStored, UInt32 crcComputed ) +{ + fprintf ( stderr, + "\n%s: Data integrity error when decompressing.\n" + "\tStored CRC = 0x%x, computed CRC = 0x%x\n", + progName, crcStored, crcComputed ); + showFileNames(); + cadvise(); + cleanUpAndFail( 2 ); +} + + +/*---------------------------------------------*/ +void compressedStreamEOF ( void ) +{ + fprintf ( stderr, + "\n%s: Compressed file ends unexpectedly;\n\t" + "perhaps it is corrupted? *Possible* reason follows.\n", + progName ); + perror ( progName ); + showFileNames(); + cadvise(); + cleanUpAndFail( 2 ); +} + + +/*---------------------------------------------*/ +void ioError ( ) +{ + fprintf ( stderr, + "\n%s: I/O or other error, bailing out. Possible reason follows.\n", + progName ); + perror ( progName ); + showFileNames(); + cleanUpAndFail( 1 ); +} + + +/*---------------------------------------------*/ +void blockOverrun () +{ + fprintf ( stderr, + "\n%s: block overrun during decompression,\n" + "\twhich probably means the compressed file\n" + "\tis corrupted.\n", + progName ); + showFileNames(); + cadvise(); + cleanUpAndFail( 2 ); +} + + +/*---------------------------------------------*/ +void badBlockHeader () +{ + fprintf ( stderr, + "\n%s: bad block header in the compressed file,\n" + "\twhich probably means it is corrupted.\n", + progName ); + showFileNames(); + cadvise(); + cleanUpAndFail( 2 ); +} + + +/*---------------------------------------------*/ +void bitStreamEOF () +{ + fprintf ( stderr, + "\n%s: read past the end of compressed data,\n" + "\twhich probably means it is corrupted.\n", + progName ); + showFileNames(); + cadvise(); + cleanUpAndFail( 2 ); +} + + +/*---------------------------------------------*/ +void mySignalCatcher ( IntNative n ) +{ + fprintf ( stderr, + "\n%s: Control-C (or similar) caught, quitting.\n", + progName ); + cleanUpAndFail(1); +} + + +/*---------------------------------------------*/ +void mySIGSEGVorSIGBUScatcher ( IntNative n ) +{ + if (opMode == OM_Z) + fprintf ( stderr, + "\n%s: Caught a SIGSEGV or SIGBUS whilst compressing,\n" + "\twhich probably indicates a bug in bzip2. Please\n" + "\treport it to me at: jseward@acm.org\n", + progName ); + else + fprintf ( stderr, + "\n%s: Caught a SIGSEGV or SIGBUS whilst decompressing,\n" + "\twhich probably indicates that the compressed data\n" + "\tis corrupted.\n", + progName ); + + showFileNames(); + if (opMode == OM_Z) + cleanUpAndFail( 3 ); else + { cadvise(); cleanUpAndFail( 2 ); } +} + + +/*---------------------------------------------*/ +void uncompressOutOfMemory ( Int32 draw, Int32 blockSize ) +{ + fprintf ( stderr, + "\n%s: Can't allocate enough memory for decompression.\n" + "\tRequested %d bytes for a block size of %d.\n" + "\tTry selecting space-economic decompress (with flag -s)\n" + "\tand failing that, find a machine with more memory.\n", + progName, draw, blockSize ); + showFileNames(); + cleanUpAndFail(1); +} + + +/*---------------------------------------------*/ +void compressOutOfMemory ( Int32 draw, Int32 blockSize ) +{ + fprintf ( stderr, + "\n%s: Can't allocate enough memory for compression.\n" + "\tRequested %d bytes for a block size of %d.\n" + "\tTry selecting a small block size (with flag -s).\n", + progName, draw, blockSize ); + showFileNames(); + cleanUpAndFail(1); +} + + +/*---------------------------------------------------*/ +/*--- The main driver machinery ---*/ +/*---------------------------------------------------*/ + +/*---------------------------------------------*/ +void pad ( Char *s ) +{ + Int32 i; + if ( (Int32)strlen(s) >= longestFileName ) return; + for (i = 1; i <= longestFileName - (Int32)strlen(s); i++) + fprintf ( stderr, " " ); +} + + +/*---------------------------------------------*/ +Bool fileExists ( Char* name ) +{ + FILE *tmp = fopen ( name, "rb" ); + Bool exists = (tmp != NULL); + if (tmp != NULL) fclose ( tmp ); + return exists; +} + + +/*---------------------------------------------*/ +/*-- + if in doubt, return True +--*/ +Bool notABogStandardFile ( Char* name ) +{ + IntNative i; + struct MY_STAT statBuf; + + i = MY_LSTAT ( name, &statBuf ); + if (i != 0) return True; + if (MY_S_IFREG(statBuf.st_mode)) return False; + return True; +} + + +/*---------------------------------------------*/ +void copyDateAndPermissions ( Char *srcName, Char *dstName ) +{ + #if BZ_UNIX + IntNative retVal; + struct MY_STAT statBuf; + struct utimbuf uTimBuf; + + retVal = MY_LSTAT ( srcName, &statBuf ); + ERROR_IF_NOT_ZERO ( retVal ); + uTimBuf.actime = statBuf.st_atime; + uTimBuf.modtime = statBuf.st_mtime; + + retVal = chmod ( dstName, statBuf.st_mode ); + ERROR_IF_NOT_ZERO ( retVal ); + retVal = utime ( dstName, &uTimBuf ); + ERROR_IF_NOT_ZERO ( retVal ); + #endif +} + + +/*---------------------------------------------*/ +Bool endsInBz2 ( Char* name ) +{ + Int32 n = strlen ( name ); + if (n <= 4) return False; + return + (name[n-4] == '.' && + name[n-3] == 'b' && + name[n-2] == 'z' && + name[n-1] == '2'); +} + + +/*---------------------------------------------*/ +Bool containsDubiousChars ( Char* name ) +{ + Bool cdc = False; + for (; *name != '\0'; name++) + if (*name == '?' || *name == '*') cdc = True; + return cdc; +} + + +/*---------------------------------------------*/ +void compress ( Char *name ) +{ + FILE *inStr; + FILE *outStr; + + if (name == NULL && srcMode != SM_I2O) + panic ( "compress: bad modes\n" ); + + switch (srcMode) { + case SM_I2O: strcpy ( inName, "(stdin)" ); + strcpy ( outName, "(stdout)" ); break; + case SM_F2F: strcpy ( inName, name ); + strcpy ( outName, name ); + strcat ( outName, ".bz2" ); break; + case SM_F2O: strcpy ( inName, name ); + strcpy ( outName, "(stdout)" ); break; + } + + if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) { + fprintf ( stderr, "%s: There are no files matching `%s'.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && !fileExists ( inName ) ) { + fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && endsInBz2 ( inName )) { + fprintf ( stderr, "%s: Input file name %s ends in `.bz2', skipping.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && notABogStandardFile ( inName )) { + fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n", + progName, inName ); + return; + } + if ( srcMode == SM_F2F && fileExists ( outName ) ) { + fprintf ( stderr, "%s: Output file %s already exists, skipping.\n", + progName, outName ); + return; + } + + switch ( srcMode ) { + + case SM_I2O: + inStr = stdin; + outStr = stdout; + if ( isatty ( fileno ( stdout ) ) ) { + fprintf ( stderr, + "%s: I won't write compressed data to a terminal.\n", + progName ); + fprintf ( stderr, "%s: For help, type: `%s --help'.\n", + progName, progName ); + return; + }; + break; + + case SM_F2O: + inStr = fopen ( inName, "rb" ); + outStr = stdout; + if ( isatty ( fileno ( stdout ) ) ) { + fprintf ( stderr, + "%s: I won't write compressed data to a terminal.\n", + progName ); + fprintf ( stderr, "%s: For help, type: `%s --help'.\n", + progName, progName ); + return; + }; + if ( inStr == NULL ) { + fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", + progName, inName ); + return; + }; + break; + + case SM_F2F: + inStr = fopen ( inName, "rb" ); + outStr = fopen ( outName, "wb" ); + if ( outStr == NULL) { + fprintf ( stderr, "%s: Can't create output file %s, skipping.\n", + progName, outName ); + return; + } + if ( inStr == NULL ) { + fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", + progName, inName ); + return; + }; + break; + + default: + panic ( "compress: bad srcMode" ); + break; + } + + if (verbosity >= 1) { + fprintf ( stderr, " %s: ", inName ); + pad ( inName ); + fflush ( stderr ); + } + + /*--- Now the input and output handles are sane. Do the Biz. ---*/ + errno = 0; + outputHandleJustInCase = outStr; + compressStream ( inStr, outStr ); + outputHandleJustInCase = NULL; + + /*--- If there was an I/O error, we won't get here. ---*/ + if ( srcMode == SM_F2F ) { + copyDateAndPermissions ( inName, outName ); + if ( !keepInputFiles ) { + IntNative retVal = remove ( inName ); + ERROR_IF_NOT_ZERO ( retVal ); + } + } +} + + +/*---------------------------------------------*/ +void uncompress ( Char *name ) +{ + FILE *inStr; + FILE *outStr; + Bool magicNumberOK; + + if (name == NULL && srcMode != SM_I2O) + panic ( "uncompress: bad modes\n" ); + + switch (srcMode) { + case SM_I2O: strcpy ( inName, "(stdin)" ); + strcpy ( outName, "(stdout)" ); break; + case SM_F2F: strcpy ( inName, name ); + strcpy ( outName, name ); + if (endsInBz2 ( outName )) + outName [ strlen ( outName ) - 4 ] = '\0'; + break; + case SM_F2O: strcpy ( inName, name ); + strcpy ( outName, "(stdout)" ); break; + } + + if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) { + fprintf ( stderr, "%s: There are no files matching `%s'.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && !fileExists ( inName ) ) { + fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && !endsInBz2 ( inName )) { + fprintf ( stderr, + "%s: Input file name %s doesn't end in `.bz2', skipping.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && notABogStandardFile ( inName )) { + fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n", + progName, inName ); + return; + } + if ( srcMode == SM_F2F && fileExists ( outName ) ) { + fprintf ( stderr, "%s: Output file %s already exists, skipping.\n", + progName, outName ); + return; + } + + switch ( srcMode ) { + + case SM_I2O: + inStr = stdin; + outStr = stdout; + if ( isatty ( fileno ( stdin ) ) ) { + fprintf ( stderr, + "%s: I won't read compressed data from a terminal.\n", + progName ); + fprintf ( stderr, "%s: For help, type: `%s --help'.\n", + progName, progName ); + return; + }; + break; + + case SM_F2O: + inStr = fopen ( inName, "rb" ); + outStr = stdout; + if ( inStr == NULL ) { + fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", + progName, inName ); + return; + }; + break; + + case SM_F2F: + inStr = fopen ( inName, "rb" ); + outStr = fopen ( outName, "wb" ); + if ( outStr == NULL) { + fprintf ( stderr, "%s: Can't create output file %s, skipping.\n", + progName, outName ); + return; + } + if ( inStr == NULL ) { + fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", + progName, inName ); + return; + }; + break; + + default: + panic ( "uncompress: bad srcMode" ); + break; + } + + if (verbosity >= 1) { + fprintf ( stderr, " %s: ", inName ); + pad ( inName ); + fflush ( stderr ); + } + + /*--- Now the input and output handles are sane. Do the Biz. ---*/ + errno = 0; + outputHandleJustInCase = outStr; + magicNumberOK = uncompressStream ( inStr, outStr ); + outputHandleJustInCase = NULL; + + /*--- If there was an I/O error, we won't get here. ---*/ + if ( magicNumberOK ) { + if ( srcMode == SM_F2F ) { + copyDateAndPermissions ( inName, outName ); + if ( !keepInputFiles ) { + IntNative retVal = remove ( inName ); + ERROR_IF_NOT_ZERO ( retVal ); + } + } + } else { + if ( srcMode == SM_F2F ) { + IntNative retVal = remove ( outName ); + ERROR_IF_NOT_ZERO ( retVal ); + } + } + + if ( magicNumberOK ) { + if (verbosity >= 1) + fprintf ( stderr, "done\n" ); + } else { + if (verbosity >= 1) + fprintf ( stderr, "not a bzip2 file, skipping.\n" ); else + fprintf ( stderr, + "%s: %s is not a bzip2 file, skipping.\n", + progName, inName ); + } + +} + + +/*---------------------------------------------*/ +void testf ( Char *name ) +{ + FILE *inStr; + Bool allOK; + + if (name == NULL && srcMode != SM_I2O) + panic ( "testf: bad modes\n" ); + + strcpy ( outName, "(none)" ); + switch (srcMode) { + case SM_I2O: strcpy ( inName, "(stdin)" ); break; + case SM_F2F: strcpy ( inName, name ); break; + case SM_F2O: strcpy ( inName, name ); break; + } + + if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) { + fprintf ( stderr, "%s: There are no files matching `%s'.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && !fileExists ( inName ) ) { + fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && !endsInBz2 ( inName )) { + fprintf ( stderr, + "%s: Input file name %s doesn't end in `.bz2', skipping.\n", + progName, inName ); + return; + } + if ( srcMode != SM_I2O && notABogStandardFile ( inName )) { + fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n", + progName, inName ); + return; + } + + switch ( srcMode ) { + + case SM_I2O: + if ( isatty ( fileno ( stdin ) ) ) { + fprintf ( stderr, + "%s: I won't read compressed data from a terminal.\n", + progName ); + fprintf ( stderr, "%s: For help, type: `%s --help'.\n", + progName, progName ); + return; + }; + inStr = stdin; + break; + + case SM_F2O: case SM_F2F: + inStr = fopen ( inName, "rb" ); + if ( inStr == NULL ) { + fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", + progName, inName ); + return; + }; + break; + + default: + panic ( "testf: bad srcMode" ); + break; + } + + if (verbosity >= 1) { + fprintf ( stderr, " %s: ", inName ); + pad ( inName ); + fflush ( stderr ); + } + + /*--- Now the input handle is sane. Do the Biz. ---*/ + errno = 0; + allOK = testStream ( inStr ); + + if (allOK && verbosity >= 1) fprintf ( stderr, "ok\n" ); + if (!allOK) testFailsExist = True; +} + + +/*---------------------------------------------*/ +void license ( void ) +{ + fprintf ( stderr, + + "bzip2, a block-sorting file compressor. " + "Version 0.1pl0, 17-Aug-97.\n" + " \n" + " Copyright (C) 1996, 1997 by Julian Seward.\n" + " \n" + " This program is free software; you can redistribute it and/or modify\n" + " it under the terms of the GNU General Public License as published by\n" + " the Free Software Foundation; either version 2 of the License, or\n" + " (at your option) any later version.\n" + " \n" + " This program is distributed in the hope that it will be useful,\n" + " but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + " GNU General Public License for more details.\n" + " \n" + " You should have received a copy of the GNU General Public License\n" + " along with this program; if not, write to the Free Software\n" + " Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n" + " \n" + " The GNU General Public License is contained in the file LICENSE.\n" + " \n" + ); +} + + +/*---------------------------------------------*/ +void usage ( Char *fullProgName ) +{ + fprintf ( + stderr, + "bzip2, a block-sorting file compressor. " + "Version 0.1pl0, 17-Aug-97.\n" + "\n usage: %s [flags and input files in any order]\n" + "\n" + " -h --help print this message\n" + " -d --decompress force decompression\n" + " -f --compress force compression\n" + " -t --test test compressed file integrity\n" + " -c --stdout output to standard out\n" + " -v --verbose be verbose (a 2nd -v gives more)\n" + " -k --keep keep (don't delete) input files\n" + " -L --license display software version & license\n" + " -V --version display software version & license\n" + " -s --small use less memory (at most 2500k)\n" + " -1 .. -9 set block size to 100k .. 900k\n" + " --repetitive-fast compress repetitive blocks faster\n" + " --repetitive-best compress repetitive blocks better\n" + "\n" + " If invoked as `bzip2', the default action is to compress.\n" + " as `bunzip2', the default action is to decompress.\n" + "\n" + " If no file names are given, bzip2 compresses or decompresses\n" + " from standard input to standard output. You can combine\n" + " flags, so `-v -4' means the same as -v4 or -4v, &c.\n" + #if BZ_UNIX + "\n" + #endif + , + + fullProgName + ); +} + + +/*---------------------------------------------*/ +/*-- + All the garbage from here to main() is purely to + implement a linked list of command-line arguments, + into which main() copies argv[1 .. argc-1]. + + The purpose of this ridiculous exercise is to + facilitate the expansion of wildcard characters + * and ? in filenames for halfwitted OSs like + MSDOS, Windows 95 and NT. + + The actual Dirty Work is done by the platform-specific + macro APPEND_FILESPEC. +--*/ + +typedef + struct zzzz { + Char *name; + struct zzzz *link; + } + Cell; + + +/*---------------------------------------------*/ +void *myMalloc ( Int32 n ) +{ + void* p; + + p = malloc ( (size_t)n ); + if (p == NULL) { + fprintf ( + stderr, + "%s: `malloc' failed on request for %d bytes.\n", + progName, n + ); + exit ( 1 ); + } + return p; +} + + +/*---------------------------------------------*/ +Cell *mkCell ( void ) +{ + Cell *c; + + c = (Cell*) myMalloc ( sizeof ( Cell ) ); + c->name = NULL; + c->link = NULL; + return c; +} + + +/*---------------------------------------------*/ +Cell *snocString ( Cell *root, Char *name ) +{ + if (root == NULL) { + Cell *tmp = mkCell(); + tmp->name = (Char*) myMalloc ( 5 + strlen(name) ); + strcpy ( tmp->name, name ); + return tmp; + } else { + Cell *tmp = root; + while (tmp->link != NULL) tmp = tmp->link; + tmp->link = snocString ( tmp->link, name ); + return root; + } +} + + + +/*---------------------------------------------*/ +#define ISFLAG(s) (strcmp(aa->name, (s))==0) + + +IntNative main ( IntNative argc, Char *argv[] ) +{ + Int32 i, j; + Char *tmp; + Cell *argList; + Cell *aa; + + + #if DEBUG + fprintf ( stderr, "bzip2: *** compiled with debugging ON ***\n" ); + #endif + + /*-- Be really really really paranoid :-) --*/ + if (sizeof(Int32) != 4 || sizeof(UInt32) != 4 || + sizeof(Int16) != 2 || sizeof(UInt16) != 2 || + sizeof(Char) != 1 || sizeof(UChar) != 1) { + fprintf ( stderr, + "bzip2: I'm not configured correctly for this platform!\n" + "\tI require Int32, Int16 and Char to have sizes\n" + "\tof 4, 2 and 1 bytes to run properly, and they don't.\n" + "\tProbably you can fix this by defining them correctly,\n" + "\tand recompiling. Bye!\n" ); + exit(1); + } + + + /*-- Set up signal handlers --*/ + signal (SIGINT, mySignalCatcher); + signal (SIGTERM, mySignalCatcher); + signal (SIGSEGV, mySIGSEGVorSIGBUScatcher); + #if BZ_UNIX + signal (SIGHUP, mySignalCatcher); + signal (SIGBUS, mySIGSEGVorSIGBUScatcher); + #endif + + + /*-- Initialise --*/ + outputHandleJustInCase = NULL; + ftab = NULL; + ll4 = NULL; + ll16 = NULL; + ll8 = NULL; + tt = NULL; + block = NULL; + zptr = NULL; + errno = 0; + smallMode = False; + keepInputFiles = False; + verbosity = 0; + blockSize100k = 9; + testFailsExist = False; + bsStream = NULL; + numFileNames = 0; + numFilesProcessed = 0; + workFactor = 30; + + strcpy ( inName, "(none)" ); + strcpy ( outName, "(none)" ); + + strcpy ( progNameReally, argv[0] ); + progName = &progNameReally[0]; + for (tmp = &progNameReally[0]; *tmp != '\0'; tmp++) + if (*tmp == PATH_SEP) progName = tmp + 1; + + + /*-- Expand filename wildcards in arg list --*/ + argList = NULL; + for (i = 1; i <= argc-1; i++) + APPEND_FILESPEC(argList, argv[i]); + + + /*-- Find the length of the longest filename --*/ + longestFileName = 7; + numFileNames = 0; + for (aa = argList; aa != NULL; aa = aa->link) + if (aa->name[0] != '-') { + numFileNames++; + if (longestFileName < (Int32)strlen(aa->name) ) + longestFileName = (Int32)strlen(aa->name); + } + + + /*-- Determine what to do (compress/uncompress/test). --*/ + /*-- Note that subsequent flag handling may change this. --*/ + opMode = OM_Z; + + if ( (strcmp ( "bunzip2", progName ) == 0) || + (strcmp ( "BUNZIP2", progName ) == 0) || + (strcmp ( "bunzip2.exe", progName ) == 0) || + (strcmp ( "BUNZIP2.EXE", progName ) == 0) ) + opMode = OM_UNZ; + + + /*-- Determine source modes; flag handling may change this too. --*/ + if (numFileNames == 0) + srcMode = SM_I2O; else srcMode = SM_F2F; + + + /*-- Look at the flags. --*/ + for (aa = argList; aa != NULL; aa = aa->link) + if (aa->name[0] == '-' && aa->name[1] != '-') + for (j = 1; aa->name[j] != '\0'; j++) + switch (aa->name[j]) { + case 'c': srcMode = SM_F2O; break; + case 'd': opMode = OM_UNZ; break; + case 'f': opMode = OM_Z; break; + case 't': opMode = OM_TEST; break; + case 'k': keepInputFiles = True; break; + case 's': smallMode = True; break; + case '1': blockSize100k = 1; break; + case '2': blockSize100k = 2; break; + case '3': blockSize100k = 3; break; + case '4': blockSize100k = 4; break; + case '5': blockSize100k = 5; break; + case '6': blockSize100k = 6; break; + case '7': blockSize100k = 7; break; + case '8': blockSize100k = 8; break; + case '9': blockSize100k = 9; break; + case 'V': + case 'L': license(); break; + case 'v': verbosity++; break; + case 'h': usage ( progName ); + exit ( 1 ); + break; + default: fprintf ( stderr, "%s: Bad flag `%s'\n", + progName, aa->name ); + usage ( progName ); + exit ( 1 ); + break; + } + + /*-- And again ... --*/ + for (aa = argList; aa != NULL; aa = aa->link) { + if (ISFLAG("--stdout")) srcMode = SM_F2O; else + if (ISFLAG("--decompress")) opMode = OM_UNZ; else + if (ISFLAG("--compress")) opMode = OM_Z; else + if (ISFLAG("--test")) opMode = OM_TEST; else + if (ISFLAG("--keep")) keepInputFiles = True; else + if (ISFLAG("--small")) smallMode = True; else + if (ISFLAG("--version")) license(); else + if (ISFLAG("--license")) license(); else + if (ISFLAG("--repetitive-fast")) workFactor = 5; else + if (ISFLAG("--repetitive-best")) workFactor = 150; else + if (ISFLAG("--verbose")) verbosity++; else + if (ISFLAG("--help")) { usage ( progName ); exit ( 1 ); } + else + if (strncmp ( aa->name, "--", 2) == 0) { + fprintf ( stderr, "%s: Bad flag `%s'\n", progName, aa->name ); + usage ( progName ); + exit ( 1 ); + } + } + + if (opMode == OM_Z && smallMode) blockSize100k = 2; + + if (opMode == OM_Z && srcMode == SM_F2O && numFileNames > 1) { + fprintf ( stderr, "%s: I won't compress multiple files to stdout.\n", + progName ); + exit ( 1 ); + } + + if (opMode == OM_TEST && srcMode == SM_F2O) { + fprintf ( stderr, "%s: -c and -t cannot be used together.\n", + progName ); + exit ( 1 ); + } + + if (opMode != OM_Z) blockSize100k = 0; + + if (opMode == OM_Z) { + allocateCompressStructures(); + if (srcMode == SM_I2O) + compress ( NULL ); + else + for (aa = argList; aa != NULL; aa = aa->link) + if (aa->name[0] != '-') { + numFilesProcessed++; + compress ( aa->name ); + } + } else + if (opMode == OM_UNZ) { + if (srcMode == SM_I2O) + uncompress ( NULL ); + else + for (aa = argList; aa != NULL; aa = aa->link) + if (aa->name[0] != '-') { + numFilesProcessed++; + uncompress ( aa->name ); + } + } else { + testFailsExist = False; + if (srcMode == SM_I2O) + testf ( NULL ); + else + for (aa = argList; aa != NULL; aa = aa->link) + if (aa->name[0] != '-') { + numFilesProcessed++; + testf ( aa->name ); + } + if (testFailsExist) { + fprintf ( stderr, + "\n" + "You can use the `bzip2recover' program to *attempt* to recover\n" + "data from undamaged sections of corrupted files.\n\n" + ); + exit(2); + } + } + return 0; +} + + +/*-----------------------------------------------------------*/ +/*--- end bzip2.c ---*/ +/*-----------------------------------------------------------*/ -- cgit v1.2.3-55-g6feb