Add gznorm.c example, which normalizes gzip files.

author: Mark Adler <madler@alumni.caltech.edu> 2018-10-05 23:06:36 -0700
committer: Mark Adler <madler@alumni.caltech.edu> 2018-10-07 13:55:00 -0700
commit: 354fa43d123b9759e4308c81e181caa1f97187ed (patch)
tree: 1afd6fd8f57e943ac1b18e6536bc2a3e0edcadca /examples
parent: cd16ff0b3a3665273e47141de8973bf1088cdf59 (diff)
download: zlib-354fa43d123b9759e4308c81e181caa1f97187ed.tar.gz
zlib-354fa43d123b9759e4308c81e181caa1f97187ed.tar.bz2
zlib-354fa43d123b9759e4308c81e181caa1f97187ed.zip
2 files changed, 474 insertions, 0 deletions
diff --git a/examples/README.examples b/examples/README.examples
index 56a3171..42d9414 100644
--- a/examples/README.examples
+++ b/examples/README.examples
@@ -34,6 +34,10 @@ gzlog.h
      and deflateSetDictionary()
    - illustrates use of a gzip header extra field
+gznorm.c
+    normalize a gzip file by combining members into a single member
+    - demonstrates how to concatenate deflate streams using Z_BLOCK
 zlib_how.html
    painfully comprehensive description of zpipe.c (see below)
    - describes in excruciating detail the use of deflate() and inflate()
diff --git a/examples/gznorm.c b/examples/gznorm.c
new file mode 100644
index 0000000..68e0a0f
--- /dev/null
+++ b/examples/gznorm.c
@@ -0,0 +1,470 @@
+/* gznorm.c -- normalize a gzip stream
+ * Copyright (C) 2018 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ * Version 1.0  7 Oct 2018  Mark Adler */
+// gznorm takes a gzip stream, potentially containing multiple members, and
+// converts it to a gzip stream with a single member. In addition the gzip
+// header is normalized, removing the file name and time stamp, and setting the
+// other header contents (XFL, OS) to fixed values. gznorm does not recompress
+// the data, so it is fast, but no advantage is gained from the history that
+// could be available across member boundaries.
+#include <stdio.h>      // fread, fwrite, putc, fflush, ferror, fprintf,
+                        // vsnprintf, stdout, stderr, NULL, FILE
+#include <stdlib.h>     // malloc, free
+#include <string.h>     // strerror
+#include <errno.h>      // errno
+#include <stdarg.h>     // va_list, va_start, va_end
+#include "zlib.h"       // inflateInit2, inflate, inflateReset, inflateEnd,
+                        // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
+                        // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
+                        // Z_MEM_ERROR
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
+#  include <fcntl.h>
+#  include <io.h>
+#  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#else
+#  define SET_BINARY_MODE(file)
+#endif
+#define local static
+// printf to an allocated string. Return the string, or NULL if the printf or
+// allocation fails.
+local char *aprintf(char *fmt, ...) {
+    // Get the length of the result of the printf.
+    va_list args;
+    va_start(args, fmt);
+    int len = vsnprintf(NULL, 0, fmt, args);
+    va_end(args);
+    if (len < 0)
+        return NULL;
+    // Allocate the required space and printf to it.
+    char *str = malloc(len + 1);
+    if (str == NULL)
+        return NULL;
+    va_start(args, fmt);
+    vsnprintf(str, len + 1, fmt, args);
+    va_end(args);
+    return str;
+}
+// Return with an error, putting an allocated error message in *err. Doing an
+// inflateEnd() on an already ended state, or one with state set to Z_NULL, is
+// permitted.
+#define BYE(...) \
+    do { \
+        inflateEnd(&strm); \
+        *err = aprintf(__VA_ARGS__); \
+        return 1; \
+    } while (0)
+// Chunk size for buffered reads and for decompression. Twice this many bytes
+// will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
+#define CHUNK 16384
+// Read a gzip stream from in and write an equivalent normalized gzip stream to
+// out. If given no input, an empty gzip stream will be written. If successful,
+// 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
+// details of the error are returned in *err, a pointer to an allocated string.
+//
+// The input may be a stream with multiple gzip members, which is converted to
+// a single gzip member on the output. Each gzip member is decompressed at the
+// level of deflate blocks. This enables clearing the last-block bit, shifting
+// the compressed data to concatenate to the previous member's compressed data,
+// which can end at an arbitrary bit boundary, and identifying stored blocks in
+// order to resynchronize those to byte boundaries. The deflate compressed data
+// is terminated with a 10-bit empty fixed block. If any members on the input
+// end with a 10-bit empty fixed block, then that block is excised from the
+// stream. This avoids appending empty fixed blocks for every normalization,
+// and assures that gzip_normalize applied a second time will not change the
+// input. The pad bits after stored block headers and after the final deflate
+// block are all forced to zeros.
+local int gzip_normalize(FILE *in, FILE *out, char **err) {
+    // initialize the inflate engine to process a gzip member
+    z_stream strm;
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
+    strm.avail_in = 0;
+    strm.next_in = Z_NULL;
+    if (inflateInit2(&strm, 15 + 16) != Z_OK)
+        BYE("out of memory");
+    // State while processing the input gzip stream.
+    enum {              // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
+        BETWEEN,        // between gzip members (must end in this state)
+        HEAD,           // reading a gzip header
+        BLOCK,          // reading deflate blocks
+        TAIL            // reading a gzip trailer
+    } state = BETWEEN;              // current component being processed
+    unsigned long crc = 0;          // accumulated CRC of uncompressed data
+    unsigned long len = 0;          // accumulated length of uncompressed data
+    unsigned long buf = 0;          // deflate stream bit buffer of num bits
+    int num = 0;                    // number of bits in buf (at bottom)
+    // Write a canonical gzip header (no mod time, file name, comment, extra
+    // block, or extra flags, and OS is marked as unknown).
+    fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
+    // Process the gzip stream from in until reaching the end of the input,
+    // encountering invalid input, or experiencing an i/o error.
+    int more;                       // true if not at the end of the input
+    do {
+        // State inside this loop.
+        unsigned char *put;         // next input buffer location to process
+        int prev;                   // number of bits from previous block in
+                                    // the bit buffer, or -1 if not at the
+                                    // start of a block
+        unsigned long long memb;    // uncompressed length of member
+        size_t tail;                // number of trailer bytes read (0..8)
+        unsigned long part;         // accumulated trailer component
+        // Get the next chunk of input from in.
+        unsigned char dat[CHUNK];
+        strm.avail_in = fread(dat, 1, CHUNK, in);
+        if (strm.avail_in == 0)
+            break;
+        more = strm.avail_in == CHUNK;
+        strm.next_in = put = dat;
+        // Run that chunk of input through the inflate engine to exhaustion.
+        do {
+            // At this point it is assured that strm.avail_in > 0.
+            // Inflate until the end of a gzip component (header, deflate
+            // block, trailer) is reached, or until all of the chunk is
+            // consumed. The resulting decompressed data is discarded, though
+            // the total size of the decompressed data in each member is
+            // tracked, for the calculation of the total CRC.
+            do {
+                // inflate and handle any errors
+                unsigned char scrap[CHUNK];
+                strm.avail_out = CHUNK;
+                strm.next_out = scrap;
+                int ret = inflate(&strm, Z_BLOCK);
+                if (ret == Z_MEM_ERROR)
+                    BYE("out of memory");
+                if (ret == Z_DATA_ERROR)
+                    BYE("input invalid: %s", strm.msg);
+                if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
+                    BYE("internal error");
+                // Update the number of uncompressed bytes generated in this
+                // member. The actual count (not modulo 2^32) is required to
+                // correctly compute the total CRC.
+                unsigned got = CHUNK - strm.avail_out;
+                memb += got;
+                if (memb < got)
+                    BYE("overflow error");
+                // Continue to process this chunk until it is consumed, or
+                // until the end of a component (header, deflate block, or
+                // trailer) is reached.
+            } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
+            // Since strm.avail_in was > 0 for the inflate call, some input was
+            // just consumed. It is therefore assured that put < strm.next_in.
+            // Disposition the consumed component or part of a component.
+            switch (state) {
+                case BETWEEN:
+                    state = HEAD;
+                    // Fall through to HEAD when some or all of the header is
+                    // processed.
+                case HEAD:
+                    // Discard the header.
+                    if (strm.data_type & 0x80) {
+                        // End of header reached -- deflate blocks follow.
+                        put = strm.next_in;
+                        prev = num;
+                        memb = 0;
+                        state = BLOCK;
+                    }
+                    break;
+                case BLOCK:
+                    // Copy the deflate stream to the output, but with the
+                    // last-block-bit cleared. Re-synchronize stored block
+                    // headers to the output byte boundaries. The bytes at
+                    // put..strm.next_in-1 is the compressed data that has been
+                    // processed and is ready to be copied to the output.
+                    // At this point, it is assured that new compressed data is
+                    // available, i.e., put < strm.next_in. If prev is -1, then
+                    // that compressed data starts in the middle of a deflate
+                    // block. If prev is not -1, then the bits in the bit
+                    // buffer, possibly combined with the bits in *put, contain
+                    // the three-bit header of the new deflate block. In that
+                    // case, prev is the number of bits from the previous block
+                    // that remain in the bit buffer. Since num is the number
+                    // of bits in the bit buffer, we have that num - prev is
+                    // the number of bits from the new block currently in the
+                    // bit buffer.
+                    // If strm.data_type & 0xc0 is 0x80, then the last byte of
+                    // the available compressed data includes the last bits of
+                    // the end of a deflate block. In that case, that last byte
+                    // also has strm.data_type & 0x1f bits of the next deflate
+                    // block, in the range 0..7. If strm.data_type & 0xc0 is
+                    // 0xc0, then the last byte of the compressed data is the
+                    // end of the deflate stream, followed by strm.data_type &
+                    // 0x1f pad bits, also in the range 0..7.
+                    // Set bits to the number of bits not yet consumed from the
+                    // last byte. If we are at the end of the block, bits is
+                    // either the number of bits in the last byte belonging to
+                    // the next block, or the number of pad bits after the
+                    // final block. In either of those cases, bits is in the
+                    // range 0..7.
+                    ;                   // (required due to C syntax oddity)
+                    int bits = strm.data_type & 0x1f;
+                    if (prev != -1) {
+                        // We are at the start of a new block. Clear the last
+                        // block bit, and check for special cases. If it is a
+                        // stored block, then emit the header and pad to the
+                        // next byte boundary. If it is a final, empty fixed
+                        // block, then excise it.
+                        // Some or all of the three header bits for this block
+                        // may already be in the bit buffer. Load any remaining
+                        // header bits into the bit buffer.
+                        if (num - prev < 3) {
+                            buf += (unsigned long)*put++ << num;
+                            num += 8;
+                        }
+                        // Set last to have a 1 in the position of the last
+                        // block bit in the bit buffer.
+                        unsigned long last = (unsigned long)1 << prev;
+                        if (((buf >> prev) & 7) == 3) {
+                            // This is a final fixed block. Load at least ten
+                            // bits from this block, including the header, into
+                            // the bit buffer. We already have at least three,
+                            // so at most one more byte needs to be loaded.
+                            if (num - prev < 10) {
+                                if (put == strm.next_in)
+                                    // Need to go get and process more input.
+                                    // We'll end up back here to finish this.
+                                    break;
+                                buf += (unsigned long)*put++ << num;
+                                num += 8;
+                            }
+                            if (((buf >> prev) & 0x3ff) == 3) {
+                                // That final fixed block is empty. Delete it
+                                // to avoid adding an empty block every time a
+                                // gzip stream is normalized.
+                                num = prev;
+                                buf &= last - 1;    // zero the pad bits
+                            }
+                        }
+                        else if (((buf >> prev) & 6) == 0) {
+                            // This is a stored block. Flush to the next
+                            // byte boundary after the three-bit header.
+                            num = (prev + 10) & ~7;
+                            buf &= last - 1;        // zero the pad bits
+                        }
+                        // Clear the last block bit.
+                        buf &= ~last;
+                        // Write out complete bytes in the bit buffer.
+                        while (num >= 8) {
+                            putc(buf, out);
+                            buf >>= 8;
+                            num -= 8;
+                        }
+                        // If no more bytes left to process, then we have
+                        // consumed the byte that had bits from the next block.
+                        if (put == strm.next_in)
+                            bits = 0;
+                    }
+                    // We are done handling the deflate block header. Now copy
+                    // all or almost all of the remaining compressed data that
+                    // has been processed so far. Don't copy one byte at the
+                    // end if it contains bits from the next deflate block or
+                    // pad bits at the end of a deflate block.
+                    // mix is 1 if we are at the end of a deflate block, and if
+                    // some of the bits in the last byte follow this block. mix
+                    // is 0 if we are in the middle of a deflate block, if the
+                    // deflate block ended on a byte boundary, or if all of the
+                    // compressed data processed so far has been consumed.
+                    int mix = (strm.data_type & 0x80) && bits;
+                    // Copy all of the processed compressed data to the output,
+                    // except for the last byte if it contains bits from the
+                    // next deflate block or pad bits at the end of the deflate
+                    // stream. Copy the data after shifting in num bits from
+                    // buf in front of it, leaving num bits from the end of the
+                    // compressed data in buf when done.
+                    unsigned char *end = strm.next_in - mix;
+                    if (put < end) {
+                        if (num)
+                            // Insert num bits from buf before the data being
+                            // copied.
+                            do {
+                                buf += (unsigned)(*put++) << num;
+                                putc(buf, out);
+                                buf >>= 8;
+                            } while (put < end);
+                        else {
+                            // No shifting needed -- write directly.
+                            fwrite(put, 1, end - put, out);
+                            put = end;
+                        }
+                    }
+                    // Process the last processed byte if it wasn't written.
+                    if (mix) {
+                        // Load the last byte into the bit buffer.
+                        buf += (unsigned)(*put++) << num;
+                        num += 8;
+                        if (strm.data_type & 0x40) {
+                            // We are at the end of the deflate stream and
+                            // there are bits pad bits. Discard the pad bits
+                            // and write a byte to the output, if available.
+                            // Leave the num bits left over in buf to prepend
+                            // to the next deflate stream.
+                            num -= bits;
+                            if (num >= 8) {
+                                putc(buf, out);
+                                num -= 8;
+                                buf >>= 8;
+                            }
+                            // Force the pad bits in the bit buffer to zeros.
+                            buf &= ((unsigned long)1 << num) - 1;
+                            // Don't need to set prev here since going to TAIL.
+                        }
+                        else
+                            // At the end of an internal deflate block. Leave
+                            // the last byte in the bit buffer to examine on
+                            // the next entry to BLOCK, when more bits from the
+                            // next block will be available.
+                            prev = num - bits;      // number of bits in buffer
+                                                    // from current block
+                    }
+                    // Don't have a byte left over, so we are in the middle of
+                    // a deflate block, or the deflate block ended on a byte
+                    // boundary. Set prev appropriately for the next entry into
+                    // BLOCK.
+                    else if (strm.data_type & 0x80)
+                        // The block ended on a byte boundary, so no header
+                        // bits are in the bit buffer.
+                        prev = num;
+                    else
+                        // In the middle of a deflate block, so no header here.
+                        prev = -1;
+                    // Check for the end of the deflate stream.
+                    if ((strm.data_type & 0xc0) == 0xc0) {
+                        // That ends the deflate stream on the input side, the
+                        // pad bits were discarded, and any remaining bits from
+                        // the last block in the stream are saved in the bit
+                        // buffer to prepend to the next stream. Process the
+                        // gzip trailer next.
+                        tail = 0;
+                        part = 0;
+                        state = TAIL;
+                    }
+                    break;
+                case TAIL:
+                    // Accumulate available trailer bytes to update the total
+                    // CRC and the total uncompressed length.
+                    do {
+                        part = (part >> 8) + ((unsigned long)(*put++) << 24);
+                        tail++;
+                        if (tail == 4) {
+                            // Update the total CRC.
+                            z_off_t len2 = memb;
+                            if (len2 < 0 || (unsigned long long)len2 != memb)
+                                BYE("overflow error");
+                            crc = crc ? crc32_combine(crc, part, len2) : part;
+                            part = 0;
+                        }
+                        else if (tail == 8) {
+                            // Update the total uncompressed length. (It's ok
+                            // if this sum is done modulo 2^32.)
+                            len += part;
+                            // At the end of a member. Set up to inflate an
+                            // immediately following gzip member. (If we made
+                            // it this far, then the trailer was valid.)
+                            if (inflateReset(&strm) != Z_OK)
+                                BYE("internal error");
+                            state = BETWEEN;
+                            break;
+                        }
+                    } while (put < strm.next_in);
+                    break;
+            }
+            // Process the input buffer until completely consumed.
+        } while (strm.avail_in > 0);
+        // Process input until end of file, invalid input, or i/o error.
+    } while (more);
+    // Done with the inflate engine.
+    inflateEnd(&strm);
+    // Verify the validity of the input.
+    if (state != BETWEEN)
+        BYE("input invalid: incomplete gzip stream");
+    // Write the remaining deflate stream bits, followed by a terminating
+    // deflate fixed block.
+    buf += (unsigned long)3 << num;
+    putc(buf, out);
+    putc(buf >> 8, out);
+    if (num > 6)
+        putc(0, out);
+    // Write the gzip trailer, which is the CRC and the uncompressed length
+    // modulo 2^32, both in little-endian order.
+    putc(crc, out);
+    putc(crc >> 8, out);
+    putc(crc >> 16, out);
+    putc(crc >> 24, out);
+    putc(len, out);
+    putc(len >> 8, out);
+    putc(len >> 16, out);
+    putc(len >> 24, out);
+    fflush(out);
+    // Check for any i/o errors.
+    if (ferror(in) || ferror(out))
+        BYE("i/o error: %s", strerror(errno));
+    // All good!
+    *err = NULL;
+    return 0;
+}
+// Normalize the gzip stream on stdin, writing the result to stdout.
+int main(void) {
+    // Avoid end-of-line conversions on evil operating systems.
+    SET_BINARY_MODE(stdin);
+    SET_BINARY_MODE(stdout);
+    // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
+    char *err;
+    int ret = gzip_normalize(stdin, stdout, &err);
+    if (ret)
+        fprintf(stderr, "gznorm error: %s\n", err);
+    free(err);
+    return ret;
+}
author	Mark Adler <madler@alumni.caltech.edu>	2018-10-05 23:06:36 -0700
committer	Mark Adler <madler@alumni.caltech.edu>	2018-10-07 13:55:00 -0700
commit	354fa43d123b9759e4308c81e181caa1f97187ed (patch)
tree	1afd6fd8f57e943ac1b18e6536bc2a3e0edcadca /examples
parent	cd16ff0b3a3665273e47141de8973bf1088cdf59 (diff)
download	zlib-354fa43d123b9759e4308c81e181caa1f97187ed.tar.gz zlib-354fa43d123b9759e4308c81e181caa1f97187ed.tar.bz2 zlib-354fa43d123b9759e4308c81e181caa1f97187ed.zip

diff --git a/examples/README.examples b/examples/README.examples index 56a3171..42d9414 100644 --- a/examples/README.examples +++ b/examples/README.examples
@@ -34,6 +34,10 @@ gzlog.h
34	and deflateSetDictionary()	34	and deflateSetDictionary()
35	- illustrates use of a gzip header extra field	35	- illustrates use of a gzip header extra field
36		36
		37	gznorm.c
		38	normalize a gzip file by combining members into a single member
		39	- demonstrates how to concatenate deflate streams using Z_BLOCK
		40
37	zlib_how.html	41	zlib_how.html
38	painfully comprehensive description of zpipe.c (see below)	42	painfully comprehensive description of zpipe.c (see below)
39	- describes in excruciating detail the use of deflate() and inflate()	43	- describes in excruciating detail the use of deflate() and inflate()


diff --git a/examples/gznorm.c b/examples/gznorm.c new file mode 100644 index 0000000..68e0a0f --- /dev/null +++ b/examples/gznorm.c
@@ -0,0 +1,470 @@
		1	/* gznorm.c -- normalize a gzip stream
		2	* Copyright (C) 2018 Mark Adler
		3	* For conditions of distribution and use, see copyright notice in zlib.h
		4	* Version 1.0 7 Oct 2018 Mark Adler */
		5
		6	// gznorm takes a gzip stream, potentially containing multiple members, and
		7	// converts it to a gzip stream with a single member. In addition the gzip
		8	// header is normalized, removing the file name and time stamp, and setting the
		9	// other header contents (XFL, OS) to fixed values. gznorm does not recompress
		10	// the data, so it is fast, but no advantage is gained from the history that
		11	// could be available across member boundaries.
		12
		13	#include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
		14	// vsnprintf, stdout, stderr, NULL, FILE
		15	#include <stdlib.h> // malloc, free
		16	#include <string.h> // strerror
		17	#include <errno.h> // errno
		18	#include <stdarg.h> // va_list, va_start, va_end
		19	#include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
		20	// z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
		21	// Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
		22	// Z_MEM_ERROR
		23
		24	#if defined(MSDOS) \|\| defined(OS2) \|\| defined(WIN32) \|\| defined(__CYGWIN__)
		25	# include <fcntl.h>
		26	# include <io.h>
		27	# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
		28	#else
		29	# define SET_BINARY_MODE(file)
		30	#endif
		31
		32	#define local static
		33
		34	// printf to an allocated string. Return the string, or NULL if the printf or
		35	// allocation fails.
		36	local char aprintf(char fmt, ...) {
		37	// Get the length of the result of the printf.
		38	va_list args;
		39	va_start(args, fmt);
		40	int len = vsnprintf(NULL, 0, fmt, args);
		41	va_end(args);
		42	if (len < 0)
		43	return NULL;
		44
		45	// Allocate the required space and printf to it.
		46	char *str = malloc(len + 1);
		47	if (str == NULL)
		48	return NULL;
		49	va_start(args, fmt);
		50	vsnprintf(str, len + 1, fmt, args);
		51	va_end(args);
		52	return str;
		53	}
		54
		55	// Return with an error, putting an allocated error message in *err. Doing an
		56	// inflateEnd() on an already ended state, or one with state set to Z_NULL, is
		57	// permitted.
		58	#define BYE(...) \
		59	do { \
		60	inflateEnd(&strm); \
		61	*err = aprintf(__VA_ARGS__); \
		62	return 1; \
		63	} while (0)
		64
		65	// Chunk size for buffered reads and for decompression. Twice this many bytes
		66	// will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
		67	#define CHUNK 16384
		68
		69	// Read a gzip stream from in and write an equivalent normalized gzip stream to
		70	// out. If given no input, an empty gzip stream will be written. If successful,
		71	// 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
		72	// details of the error are returned in *err, a pointer to an allocated string.
		73	//
		74	// The input may be a stream with multiple gzip members, which is converted to
		75	// a single gzip member on the output. Each gzip member is decompressed at the
		76	// level of deflate blocks. This enables clearing the last-block bit, shifting
		77	// the compressed data to concatenate to the previous member's compressed data,
		78	// which can end at an arbitrary bit boundary, and identifying stored blocks in
		79	// order to resynchronize those to byte boundaries. The deflate compressed data
		80	// is terminated with a 10-bit empty fixed block. If any members on the input
		81	// end with a 10-bit empty fixed block, then that block is excised from the
		82	// stream. This avoids appending empty fixed blocks for every normalization,
		83	// and assures that gzip_normalize applied a second time will not change the
		84	// input. The pad bits after stored block headers and after the final deflate
		85	// block are all forced to zeros.
		86	local int gzip_normalize(FILE in, FILE out, char **err) {
		87	// initialize the inflate engine to process a gzip member
		88	z_stream strm;
		89	strm.zalloc = Z_NULL;
		90	strm.zfree = Z_NULL;
		91	strm.opaque = Z_NULL;
		92	strm.avail_in = 0;
		93	strm.next_in = Z_NULL;
		94	if (inflateInit2(&strm, 15 + 16) != Z_OK)
		95	BYE("out of memory");
		96
		97	// State while processing the input gzip stream.
		98	enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
		99	BETWEEN, // between gzip members (must end in this state)
		100	HEAD, // reading a gzip header
		101	BLOCK, // reading deflate blocks
		102	TAIL // reading a gzip trailer
		103	} state = BETWEEN; // current component being processed
		104	unsigned long crc = 0; // accumulated CRC of uncompressed data
		105	unsigned long len = 0; // accumulated length of uncompressed data
		106	unsigned long buf = 0; // deflate stream bit buffer of num bits
		107	int num = 0; // number of bits in buf (at bottom)
		108
		109	// Write a canonical gzip header (no mod time, file name, comment, extra
		110	// block, or extra flags, and OS is marked as unknown).
		111	fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
		112
		113	// Process the gzip stream from in until reaching the end of the input,
		114	// encountering invalid input, or experiencing an i/o error.
		115	int more; // true if not at the end of the input
		116	do {
		117	// State inside this loop.
		118	unsigned char *put; // next input buffer location to process
		119	int prev; // number of bits from previous block in
		120	// the bit buffer, or -1 if not at the
		121	// start of a block
		122	unsigned long long memb; // uncompressed length of member
		123	size_t tail; // number of trailer bytes read (0..8)
		124	unsigned long part; // accumulated trailer component
		125
		126	// Get the next chunk of input from in.
		127	unsigned char dat[CHUNK];
		128	strm.avail_in = fread(dat, 1, CHUNK, in);
		129	if (strm.avail_in == 0)
		130	break;
		131	more = strm.avail_in == CHUNK;
		132	strm.next_in = put = dat;
		133
		134	// Run that chunk of input through the inflate engine to exhaustion.
		135	do {
		136	// At this point it is assured that strm.avail_in > 0.
		137
		138	// Inflate until the end of a gzip component (header, deflate
		139	// block, trailer) is reached, or until all of the chunk is
		140	// consumed. The resulting decompressed data is discarded, though
		141	// the total size of the decompressed data in each member is
		142	// tracked, for the calculation of the total CRC.
		143	do {
		144	// inflate and handle any errors
		145	unsigned char scrap[CHUNK];
		146	strm.avail_out = CHUNK;
		147	strm.next_out = scrap;
		148	int ret = inflate(&strm, Z_BLOCK);
		149	if (ret == Z_MEM_ERROR)
		150	BYE("out of memory");
		151	if (ret == Z_DATA_ERROR)
		152	BYE("input invalid: %s", strm.msg);
		153	if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
		154	BYE("internal error");
		155
		156	// Update the number of uncompressed bytes generated in this
		157	// member. The actual count (not modulo 2^32) is required to
		158	// correctly compute the total CRC.
		159	unsigned got = CHUNK - strm.avail_out;
		160	memb += got;
		161	if (memb < got)
		162	BYE("overflow error");
		163
		164	// Continue to process this chunk until it is consumed, or
		165	// until the end of a component (header, deflate block, or
		166	// trailer) is reached.
		167	} while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
		168
		169	// Since strm.avail_in was > 0 for the inflate call, some input was
		170	// just consumed. It is therefore assured that put < strm.next_in.
		171
		172	// Disposition the consumed component or part of a component.
		173	switch (state) {
		174	case BETWEEN:
		175	state = HEAD;
		176	// Fall through to HEAD when some or all of the header is
		177	// processed.
		178
		179	case HEAD:
		180	// Discard the header.
		181	if (strm.data_type & 0x80) {
		182	// End of header reached -- deflate blocks follow.
		183	put = strm.next_in;
		184	prev = num;
		185	memb = 0;
		186	state = BLOCK;
		187	}
		188	break;
		189
		190	case BLOCK:
		191	// Copy the deflate stream to the output, but with the
		192	// last-block-bit cleared. Re-synchronize stored block
		193	// headers to the output byte boundaries. The bytes at
		194	// put..strm.next_in-1 is the compressed data that has been
		195	// processed and is ready to be copied to the output.
		196
		197	// At this point, it is assured that new compressed data is
		198	// available, i.e., put < strm.next_in. If prev is -1, then
		199	// that compressed data starts in the middle of a deflate
		200	// block. If prev is not -1, then the bits in the bit
		201	// buffer, possibly combined with the bits in *put, contain
		202	// the three-bit header of the new deflate block. In that
		203	// case, prev is the number of bits from the previous block
		204	// that remain in the bit buffer. Since num is the number
		205	// of bits in the bit buffer, we have that num - prev is
		206	// the number of bits from the new block currently in the
		207	// bit buffer.
		208
		209	// If strm.data_type & 0xc0 is 0x80, then the last byte of
		210	// the available compressed data includes the last bits of
		211	// the end of a deflate block. In that case, that last byte
		212	// also has strm.data_type & 0x1f bits of the next deflate
		213	// block, in the range 0..7. If strm.data_type & 0xc0 is
		214	// 0xc0, then the last byte of the compressed data is the
		215	// end of the deflate stream, followed by strm.data_type &
		216	// 0x1f pad bits, also in the range 0..7.
		217
		218	// Set bits to the number of bits not yet consumed from the
		219	// last byte. If we are at the end of the block, bits is
		220	// either the number of bits in the last byte belonging to
		221	// the next block, or the number of pad bits after the
		222	// final block. In either of those cases, bits is in the
		223	// range 0..7.
		224	; // (required due to C syntax oddity)
		225	int bits = strm.data_type & 0x1f;
		226
		227	if (prev != -1) {
		228	// We are at the start of a new block. Clear the last
		229	// block bit, and check for special cases. If it is a
		230	// stored block, then emit the header and pad to the
		231	// next byte boundary. If it is a final, empty fixed
		232	// block, then excise it.
		233
		234	// Some or all of the three header bits for this block
		235	// may already be in the bit buffer. Load any remaining
		236	// header bits into the bit buffer.
		237	if (num - prev < 3) {
		238	buf += (unsigned long)*put++ << num;
		239	num += 8;
		240	}
		241
		242	// Set last to have a 1 in the position of the last
		243	// block bit in the bit buffer.
		244	unsigned long last = (unsigned long)1 << prev;
		245
		246	if (((buf >> prev) & 7) == 3) {
		247	// This is a final fixed block. Load at least ten
		248	// bits from this block, including the header, into
		249	// the bit buffer. We already have at least three,
		250	// so at most one more byte needs to be loaded.
		251	if (num - prev < 10) {
		252	if (put == strm.next_in)
		253	// Need to go get and process more input.
		254	// We'll end up back here to finish this.
		255	break;
		256	buf += (unsigned long)*put++ << num;
		257	num += 8;
		258	}
		259	if (((buf >> prev) & 0x3ff) == 3) {
		260	// That final fixed block is empty. Delete it
		261	// to avoid adding an empty block every time a
		262	// gzip stream is normalized.
		263	num = prev;
		264	buf &= last - 1; // zero the pad bits
		265	}
		266	}
		267	else if (((buf >> prev) & 6) == 0) {
		268	// This is a stored block. Flush to the next
		269	// byte boundary after the three-bit header.
		270	num = (prev + 10) & ~7;
		271	buf &= last - 1; // zero the pad bits
		272	}
		273
		274	// Clear the last block bit.
		275	buf &= ~last;
		276
		277	// Write out complete bytes in the bit buffer.
		278	while (num >= 8) {
		279	putc(buf, out);
		280	buf >>= 8;
		281	num -= 8;
		282	}
		283
		284	// If no more bytes left to process, then we have
		285	// consumed the byte that had bits from the next block.
		286	if (put == strm.next_in)
		287	bits = 0;
		288	}
		289
		290	// We are done handling the deflate block header. Now copy
		291	// all or almost all of the remaining compressed data that
		292	// has been processed so far. Don't copy one byte at the
		293	// end if it contains bits from the next deflate block or
		294	// pad bits at the end of a deflate block.
		295
		296	// mix is 1 if we are at the end of a deflate block, and if
		297	// some of the bits in the last byte follow this block. mix
		298	// is 0 if we are in the middle of a deflate block, if the
		299	// deflate block ended on a byte boundary, or if all of the
		300	// compressed data processed so far has been consumed.
		301	int mix = (strm.data_type & 0x80) && bits;
		302
		303	// Copy all of the processed compressed data to the output,
		304	// except for the last byte if it contains bits from the
		305	// next deflate block or pad bits at the end of the deflate
		306	// stream. Copy the data after shifting in num bits from
		307	// buf in front of it, leaving num bits from the end of the
		308	// compressed data in buf when done.
		309	unsigned char *end = strm.next_in - mix;
		310	if (put < end) {
		311	if (num)
		312	// Insert num bits from buf before the data being
		313	// copied.
		314	do {
		315	buf += (unsigned)(*put++) << num;
		316	putc(buf, out);
		317	buf >>= 8;
		318	} while (put < end);
		319	else {
		320	// No shifting needed -- write directly.
		321	fwrite(put, 1, end - put, out);
		322	put = end;
		323	}
		324	}
		325
		326	// Process the last processed byte if it wasn't written.
		327	if (mix) {
		328	// Load the last byte into the bit buffer.
		329	buf += (unsigned)(*put++) << num;
		330	num += 8;
		331
		332	if (strm.data_type & 0x40) {
		333	// We are at the end of the deflate stream and
		334	// there are bits pad bits. Discard the pad bits
		335	// and write a byte to the output, if available.
		336	// Leave the num bits left over in buf to prepend
		337	// to the next deflate stream.
		338	num -= bits;
		339	if (num >= 8) {
		340	putc(buf, out);
		341	num -= 8;
		342	buf >>= 8;
		343	}
		344
		345	// Force the pad bits in the bit buffer to zeros.
		346	buf &= ((unsigned long)1 << num) - 1;
		347
		348	// Don't need to set prev here since going to TAIL.
		349	}
		350	else
		351	// At the end of an internal deflate block. Leave
		352	// the last byte in the bit buffer to examine on
		353	// the next entry to BLOCK, when more bits from the
		354	// next block will be available.
		355	prev = num - bits; // number of bits in buffer
		356	// from current block
		357	}
		358
		359	// Don't have a byte left over, so we are in the middle of
		360	// a deflate block, or the deflate block ended on a byte
		361	// boundary. Set prev appropriately for the next entry into
		362	// BLOCK.
		363	else if (strm.data_type & 0x80)
		364	// The block ended on a byte boundary, so no header
		365	// bits are in the bit buffer.
		366	prev = num;
		367	else
		368	// In the middle of a deflate block, so no header here.
		369	prev = -1;
		370
		371	// Check for the end of the deflate stream.
		372	if ((strm.data_type & 0xc0) == 0xc0) {
		373	// That ends the deflate stream on the input side, the
		374	// pad bits were discarded, and any remaining bits from
		375	// the last block in the stream are saved in the bit
		376	// buffer to prepend to the next stream. Process the
		377	// gzip trailer next.
		378	tail = 0;
		379	part = 0;
		380	state = TAIL;
		381	}
		382	break;
		383
		384	case TAIL:
		385	// Accumulate available trailer bytes to update the total
		386	// CRC and the total uncompressed length.
		387	do {
		388	part = (part >> 8) + ((unsigned long)(*put++) << 24);
		389	tail++;
		390	if (tail == 4) {
		391	// Update the total CRC.
		392	z_off_t len2 = memb;
		393	if (len2 < 0 \|\| (unsigned long long)len2 != memb)
		394	BYE("overflow error");
		395	crc = crc ? crc32_combine(crc, part, len2) : part;
		396	part = 0;
		397	}
		398	else if (tail == 8) {
		399	// Update the total uncompressed length. (It's ok
		400	// if this sum is done modulo 2^32.)
		401	len += part;
		402
		403	// At the end of a member. Set up to inflate an
		404	// immediately following gzip member. (If we made
		405	// it this far, then the trailer was valid.)
		406	if (inflateReset(&strm) != Z_OK)
		407	BYE("internal error");
		408	state = BETWEEN;
		409	break;
		410	}
		411	} while (put < strm.next_in);
		412	break;
		413	}
		414
		415	// Process the input buffer until completely consumed.
		416	} while (strm.avail_in > 0);
		417
		418	// Process input until end of file, invalid input, or i/o error.
		419	} while (more);
		420
		421	// Done with the inflate engine.
		422	inflateEnd(&strm);
		423
		424	// Verify the validity of the input.
		425	if (state != BETWEEN)
		426	BYE("input invalid: incomplete gzip stream");
		427
		428	// Write the remaining deflate stream bits, followed by a terminating
		429	// deflate fixed block.
		430	buf += (unsigned long)3 << num;
		431	putc(buf, out);
		432	putc(buf >> 8, out);
		433	if (num > 6)
		434	putc(0, out);
		435
		436	// Write the gzip trailer, which is the CRC and the uncompressed length
		437	// modulo 2^32, both in little-endian order.
		438	putc(crc, out);
		439	putc(crc >> 8, out);
		440	putc(crc >> 16, out);
		441	putc(crc >> 24, out);
		442	putc(len, out);
		443	putc(len >> 8, out);
		444	putc(len >> 16, out);
		445	putc(len >> 24, out);
		446	fflush(out);
		447
		448	// Check for any i/o errors.
		449	if (ferror(in) \|\| ferror(out))
		450	BYE("i/o error: %s", strerror(errno));
		451
		452	// All good!
		453	*err = NULL;
		454	return 0;
		455	}
		456
		457	// Normalize the gzip stream on stdin, writing the result to stdout.
		458	int main(void) {
		459	// Avoid end-of-line conversions on evil operating systems.
		460	SET_BINARY_MODE(stdin);
		461	SET_BINARY_MODE(stdout);
		462
		463	// Normalize from stdin to stdout, returning 1 on error, 0 if ok.
		464	char *err;
		465	int ret = gzip_normalize(stdin, stdout, &err);
		466	if (ret)
		467	fprintf(stderr, "gznorm error: %s\n", err);
		468	free(err);
		469	return ret;
		470	}