diff options
author | Mark Adler <madler@alumni.caltech.edu> | 2018-10-05 23:06:36 -0700 |
---|---|---|
committer | Mark Adler <madler@alumni.caltech.edu> | 2018-10-07 13:55:00 -0700 |
commit | 354fa43d123b9759e4308c81e181caa1f97187ed (patch) | |
tree | 1afd6fd8f57e943ac1b18e6536bc2a3e0edcadca /examples | |
parent | cd16ff0b3a3665273e47141de8973bf1088cdf59 (diff) | |
download | zlib-354fa43d123b9759e4308c81e181caa1f97187ed.tar.gz zlib-354fa43d123b9759e4308c81e181caa1f97187ed.tar.bz2 zlib-354fa43d123b9759e4308c81e181caa1f97187ed.zip |
Add gznorm.c example, which normalizes gzip files.
Diffstat (limited to 'examples')
-rw-r--r-- | examples/README.examples | 4 | ||||
-rw-r--r-- | examples/gznorm.c | 470 |
2 files changed, 474 insertions, 0 deletions
diff --git a/examples/README.examples b/examples/README.examples index 56a3171..42d9414 100644 --- a/examples/README.examples +++ b/examples/README.examples | |||
@@ -34,6 +34,10 @@ gzlog.h | |||
34 | and deflateSetDictionary() | 34 | and deflateSetDictionary() |
35 | - illustrates use of a gzip header extra field | 35 | - illustrates use of a gzip header extra field |
36 | 36 | ||
37 | gznorm.c | ||
38 | normalize a gzip file by combining members into a single member | ||
39 | - demonstrates how to concatenate deflate streams using Z_BLOCK | ||
40 | |||
37 | zlib_how.html | 41 | zlib_how.html |
38 | painfully comprehensive description of zpipe.c (see below) | 42 | painfully comprehensive description of zpipe.c (see below) |
39 | - describes in excruciating detail the use of deflate() and inflate() | 43 | - describes in excruciating detail the use of deflate() and inflate() |
diff --git a/examples/gznorm.c b/examples/gznorm.c new file mode 100644 index 0000000..68e0a0f --- /dev/null +++ b/examples/gznorm.c | |||
@@ -0,0 +1,470 @@ | |||
1 | /* gznorm.c -- normalize a gzip stream | ||
2 | * Copyright (C) 2018 Mark Adler | ||
3 | * For conditions of distribution and use, see copyright notice in zlib.h | ||
4 | * Version 1.0 7 Oct 2018 Mark Adler */ | ||
5 | |||
6 | // gznorm takes a gzip stream, potentially containing multiple members, and | ||
7 | // converts it to a gzip stream with a single member. In addition the gzip | ||
8 | // header is normalized, removing the file name and time stamp, and setting the | ||
9 | // other header contents (XFL, OS) to fixed values. gznorm does not recompress | ||
10 | // the data, so it is fast, but no advantage is gained from the history that | ||
11 | // could be available across member boundaries. | ||
12 | |||
13 | #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf, | ||
14 | // vsnprintf, stdout, stderr, NULL, FILE | ||
15 | #include <stdlib.h> // malloc, free | ||
16 | #include <string.h> // strerror | ||
17 | #include <errno.h> // errno | ||
18 | #include <stdarg.h> // va_list, va_start, va_end | ||
19 | #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd, | ||
20 | // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK, | ||
21 | // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR, | ||
22 | // Z_MEM_ERROR | ||
23 | |||
24 | #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) | ||
25 | # include <fcntl.h> | ||
26 | # include <io.h> | ||
27 | # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) | ||
28 | #else | ||
29 | # define SET_BINARY_MODE(file) | ||
30 | #endif | ||
31 | |||
32 | #define local static | ||
33 | |||
34 | // printf to an allocated string. Return the string, or NULL if the printf or | ||
35 | // allocation fails. | ||
36 | local char *aprintf(char *fmt, ...) { | ||
37 | // Get the length of the result of the printf. | ||
38 | va_list args; | ||
39 | va_start(args, fmt); | ||
40 | int len = vsnprintf(NULL, 0, fmt, args); | ||
41 | va_end(args); | ||
42 | if (len < 0) | ||
43 | return NULL; | ||
44 | |||
45 | // Allocate the required space and printf to it. | ||
46 | char *str = malloc(len + 1); | ||
47 | if (str == NULL) | ||
48 | return NULL; | ||
49 | va_start(args, fmt); | ||
50 | vsnprintf(str, len + 1, fmt, args); | ||
51 | va_end(args); | ||
52 | return str; | ||
53 | } | ||
54 | |||
55 | // Return with an error, putting an allocated error message in *err. Doing an | ||
56 | // inflateEnd() on an already ended state, or one with state set to Z_NULL, is | ||
57 | // permitted. | ||
58 | #define BYE(...) \ | ||
59 | do { \ | ||
60 | inflateEnd(&strm); \ | ||
61 | *err = aprintf(__VA_ARGS__); \ | ||
62 | return 1; \ | ||
63 | } while (0) | ||
64 | |||
65 | // Chunk size for buffered reads and for decompression. Twice this many bytes | ||
66 | // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned. | ||
67 | #define CHUNK 16384 | ||
68 | |||
69 | // Read a gzip stream from in and write an equivalent normalized gzip stream to | ||
70 | // out. If given no input, an empty gzip stream will be written. If successful, | ||
71 | // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the | ||
72 | // details of the error are returned in *err, a pointer to an allocated string. | ||
73 | // | ||
74 | // The input may be a stream with multiple gzip members, which is converted to | ||
75 | // a single gzip member on the output. Each gzip member is decompressed at the | ||
76 | // level of deflate blocks. This enables clearing the last-block bit, shifting | ||
77 | // the compressed data to concatenate to the previous member's compressed data, | ||
78 | // which can end at an arbitrary bit boundary, and identifying stored blocks in | ||
79 | // order to resynchronize those to byte boundaries. The deflate compressed data | ||
80 | // is terminated with a 10-bit empty fixed block. If any members on the input | ||
81 | // end with a 10-bit empty fixed block, then that block is excised from the | ||
82 | // stream. This avoids appending empty fixed blocks for every normalization, | ||
83 | // and assures that gzip_normalize applied a second time will not change the | ||
84 | // input. The pad bits after stored block headers and after the final deflate | ||
85 | // block are all forced to zeros. | ||
86 | local int gzip_normalize(FILE *in, FILE *out, char **err) { | ||
87 | // initialize the inflate engine to process a gzip member | ||
88 | z_stream strm; | ||
89 | strm.zalloc = Z_NULL; | ||
90 | strm.zfree = Z_NULL; | ||
91 | strm.opaque = Z_NULL; | ||
92 | strm.avail_in = 0; | ||
93 | strm.next_in = Z_NULL; | ||
94 | if (inflateInit2(&strm, 15 + 16) != Z_OK) | ||
95 | BYE("out of memory"); | ||
96 | |||
97 | // State while processing the input gzip stream. | ||
98 | enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ... | ||
99 | BETWEEN, // between gzip members (must end in this state) | ||
100 | HEAD, // reading a gzip header | ||
101 | BLOCK, // reading deflate blocks | ||
102 | TAIL // reading a gzip trailer | ||
103 | } state = BETWEEN; // current component being processed | ||
104 | unsigned long crc = 0; // accumulated CRC of uncompressed data | ||
105 | unsigned long len = 0; // accumulated length of uncompressed data | ||
106 | unsigned long buf = 0; // deflate stream bit buffer of num bits | ||
107 | int num = 0; // number of bits in buf (at bottom) | ||
108 | |||
109 | // Write a canonical gzip header (no mod time, file name, comment, extra | ||
110 | // block, or extra flags, and OS is marked as unknown). | ||
111 | fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); | ||
112 | |||
113 | // Process the gzip stream from in until reaching the end of the input, | ||
114 | // encountering invalid input, or experiencing an i/o error. | ||
115 | int more; // true if not at the end of the input | ||
116 | do { | ||
117 | // State inside this loop. | ||
118 | unsigned char *put; // next input buffer location to process | ||
119 | int prev; // number of bits from previous block in | ||
120 | // the bit buffer, or -1 if not at the | ||
121 | // start of a block | ||
122 | unsigned long long memb; // uncompressed length of member | ||
123 | size_t tail; // number of trailer bytes read (0..8) | ||
124 | unsigned long part; // accumulated trailer component | ||
125 | |||
126 | // Get the next chunk of input from in. | ||
127 | unsigned char dat[CHUNK]; | ||
128 | strm.avail_in = fread(dat, 1, CHUNK, in); | ||
129 | if (strm.avail_in == 0) | ||
130 | break; | ||
131 | more = strm.avail_in == CHUNK; | ||
132 | strm.next_in = put = dat; | ||
133 | |||
134 | // Run that chunk of input through the inflate engine to exhaustion. | ||
135 | do { | ||
136 | // At this point it is assured that strm.avail_in > 0. | ||
137 | |||
138 | // Inflate until the end of a gzip component (header, deflate | ||
139 | // block, trailer) is reached, or until all of the chunk is | ||
140 | // consumed. The resulting decompressed data is discarded, though | ||
141 | // the total size of the decompressed data in each member is | ||
142 | // tracked, for the calculation of the total CRC. | ||
143 | do { | ||
144 | // inflate and handle any errors | ||
145 | unsigned char scrap[CHUNK]; | ||
146 | strm.avail_out = CHUNK; | ||
147 | strm.next_out = scrap; | ||
148 | int ret = inflate(&strm, Z_BLOCK); | ||
149 | if (ret == Z_MEM_ERROR) | ||
150 | BYE("out of memory"); | ||
151 | if (ret == Z_DATA_ERROR) | ||
152 | BYE("input invalid: %s", strm.msg); | ||
153 | if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END) | ||
154 | BYE("internal error"); | ||
155 | |||
156 | // Update the number of uncompressed bytes generated in this | ||
157 | // member. The actual count (not modulo 2^32) is required to | ||
158 | // correctly compute the total CRC. | ||
159 | unsigned got = CHUNK - strm.avail_out; | ||
160 | memb += got; | ||
161 | if (memb < got) | ||
162 | BYE("overflow error"); | ||
163 | |||
164 | // Continue to process this chunk until it is consumed, or | ||
165 | // until the end of a component (header, deflate block, or | ||
166 | // trailer) is reached. | ||
167 | } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0); | ||
168 | |||
169 | // Since strm.avail_in was > 0 for the inflate call, some input was | ||
170 | // just consumed. It is therefore assured that put < strm.next_in. | ||
171 | |||
172 | // Disposition the consumed component or part of a component. | ||
173 | switch (state) { | ||
174 | case BETWEEN: | ||
175 | state = HEAD; | ||
176 | // Fall through to HEAD when some or all of the header is | ||
177 | // processed. | ||
178 | |||
179 | case HEAD: | ||
180 | // Discard the header. | ||
181 | if (strm.data_type & 0x80) { | ||
182 | // End of header reached -- deflate blocks follow. | ||
183 | put = strm.next_in; | ||
184 | prev = num; | ||
185 | memb = 0; | ||
186 | state = BLOCK; | ||
187 | } | ||
188 | break; | ||
189 | |||
190 | case BLOCK: | ||
191 | // Copy the deflate stream to the output, but with the | ||
192 | // last-block-bit cleared. Re-synchronize stored block | ||
193 | // headers to the output byte boundaries. The bytes at | ||
194 | // put..strm.next_in-1 is the compressed data that has been | ||
195 | // processed and is ready to be copied to the output. | ||
196 | |||
197 | // At this point, it is assured that new compressed data is | ||
198 | // available, i.e., put < strm.next_in. If prev is -1, then | ||
199 | // that compressed data starts in the middle of a deflate | ||
200 | // block. If prev is not -1, then the bits in the bit | ||
201 | // buffer, possibly combined with the bits in *put, contain | ||
202 | // the three-bit header of the new deflate block. In that | ||
203 | // case, prev is the number of bits from the previous block | ||
204 | // that remain in the bit buffer. Since num is the number | ||
205 | // of bits in the bit buffer, we have that num - prev is | ||
206 | // the number of bits from the new block currently in the | ||
207 | // bit buffer. | ||
208 | |||
209 | // If strm.data_type & 0xc0 is 0x80, then the last byte of | ||
210 | // the available compressed data includes the last bits of | ||
211 | // the end of a deflate block. In that case, that last byte | ||
212 | // also has strm.data_type & 0x1f bits of the next deflate | ||
213 | // block, in the range 0..7. If strm.data_type & 0xc0 is | ||
214 | // 0xc0, then the last byte of the compressed data is the | ||
215 | // end of the deflate stream, followed by strm.data_type & | ||
216 | // 0x1f pad bits, also in the range 0..7. | ||
217 | |||
218 | // Set bits to the number of bits not yet consumed from the | ||
219 | // last byte. If we are at the end of the block, bits is | ||
220 | // either the number of bits in the last byte belonging to | ||
221 | // the next block, or the number of pad bits after the | ||
222 | // final block. In either of those cases, bits is in the | ||
223 | // range 0..7. | ||
224 | ; // (required due to C syntax oddity) | ||
225 | int bits = strm.data_type & 0x1f; | ||
226 | |||
227 | if (prev != -1) { | ||
228 | // We are at the start of a new block. Clear the last | ||
229 | // block bit, and check for special cases. If it is a | ||
230 | // stored block, then emit the header and pad to the | ||
231 | // next byte boundary. If it is a final, empty fixed | ||
232 | // block, then excise it. | ||
233 | |||
234 | // Some or all of the three header bits for this block | ||
235 | // may already be in the bit buffer. Load any remaining | ||
236 | // header bits into the bit buffer. | ||
237 | if (num - prev < 3) { | ||
238 | buf += (unsigned long)*put++ << num; | ||
239 | num += 8; | ||
240 | } | ||
241 | |||
242 | // Set last to have a 1 in the position of the last | ||
243 | // block bit in the bit buffer. | ||
244 | unsigned long last = (unsigned long)1 << prev; | ||
245 | |||
246 | if (((buf >> prev) & 7) == 3) { | ||
247 | // This is a final fixed block. Load at least ten | ||
248 | // bits from this block, including the header, into | ||
249 | // the bit buffer. We already have at least three, | ||
250 | // so at most one more byte needs to be loaded. | ||
251 | if (num - prev < 10) { | ||
252 | if (put == strm.next_in) | ||
253 | // Need to go get and process more input. | ||
254 | // We'll end up back here to finish this. | ||
255 | break; | ||
256 | buf += (unsigned long)*put++ << num; | ||
257 | num += 8; | ||
258 | } | ||
259 | if (((buf >> prev) & 0x3ff) == 3) { | ||
260 | // That final fixed block is empty. Delete it | ||
261 | // to avoid adding an empty block every time a | ||
262 | // gzip stream is normalized. | ||
263 | num = prev; | ||
264 | buf &= last - 1; // zero the pad bits | ||
265 | } | ||
266 | } | ||
267 | else if (((buf >> prev) & 6) == 0) { | ||
268 | // This is a stored block. Flush to the next | ||
269 | // byte boundary after the three-bit header. | ||
270 | num = (prev + 10) & ~7; | ||
271 | buf &= last - 1; // zero the pad bits | ||
272 | } | ||
273 | |||
274 | // Clear the last block bit. | ||
275 | buf &= ~last; | ||
276 | |||
277 | // Write out complete bytes in the bit buffer. | ||
278 | while (num >= 8) { | ||
279 | putc(buf, out); | ||
280 | buf >>= 8; | ||
281 | num -= 8; | ||
282 | } | ||
283 | |||
284 | // If no more bytes left to process, then we have | ||
285 | // consumed the byte that had bits from the next block. | ||
286 | if (put == strm.next_in) | ||
287 | bits = 0; | ||
288 | } | ||
289 | |||
290 | // We are done handling the deflate block header. Now copy | ||
291 | // all or almost all of the remaining compressed data that | ||
292 | // has been processed so far. Don't copy one byte at the | ||
293 | // end if it contains bits from the next deflate block or | ||
294 | // pad bits at the end of a deflate block. | ||
295 | |||
296 | // mix is 1 if we are at the end of a deflate block, and if | ||
297 | // some of the bits in the last byte follow this block. mix | ||
298 | // is 0 if we are in the middle of a deflate block, if the | ||
299 | // deflate block ended on a byte boundary, or if all of the | ||
300 | // compressed data processed so far has been consumed. | ||
301 | int mix = (strm.data_type & 0x80) && bits; | ||
302 | |||
303 | // Copy all of the processed compressed data to the output, | ||
304 | // except for the last byte if it contains bits from the | ||
305 | // next deflate block or pad bits at the end of the deflate | ||
306 | // stream. Copy the data after shifting in num bits from | ||
307 | // buf in front of it, leaving num bits from the end of the | ||
308 | // compressed data in buf when done. | ||
309 | unsigned char *end = strm.next_in - mix; | ||
310 | if (put < end) { | ||
311 | if (num) | ||
312 | // Insert num bits from buf before the data being | ||
313 | // copied. | ||
314 | do { | ||
315 | buf += (unsigned)(*put++) << num; | ||
316 | putc(buf, out); | ||
317 | buf >>= 8; | ||
318 | } while (put < end); | ||
319 | else { | ||
320 | // No shifting needed -- write directly. | ||
321 | fwrite(put, 1, end - put, out); | ||
322 | put = end; | ||
323 | } | ||
324 | } | ||
325 | |||
326 | // Process the last processed byte if it wasn't written. | ||
327 | if (mix) { | ||
328 | // Load the last byte into the bit buffer. | ||
329 | buf += (unsigned)(*put++) << num; | ||
330 | num += 8; | ||
331 | |||
332 | if (strm.data_type & 0x40) { | ||
333 | // We are at the end of the deflate stream and | ||
334 | // there are bits pad bits. Discard the pad bits | ||
335 | // and write a byte to the output, if available. | ||
336 | // Leave the num bits left over in buf to prepend | ||
337 | // to the next deflate stream. | ||
338 | num -= bits; | ||
339 | if (num >= 8) { | ||
340 | putc(buf, out); | ||
341 | num -= 8; | ||
342 | buf >>= 8; | ||
343 | } | ||
344 | |||
345 | // Force the pad bits in the bit buffer to zeros. | ||
346 | buf &= ((unsigned long)1 << num) - 1; | ||
347 | |||
348 | // Don't need to set prev here since going to TAIL. | ||
349 | } | ||
350 | else | ||
351 | // At the end of an internal deflate block. Leave | ||
352 | // the last byte in the bit buffer to examine on | ||
353 | // the next entry to BLOCK, when more bits from the | ||
354 | // next block will be available. | ||
355 | prev = num - bits; // number of bits in buffer | ||
356 | // from current block | ||
357 | } | ||
358 | |||
359 | // Don't have a byte left over, so we are in the middle of | ||
360 | // a deflate block, or the deflate block ended on a byte | ||
361 | // boundary. Set prev appropriately for the next entry into | ||
362 | // BLOCK. | ||
363 | else if (strm.data_type & 0x80) | ||
364 | // The block ended on a byte boundary, so no header | ||
365 | // bits are in the bit buffer. | ||
366 | prev = num; | ||
367 | else | ||
368 | // In the middle of a deflate block, so no header here. | ||
369 | prev = -1; | ||
370 | |||
371 | // Check for the end of the deflate stream. | ||
372 | if ((strm.data_type & 0xc0) == 0xc0) { | ||
373 | // That ends the deflate stream on the input side, the | ||
374 | // pad bits were discarded, and any remaining bits from | ||
375 | // the last block in the stream are saved in the bit | ||
376 | // buffer to prepend to the next stream. Process the | ||
377 | // gzip trailer next. | ||
378 | tail = 0; | ||
379 | part = 0; | ||
380 | state = TAIL; | ||
381 | } | ||
382 | break; | ||
383 | |||
384 | case TAIL: | ||
385 | // Accumulate available trailer bytes to update the total | ||
386 | // CRC and the total uncompressed length. | ||
387 | do { | ||
388 | part = (part >> 8) + ((unsigned long)(*put++) << 24); | ||
389 | tail++; | ||
390 | if (tail == 4) { | ||
391 | // Update the total CRC. | ||
392 | z_off_t len2 = memb; | ||
393 | if (len2 < 0 || (unsigned long long)len2 != memb) | ||
394 | BYE("overflow error"); | ||
395 | crc = crc ? crc32_combine(crc, part, len2) : part; | ||
396 | part = 0; | ||
397 | } | ||
398 | else if (tail == 8) { | ||
399 | // Update the total uncompressed length. (It's ok | ||
400 | // if this sum is done modulo 2^32.) | ||
401 | len += part; | ||
402 | |||
403 | // At the end of a member. Set up to inflate an | ||
404 | // immediately following gzip member. (If we made | ||
405 | // it this far, then the trailer was valid.) | ||
406 | if (inflateReset(&strm) != Z_OK) | ||
407 | BYE("internal error"); | ||
408 | state = BETWEEN; | ||
409 | break; | ||
410 | } | ||
411 | } while (put < strm.next_in); | ||
412 | break; | ||
413 | } | ||
414 | |||
415 | // Process the input buffer until completely consumed. | ||
416 | } while (strm.avail_in > 0); | ||
417 | |||
418 | // Process input until end of file, invalid input, or i/o error. | ||
419 | } while (more); | ||
420 | |||
421 | // Done with the inflate engine. | ||
422 | inflateEnd(&strm); | ||
423 | |||
424 | // Verify the validity of the input. | ||
425 | if (state != BETWEEN) | ||
426 | BYE("input invalid: incomplete gzip stream"); | ||
427 | |||
428 | // Write the remaining deflate stream bits, followed by a terminating | ||
429 | // deflate fixed block. | ||
430 | buf += (unsigned long)3 << num; | ||
431 | putc(buf, out); | ||
432 | putc(buf >> 8, out); | ||
433 | if (num > 6) | ||
434 | putc(0, out); | ||
435 | |||
436 | // Write the gzip trailer, which is the CRC and the uncompressed length | ||
437 | // modulo 2^32, both in little-endian order. | ||
438 | putc(crc, out); | ||
439 | putc(crc >> 8, out); | ||
440 | putc(crc >> 16, out); | ||
441 | putc(crc >> 24, out); | ||
442 | putc(len, out); | ||
443 | putc(len >> 8, out); | ||
444 | putc(len >> 16, out); | ||
445 | putc(len >> 24, out); | ||
446 | fflush(out); | ||
447 | |||
448 | // Check for any i/o errors. | ||
449 | if (ferror(in) || ferror(out)) | ||
450 | BYE("i/o error: %s", strerror(errno)); | ||
451 | |||
452 | // All good! | ||
453 | *err = NULL; | ||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | // Normalize the gzip stream on stdin, writing the result to stdout. | ||
458 | int main(void) { | ||
459 | // Avoid end-of-line conversions on evil operating systems. | ||
460 | SET_BINARY_MODE(stdin); | ||
461 | SET_BINARY_MODE(stdout); | ||
462 | |||
463 | // Normalize from stdin to stdout, returning 1 on error, 0 if ok. | ||
464 | char *err; | ||
465 | int ret = gzip_normalize(stdin, stdout, &err); | ||
466 | if (ret) | ||
467 | fprintf(stderr, "gznorm error: %s\n", err); | ||
468 | free(err); | ||
469 | return ret; | ||
470 | } | ||