summaryrefslogtreecommitdiff
path: root/examples/gzjoin.c
diff options
context:
space:
mode:
Diffstat (limited to 'examples/gzjoin.c')
-rw-r--r--examples/gzjoin.c447
1 files changed, 447 insertions, 0 deletions
diff --git a/examples/gzjoin.c b/examples/gzjoin.c
new file mode 100644
index 0000000..7434c5b
--- /dev/null
+++ b/examples/gzjoin.c
@@ -0,0 +1,447 @@
1/* gzjoin -- command to join gzip files into one gzip file
2
3 Copyright (C) 2004 Mark Adler, all rights reserved
4 version 1.0, 11 Dec 2004
5
6 This software is provided 'as-is', without any express or implied
7 warranty. In no event will the author be held liable for any damages
8 arising from the use of this software.
9
10 Permission is granted to anyone to use this software for any purpose,
11 including commercial applications, and to alter it and redistribute it
12 freely, subject to the following restrictions:
13
14 1. The origin of this software must not be misrepresented; you must not
15 claim that you wrote the original software. If you use this software
16 in a product, an acknowledgment in the product documentation would be
17 appreciated but is not required.
18 2. Altered source versions must be plainly marked as such, and must not be
19 misrepresented as being the original software.
20 3. This notice may not be removed or altered from any source distribution.
21
22 Mark Adler madler@alumni.caltech.edu
23 */
24
25/*
26 * Change history:
27 *
28 * 1.0 11 Dec 2004 - First version
29 */
30
31/*
32 gzjoin takes one or more gzip files on the command line and writes out a
33 single gzip file that will uncompress to the concatenation of the
34 uncompressed data from the individual gzip files. gzjoin does this without
35 having to recompress any of the data and without having to calculate a new
36 crc32 for the concatenated uncompressed data. gzjoin does however have to
37 decompress all of the input data in order to find the bits in the compressed
38 data that need to be modified to concatenate the streams.
39
40 gzjoin does not do an integrity check on the input gzip files other than
41 checking the gzip header and decompressing the compressed data. They are
42 otherwise assumed to be complete and correct.
43
44 Each joint between gzip files removes at least 18 bytes of previous trailer
45 and subsequent header, and inserts an average of about three bytes to the
46 compressed data in order to connect the streams. The output gzip file
47 has a minimal ten-byte gzip header with no file name or modification time.
48
49 This program was written to illustrate the use of the Z_BLOCK option of
50 inflate() and the crc32_combine() function. gzjoin will not compile with
51 versions of zlib earlier than 1.2.3.
52 */
53
54#include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
55#include <stdlib.h> /* exit(), malloc(), free() */
56#include <fcntl.h> /* open() */
57#include <unistd.h> /* close(), read(), lseek() */
58#include "zlib.h"
59 /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
60
61#define local static
62
63/* exit with an error (return a value to allow use in an expression) */
64local int bail(char *why1, char *why2)
65{
66 fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
67 exit(1);
68 return 0;
69}
70
71/* -- simple buffered file input with access to the buffer -- */
72
73#define CHUNK 32768 /* must be a power of two and fit in unsigned */
74
75/* bin buffered input file type */
76typedef struct {
77 char *name; /* name of file for error messages */
78 int fd; /* file descriptor */
79 unsigned left; /* bytes remaining at next */
80 unsigned char *next; /* next byte to read */
81 unsigned char *buf; /* allocated buffer of length CHUNK */
82} bin;
83
84/* close a buffered file and free allocated memory */
85local void bclose(bin *in)
86{
87 if (in != NULL) {
88 if (in->fd != -1)
89 close(in->fd);
90 if (in->buf != NULL)
91 free(in->buf);
92 free(in);
93 }
94}
95
96/* open a buffered file for input, return a pointer to type bin, or NULL on
97 failure */
98local bin *bopen(char *name)
99{
100 bin *in;
101
102 in = malloc(sizeof(bin));
103 if (in == NULL)
104 return NULL;
105 in->buf = malloc(CHUNK);
106 in->fd = open(name, O_RDONLY, 0);
107 if (in->buf == NULL || in->fd == -1) {
108 bclose(in);
109 return NULL;
110 }
111 in->left = 0;
112 in->next = in->buf;
113 in->name = name;
114 return in;
115}
116
117/* load buffer from file, return -1 on read error, 0 or 1 on success, with
118 1 indicating that end-of-file was reached */
119local int bload(bin *in)
120{
121 ssize_t len;
122
123 if (in == NULL)
124 return -1;
125 if (in->left != 0)
126 return 0;
127 in->next = in->buf;
128 do {
129 len = read(in->fd, in->buf + in->left, CHUNK - in->left);
130 if (len < 0)
131 return -1;
132 in->left += (unsigned)len;
133 } while (len != 0 && in->left < CHUNK);
134 return len == 0 ? 1 : 0;
135}
136
137/* get a byte from the file, bail if end of file */
138#define bget(in) (in->left ? 0 : bload(in), \
139 in->left ? (in->left--, *(in->next)++) : \
140 bail("unexpected end of file on ", in->name))
141
142/* get a four-byte little-endian unsigned integer from file */
143local unsigned long bget4(bin *in)
144{
145 unsigned long val;
146
147 val = bget(in);
148 val += (unsigned long)(bget(in)) << 8;
149 val += (unsigned long)(bget(in)) << 16;
150 val += (unsigned long)(bget(in)) << 24;
151 return val;
152}
153
154/* skip bytes in file */
155local void bskip(bin *in, unsigned skip)
156{
157 /* check pointer */
158 if (in == NULL)
159 return;
160
161 /* easy case -- skip bytes in buffer */
162 if (skip <= in->left) {
163 in->left -= skip;
164 in->next += skip;
165 return;
166 }
167
168 /* skip what's in buffer, discard buffer contents */
169 skip -= in->left;
170 in->left = 0;
171
172 /* seek past multiples of CHUNK bytes */
173 if (skip > CHUNK) {
174 unsigned left;
175
176 left = skip & (CHUNK - 1);
177 if (left == 0) {
178 /* exact number of chunks: seek all the way minus one byte to check
179 for end-of-file with a read */
180 lseek(in->fd, skip - 1, SEEK_CUR);
181 if (read(in->fd, in->buf, 1) != 1)
182 bail("unexpected end of file on ", in->name);
183 return;
184 }
185
186 /* skip the integral chunks, update skip with remainder */
187 lseek(in->fd, skip - left, SEEK_CUR);
188 skip = left;
189 }
190
191 /* read more input and skip remainder */
192 bload(in);
193 if (skip > in->left)
194 bail("unexpected end of file on ", in->name);
195 in->left -= skip;
196 in->next += skip;
197}
198
199/* -- end of buffered input functions -- */
200
201/* skip the gzip header from file in */
202local void gzhead(bin *in)
203{
204 int flags;
205
206 /* verify gzip magic header and compression method */
207 if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
208 bail(in->name, " is not a valid gzip file");
209
210 /* get and verify flags */
211 flags = bget(in);
212 if ((flags & 0xe0) != 0)
213 bail("unknown reserved bits set in ", in->name);
214
215 /* skip modification time, extra flags, and os */
216 bskip(in, 6);
217
218 /* skip extra field if present */
219 if (flags & 4) {
220 unsigned len;
221
222 len = bget(in);
223 len += (unsigned)(bget(in)) << 8;
224 bskip(in, len);
225 }
226
227 /* skip file name if present */
228 if (flags & 8)
229 while (bget(in) != 0)
230 ;
231
232 /* skip comment if present */
233 if (flags & 16)
234 while (bget(in) != 0)
235 ;
236
237 /* skip header crc if present */
238 if (flags & 2)
239 bskip(in, 2);
240}
241
242/* write a four-byte little-endian unsigned integer to out */
243local void put4(unsigned long val, FILE *out)
244{
245 putc(val & 0xff, out);
246 putc((val >> 8) & 0xff, out);
247 putc((val >> 16) & 0xff, out);
248 putc((val >> 24) & 0xff, out);
249}
250
251/* Load up zlib stream from buffered input, bail if end of file */
252local void zpull(z_streamp strm, bin *in)
253{
254 if (in->left == 0)
255 bload(in);
256 if (in->left == 0)
257 bail("unexpected end of file on ", in->name);
258 strm->avail_in = in->left;
259 strm->next_in = in->next;
260}
261
262/* Write header for gzip file to out and initialize trailer. */
263local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
264{
265 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
266 *crc = crc32(0L, Z_NULL, 0);
267 *tot = 0;
268}
269
270/* Copy the compressed data from name, zeroing the last block bit of the last
271 block if clr is true, and adding empty blocks as needed to get to a byte
272 boundary. If clr is false, then the last block becomes the last block of
273 the output, and the gzip trailer is written. crc and tot maintains the
274 crc and length (modulo 2^32) of the output for the trailer. The resulting
275 gzip file is written to out. gzinit() must be called before the first call
276 of gzcopy() to write the gzip header and to initialize crc and tot. */
277local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
278 FILE *out)
279{
280 int ret; /* return value from zlib functions */
281 int pos; /* where the "last block" bit is in byte */
282 int last; /* true if processing the last block */
283 bin *in; /* buffered input file */
284 unsigned char *start; /* start of compressed data in buffer */
285 unsigned char *junk; /* buffer for uncompressed data -- discarded */
286 z_off_t len; /* length of uncompressed data (support > 4 GB) */
287 z_stream strm; /* zlib inflate stream */
288
289 /* open gzip file and skip header */
290 in = bopen(name);
291 if (in == NULL)
292 bail("could not open ", name);
293 gzhead(in);
294
295 /* allocate buffer for uncompressed data and initialize raw inflate
296 stream */
297 junk = malloc(CHUNK);
298 strm.zalloc = Z_NULL;
299 strm.zfree = Z_NULL;
300 strm.opaque = Z_NULL;
301 strm.avail_in = 0;
302 strm.next_in = Z_NULL;
303 ret = inflateInit2(&strm, -15);
304 if (junk == NULL || ret != Z_OK)
305 bail("out of memory", "");
306
307 /* inflate and copy compressed data, clear last-block bit if requested */
308 len = 0;
309 zpull(&strm, in);
310 start = strm.next_in;
311 last = start[0] & 1;
312 if (last && clr)
313 start[0] &= ~1;
314 strm.avail_out = 0;
315 for (;;) {
316 /* if input used and output done, write used input and get more */
317 if (strm.avail_in == 0 && strm.avail_out != 0) {
318 fwrite(start, 1, strm.next_in - start, out);
319 start = in->buf;
320 in->left = 0;
321 zpull(&strm, in);
322 }
323
324 /* decompress -- return early when end-of-block reached */
325 strm.avail_out = CHUNK;
326 strm.next_out = junk;
327 ret = inflate(&strm, Z_BLOCK);
328 switch (ret) {
329 case Z_MEM_ERROR:
330 bail("out of memory", "");
331 case Z_DATA_ERROR:
332 bail("invalid compressed data in ", in->name);
333 }
334
335 /* update length of uncompressed data */
336 len += CHUNK - strm.avail_out;
337
338 /* check for block boundary (only get this when block copied out) */
339 if (strm.data_type & 128) {
340 /* if that was the last block, then done */
341 if (last)
342 break;
343
344 /* number of unused bits in last byte */
345 pos = strm.data_type & 7;
346
347 /* find the next last-block bit */
348 if (pos != 0) {
349 /* next last-block bit is in last used byte */
350 pos = 0x100 >> pos;
351 last = strm.next_in[-1] & pos;
352 if (last && clr)
353 strm.next_in[-1] &= ~pos;
354 }
355 else {
356 /* next last-block bit is in next unused byte */
357 if (strm.avail_in == 0) {
358 /* don't have that byte yet -- get it */
359 fwrite(start, 1, strm.next_in - start, out);
360 start = in->buf;
361 in->left = 0;
362 zpull(&strm, in);
363 }
364 last = strm.next_in[0] & 1;
365 if (last && clr)
366 strm.next_in[0] &= ~1;
367 }
368 }
369 }
370
371 /* update buffer with unused input */
372 in->left = strm.avail_in;
373 in->next = strm.next_in;
374
375 /* copy used input, write empty blocks to get to byte boundary */
376 pos = strm.data_type & 7;
377 fwrite(start, 1, in->next - start - 1, out);
378 last = in->next[-1];
379 if (pos == 0 || !clr)
380 /* already at byte boundary, or last file: write last byte */
381 putc(last, out);
382 else {
383 /* append empty blocks to last byte */
384 last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
385 if (pos & 1) {
386 /* odd -- append an empty stored block */
387 putc(last, out);
388 if (pos == 1)
389 putc(0, out); /* two more bits in block header */
390 fwrite("\0\0\xff\xff", 1, 4, out);
391 }
392 else {
393 /* even -- append 1, 2, or 3 empty fixed blocks */
394 switch (pos) {
395 case 6:
396 putc(last | 8, out);
397 last = 0;
398 case 4:
399 putc(last | 0x20, out);
400 last = 0;
401 case 2:
402 putc(last | 0x80, out);
403 putc(0, out);
404 }
405 }
406 }
407
408 /* update crc and tot */
409 *crc = crc32_combine(*crc, bget4(in), len);
410 *tot += (unsigned long)len;
411
412 /* clean up */
413 inflateEnd(&strm);
414 free(junk);
415 bclose(in);
416
417 /* write trailer if this is the last gzip file */
418 if (!clr) {
419 put4(*crc, out);
420 put4(*tot, out);
421 }
422}
423
424/* join the gzip files on the command line, write result to stdout */
425int main(int argc, char **argv)
426{
427 unsigned long crc, tot; /* running crc and total uncompressed length */
428
429 /* skip command name */
430 argc--;
431 argv++;
432
433 /* show usage if no arguments */
434 if (argc == 0) {
435 fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
436 stderr);
437 return 0;
438 }
439
440 /* join gzip files on command line and write to stdout */
441 gzinit(&crc, &tot, stdout);
442 while (argc--)
443 gzcopy(*argv++, argc, &crc, &tot, stdout);
444
445 /* done */
446 return 0;
447}