diff options
Diffstat (limited to 'examples/gzjoin.c')
-rw-r--r-- | examples/gzjoin.c | 447 |
1 files changed, 447 insertions, 0 deletions
diff --git a/examples/gzjoin.c b/examples/gzjoin.c new file mode 100644 index 0000000..7434c5b --- /dev/null +++ b/examples/gzjoin.c | |||
@@ -0,0 +1,447 @@ | |||
1 | /* gzjoin -- command to join gzip files into one gzip file | ||
2 | |||
3 | Copyright (C) 2004 Mark Adler, all rights reserved | ||
4 | version 1.0, 11 Dec 2004 | ||
5 | |||
6 | This software is provided 'as-is', without any express or implied | ||
7 | warranty. In no event will the author be held liable for any damages | ||
8 | arising from the use of this software. | ||
9 | |||
10 | Permission is granted to anyone to use this software for any purpose, | ||
11 | including commercial applications, and to alter it and redistribute it | ||
12 | freely, subject to the following restrictions: | ||
13 | |||
14 | 1. The origin of this software must not be misrepresented; you must not | ||
15 | claim that you wrote the original software. If you use this software | ||
16 | in a product, an acknowledgment in the product documentation would be | ||
17 | appreciated but is not required. | ||
18 | 2. Altered source versions must be plainly marked as such, and must not be | ||
19 | misrepresented as being the original software. | ||
20 | 3. This notice may not be removed or altered from any source distribution. | ||
21 | |||
22 | Mark Adler madler@alumni.caltech.edu | ||
23 | */ | ||
24 | |||
25 | /* | ||
26 | * Change history: | ||
27 | * | ||
28 | * 1.0 11 Dec 2004 - First version | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | gzjoin takes one or more gzip files on the command line and writes out a | ||
33 | single gzip file that will uncompress to the concatenation of the | ||
34 | uncompressed data from the individual gzip files. gzjoin does this without | ||
35 | having to recompress any of the data and without having to calculate a new | ||
36 | crc32 for the concatenated uncompressed data. gzjoin does however have to | ||
37 | decompress all of the input data in order to find the bits in the compressed | ||
38 | data that need to be modified to concatenate the streams. | ||
39 | |||
40 | gzjoin does not do an integrity check on the input gzip files other than | ||
41 | checking the gzip header and decompressing the compressed data. They are | ||
42 | otherwise assumed to be complete and correct. | ||
43 | |||
44 | Each joint between gzip files removes at least 18 bytes of previous trailer | ||
45 | and subsequent header, and inserts an average of about three bytes to the | ||
46 | compressed data in order to connect the streams. The output gzip file | ||
47 | has a minimal ten-byte gzip header with no file name or modification time. | ||
48 | |||
49 | This program was written to illustrate the use of the Z_BLOCK option of | ||
50 | inflate() and the crc32_combine() function. gzjoin will not compile with | ||
51 | versions of zlib earlier than 1.2.3. | ||
52 | */ | ||
53 | |||
54 | #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */ | ||
55 | #include <stdlib.h> /* exit(), malloc(), free() */ | ||
56 | #include <fcntl.h> /* open() */ | ||
57 | #include <unistd.h> /* close(), read(), lseek() */ | ||
58 | #include "zlib.h" | ||
59 | /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */ | ||
60 | |||
61 | #define local static | ||
62 | |||
63 | /* exit with an error (return a value to allow use in an expression) */ | ||
64 | local int bail(char *why1, char *why2) | ||
65 | { | ||
66 | fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2); | ||
67 | exit(1); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | /* -- simple buffered file input with access to the buffer -- */ | ||
72 | |||
73 | #define CHUNK 32768 /* must be a power of two and fit in unsigned */ | ||
74 | |||
75 | /* bin buffered input file type */ | ||
76 | typedef struct { | ||
77 | char *name; /* name of file for error messages */ | ||
78 | int fd; /* file descriptor */ | ||
79 | unsigned left; /* bytes remaining at next */ | ||
80 | unsigned char *next; /* next byte to read */ | ||
81 | unsigned char *buf; /* allocated buffer of length CHUNK */ | ||
82 | } bin; | ||
83 | |||
84 | /* close a buffered file and free allocated memory */ | ||
85 | local void bclose(bin *in) | ||
86 | { | ||
87 | if (in != NULL) { | ||
88 | if (in->fd != -1) | ||
89 | close(in->fd); | ||
90 | if (in->buf != NULL) | ||
91 | free(in->buf); | ||
92 | free(in); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | /* open a buffered file for input, return a pointer to type bin, or NULL on | ||
97 | failure */ | ||
98 | local bin *bopen(char *name) | ||
99 | { | ||
100 | bin *in; | ||
101 | |||
102 | in = malloc(sizeof(bin)); | ||
103 | if (in == NULL) | ||
104 | return NULL; | ||
105 | in->buf = malloc(CHUNK); | ||
106 | in->fd = open(name, O_RDONLY, 0); | ||
107 | if (in->buf == NULL || in->fd == -1) { | ||
108 | bclose(in); | ||
109 | return NULL; | ||
110 | } | ||
111 | in->left = 0; | ||
112 | in->next = in->buf; | ||
113 | in->name = name; | ||
114 | return in; | ||
115 | } | ||
116 | |||
117 | /* load buffer from file, return -1 on read error, 0 or 1 on success, with | ||
118 | 1 indicating that end-of-file was reached */ | ||
119 | local int bload(bin *in) | ||
120 | { | ||
121 | ssize_t len; | ||
122 | |||
123 | if (in == NULL) | ||
124 | return -1; | ||
125 | if (in->left != 0) | ||
126 | return 0; | ||
127 | in->next = in->buf; | ||
128 | do { | ||
129 | len = read(in->fd, in->buf + in->left, CHUNK - in->left); | ||
130 | if (len < 0) | ||
131 | return -1; | ||
132 | in->left += (unsigned)len; | ||
133 | } while (len != 0 && in->left < CHUNK); | ||
134 | return len == 0 ? 1 : 0; | ||
135 | } | ||
136 | |||
137 | /* get a byte from the file, bail if end of file */ | ||
138 | #define bget(in) (in->left ? 0 : bload(in), \ | ||
139 | in->left ? (in->left--, *(in->next)++) : \ | ||
140 | bail("unexpected end of file on ", in->name)) | ||
141 | |||
142 | /* get a four-byte little-endian unsigned integer from file */ | ||
143 | local unsigned long bget4(bin *in) | ||
144 | { | ||
145 | unsigned long val; | ||
146 | |||
147 | val = bget(in); | ||
148 | val += (unsigned long)(bget(in)) << 8; | ||
149 | val += (unsigned long)(bget(in)) << 16; | ||
150 | val += (unsigned long)(bget(in)) << 24; | ||
151 | return val; | ||
152 | } | ||
153 | |||
154 | /* skip bytes in file */ | ||
155 | local void bskip(bin *in, unsigned skip) | ||
156 | { | ||
157 | /* check pointer */ | ||
158 | if (in == NULL) | ||
159 | return; | ||
160 | |||
161 | /* easy case -- skip bytes in buffer */ | ||
162 | if (skip <= in->left) { | ||
163 | in->left -= skip; | ||
164 | in->next += skip; | ||
165 | return; | ||
166 | } | ||
167 | |||
168 | /* skip what's in buffer, discard buffer contents */ | ||
169 | skip -= in->left; | ||
170 | in->left = 0; | ||
171 | |||
172 | /* seek past multiples of CHUNK bytes */ | ||
173 | if (skip > CHUNK) { | ||
174 | unsigned left; | ||
175 | |||
176 | left = skip & (CHUNK - 1); | ||
177 | if (left == 0) { | ||
178 | /* exact number of chunks: seek all the way minus one byte to check | ||
179 | for end-of-file with a read */ | ||
180 | lseek(in->fd, skip - 1, SEEK_CUR); | ||
181 | if (read(in->fd, in->buf, 1) != 1) | ||
182 | bail("unexpected end of file on ", in->name); | ||
183 | return; | ||
184 | } | ||
185 | |||
186 | /* skip the integral chunks, update skip with remainder */ | ||
187 | lseek(in->fd, skip - left, SEEK_CUR); | ||
188 | skip = left; | ||
189 | } | ||
190 | |||
191 | /* read more input and skip remainder */ | ||
192 | bload(in); | ||
193 | if (skip > in->left) | ||
194 | bail("unexpected end of file on ", in->name); | ||
195 | in->left -= skip; | ||
196 | in->next += skip; | ||
197 | } | ||
198 | |||
199 | /* -- end of buffered input functions -- */ | ||
200 | |||
201 | /* skip the gzip header from file in */ | ||
202 | local void gzhead(bin *in) | ||
203 | { | ||
204 | int flags; | ||
205 | |||
206 | /* verify gzip magic header and compression method */ | ||
207 | if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8) | ||
208 | bail(in->name, " is not a valid gzip file"); | ||
209 | |||
210 | /* get and verify flags */ | ||
211 | flags = bget(in); | ||
212 | if ((flags & 0xe0) != 0) | ||
213 | bail("unknown reserved bits set in ", in->name); | ||
214 | |||
215 | /* skip modification time, extra flags, and os */ | ||
216 | bskip(in, 6); | ||
217 | |||
218 | /* skip extra field if present */ | ||
219 | if (flags & 4) { | ||
220 | unsigned len; | ||
221 | |||
222 | len = bget(in); | ||
223 | len += (unsigned)(bget(in)) << 8; | ||
224 | bskip(in, len); | ||
225 | } | ||
226 | |||
227 | /* skip file name if present */ | ||
228 | if (flags & 8) | ||
229 | while (bget(in) != 0) | ||
230 | ; | ||
231 | |||
232 | /* skip comment if present */ | ||
233 | if (flags & 16) | ||
234 | while (bget(in) != 0) | ||
235 | ; | ||
236 | |||
237 | /* skip header crc if present */ | ||
238 | if (flags & 2) | ||
239 | bskip(in, 2); | ||
240 | } | ||
241 | |||
242 | /* write a four-byte little-endian unsigned integer to out */ | ||
243 | local void put4(unsigned long val, FILE *out) | ||
244 | { | ||
245 | putc(val & 0xff, out); | ||
246 | putc((val >> 8) & 0xff, out); | ||
247 | putc((val >> 16) & 0xff, out); | ||
248 | putc((val >> 24) & 0xff, out); | ||
249 | } | ||
250 | |||
251 | /* Load up zlib stream from buffered input, bail if end of file */ | ||
252 | local void zpull(z_streamp strm, bin *in) | ||
253 | { | ||
254 | if (in->left == 0) | ||
255 | bload(in); | ||
256 | if (in->left == 0) | ||
257 | bail("unexpected end of file on ", in->name); | ||
258 | strm->avail_in = in->left; | ||
259 | strm->next_in = in->next; | ||
260 | } | ||
261 | |||
262 | /* Write header for gzip file to out and initialize trailer. */ | ||
263 | local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out) | ||
264 | { | ||
265 | fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); | ||
266 | *crc = crc32(0L, Z_NULL, 0); | ||
267 | *tot = 0; | ||
268 | } | ||
269 | |||
270 | /* Copy the compressed data from name, zeroing the last block bit of the last | ||
271 | block if clr is true, and adding empty blocks as needed to get to a byte | ||
272 | boundary. If clr is false, then the last block becomes the last block of | ||
273 | the output, and the gzip trailer is written. crc and tot maintains the | ||
274 | crc and length (modulo 2^32) of the output for the trailer. The resulting | ||
275 | gzip file is written to out. gzinit() must be called before the first call | ||
276 | of gzcopy() to write the gzip header and to initialize crc and tot. */ | ||
277 | local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot, | ||
278 | FILE *out) | ||
279 | { | ||
280 | int ret; /* return value from zlib functions */ | ||
281 | int pos; /* where the "last block" bit is in byte */ | ||
282 | int last; /* true if processing the last block */ | ||
283 | bin *in; /* buffered input file */ | ||
284 | unsigned char *start; /* start of compressed data in buffer */ | ||
285 | unsigned char *junk; /* buffer for uncompressed data -- discarded */ | ||
286 | z_off_t len; /* length of uncompressed data (support > 4 GB) */ | ||
287 | z_stream strm; /* zlib inflate stream */ | ||
288 | |||
289 | /* open gzip file and skip header */ | ||
290 | in = bopen(name); | ||
291 | if (in == NULL) | ||
292 | bail("could not open ", name); | ||
293 | gzhead(in); | ||
294 | |||
295 | /* allocate buffer for uncompressed data and initialize raw inflate | ||
296 | stream */ | ||
297 | junk = malloc(CHUNK); | ||
298 | strm.zalloc = Z_NULL; | ||
299 | strm.zfree = Z_NULL; | ||
300 | strm.opaque = Z_NULL; | ||
301 | strm.avail_in = 0; | ||
302 | strm.next_in = Z_NULL; | ||
303 | ret = inflateInit2(&strm, -15); | ||
304 | if (junk == NULL || ret != Z_OK) | ||
305 | bail("out of memory", ""); | ||
306 | |||
307 | /* inflate and copy compressed data, clear last-block bit if requested */ | ||
308 | len = 0; | ||
309 | zpull(&strm, in); | ||
310 | start = strm.next_in; | ||
311 | last = start[0] & 1; | ||
312 | if (last && clr) | ||
313 | start[0] &= ~1; | ||
314 | strm.avail_out = 0; | ||
315 | for (;;) { | ||
316 | /* if input used and output done, write used input and get more */ | ||
317 | if (strm.avail_in == 0 && strm.avail_out != 0) { | ||
318 | fwrite(start, 1, strm.next_in - start, out); | ||
319 | start = in->buf; | ||
320 | in->left = 0; | ||
321 | zpull(&strm, in); | ||
322 | } | ||
323 | |||
324 | /* decompress -- return early when end-of-block reached */ | ||
325 | strm.avail_out = CHUNK; | ||
326 | strm.next_out = junk; | ||
327 | ret = inflate(&strm, Z_BLOCK); | ||
328 | switch (ret) { | ||
329 | case Z_MEM_ERROR: | ||
330 | bail("out of memory", ""); | ||
331 | case Z_DATA_ERROR: | ||
332 | bail("invalid compressed data in ", in->name); | ||
333 | } | ||
334 | |||
335 | /* update length of uncompressed data */ | ||
336 | len += CHUNK - strm.avail_out; | ||
337 | |||
338 | /* check for block boundary (only get this when block copied out) */ | ||
339 | if (strm.data_type & 128) { | ||
340 | /* if that was the last block, then done */ | ||
341 | if (last) | ||
342 | break; | ||
343 | |||
344 | /* number of unused bits in last byte */ | ||
345 | pos = strm.data_type & 7; | ||
346 | |||
347 | /* find the next last-block bit */ | ||
348 | if (pos != 0) { | ||
349 | /* next last-block bit is in last used byte */ | ||
350 | pos = 0x100 >> pos; | ||
351 | last = strm.next_in[-1] & pos; | ||
352 | if (last && clr) | ||
353 | strm.next_in[-1] &= ~pos; | ||
354 | } | ||
355 | else { | ||
356 | /* next last-block bit is in next unused byte */ | ||
357 | if (strm.avail_in == 0) { | ||
358 | /* don't have that byte yet -- get it */ | ||
359 | fwrite(start, 1, strm.next_in - start, out); | ||
360 | start = in->buf; | ||
361 | in->left = 0; | ||
362 | zpull(&strm, in); | ||
363 | } | ||
364 | last = strm.next_in[0] & 1; | ||
365 | if (last && clr) | ||
366 | strm.next_in[0] &= ~1; | ||
367 | } | ||
368 | } | ||
369 | } | ||
370 | |||
371 | /* update buffer with unused input */ | ||
372 | in->left = strm.avail_in; | ||
373 | in->next = strm.next_in; | ||
374 | |||
375 | /* copy used input, write empty blocks to get to byte boundary */ | ||
376 | pos = strm.data_type & 7; | ||
377 | fwrite(start, 1, in->next - start - 1, out); | ||
378 | last = in->next[-1]; | ||
379 | if (pos == 0 || !clr) | ||
380 | /* already at byte boundary, or last file: write last byte */ | ||
381 | putc(last, out); | ||
382 | else { | ||
383 | /* append empty blocks to last byte */ | ||
384 | last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */ | ||
385 | if (pos & 1) { | ||
386 | /* odd -- append an empty stored block */ | ||
387 | putc(last, out); | ||
388 | if (pos == 1) | ||
389 | putc(0, out); /* two more bits in block header */ | ||
390 | fwrite("\0\0\xff\xff", 1, 4, out); | ||
391 | } | ||
392 | else { | ||
393 | /* even -- append 1, 2, or 3 empty fixed blocks */ | ||
394 | switch (pos) { | ||
395 | case 6: | ||
396 | putc(last | 8, out); | ||
397 | last = 0; | ||
398 | case 4: | ||
399 | putc(last | 0x20, out); | ||
400 | last = 0; | ||
401 | case 2: | ||
402 | putc(last | 0x80, out); | ||
403 | putc(0, out); | ||
404 | } | ||
405 | } | ||
406 | } | ||
407 | |||
408 | /* update crc and tot */ | ||
409 | *crc = crc32_combine(*crc, bget4(in), len); | ||
410 | *tot += (unsigned long)len; | ||
411 | |||
412 | /* clean up */ | ||
413 | inflateEnd(&strm); | ||
414 | free(junk); | ||
415 | bclose(in); | ||
416 | |||
417 | /* write trailer if this is the last gzip file */ | ||
418 | if (!clr) { | ||
419 | put4(*crc, out); | ||
420 | put4(*tot, out); | ||
421 | } | ||
422 | } | ||
423 | |||
424 | /* join the gzip files on the command line, write result to stdout */ | ||
425 | int main(int argc, char **argv) | ||
426 | { | ||
427 | unsigned long crc, tot; /* running crc and total uncompressed length */ | ||
428 | |||
429 | /* skip command name */ | ||
430 | argc--; | ||
431 | argv++; | ||
432 | |||
433 | /* show usage if no arguments */ | ||
434 | if (argc == 0) { | ||
435 | fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n", | ||
436 | stderr); | ||
437 | return 0; | ||
438 | } | ||
439 | |||
440 | /* join gzip files on command line and write to stdout */ | ||
441 | gzinit(&crc, &tot, stdout); | ||
442 | while (argc--) | ||
443 | gzcopy(*argv++, argc, &crc, &tot, stdout); | ||
444 | |||
445 | /* done */ | ||
446 | return 0; | ||
447 | } | ||