summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:26:40 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:26:40 -0700
commitf6194ef39af5864f792412460c354cc339dde7d1 (patch)
tree5ea1e6849128e9b2194c66ee3d82afa36b4ac07c /examples
parent639be997883d9016baaf46017a2802b2ce1698bd (diff)
downloadzlib-1.2.3.4.tar.gz
zlib-1.2.3.4.tar.bz2
zlib-1.2.3.4.zip
zlib 1.2.3.4v1.2.3.4
Diffstat (limited to 'examples')
-rw-r--r--examples/README.examples21
-rw-r--r--examples/enough.c569
-rw-r--r--examples/gzlog.c1303
-rw-r--r--examples/gzlog.h93
-rw-r--r--examples/pigz.c452
5 files changed, 2074 insertions, 364 deletions
diff --git a/examples/README.examples b/examples/README.examples
index 5632d7a..146919c 100644
--- a/examples/README.examples
+++ b/examples/README.examples
@@ -1,4 +1,10 @@
1This directory contains examples of the use of zlib. 1This directory contains examples of the use of zlib and other relevant
2programs and documentation.
3
4enough.c
5 calculation and justification of ENOUGH parameter in inftrees.h
6 - calculates the maximum table space used in inflate tree
7 construction over all possible Huffman codes
2 8
3fitblk.c 9fitblk.c
4 compress just enough input to nearly fill a requested output size 10 compress just enough input to nearly fill a requested output size
@@ -23,9 +29,16 @@ gzjoin.c
23 29
24gzlog.c 30gzlog.c
25gzlog.h 31gzlog.h
26 efficiently maintain a message log file in gzip format 32 efficiently and robustly maintain a message log file in gzip format
27 - illustrates use of raw deflate and Z_SYNC_FLUSH 33 - illustrates use of raw deflate, Z_PARTIAL_FLUSH, deflatePrime(),
28 - illustrates use of gzip header extra field 34 and deflateSetDictionary()
35 - illustrates use of a gzip header extra field
36
37pigz.c
38 parallel implementation of gzip compression
39 - uses pthreads to speed up compression on multiple core machines
40 - illustrates the use of deflateSetDictionary() with raw deflate
41 - illustrates the use of crc32_combine()
29 42
30zlib_how.html 43zlib_how.html
31 painfully comprehensive description of zpipe.c (see below) 44 painfully comprehensive description of zpipe.c (see below)
diff --git a/examples/enough.c b/examples/enough.c
new file mode 100644
index 0000000..b570707
--- /dev/null
+++ b/examples/enough.c
@@ -0,0 +1,569 @@
1/* enough.c -- determine the maximum size of inflate's Huffman code tables over
2 * all possible valid and complete Huffman codes, subject to a length limit.
3 * Copyright (C) 2007, 2008 Mark Adler
4 * Version 1.3 17 February 2008 Mark Adler
5 */
6
7/* Version history:
8 1.0 3 Jan 2007 First version (derived from codecount.c version 1.4)
9 1.1 4 Jan 2007 Use faster incremental table usage computation
10 Prune examine() search on previously visited states
11 1.2 5 Jan 2007 Comments clean up
12 As inflate does, decrease root for short codes
13 Refuse cases where inflate would increase root
14 1.3 17 Feb 2008 Add argument for initial root table size
15 Fix bug for initial root table size == max - 1
16 Use a macro to compute the history index
17 */
18
19/*
20 Examine all possible Huffman codes for a given number of symbols and a
21 maximum code length in bits to determine the maximum table size for zilb's
22 inflate. Only complete Huffman codes are counted.
23
24 Two codes are considered distinct if the vectors of the number of codes per
25 length are not identical. So permutations of the symbol assignments result
26 in the same code for the counting, as do permutations of the assignments of
27 the bit values to the codes (i.e. only canonical codes are counted).
28
29 We build a code from shorter to longer lengths, determining how many symbols
30 are coded at each length. At each step, we have how many symbols remain to
31 be coded, what the last code length used was, and how many bit patterns of
32 that length remain unused. Then we add one to the code length and double the
33 number of unused patterns to graduate to the next code length. We then
34 assign all portions of the remaining symbols to that code length that
35 preserve the properties of a correct and eventually complete code. Those
36 properties are: we cannot use more bit patterns than are available; and when
37 all the symbols are used, there are exactly zero possible bit patterns
38 remaining.
39
40 The inflate Huffman decoding algorithm uses two-level lookup tables for
41 speed. There is a single first-level table to decode codes up to root bits
42 in length (root == 9 in the current inflate implementation). The table
43 has 1 << root entries and is indexed by the next root bits of input. Codes
44 shorter than root bits have replicated table entries, so that the correct
45 entry is pointed to regardless of the bits that follow the short code. If
46 the code is longer than root bits, then the table entry points to a second-
47 level table. The size of that table is determined by the longest code with
48 that root-bit prefix. If that longest code has length len, then the table
49 has size 1 << (len - root), to index the remaining bits in that set of
50 codes. Each subsequent root-bit prefix then has its own sub-table. The
51 total number of table entries required by the code is calculated
52 incrementally as the number of codes at each bit length is populated. When
53 all of the codes are shorter than root bits, then root is reduced to the
54 longest code length, resulting in a single, smaller, one-level table.
55
56 The inflate algorithm also provides for small values of root (relative to
57 the log2 of the number of symbols), where the shortest code has more bits
58 than root. In that case, root is increased to the length of the shortest
59 code. This program, by design, does not handle that case, so it is verified
60 that the number of symbols is less than 2^(root + 1).
61
62 In order to speed up the examination (by about ten orders of magnitude for
63 the default arguments), the intermediate states in the build-up of a code
64 are remembered and previously visited branches are pruned. The memory
65 required for this will increase rapidly with the total number of symbols and
66 the maximum code length in bits. However this is a very small price to pay
67 for the vast speedup.
68
69 First, all of the possible Huffman codes are counted, and reachable
70 intermediate states are noted by a non-zero count in a saved-results array.
71 Second, the intermediate states that lead to (root + 1) bit or longer codes
72 are used to look at all sub-codes from those junctures for their inflate
73 memory usage. (The amount of memory used is not affected by the number of
74 codes of root bits or less in length.) Third, the visited states in the
75 construction of those sub-codes and the associated calculation of the table
76 size is recalled in order to avoid recalculating from the same juncture.
77 Beginning the code examination at (root + 1) bit codes, which is enabled by
78 identifying the reachable nodes, accounts for about six of the orders of
79 magnitude of improvement for the default arguments. About another four
80 orders of magnitude come from not revisiting previous states. Out of
81 approximately 2x10^16 possible Huffman codes, only about 2x10^6 sub-codes
82 need to be examined to cover all of the possible table memory usage cases
83 for the default arguments of 286 symbols limited to 15-bit codes.
84
85 Note that an unsigned long long type is used for counting. It is quite easy
86 to exceed the capacity of an eight-byte integer with a large number of
87 symbols and a large maximum code length, so multiple-precision arithmetic
88 would need to replace the unsigned long long arithmetic in that case. This
89 program will abort if an overflow occurs. The big_t type identifies where
90 the counting takes place.
91
92 An unsigned long long type is also used for calculating the number of
93 possible codes remaining at the maximum length. This limits the maximum
94 code length to the number of bits in a long long minus the number of bits
95 needed to represent the symbols in a flat code. The code_t type identifies
96 where the bit pattern counting takes place.
97 */
98
99#include <stdio.h>
100#include <stdlib.h>
101#include <string.h>
102#include <assert.h>
103
104#define local static
105
106/* special data types */
107typedef unsigned long long big_t; /* type for code counting */
108typedef unsigned long long code_t; /* type for bit pattern counting */
109struct tab { /* type for been here check */
110 size_t len; /* length of bit vector in char's */
111 char *vec; /* allocated bit vector */
112};
113
114/* The array for saving results, num[], is indexed with this triplet:
115
116 syms: number of symbols remaining to code
117 left: number of available bit patterns at length len
118 len: number of bits in the codes currently being assigned
119
120 Those indices are constrained thusly when saving results:
121
122 syms: 3..totsym (totsym == total symbols to code)
123 left: 2..syms - 1, but only the evens (so syms == 8 -> 2, 4, 6)
124 len: 1..max - 1 (max == maximum code length in bits)
125
126 syms == 2 is not saved since that immediately leads to a single code. left
127 must be even, since it represents the number of available bit patterns at
128 the current length, which is double the number at the previous length.
129 left ends at syms-1 since left == syms immediately results in a single code.
130 (left > sym is not allowed since that would result in an incomplete code.)
131 len is less than max, since the code completes immediately when len == max.
132
133 The offset into the array is calculated for the three indices with the
134 first one (syms) being outermost, and the last one (len) being innermost.
135 We build the array with length max-1 lists for the len index, with syms-3
136 of those for each symbol. There are totsym-2 of those, with each one
137 varying in length as a function of sym. See the calculation of index in
138 count() for the index, and the calculation of size in main() for the size
139 of the array.
140
141 For the deflate example of 286 symbols limited to 15-bit codes, the array
142 has 284,284 entries, taking up 2.17 MB for an 8-byte big_t. More than
143 half of the space allocated for saved results is actually used -- not all
144 possible triplets are reached in the generation of valid Huffman codes.
145 */
146
147/* The array for tracking visited states, done[], is itself indexed identically
148 to the num[] array as described above for the (syms, left, len) triplet.
149 Each element in the array is further indexed by the (mem, rem) doublet,
150 where mem is the amount of inflate table space used so far, and rem is the
151 remaining unused entries in the current inflate sub-table. Each indexed
152 element is simply one bit indicating whether the state has been visited or
153 not. Since the ranges for mem and rem are not known a priori, each bit
154 vector is of a variable size, and grows as needed to accommodate the visited
155 states. mem and rem are used to calculate a single index in a triangular
156 array. Since the range of mem is expected in the default case to be about
157 ten times larger than the range of rem, the array is skewed to reduce the
158 memory usage, with eight times the range for mem than for rem. See the
159 calculations for offset and bit in beenhere() for the details.
160
161 For the deflate example of 286 symbols limited to 15-bit codes, the bit
162 vectors grow to total approximately 21 MB, in addition to the 4.3 MB done[]
163 array itself.
164 */
165
166/* Globals to avoid propagating constants or constant pointers recursively */
167local int max; /* maximum allowed bit length for the codes */
168local int root; /* size of base code table in bits */
169local int large; /* largest code table so far */
170local size_t size; /* number of elements in num and done */
171local int *code; /* number of symbols assigned to each bit length */
172local big_t *num; /* saved results array for code counting */
173local struct tab *done; /* states already evaluated array */
174
175/* Index function for num[] and done[] */
176#define INDEX(i,j,k) (((size_t)((i-1)>>1)*((i-2)>>1)+(j>>1)-1)*(max-1)+k-1)
177
178/* Free allocated space. Uses globals code, num, and done. */
179local void cleanup(void)
180{
181 size_t n;
182
183 if (done != NULL) {
184 for (n = 0; n < size; n++)
185 if (done[n].len)
186 free(done[n].vec);
187 free(done);
188 }
189 if (num != NULL)
190 free(num);
191 if (code != NULL)
192 free(code);
193}
194
195/* Return the number of possible Huffman codes using bit patterns of lengths
196 len through max inclusive, coding syms symbols, with left bit patterns of
197 length len unused -- return -1 if there is an overflow in the counting.
198 Keep a record of previous results in num to prevent repeating the same
199 calculation. Uses the globals max and num. */
200local big_t count(int syms, int len, int left)
201{
202 big_t sum; /* number of possible codes from this juncture */
203 big_t got; /* value returned from count() */
204 int least; /* least number of syms to use at this juncture */
205 int most; /* most number of syms to use at this juncture */
206 int use; /* number of bit patterns to use in next call */
207 size_t index; /* index of this case in *num */
208
209 /* see if only one possible code */
210 if (syms == left)
211 return 1;
212
213 /* note and verify the expected state */
214 assert(syms > left && left > 0 && len < max);
215
216 /* see if we've done this one already */
217 index = INDEX(syms, left, len);
218 got = num[index];
219 if (got)
220 return got; /* we have -- return the saved result */
221
222 /* we need to use at least this many bit patterns so that the code won't be
223 incomplete at the next length (more bit patterns than symbols) */
224 least = (left << 1) - syms;
225 if (least < 0)
226 least = 0;
227
228 /* we can use at most this many bit patterns, lest there not be enough
229 available for the remaining symbols at the maximum length (if there were
230 no limit to the code length, this would become: most = left - 1) */
231 most = (((code_t)left << (max - len)) - syms) /
232 (((code_t)1 << (max - len)) - 1);
233
234 /* count all possible codes from this juncture and add them up */
235 sum = 0;
236 for (use = least; use <= most; use++) {
237 got = count(syms - use, len + 1, (left - use) << 1);
238 sum += got;
239 if (got == -1 || sum < got) /* overflow */
240 return -1;
241 }
242
243 /* verify that all recursive calls are productive */
244 assert(sum != 0);
245
246 /* save the result and return it */
247 num[index] = sum;
248 return sum;
249}
250
251/* Return true if we've been here before, set to true if not. Set a bit in a
252 bit vector to indicate visiting this state. Each (syms,len,left) state
253 has a variable size bit vector indexed by (mem,rem). The bit vector is
254 lengthened if needed to allow setting the (mem,rem) bit. */
255local int beenhere(int syms, int len, int left, int mem, int rem)
256{
257 size_t index; /* index for this state's bit vector */
258 size_t offset; /* offset in this state's bit vector */
259 int bit; /* mask for this state's bit */
260 size_t length; /* length of the bit vector in bytes */
261 char *vector; /* new or enlarged bit vector */
262
263 /* point to vector for (syms,left,len), bit in vector for (mem,rem) */
264 index = INDEX(syms, left, len);
265 mem -= 1 << root;
266 offset = (mem >> 3) + rem;
267 offset = ((offset * (offset + 1)) >> 1) + rem;
268 bit = 1 << (mem & 7);
269
270 /* see if we've been here */
271 length = done[index].len;
272 if (offset < length && (done[index].vec[offset] & bit) != 0)
273 return 1; /* done this! */
274
275 /* we haven't been here before -- set the bit to show we have now */
276
277 /* see if we need to lengthen the vector in order to set the bit */
278 if (length <= offset) {
279 /* if we have one already, enlarge it, zero out the appended space */
280 if (length) {
281 do {
282 length <<= 1;
283 } while (length <= offset);
284 vector = realloc(done[index].vec, length);
285 if (vector != NULL)
286 memset(vector + done[index].len, 0, length - done[index].len);
287 }
288
289 /* otherwise we need to make a new vector and zero it out */
290 else {
291 length = 1 << (len - root);
292 while (length <= offset)
293 length <<= 1;
294 vector = calloc(length, sizeof(char));
295 }
296
297 /* in either case, bail if we can't get the memory */
298 if (vector == NULL) {
299 fputs("abort: unable to allocate enough memory\n", stderr);
300 cleanup();
301 exit(1);
302 }
303
304 /* install the new vector */
305 done[index].len = length;
306 done[index].vec = vector;
307 }
308
309 /* set the bit */
310 done[index].vec[offset] |= bit;
311 return 0;
312}
313
314/* Examine all possible codes from the given node (syms, len, left). Compute
315 the amount of memory required to build inflate's decoding tables, where the
316 number of code structures used so far is mem, and the number remaining in
317 the current sub-table is rem. Uses the globals max, code, root, large, and
318 done. */
319local void examine(int syms, int len, int left, int mem, int rem)
320{
321 int least; /* least number of syms to use at this juncture */
322 int most; /* most number of syms to use at this juncture */
323 int use; /* number of bit patterns to use in next call */
324
325 /* see if we have a complete code */
326 if (syms == left) {
327 /* set the last code entry */
328 code[len] = left;
329
330 /* complete computation of memory used by this code */
331 while (rem < left) {
332 left -= rem;
333 rem = 1 << (len - root);
334 mem += rem;
335 }
336 assert(rem == left);
337
338 /* if this is a new maximum, show the entries used and the sub-code */
339 if (mem > large) {
340 large = mem;
341 printf("max %d: ", mem);
342 for (use = root + 1; use <= max; use++)
343 if (code[use])
344 printf("%d[%d] ", code[use], use);
345 putchar('\n');
346 fflush(stdout);
347 }
348
349 /* remove entries as we drop back down in the recursion */
350 code[len] = 0;
351 return;
352 }
353
354 /* prune the tree if we can */
355 if (beenhere(syms, len, left, mem, rem))
356 return;
357
358 /* we need to use at least this many bit patterns so that the code won't be
359 incomplete at the next length (more bit patterns than symbols) */
360 least = (left << 1) - syms;
361 if (least < 0)
362 least = 0;
363
364 /* we can use at most this many bit patterns, lest there not be enough
365 available for the remaining symbols at the maximum length (if there were
366 no limit to the code length, this would become: most = left - 1) */
367 most = (((code_t)left << (max - len)) - syms) /
368 (((code_t)1 << (max - len)) - 1);
369
370 /* occupy least table spaces, creating new sub-tables as needed */
371 use = least;
372 while (rem < use) {
373 use -= rem;
374 rem = 1 << (len - root);
375 mem += rem;
376 }
377 rem -= use;
378
379 /* examine codes from here, updating table space as we go */
380 for (use = least; use <= most; use++) {
381 code[len] = use;
382 examine(syms - use, len + 1, (left - use) << 1,
383 mem + (rem ? 1 << (len - root) : 0), rem << 1);
384 if (rem == 0) {
385 rem = 1 << (len - root);
386 mem += rem;
387 }
388 rem--;
389 }
390
391 /* remove entries as we drop back down in the recursion */
392 code[len] = 0;
393}
394
395/* Look at all sub-codes starting with root + 1 bits. Look at only the valid
396 intermediate code states (syms, left, len). For each completed code,
397 calculate the amount of memory required by inflate to build the decoding
398 tables. Find the maximum amount of memory required and show the code that
399 requires that maximum. Uses the globals max, root, and num. */
400local void enough(int syms)
401{
402 int n; /* number of remaing symbols for this node */
403 int left; /* number of unused bit patterns at this length */
404 size_t index; /* index of this case in *num */
405
406 /* clear code */
407 for (n = 0; n <= max; n++)
408 code[n] = 0;
409
410 /* look at all (root + 1) bit and longer codes */
411 large = 1 << root; /* base table */
412 if (root < max) /* otherwise, there's only a base table */
413 for (n = 3; n <= syms; n++)
414 for (left = 2; left < n; left += 2)
415 {
416 /* look at all reachable (root + 1) bit nodes, and the
417 resulting codes (complete at root + 2 or more) */
418 index = INDEX(n, left, root + 1);
419 if (root + 1 < max && num[index]) /* reachable node */
420 examine(n, root + 1, left, 1 << root, 0);
421
422 /* also look at root bit codes with completions at root + 1
423 bits (not saved in num, since complete), just in case */
424 if (num[index - 1] && n <= left << 1)
425 examine((n - left) << 1, root + 1, (n - left) << 1,
426 1 << root, 0);
427 }
428
429 /* done */
430 printf("done: maximum of %d table entries\n", large);
431}
432
433/*
434 Examine and show the total number of possible Huffman codes for a given
435 maximum number of symbols, initial root table size, and maximum code length
436 in bits -- those are the command arguments in that order. The default
437 values are 286, 9, and 15 respectively, for the deflate literal/length code.
438 The possible codes are counted for each number of coded symbols from two to
439 the maximum. The counts for each of those and the total number of codes are
440 shown. The maximum number of inflate table entires is then calculated
441 across all possible codes. Each new maximum number of table entries and the
442 associated sub-code (starting at root + 1 == 10 bits) is shown.
443
444 To count and examine Huffman codes that are not length-limited, provide a
445 maximum length equal to the number of symbols minus one.
446
447 For the deflate literal/length code, use "enough". For the deflate distance
448 code, use "enough 30 6".
449
450 This uses the %llu printf format to print big_t numbers, which assumes that
451 big_t is an unsigned long long. If the big_t type is changed (for example
452 to a multiple precision type), the method of printing will also need to be
453 updated.
454 */
455int main(int argc, char **argv)
456{
457 int syms; /* total number of symbols to code */
458 int n; /* number of symbols to code for this run */
459 big_t got; /* return value of count() */
460 big_t sum; /* accumulated number of codes over n */
461
462 /* set up globals for cleanup() */
463 code = NULL;
464 num = NULL;
465 done = NULL;
466
467 /* get arguments -- default to the deflate literal/length code */
468 syms = 286;
469 root = 9;
470 max = 15;
471 if (argc > 1) {
472 syms = atoi(argv[1]);
473 if (argc > 2) {
474 root = atoi(argv[2]);
475 if (argc > 3)
476 max = atoi(argv[3]);
477 }
478 }
479 if (argc > 4 || syms < 2 || root < 1 || max < 1) {
480 fputs("invalid arguments, need: [sym >= 2 [root >= 1 [max >= 1]]]\n",
481 stderr);
482 return 1;
483 }
484
485 /* if not restricting the code length, the longest is syms - 1 */
486 if (max > syms - 1)
487 max = syms - 1;
488
489 /* determine the number of bits in a code_t */
490 n = 0;
491 while (((code_t)1 << n) != 0)
492 n++;
493
494 /* make sure that the calculation of most will not overflow */
495 if (max > n || syms - 2 >= (((code_t)0 - 1) >> (max - 1))) {
496 fputs("abort: code length too long for internal types\n", stderr);
497 return 1;
498 }
499
500 /* reject impossible code requests */
501 if (syms - 1 > ((code_t)1 << max) - 1) {
502 fprintf(stderr, "%d symbols cannot be coded in %d bits\n",
503 syms, max);
504 return 1;
505 }
506
507 /* allocate code vector */
508 code = calloc(max + 1, sizeof(int));
509 if (code == NULL) {
510 fputs("abort: unable to allocate enough memory\n", stderr);
511 return 1;
512 }
513
514 /* determine size of saved results array, checking for overflows,
515 allocate and clear the array (set all to zero with calloc()) */
516 if (syms == 2) /* iff max == 1 */
517 num = NULL; /* won't be saving any results */
518 else {
519 size = syms >> 1;
520 if (size > ((size_t)0 - 1) / (n = (syms - 1) >> 1) ||
521 (size *= n, size > ((size_t)0 - 1) / (n = max - 1)) ||
522 (size *= n, size > ((size_t)0 - 1) / sizeof(big_t)) ||
523 (num = calloc(size, sizeof(big_t))) == NULL) {
524 fputs("abort: unable to allocate enough memory\n", stderr);
525 cleanup();
526 return 1;
527 }
528 }
529
530 /* count possible codes for all numbers of symbols, add up counts */
531 sum = 0;
532 for (n = 2; n <= syms; n++) {
533 got = count(n, 1, 2);
534 sum += got;
535 if (got == -1 || sum < got) { /* overflow */
536 fputs("abort: can't count that high!\n", stderr);
537 cleanup();
538 return 1;
539 }
540 printf("%llu %d-codes\n", got, n);
541 }
542 printf("%llu total codes for 2 to %d symbols", sum, syms);
543 if (max < syms - 1)
544 printf(" (%d-bit length limit)\n", max);
545 else
546 puts(" (no length limit)");
547
548 /* allocate and clear done array for beenhere() */
549 if (syms == 2)
550 done = NULL;
551 else if (size > ((size_t)0 - 1) / sizeof(struct tab) ||
552 (done = calloc(size, sizeof(struct tab))) == NULL) {
553 fputs("abort: unable to allocate enough memory\n", stderr);
554 cleanup();
555 return 1;
556 }
557
558 /* find and show maximum inflate table usage */
559 if (root > max) /* reduce root to max length */
560 root = max;
561 if (syms < ((code_t)1 << (root + 1)))
562 enough(syms);
563 else
564 puts("cannot handle minimum code lengths > root");
565
566 /* done */
567 cleanup();
568 return 0;
569}
diff --git a/examples/gzlog.c b/examples/gzlog.c
index b6acdef..4daf1c2 100644
--- a/examples/gzlog.c
+++ b/examples/gzlog.c
@@ -1,413 +1,1058 @@
1/* 1/*
2 * gzlog.c 2 * gzlog.c
3 * Copyright (C) 2004 Mark Adler 3 * Copyright (C) 2004, 2008 Mark Adler, all rights reserved
4 * For conditions of distribution and use, see copyright notice in gzlog.h 4 * For conditions of distribution and use, see copyright notice in gzlog.h
5 * version 1.0, 26 Nov 2004 5 * version 2.0, 25 Apr 2008
6 *
7 */ 6 */
8 7
9#include <string.h> /* memcmp() */ 8/*
10#include <stdlib.h> /* malloc(), free(), NULL */ 9 gzlog provides a mechanism for frequently appending short strings to a gzip
11#include <sys/types.h> /* size_t, off_t */ 10 file that is efficient both in execution time and compression ratio. The
12#include <unistd.h> /* read(), close(), sleep(), ftruncate(), */ 11 strategy is to write the short strings in an uncompressed form to the end of
13 /* lseek() */ 12 the gzip file, only compressing when the amount of uncompressed data has
14#include <fcntl.h> /* open() */ 13 reached a given threshold.
15#include <sys/file.h> /* flock() */ 14
16#include "zlib.h" /* deflateInit2(), deflate(), deflateEnd() */ 15 gzlog also provides protection against interruptions in the process due to
16 system crashes. The status of the operation is recorded in an extra field
17 in the gzip file, and is only updated once the gzip file is brought to a
18 valid state. The last data to be appended or compressed is saved in an
19 auxiliary file, so that if the operation is interrupted, it can be completed
20 the next time an append operation is attempted.
21
22 gzlog maintains another auxiliary file with the last 32K of data from the
23 compressed portion, which is preloaded for the compression of the subsequent
24 data. This minimizes the impact to the compression ratio of appending.
25 */
26
27/*
28 Operations Concept:
29
30 Files (log name "foo"):
31 foo.gz -- gzip file with the complete log
32 foo.add -- last message to append or last data to compress
33 foo.dict -- dictionary of the last 32K of data for next compression
34 foo.temp -- temporary dictionary file for compression after this one
35 foo.lock -- lock file for reading and writing the other files
36 foo.repairs -- log file for log file recovery operations (not compressed)
37
38 gzip file structure:
39 - fixed-length (no file name) header with extra field (see below)
40 - compressed data ending initially with empty stored block
41 - uncompressed data filling out originally empty stored block and
42 subsequent stored blocks as needed (16K max each)
43 - gzip trailer
44 - no junk at end (no other gzip streams)
45
46 When appending data, the information in the first three items above plus the
47 foo.add file are sufficient to recover an interrupted append operation. The
48 extra field has the necessary information to restore the start of the last
49 stored block and determine where to append the data in the foo.add file, as
50 well as the crc and length of the gzip data before the append operation.
51
52 The foo.add file is created before the gzip file is marked for append, and
53 deleted after the gzip file is marked as complete. So if the append
54 operation is interrupted, the data to add will still be there. If due to
55 some external force, the foo.add file gets deleted between when the append
56 operation was interrupted and when recovery is attempted, the gzip file will
57 still be restored, but without the appended data.
58
59 When compressing data, the information in the first two items above plus the
60 foo.add file are sufficient to recover an interrupted compress operation.
61 The extra field has the necessary information to find the end of the
62 compressed data, and contains both the crc and length of just the compressed
63 data and of the complete set of data including the contents of the foo.add
64 file.
65
66 Again, the foo.add file is maintained during the compress operation in case
67 of an interruption. If in the unlikely event the foo.add file with the data
68 to be compressed is missing due to some external force, a gzip file with
69 just the previous compressed data will be reconstructed. In this case, all
70 of the data that was to be compressed is lost (approximately one megabyte).
71 This will not occur if all that happened was an interruption of the compress
72 operation.
73
74 The third state that is marked is the replacement of the old dictionary with
75 the new dictionary after a compress operation. Once compression is
76 complete, the gzip file is marked as being in the replace state. This
77 completes the gzip file, so an interrupt after being so marked does not
78 result in recompression. Then the dictionary file is replaced, and the gzip
79 file is marked as completed. This state prevents the possibility of
80 restarting compression with the wrong dictionary file.
81
82 All three operations are wrapped by a lock/unlock procedure. In order to
83 gain exclusive access to the log files, first a foo.lock file must be
84 exclusively created. When all operations are complete, the lock is
85 released by deleting the foo.lock file. If when attempting to create the
86 lock file, it already exists and the modify time of the lock file is more
87 than five minutes old (set by the PATIENCE define below), then the old
88 lock file is considered stale and deleted, and the exclusive creation of
89 the lock file is retried. To assure that there are no false assessments
90 of the staleness of the lock file, the operations periodically touch the
91 lock file to update the modified date.
92
93 Following is the definition of the extra field with all of the information
94 required to enable the above append and compress operations and their
95 recovery if interrupted. Multi-byte values are stored little endian
96 (consistent with the gzip format). File pointers are eight bytes long.
97 The crc's and lengths for the gzip trailer are four bytes long. (Note that
98 the length at the end of a gzip file is used for error checking only, and
99 for large files is actually the length modulo 2^32.) The stored block
100 length is two bytes long. The gzip extra field two-byte identification is
101 "ap" for append. It is assumed that writing the extra field to the file is
102 an "atomic" operation. That is, either all of the extra field is written
103 to the file, or none of it is, if the operation is interrupted right at the
104 point of updating the extra field. This is a reasonable assumption, since
105 the extra field is within the first 52 bytes of the file, which is smaller
106 than any expected block size for a mass storage device (usually 512 bytes or
107 larger).
108
109 Extra field (35 bytes):
110 - Pointer to first stored block length -- this points to the two-byte length
111 of the first stored block, which is followed by the two-byte, one's
112 complement of that length. The stored block length is preceded by the
113 three-bit header of the stored block, which is the actual start of the
114 stored block in the deflate format. See the bit offset field below.
115 - Pointer to the last stored block length. This is the same as above, but
116 for the last stored block of the uncompressed data in the gzip file.
117 Initially this is the same as the first stored block length pointer.
118 When the stored block gets to 16K (see the MAX_STORE define), then a new
119 stored block as added, at which point the last stored block length pointer
120 is different from the first stored block length pointer. When they are
121 different, the first bit of the last stored block header is eight bits, or
122 one byte back from the block length.
123 - Compressed data crc and length. This is the crc and length of the data
124 that is in the compressed portion of the deflate stream. These are used
125 only in the event that the foo.add file containing the data to compress is
126 lost after a compress operation is interrupted.
127 - Total data crc and length. This is the crc and length of all of the data
128 stored in the gzip file, compressed and uncompressed. It is used to
129 reconstruct the gzip trailer when compressing, as well as when recovering
130 interrupted operations.
131 - Final stored block length. This is used to quickly find where to append,
132 and allows the restoration of the original final stored block state when
133 an append operation is interrupted.
134 - First stored block start as the number of bits back from the final stored
135 block first length byte. This value is in the range of 3..10, and is
136 stored as the low three bits of the final byte of the extra field after
137 subtracting three (0..7). This allows the last-block bit of the stored
138 block header to be updated when a new stored block is added, for the case
139 when the first stored block and the last stored block are the same. (When
140 they are different, the numbers of bits back is known to be eight.) This
141 also allows for new compressed data to be appended to the old compressed
142 data in the compress operation, overwriting the previous first stored
143 block, or for the compressed data to be terminated and a valid gzip file
144 reconstructed on the off chance that a compression operation was
145 interrupted and the data to compress in the foo.add file was deleted.
146 - The operation in process. This is the next two bits in the last byte (the
147 bits under the mask 0x18). The are interpreted as 0: nothing in process,
148 1: append in process, 2: compress in process, 3: replace in process.
149 - The top three bits of the last byte in the extra field are reserved and
150 are currently set to zero.
151
152 Main procedure:
153 - Exclusively create the foo.lock file using the O_CREAT and O_EXCL modes of
154 the system open() call. If the modify time of an existing lock file is
155 more than PATIENCE seconds old, then the lock file is deleted and the
156 exclusive create is retried.
157 - Load the extra field from the foo.gz file, and see if an operation was in
158 progress but not completed. If so, apply the recovery procedure below.
159 - Perform the append procedure with the provided data.
160 - If the uncompressed data in the foo.gz file is 1MB or more, apply the
161 compress procedure.
162 - Delete the foo.lock file.
163
164 Append procedure:
165 - Put what to append in the foo.add file so that the operation can be
166 restarted if this procedure is interrupted.
167 - Mark the foo.gz extra field with the append operation in progress.
168 + Restore the original last-block bit and stored block length of the last
169 stored block from the information in the extra field, in case a previous
170 append operation was interrupted.
171 - Append the provided data to the last stored block, creating new stored
172 blocks as needed and updating the stored blocks last-block bits and
173 lengths.
174 - Update the crc and length with the new data, and write the gzip trailer.
175 - Write over the extra field (with a single write operation) with the new
176 pointers, lengths, and crc's, and mark the gzip file as not in process.
177 Though there is still a foo.add file, it will be ignored since nothing
178 is in process. If a foo.add file is leftover from a previously
179 completed operation, it is truncated when writing new data to it.
180 - Delete the foo.add file.
181
182 Compress and replace procedures:
183 - Read all of the uncompressed data in the stored blocks in foo.gz and write
184 it to foo.add. Also write foo.temp with the last 32K of that data to
185 provide a dictionary for the next invocation of this procedure.
186 - Rewrite the extra field marking foo.gz with a compression in process.
187 * If there is no data provided to compress (due to a missing foo.add file
188 when recovering), reconstruct and truncate the foo.gz file to contain
189 only the previous compressed data and proceed to the step after the next
190 one. Otherwise ...
191 - Compress the data with the dictionary in foo.dict, and write to the
192 foo.gz file starting at the bit immediately following the last previously
193 compressed block. If there is no foo.dict, proceed anyway with the
194 compression at slightly reduced efficiency. (For the foo.dict file to be
195 missing requires some external failure beyond simply the interruption of
196 a compress operation.) During this process, the foo.lock file is
197 periodically touched to assure that that file is not considered stale by
198 another process before we're done. The deflation is terminated with a
199 non-last empty static block (10 bits long), that is then located and
200 written over by a last-bit-set empty stored block.
201 - Append the crc and length of the data in the gzip file (previously
202 calculated during the append operations).
203 - Write over the extra field with the updated stored block offsets, bits
204 back, crc's, and lengths, and mark foo.gz as in process for a replacement
205 of the dictionary.
206 @ Delete the foo.add file.
207 - Replace foo.dict with foo.temp.
208 - Write over the extra field, marking foo.gz as complete.
209
210 Recovery procedure:
211 - If not a replace recovery, read in the foo.add file, and provide that data
212 to the appropriate recovery below. If there is no foo.add file, provide
213 a zero data length to the recovery. In that case, the append recovery
214 restores the foo.gz to the previous compressed + uncompressed data state.
215 For the the compress recovery, a missing foo.add file results in foo.gz
216 being restored to the previous compressed-only data state.
217 - Append recovery:
218 - Pick up append at + step above
219 - Compress recovery:
220 - Pick up compress at * step above
221 - Replace recovery:
222 - Pick up compress at @ step above
223 - Log the repair with a date stamp in foo.repairs
224 */
225
226#include <sys/types.h>
227#include <stdio.h> /* rename, fopen, fprintf, fclose */
228#include <stdlib.h> /* malloc, free */
229#include <string.h> /* strlen, strrchr, strcpy, strncpy, strcmp */
230#include <fcntl.h> /* open */
231#include <unistd.h> /* lseek, read, write, close, unlink, sleep, */
232 /* ftruncate, fsync */
233#include <errno.h> /* errno */
234#include <time.h> /* time, ctime */
235#include <sys/stat.h> /* stat */
236#include <sys/time.h> /* utimes */
237#include "zlib.h" /* crc32 */
238
239#include "gzlog.h" /* header for external access */
17 240
18#include "gzlog.h" /* interface */
19#define local static 241#define local static
242typedef unsigned int uint;
243typedef unsigned long ulong;
244
245/* Macro for debugging to deterministically force recovery operations */
246#ifdef DEBUG
247 #include <setjmp.h> /* longjmp */
248 jmp_buf gzlog_jump; /* where to go back to */
249 int gzlog_bail = 0; /* which point to bail at (1..8) */
250 int gzlog_count = -1; /* number of times through to wait */
251# define BAIL(n) do { if (n == gzlog_bail && gzlog_count-- == 0) \
252 longjmp(gzlog_jump, gzlog_bail); } while (0)
253#else
254# define BAIL(n)
255#endif
256
257/* how old the lock file can be in seconds before considering it stale */
258#define PATIENCE 300
259
260/* maximum stored block size in Kbytes -- must be in 1..63 */
261#define MAX_STORE 16
20 262
21/* log object structure */ 263/* number of stored Kbytes to trigger compression (must be >= 32 to allow
22typedef struct { 264 dictionary construction, and <= 204 * MAX_STORE, in order for >> 10 to
23 int id; /* object identifier */ 265 discard the stored block headers contribution of five bytes each) */
24 int fd; /* log file descriptor */ 266#define TRIGGER 1024
25 off_t extra; /* offset of extra "ap" subfield */ 267
26 off_t mark_off; /* offset of marked data */ 268/* size of a deflate dictionary (this cannot be changed) */
27 off_t last_off; /* offset of last block */ 269#define DICT 32768U
28 unsigned long crc; /* uncompressed crc */ 270
29 unsigned long len; /* uncompressed length (modulo 2^32) */ 271/* values for the operation (2 bits) */
30 unsigned stored; /* length of current stored block */ 272#define NO_OP 0
31} gz_log; 273#define APPEND_OP 1
32 274#define COMPRESS_OP 2
33#define GZLOGID 19334 /* gz_log object identifier */ 275#define REPLACE_OP 3
34 276
35#define LOCK_RETRY 1 /* retry lock once a second */ 277/* macros to extract little-endian integers from an unsigned byte buffer */
36#define LOCK_PATIENCE 1200 /* try about twenty minutes before forcing */ 278#define PULL2(p) ((p)[0]+((uint)((p)[1])<<8))
37 279#define PULL4(p) (PULL2(p)+((ulong)PULL2(p+2)<<16))
38/* acquire a lock on a file */ 280#define PULL8(p) (PULL4(p)+((off_t)PULL4(p+4)<<32))
39local int lock(int fd) 281
282/* macros to store integers into a byte buffer in little-endian order */
283#define PUT2(p,a) do {(p)[0]=a;(p)[1]=(a)>>8;} while(0)
284#define PUT4(p,a) do {PUT2(p,a);PUT2(p+2,a>>16);} while(0)
285#define PUT8(p,a) do {PUT4(p,a);PUT4(p+4,a>>32);} while(0)
286
287/* internal structure for log information */
288#define LOGID "\106\035\172" /* should be three non-zero characters */
289struct log {
290 char id[4]; /* contains LOGID to detect inadvertent overwrites */
291 int fd; /* file descriptor for .gz file, opened read/write */
292 char *path; /* allocated path, e.g. "/var/log/foo" or "foo" */
293 char *end; /* end of path, for appending suffices such as ".gz" */
294 off_t first; /* offset of first stored block first length byte */
295 int back; /* location of first block id in bits back from first */
296 uint stored; /* bytes currently in last stored block */
297 off_t last; /* offset of last stored block first length byte */
298 ulong ccrc; /* crc of compressed data */
299 ulong clen; /* length (modulo 2^32) of compressed data */
300 ulong tcrc; /* crc of total data */
301 ulong tlen; /* length (modulo 2^32) of total data */
302 time_t lock; /* last modify time of our lock file */
303};
304
305/* gzip header for gzlog */
306local unsigned char log_gzhead[] = {
307 0x1f, 0x8b, /* magic gzip id */
308 8, /* compression method is deflate */
309 4, /* there is an extra field (no file name) */
310 0, 0, 0, 0, /* no modification time provided */
311 0, 0xff, /* no extra flags, no OS specified */
312 39, 0, 'a', 'p', 35, 0 /* extra field with "ap" subfield */
313 /* 35 is EXTRA, 39 is EXTRA + 4 */
314};
315
316#define HEAD sizeof(log_gzhead) /* should be 16 */
317
318/* initial gzip extra field content (52 == HEAD + EXTRA + 1) */
319local unsigned char log_gzext[] = {
320 52, 0, 0, 0, 0, 0, 0, 0, /* offset of first stored block length */
321 52, 0, 0, 0, 0, 0, 0, 0, /* offset of last stored block length */
322 0, 0, 0, 0, 0, 0, 0, 0, /* compressed data crc and length */
323 0, 0, 0, 0, 0, 0, 0, 0, /* total data crc and length */
324 0, 0, /* final stored block data length */
325 5 /* op is NO_OP, last bit 8 bits back */
326};
327
328#define EXTRA sizeof(log_gzext) /* should be 35 */
329
330/* initial gzip data and trailer */
331local unsigned char log_gzbody[] = {
332 1, 0, 0, 0xff, 0xff, /* empty stored block (last) */
333 0, 0, 0, 0, /* crc */
334 0, 0, 0, 0 /* uncompressed length */
335};
336
337#define BODY sizeof(log_gzbody)
338
339/* Exclusively create foo.lock in order to negotiate exclusive access to the
340 foo.* files. If the modify time of an existing lock file is greater than
341 PATIENCE seconds in the past, then consider the lock file to have been
342 abandoned, delete it, and try the exclusive create again. Save the lock
343 file modify time for verification of ownership. Return 0 on success, or -1
344 on failure, usually due to an access restriction or invalid path. Note that
345 if stat() or unlink() fails, it may be due to another process noticing the
346 abandoned lock file a smidge sooner and deleting it, so those are not
347 flagged as an error. */
348local int log_lock(struct log *log)
40{ 349{
41 int patience; 350 int fd;
351 struct stat st;
42 352
43 /* try to lock every LOCK_RETRY seconds for LOCK_PATIENCE seconds */ 353 strcpy(log->end, ".lock");
44 patience = LOCK_PATIENCE; 354 while ((fd = open(log->path, O_CREAT | O_EXCL, 0644)) < 0) {
45 do { 355 if (errno != EEXIST)
46 if (flock(fd, LOCK_EX + LOCK_NB) == 0) 356 return -1;
47 return 0; 357 if (stat(log->path, &st) == 0 && time(NULL) - st.st_mtime > PATIENCE) {
48 (void)sleep(LOCK_RETRY); 358 unlink(log->path);
49 patience -= LOCK_RETRY; 359 continue;
50 } while (patience > 0); 360 }
361 sleep(2); /* relinquish the CPU for two seconds while waiting */
362 }
363 close(fd);
364 if (stat(log->path, &st) == 0)
365 log->lock = st.st_mtime;
366 return 0;
367}
51 368
52 /* we've run out of patience -- give up */ 369/* Update the modify time of the lock file to now, in order to prevent another
53 return -1; 370 task from thinking that the lock is stale. Save the lock file modify time
371 for verification of ownership. */
372local void log_touch(struct log *log)
373{
374 struct stat st;
375
376 strcpy(log->end, ".lock");
377 utimes(log->path, NULL);
378 if (stat(log->path, &st) == 0)
379 log->lock = st.st_mtime;
54} 380}
55 381
56/* release lock */ 382/* Check the log file modify time against what is expected. Return true if
57local void unlock(int fd) 383 this is not our lock. If it is our lock, touch it to keep it. */
384local int log_check(struct log *log)
58{ 385{
59 (void)flock(fd, LOCK_UN); 386 struct stat st;
387
388 strcpy(log->end, ".lock");
389 if (stat(log->path, &st) || st.st_mtime != log->lock)
390 return 1;
391 log_touch(log);
392 return 0;
60} 393}
61 394
62/* release a log object */ 395/* Unlock a previously acquired lock, but only if it's ours. */
63local void log_clean(gz_log *log) 396local void log_unlock(struct log *log)
64{ 397{
65 unlock(log->fd); 398 if (log_check(log))
66 (void)close(log->fd); 399 return;
67 free(log); 400 strcpy(log->end, ".lock");
401 unlink(log->path);
402 log->lock = 0;
68} 403}
69 404
70/* read an unsigned long from a byte buffer little-endian */ 405/* Check the gzip header and read in the extra field, filling in the values in
71local unsigned long make_ulg(unsigned char *buf) 406 the log structure. Return op on success or -1 if the gzip header was not as
407 expected. op is the current operation in progress last written to the extra
408 field. This assumes that the gzip file has already been opened, with the
409 file descriptor log->fd. */
410local int log_head(struct log *log)
72{ 411{
73 int n; 412 int op;
74 unsigned long val; 413 unsigned char buf[HEAD + EXTRA];
75 414
76 val = (unsigned long)(*buf++); 415 if (lseek(log->fd, 0, SEEK_SET) < 0 ||
77 for (n = 8; n < 32; n += 8) 416 read(log->fd, buf, HEAD + EXTRA) != HEAD + EXTRA ||
78 val += (unsigned long)(*buf++) << n; 417 memcmp(buf, log_gzhead, HEAD)) {
79 return val; 418 return -1;
419 }
420 log->first = PULL8(buf + HEAD);
421 log->last = PULL8(buf + HEAD + 8);
422 log->ccrc = PULL4(buf + HEAD + 16);
423 log->clen = PULL4(buf + HEAD + 20);
424 log->tcrc = PULL4(buf + HEAD + 24);
425 log->tlen = PULL4(buf + HEAD + 28);
426 log->stored = PULL2(buf + HEAD + 32);
427 log->back = 3 + (buf[HEAD + 34] & 7);
428 op = (buf[HEAD + 34] >> 3) & 3;
429 return op;
80} 430}
81 431
82/* read an off_t from a byte buffer little-endian */ 432/* Write over the extra field contents, marking the operation as op. Use fsync
83local off_t make_off(unsigned char *buf) 433 to assure that the device is written to, and in the requested order. This
434 operation, and only this operation, is assumed to be atomic in order to
435 assure that the log is recoverable in the event of an interruption at any
436 point in the process. Return -1 if the write to foo.gz failed. */
437local int log_mark(struct log *log, int op)
84{ 438{
85 int n; 439 int ret;
86 off_t val; 440 unsigned char ext[EXTRA];
87 441
88 val = (off_t)(*buf++); 442 PUT8(ext, log->first);
89 for (n = 8; n < 64; n += 8) 443 PUT8(ext + 8, log->last);
90 val += (off_t)(*buf++) << n; 444 PUT4(ext + 16, log->ccrc);
91 return val; 445 PUT4(ext + 20, log->clen);
446 PUT4(ext + 24, log->tcrc);
447 PUT4(ext + 28, log->tlen);
448 PUT2(ext + 32, log->stored);
449 ext[34] = log->back - 3 + (op << 3);
450 fsync(log->fd);
451 ret = lseek(log->fd, HEAD, SEEK_SET) < 0 ||
452 write(log->fd, ext, EXTRA) != EXTRA ? -1 : 0;
453 fsync(log->fd);
454 return ret;
92} 455}
93 456
94/* write an unsigned long little-endian to byte buffer */ 457/* Rewrite the last block header bits and subsequent zero bits to get to a byte
95local void dice_ulg(unsigned long val, unsigned char *buf) 458 boundary, setting the last block bit if last is true, and then write the
459 remainder of the stored block header (length and one's complement). Leave
460 the file pointer after the end of the last stored block data. Return -1 if
461 there is a read or write failure on the foo.gz file */
462local int log_last(struct log *log, int last)
96{ 463{
97 int n; 464 int back, len, mask;
465 unsigned char buf[6];
466
467 /* determine the locations of the bytes and bits to modify */
468 back = log->last == log->first ? log->back : 8;
469 len = back > 8 ? 2 : 1; /* bytes back from log->last */
470 mask = 0x80 >> ((back - 1) & 7); /* mask for block last-bit */
471
472 /* get the byte to modify (one or two back) into buf[0] -- don't need to
473 read the byte if the last-bit is eight bits back, since in that case
474 the entire byte will be modified */
475 buf[0] = 0;
476 if (back != 8 && (lseek(log->fd, log->last - len, SEEK_SET) < 0 ||
477 read(log->fd, buf, 1) != 1))
478 return -1;
479
480 /* change the last-bit of the last stored block as requested -- note
481 that all bits above the last-bit are set to zero, per the type bits
482 of a stored block being 00 and per the convention that the bits to
483 bring the stream to a byte boundary are also zeros */
484 buf[1] = 0;
485 buf[2 - len] = (*buf & (mask - 1)) + (last ? mask : 0);
98 486
99 for (n = 0; n < 4; n++) { 487 /* write the modified stored block header and lengths, move the file
100 *buf++ = val & 0xff; 488 pointer to after the last stored block data */
101 val >>= 8; 489 PUT2(buf + 2, log->stored);
490 PUT2(buf + 4, log->stored ^ 0xffff);
491 return lseek(log->fd, log->last - len, SEEK_SET) < 0 ||
492 write(log->fd, buf + 2 - len, len + 4) != len + 4 ||
493 lseek(log->fd, log->stored, SEEK_CUR) < 0 ? -1 : 0;
494}
495
496/* Append len bytes from data to the locked and open log file. len may be zero
497 if recovering and no .add file was found. In that case, the previous state
498 of the foo.gz file is restored. The data is appended uncompressed in
499 deflate stored blocks. Return -1 if there was an error reading or writing
500 the foo.gz file. */
501local int log_append(struct log *log, unsigned char *data, size_t len)
502{
503 uint put;
504 off_t end;
505 unsigned char buf[8];
506
507 /* set the last block last-bit and length, in case recovering an
508 interrupted append, then position the file pointer to append to the
509 block */
510 if (log_last(log, 1))
511 return -1;
512
513 /* append, adding stored blocks and updating the offset of the last stored
514 block as needed, and update the total crc and length */
515 while (len) {
516 /* append as much as we can to the last block */
517 put = (MAX_STORE << 10) - log->stored;
518 if (put > len)
519 put = (uint)len;
520 if (put) {
521 if (write(log->fd, data, put) != put)
522 return -1;
523 BAIL(1);
524 log->tcrc = crc32(log->tcrc, data, put);
525 log->tlen += put;
526 log->stored += put;
527 data += put;
528 len -= put;
529 }
530
531 /* if we need to, add a new empty stored block */
532 if (len) {
533 /* mark current block as not last */
534 if (log_last(log, 0))
535 return -1;
536
537 /* point to new, empty stored block */
538 log->last += 4 + log->stored + 1;
539 log->stored = 0;
540 }
541
542 /* mark last block as last, update its length */
543 if (log_last(log, 1))
544 return -1;
545 BAIL(2);
102 } 546 }
547
548 /* write the new crc and length trailer, and truncate just in case (could
549 be recovering from partial append with a missing foo.add file) */
550 PUT4(buf, log->tcrc);
551 PUT4(buf + 4, log->tlen);
552 if (write(log->fd, buf, 8) != 8 ||
553 (end = lseek(log->fd, 0, SEEK_CUR)) < 0 || ftruncate(log->fd, end))
554 return -1;
555
556 /* write the extra field, marking the log file as done, delete .add file */
557 if (log_mark(log, NO_OP))
558 return -1;
559 strcpy(log->end, ".add");
560 unlink(log->path); /* ignore error, since may not exist */
561 return 0;
103} 562}
104 563
105/* write an off_t little-endian to byte buffer */ 564/* Replace the foo.dict file with the foo.temp file. Also delete the foo.add
106local void dice_off(off_t val, unsigned char *buf) 565 file, since the compress operation may have been interrupted before that was
566 done. Returns 1 if memory could not be allocated, or -1 if reading or
567 writing foo.gz fails, or if the rename fails for some reason other than
568 foo.temp not existing. foo.temp not existing is a permitted error, since
569 the replace operation may have been interrupted after the rename is done,
570 but before foo.gz is marked as complete. */
571local int log_replace(struct log *log)
107{ 572{
108 int n; 573 int ret;
574 char *dest;
575
576 /* delete foo.add file */
577 strcpy(log->end, ".add");
578 unlink(log->path); /* ignore error, since may not exist */
579 BAIL(3);
580
581 /* rename foo.name to foo.dict, replacing foo.dict if it exists */
582 strcpy(log->end, ".dict");
583 dest = malloc(strlen(log->path) + 1);
584 if (dest == NULL)
585 return -2;
586 strcpy(dest, log->path);
587 strcpy(log->end, ".temp");
588 ret = rename(log->path, dest);
589 free(dest);
590 if (ret && errno != ENOENT)
591 return -1;
592 BAIL(4);
109 593
110 for (n = 0; n < 8; n++) { 594 /* mark the foo.gz file as done */
111 *buf++ = val & 0xff; 595 return log_mark(log, NO_OP);
112 val >>= 8; 596}
597
598/* Compress the len bytes at data and append the compressed data to the
599 foo.gz deflate data immediately after the previous compressed data. This
600 overwrites the previous uncompressed data, which was stored in foo.add
601 and is the data provided in data[0..len-1]. If this operation is
602 interrupted, it picks up at the start of this routine, with the foo.add
603 file read in again. If there is no data to compress (len == 0), then we
604 simply terminate the foo.gz file after the previously compressed data,
605 appending a final empty stored block and the gzip trailer. Return -1 if
606 reading or writing the log.gz file failed, or -2 if there was a memory
607 allocation failure. */
608local int log_compress(struct log *log, unsigned char *data, size_t len)
609{
610 int fd;
611 uint got, max;
612 ssize_t dict;
613 off_t end;
614 z_stream strm;
615 unsigned char buf[DICT];
616
617 /* compress and append compressed data */
618 if (len) {
619 /* set up for deflate, allocating memory */
620 strm.zalloc = Z_NULL;
621 strm.zfree = Z_NULL;
622 strm.opaque = Z_NULL;
623 if (deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -15, 8,
624 Z_DEFAULT_STRATEGY) != Z_OK)
625 return -2;
626
627 /* read in dictionary (last 32K of data that was compressed) */
628 strcpy(log->end, ".dict");
629 fd = open(log->path, O_RDONLY, 0);
630 if (fd >= 0) {
631 dict = read(fd, buf, DICT);
632 close(fd);
633 if (dict < 0) {
634 deflateEnd(&strm);
635 return -1;
636 }
637 if (dict)
638 deflateSetDictionary(&strm, buf, (uint)dict);
639 }
640 log_touch(log);
641
642 /* prime deflate with last bits of previous block, position write
643 pointer to write those bits and overwrite what follows */
644 if (lseek(log->fd, log->first - (log->back > 8 ? 2 : 1),
645 SEEK_SET) < 0 ||
646 read(log->fd, buf, 1) != 1 || lseek(log->fd, -1, SEEK_CUR) < 0) {
647 deflateEnd(&strm);
648 return -1;
649 }
650 deflatePrime(&strm, (8 - log->back) & 7, *buf);
651
652 /* compress, finishing with a partial non-last empty static block */
653 strm.next_in = data;
654 max = (((uint)0 - 1) >> 1) + 1; /* in case int smaller than size_t */
655 do {
656 strm.avail_in = len > max ? max : (uint)len;
657 len -= strm.avail_in;
658 do {
659 strm.avail_out = DICT;
660 strm.next_out = buf;
661 deflate(&strm, len ? Z_NO_FLUSH : Z_PARTIAL_FLUSH);
662 got = DICT - strm.avail_out;
663 if (got && write(log->fd, buf, got) != got) {
664 deflateEnd(&strm);
665 return -1;
666 }
667 log_touch(log);
668 } while (strm.avail_out == 0);
669 } while (len);
670 deflateEnd(&strm);
671 BAIL(5);
672
673 /* find start of empty static block -- scanning backwards the first one
674 bit is the second bit of the block, if the last byte is zero, then
675 we know the byte before that has a one in the top bit, since an
676 empty static block is ten bits long */
677 if ((log->first = lseek(log->fd, -1, SEEK_CUR)) < 0 ||
678 read(log->fd, buf, 1) != 1)
679 return -1;
680 log->first++;
681 if (*buf) {
682 log->back = 1;
683 while ((*buf & ((uint)1 << (8 - log->back++))) == 0)
684 ; /* guaranteed to terminate, since *buf != 0 */
685 }
686 else
687 log->back = 10;
688
689 /* update compressed crc and length */
690 log->ccrc = log->tcrc;
691 log->clen = log->tlen;
692 }
693 else {
694 /* no data to compress -- fix up existing gzip stream */
695 log->tcrc = log->ccrc;
696 log->tlen = log->clen;
113 } 697 }
698
699 /* complete and truncate gzip stream */
700 log->last = log->first;
701 log->stored = 0;
702 PUT4(buf, log->tcrc);
703 PUT4(buf + 4, log->tlen);
704 if (log_last(log, 1) || write(log->fd, buf, 8) != 8 ||
705 (end = lseek(log->fd, 0, SEEK_CUR)) < 0 || ftruncate(log->fd, end))
706 return -1;
707 BAIL(6);
708
709 /* mark as being in the replace operation */
710 if (log_mark(log, REPLACE_OP))
711 return -1;
712
713 /* execute the replace operation and mark the file as done */
714 return log_replace(log);
114} 715}
115 716
116/* initial, empty gzip file for appending */ 717/* log a repair record to the .repairs file */
117local char empty_gz[] = { 718local void log_log(struct log *log, int op, char *record)
118 0x1f, 0x8b, /* magic gzip id */ 719{
119 8, /* compression method is deflate */ 720 time_t now;
120 4, /* there is an extra field */ 721 FILE *rec;
121 0, 0, 0, 0, /* no modification time provided */
122 0, 0xff, /* no extra flags, no OS */
123 20, 0, 'a', 'p', 16, 0, /* extra field with "ap" subfield */
124 32, 0, 0, 0, 0, 0, 0, 0, /* offset of uncompressed data */
125 32, 0, 0, 0, 0, 0, 0, 0, /* offset of last block */
126 1, 0, 0, 0xff, 0xff, /* empty stored block (last) */
127 0, 0, 0, 0, /* crc */
128 0, 0, 0, 0 /* uncompressed length */
129};
130 722
131/* initialize a log object with locking */ 723 now = time(NULL);
132void *gzlog_open(char *path) 724 strcpy(log->end, ".repairs");
725 rec = fopen(log->path, "a");
726 if (rec == NULL)
727 return;
728 fprintf(rec, "%.24s %s recovery: %s\n", ctime(&now), op == APPEND_OP ?
729 "append" : (op == COMPRESS_OP ? "compress" : "replace"), record);
730 fclose(rec);
731 return;
732}
733
734/* Recover the interrupted operation op. First read foo.add for recovering an
735 append or compress operation. Return -1 if there was an error reading or
736 writing foo.gz or reading an existing foo.add, or -2 if there was a memory
737 allocation failure. */
738local int log_recover(struct log *log, int op)
133{ 739{
134 unsigned xlen; 740 int fd, ret = 0;
135 unsigned char temp[20]; 741 unsigned char *data = NULL;
136 unsigned sub_len; 742 size_t len = 0;
137 int good; 743 struct stat st;
138 gz_log *log;
139
140 /* allocate log structure */
141 log = malloc(sizeof(gz_log));
142 if (log == NULL)
143 return NULL;
144 log->id = GZLOGID;
145 744
146 /* open file, creating it if necessary, and locking it */ 745 /* log recovery */
147 log->fd = open(path, O_RDWR | O_CREAT, 0600); 746 log_log(log, op, "start");
148 if (log->fd < 0) { 747
149 free(log); 748 /* load foo.add file if expected and present */
150 return NULL; 749 if (op == APPEND_OP || op == COMPRESS_OP) {
750 strcpy(log->end, ".add");
751 if (stat(log->path, &st) == 0 && st.st_size) {
752 len = (size_t)(st.st_size);
753 if (len != st.st_size || (data = malloc(st.st_size)) == NULL) {
754 log_log(log, op, "allocation failure");
755 return -2;
756 }
757 if ((fd = open(log->path, O_RDONLY, 0)) < 0) {
758 log_log(log, op, ".add file read failure");
759 return -1;
760 }
761 ret = read(fd, data, len) != len;
762 close(fd);
763 if (ret) {
764 log_log(log, op, ".add file read failure");
765 return -1;
766 }
767 log_log(log, op, "loaded .add file");
768 }
769 else
770 log_log(log, op, "missing .add file!");
771 }
772
773 /* recover the interrupted operation */
774 switch (op) {
775 case APPEND_OP:
776 ret = log_append(log, data, len);
777 break;
778 case COMPRESS_OP:
779 ret = log_compress(log, data, len);
780 break;
781 case REPLACE_OP:
782 ret = log_replace(log);
151 } 783 }
152 if (lock(log->fd)) { 784
785 /* log status */
786 log_log(log, op, ret ? "failure" : "complete");
787
788 /* clean up */
789 if (data != NULL)
790 free(data);
791 return ret;
792}
793
794/* Close the foo.gz file (if open) and release the lock. */
795local void log_close(struct log *log)
796{
797 if (log->fd >= 0)
153 close(log->fd); 798 close(log->fd);
154 free(log); 799 log->fd = -1;
155 return NULL; 800 log_unlock(log);
801}
802
803/* Open foo.gz, verify the header, and load the extra field contents, after
804 first creating the foo.lock file to gain exclusive access to the foo.*
805 files. If foo.gz does not exist or is empty, then write the initial header,
806 extra, and body content of an empty foo.gz log file. If there is an error
807 creating the lock file due to access restrictions, or an error reading or
808 writing the foo.gz file, or if the foo.gz file is not a proper log file for
809 this object (e.g. not a gzip file or does not contain the expected extra
810 field), then return true. If there is an error, the lock is released.
811 Otherwise, the lock is left in place. */
812local int log_open(struct log *log)
813{
814 int op;
815
816 /* release open file resource if left over -- can occur if lock lost
817 between gzlog_open() and gzlog_write() */
818 if (log->fd >= 0)
819 close(log->fd);
820 log->fd = -1;
821
822 /* negotiate exclusive access */
823 if (log_lock(log) < 0)
824 return -1;
825
826 /* open the log file, foo.gz */
827 strcpy(log->end, ".gz");
828 log->fd = open(log->path, O_RDWR | O_CREAT, 0644);
829 if (log->fd < 0) {
830 log_close(log);
831 return -1;
156 } 832 }
157 833
158 /* if file is empty, write new gzip stream */ 834 /* if new, initialize foo.gz with an empty log, delete old dictionary */
159 if (lseek(log->fd, 0, SEEK_END) == 0) { 835 if (lseek(log->fd, 0, SEEK_END) == 0) {
160 if (write(log->fd, empty_gz, sizeof(empty_gz)) != sizeof(empty_gz)) { 836 if (write(log->fd, log_gzhead, HEAD) != HEAD ||
161 log_clean(log); 837 write(log->fd, log_gzext, EXTRA) != EXTRA ||
162 return NULL; 838 write(log->fd, log_gzbody, BODY) != BODY) {
839 log_close(log);
840 return -1;
163 } 841 }
842 strcpy(log->end, ".dict");
843 unlink(log->path);
164 } 844 }
165 845
166 /* check gzip header */ 846 /* verify log file and load extra field information */
167 (void)lseek(log->fd, 0, SEEK_SET); 847 if ((op = log_head(log)) < 0) {
168 if (read(log->fd, temp, 12) != 12 || temp[0] != 0x1f || 848 log_close(log);
169 temp[1] != 0x8b || temp[2] != 8 || (temp[3] & 4) == 0) { 849 return -1;
170 log_clean(log);
171 return NULL;
172 } 850 }
173 851
174 /* process extra field to find "ap" sub-field */ 852 /* check for interrupted process and if so, recover */
175 xlen = temp[10] + (temp[11] << 8); 853 if (op != NO_OP && log_recover(log, op)) {
176 good = 0; 854 log_close(log);
177 while (xlen) { 855 return -1;
178 if (xlen < 4 || read(log->fd, temp, 4) != 4)
179 break;
180 sub_len = temp[2];
181 sub_len += temp[3] << 8;
182 xlen -= 4;
183 if (memcmp(temp, "ap", 2) == 0 && sub_len == 16) {
184 good = 1;
185 break;
186 }
187 if (xlen < sub_len)
188 break;
189 (void)lseek(log->fd, sub_len, SEEK_CUR);
190 xlen -= sub_len;
191 } 856 }
192 if (!good) { 857
193 log_clean(log); 858 /* touch the lock file to prevent another process from grabbing it */
859 log_touch(log);
860 return 0;
861}
862
863/* See gzlog.h for the description of the external methods below */
864gzlog *gzlog_open(char *path)
865{
866 size_t n;
867 struct log *log;
868
869 /* check arguments */
870 if (path == NULL || *path == 0)
194 return NULL; 871 return NULL;
195 }
196 872
197 /* read in "ap" sub-field */ 873 /* allocate and initialize log structure */
198 log->extra = lseek(log->fd, 0, SEEK_CUR); 874 log = malloc(sizeof(struct log));
199 if (read(log->fd, temp, 16) != 16) { 875 if (log == NULL)
200 log_clean(log); 876 return NULL;
877 strcpy(log->id, LOGID);
878 log->fd = -1;
879
880 /* save path and end of path for name construction */
881 n = strlen(path);
882 log->path = malloc(n + 9); /* allow for ".repairs" */
883 if (log->path == NULL) {
884 free(log);
201 return NULL; 885 return NULL;
202 } 886 }
203 log->mark_off = make_off(temp); 887 strcpy(log->path, path);
204 log->last_off = make_off(temp + 8); 888 log->end = log->path + n;
205 889
206 /* get crc, length of gzip file */ 890 /* gain exclusive access and verify log file -- may perform a
207 (void)lseek(log->fd, log->last_off, SEEK_SET); 891 recovery operation if needed */
208 if (read(log->fd, temp, 13) != 13 || 892 if (log_open(log)) {
209 memcmp(temp, "\001\000\000\377\377", 5) != 0) { 893 free(log->path);
210 log_clean(log); 894 free(log);
211 return NULL; 895 return NULL;
212 } 896 }
213 log->crc = make_ulg(temp + 5);
214 log->len = make_ulg(temp + 9);
215 897
216 /* set up to write over empty last block */ 898 /* return pointer to log structure */
217 (void)lseek(log->fd, log->last_off + 5, SEEK_SET); 899 return log;
218 log->stored = 0;
219 return (void *)log;
220} 900}
221 901
222/* maximum amount to put in a stored block before starting a new one */ 902/* gzlog_compress() return values:
223#define MAX_BLOCK 16384 903 0: all good
224 904 -1: file i/o error (usually access issue)
225/* write a block to a log object */ 905 -2: memory allocation failure
226int gzlog_write(void *obj, char *data, size_t len) 906 -3: invalid log pointer argument */
907int gzlog_compress(gzlog *logd)
227{ 908{
228 size_t some; 909 int fd, ret;
229 unsigned char temp[5]; 910 uint block;
230 gz_log *log; 911 size_t len, next;
912 unsigned char *data, buf[5];
913 struct log *log = logd;
231 914
232 /* check object */ 915 /* check arguments */
233 log = (gz_log *)obj; 916 if (log == NULL || strcmp(log->id, LOGID) || len < 0)
234 if (log == NULL || log->id != GZLOGID) 917 return -3;
235 return 1;
236 918
237 /* write stored blocks until all of the input is written */ 919 /* see if we lost the lock -- if so get it again and reload the extra
238 do { 920 field information (it probably changed), recover last operation if
239 some = MAX_BLOCK - log->stored; 921 necessary */
240 if (some > len) 922 if (log_check(log) && log_open(log))
241 some = len; 923 return -1;
242 if (write(log->fd, data, some) != some)
243 return 1;
244 log->crc = crc32(log->crc, (unsigned char *)data, some);
245 log->len += some;
246 len -= some;
247 data += some;
248 log->stored += some;
249
250 /* if the stored block is full, end it and start another */
251 if (log->stored == MAX_BLOCK) {
252 (void)lseek(log->fd, log->last_off, SEEK_SET);
253 temp[0] = 0;
254 dice_ulg(log->stored + ((unsigned long)(~log->stored) << 16),
255 temp + 1);
256 if (write(log->fd, temp, 5) != 5)
257 return 1;
258 log->last_off = lseek(log->fd, log->stored, SEEK_CUR);
259 (void)lseek(log->fd, 5, SEEK_CUR);
260 log->stored = 0;
261 }
262 } while (len);
263 return 0;
264}
265 924
266/* recompress the remaining stored deflate data in place */ 925 /* create space for uncompressed data */
267local int recomp(gz_log *log) 926 len = ((size_t)(log->last - log->first) & ~(((size_t)1 << 10) - 1)) +
268{ 927 log->stored;
269 z_stream strm; 928 if ((data = malloc(len)) == NULL)
270 size_t len, max; 929 return -2;
271 unsigned char *in;
272 unsigned char *out;
273 unsigned char temp[16];
274
275 /* allocate space and read it all in (it's around 1 MB) */
276 len = log->last_off - log->mark_off;
277 max = len + (len >> 12) + (len >> 14) + 11;
278 out = malloc(max);
279 if (out == NULL)
280 return 1;
281 in = malloc(len);
282 if (in == NULL) {
283 free(out);
284 return 1;
285 }
286 (void)lseek(log->fd, log->mark_off, SEEK_SET);
287 if (read(log->fd, in, len) != len) {
288 free(in);
289 free(out);
290 return 1;
291 }
292 930
293 /* recompress in memory, decoding stored data as we go */ 931 /* do statement here is just a cheap trick for error handling */
294 /* note: this assumes that unsigned is four bytes or more */ 932 do {
295 /* consider not making that assumption */ 933 /* read in the uncompressed data */
296 strm.zalloc = Z_NULL; 934 if (lseek(log->fd, log->first - 1, SEEK_SET) < 0)
297 strm.zfree = Z_NULL;
298 strm.opaque = Z_NULL;
299 if (deflateInit2(&strm, Z_BEST_COMPRESSION, Z_DEFLATED, -15, 8,
300 Z_DEFAULT_STRATEGY) != Z_OK) {
301 free(in);
302 free(out);
303 return 1;
304 }
305 strm.next_in = in;
306 strm.avail_out = max;
307 strm.next_out = out;
308 while (len >= 5) {
309 if (strm.next_in[0] != 0)
310 break; 935 break;
311 strm.avail_in = strm.next_in[1] + (strm.next_in[2] << 8); 936 next = 0;
312 strm.next_in += 5; 937 while (next < len) {
313 len -= 5; 938 if (read(log->fd, buf, 5) != 5)
314 if (strm.avail_in != 0) {
315 if (len < strm.avail_in)
316 break; 939 break;
317 len -= strm.avail_in; 940 block = PULL2(buf + 1);
318 (void)deflate(&strm, Z_NO_FLUSH); 941 if (next + block > len ||
319 if (strm.avail_in != 0 || strm.avail_out == 0) 942 read(log->fd, (char *)data + next, block) != block)
320 break; 943 break;
944 next += block;
321 } 945 }
322 } 946 if (lseek(log->fd, 0, SEEK_CUR) != log->last + 4 + log->stored)
323 (void)deflate(&strm, Z_SYNC_FLUSH); 947 break;
324 (void)deflateEnd(&strm); 948 log_touch(log);
325 free(in);
326 if (len != 0 || strm.avail_out == 0) {
327 free(out);
328 return 1;
329 }
330 949
331 /* overwrite stored data with compressed data */ 950 /* write the uncompressed data to the .add file */
332 (void)lseek(log->fd, log->mark_off, SEEK_SET); 951 strcpy(log->end, ".add");
333 len = max - strm.avail_out; 952 fd = open(log->path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
334 if (write(log->fd, out, len) != len) { 953 if (fd < 0)
335 free(out); 954 break;
336 return 1; 955 ret = write(fd, data, len) != len;
337 } 956 if (ret | close(fd))
338 free(out); 957 break;
339 958 log_touch(log);
340 /* write last empty block, crc, and length */
341 log->mark_off = log->last_off = lseek(log->fd, 0, SEEK_CUR);
342 temp[0] = 1;
343 dice_ulg(0xffffL << 16, temp + 1);
344 dice_ulg(log->crc, temp + 5);
345 dice_ulg(log->len, temp + 9);
346 if (write(log->fd, temp, 13) != 13)
347 return 1;
348 959
349 /* truncate file to discard remaining stored data and old trailer */ 960 /* write the dictionary for the next compress to the .temp file */
350 ftruncate(log->fd, lseek(log->fd, 0, SEEK_CUR)); 961 strcpy(log->end, ".temp");
962 fd = open(log->path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
963 if (fd < 0)
964 break;
965 next = DICT > len ? len : DICT;
966 ret = write(fd, (char *)data + len - next, next) != next;
967 if (ret | close(fd))
968 break;
969 log_touch(log);
351 970
352 /* update extra field to point to new last empty block */ 971 /* roll back to compressed data, mark the compress in progress */
353 (void)lseek(log->fd, log->extra, SEEK_SET); 972 log->last = log->first;
354 dice_off(log->mark_off, temp); 973 log->stored = 0;
355 dice_off(log->last_off, temp + 8); 974 if (log_mark(log, COMPRESS_OP))
356 if (write(log->fd, temp, 16) != 16) 975 break;
357 return 1; 976 BAIL(7);
358 return 0; 977
359} 978 /* compress and append the data (clears mark) */
979 ret = log_compress(log, data, len);
980 free(data);
981 return ret;
982 } while (0);
360 983
361/* maximum accumulation of stored blocks before compressing */ 984 /* broke out of do above on i/o error */
362#define MAX_STORED 1048576 985 free(data);
986 return -1;
987}
363 988
364/* close log object */ 989/* gzlog_write() return values:
365int gzlog_close(void *obj) 990 0: all good
991 -1: file i/o error (usually access issue)
992 -2: memory allocation failure
993 -3: invalid log pointer argument */
994int gzlog_write(gzlog *logd, void *data, size_t len)
366{ 995{
367 unsigned char temp[8]; 996 int fd, ret;
368 gz_log *log; 997 struct log *log = logd;
369 998
370 /* check object */ 999 /* check arguments */
371 log = (gz_log *)obj; 1000 if (log == NULL || strcmp(log->id, LOGID) || len < 0)
372 if (log == NULL || log->id != GZLOGID) 1001 return -3;
373 return 1; 1002 if (data == NULL || len == 0)
1003 return 0;
374 1004
375 /* go to start of most recent block being written */ 1005 /* see if we lost the lock -- if so get it again and reload the extra
376 (void)lseek(log->fd, log->last_off, SEEK_SET); 1006 field information (it probably changed), recover last operation if
377 1007 necessary */
378 /* if some stuff was put there, update block */ 1008 if (log_check(log) && log_open(log))
379 if (log->stored) { 1009 return -1;
380 temp[0] = 0;
381 dice_ulg(log->stored + ((unsigned long)(~log->stored) << 16),
382 temp + 1);
383 if (write(log->fd, temp, 5) != 5)
384 return 1;
385 log->last_off = lseek(log->fd, log->stored, SEEK_CUR);
386 }
387 1010
388 /* write last block (empty) */ 1011 /* create and write .add file */
389 if (write(log->fd, "\001\000\000\377\377", 5) != 5) 1012 strcpy(log->end, ".add");
390 return 1; 1013 fd = open(log->path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
1014 if (fd < 0)
1015 return -1;
1016 ret = write(fd, data, len) != len;
1017 if (ret | close(fd))
1018 return -1;
1019 log_touch(log);
391 1020
392 /* write updated crc and uncompressed length */ 1021 /* mark log file with append in progress */
393 dice_ulg(log->crc, temp); 1022 if (log_mark(log, APPEND_OP))
394 dice_ulg(log->len, temp + 4); 1023 return -1;
395 if (write(log->fd, temp, 8) != 8) 1024 BAIL(8);
396 return 1;
397 1025
398 /* put offset of that last block in gzip extra block */ 1026 /* append data (clears mark) */
399 (void)lseek(log->fd, log->extra + 8, SEEK_SET); 1027 if (log_append(log, data, len))
400 dice_off(log->last_off, temp); 1028 return -1;
401 if (write(log->fd, temp, 8) != 8)
402 return 1;
403 1029
404 /* if more than 1 MB stored, then time to compress it */ 1030 /* check to see if it's time to compress -- if not, then done */
405 if (log->last_off - log->mark_off > MAX_STORED) { 1031 if (((log->last - log->first) >> 10) + (log->stored >> 10) < TRIGGER)
406 if (recomp(log)) 1032 return 0;
407 return 1; 1033
408 } 1034 /* time to compress */
1035 return gzlog_compress(log);
1036}
1037
1038/* gzlog_close() return values:
1039 0: ok
1040 -3: invalid log pointer argument */
1041int gzlog_close(gzlog *logd)
1042{
1043 struct log *log = logd;
409 1044
410 /* unlock and close file */ 1045 /* check arguments */
411 log_clean(log); 1046 if (log == NULL || strcmp(log->id, LOGID))
1047 return -3;
1048
1049 /* close the log file and release the lock */
1050 log_close(log);
1051
1052 /* free structure and return */
1053 if (log->path != NULL)
1054 free(log->path);
1055 strcpy(log->id, "bad");
1056 free(log);
412 return 0; 1057 return 0;
413} 1058}
diff --git a/examples/gzlog.h b/examples/gzlog.h
index a800bd5..c461426 100644
--- a/examples/gzlog.h
+++ b/examples/gzlog.h
@@ -1,6 +1,6 @@
1/* gzlog.h 1/* gzlog.h
2 Copyright (C) 2004 Mark Adler, all rights reserved 2 Copyright (C) 2004, 2008 Mark Adler, all rights reserved
3 version 1.0, 26 Nov 2004 3 version 2.0, 25 Apr 2008
4 4
5 This software is provided 'as-is', without any express or implied 5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the author be held liable for any damages 6 warranty. In no event will the author be held liable for any damages
@@ -21,38 +21,69 @@
21 Mark Adler madler@alumni.caltech.edu 21 Mark Adler madler@alumni.caltech.edu
22 */ 22 */
23 23
24/* Version History:
25 1.0 26 Nov 2004 First version
26 2.0 25 Apr 2008 Complete redesign for recovery of interrupted operations
27 Interface changed slightly in that now path is a prefix
28 Compression now occurs as needed during gzlog_write()
29 gzlog_write() now always leaves the log file as valid gzip
30 */
31
24/* 32/*
25 The gzlog object allows writing short messages to a gzipped log file, 33 The gzlog object allows writing short messages to a gzipped log file,
26 opening the log file locked for small bursts, and then closing it. The log 34 opening the log file locked for small bursts, and then closing it. The log
27 object works by appending stored data to the gzip file until 1 MB has been 35 object works by appending stored (uncompressed) data to the gzip file until
28 accumulated. At that time, the stored data is compressed, and replaces the 36 1 MB has been accumulated. At that time, the stored data is compressed, and
29 uncompressed data in the file. The log file is truncated to its new size at 37 replaces the uncompressed data in the file. The log file is truncated to
30 that time. After closing, the log file is always valid gzip file that can 38 its new size at that time. After each write operation, the log file is a
31 decompressed to recover what was written. 39 valid gzip file that can decompressed to recover what was written.
32 40
33 A gzip header "extra" field contains two file offsets for appending. The 41 The gzlog operations can be interupted at any point due to an application or
34 first points to just after the last compressed data. The second points to 42 system crash, and the log file will be recovered the next time the log is
35 the last stored block in the deflate stream, which is empty. All of the 43 opened with gzlog_open().
36 data between those pointers is uncompressed.
37 */ 44 */
38 45
46#ifndef GZLOG_H
47#define GZLOG_H
48
49/* gzlog object type */
50typedef void gzlog;
51
39/* Open a gzlog object, creating the log file if it does not exist. Return 52/* Open a gzlog object, creating the log file if it does not exist. Return
40 NULL on error. Note that gzlog_open() could take a long time to return if 53 NULL on error. Note that gzlog_open() could take a while to complete if it
41 there is difficulty in locking the file. */ 54 has to wait to verify that a lock is stale (possibly for five minutes), or
42void *gzlog_open(char *path); 55 if there is significant contention with other instantiations of this object
43 56 when locking the resource. path is the prefix of the file names created by
44/* Write to a gzlog object. Return non-zero on error. This function will 57 this object. If path is "foo", then the log file will be "foo.gz", and
45 simply write data to the file uncompressed. Compression of the data 58 other auxiliary files will be created and destroyed during the process:
46 will not occur until gzlog_close() is called. It is expected that 59 "foo.dict" for a compression dictionary, "foo.temp" for a temporary (next)
47 gzlog_write() is used for a short message, and then gzlog_close() is 60 dictionary, "foo.add" for data being added or compressed, "foo.lock" for the
48 called. If a large amount of data is to be written, then the application 61 lock file, and "foo.repairs" to log recovery operations performed due to
49 should write no more than 1 MB at a time with gzlog_write() before 62 interrupted gzlog operations. A gzlog_open() followed by a gzlog_close()
50 calling gzlog_close() and then gzlog_open() again. */ 63 will recover a previously interrupted operation, if any. */
51int gzlog_write(void *log, char *data, size_t len); 64gzlog *gzlog_open(char *path);
52 65
53/* Close a gzlog object. Return non-zero on error. The log file is locked 66/* Write to a gzlog object. Return zero on success, -1 if there is a file i/o
54 until this function is called. This function will compress stored data 67 error on any of the gzlog files (this should not happen if gzlog_open()
55 at the end of the gzip file if at least 1 MB has been accumulated. Note 68 succeeded, unless the device has run out of space or leftover auxiliary
56 that the file will not be a valid gzip file until this function completes. 69 files have permissions or ownership that prevent their use), -2 if there is
57 */ 70 a memory allocation failure, or -3 if the log argument is invalid (e.g. if
58int gzlog_close(void *log); 71 it was not created by gzlog_open()). This function will write data to the
72 file uncompressed, until 1 MB has been accumulated, at which time that data
73 will be compressed. The log file will be a valid gzip file upon successful
74 return. */
75int gzlog_write(gzlog *log, void *data, size_t len);
76
77/* Force compression of any uncompressed data in the log. This should be used
78 sparingly, if at all. The main application would be when a log file will
79 not be appended to again. If this is used to compress frequently while
80 appending, it will both significantly increase the execution time and
81 reduce the compression ratio. The return codes are the same as for
82 gzlog_write(). */
83int gzlog_compress(gzlog *log);
84
85/* Close a gzlog object. Return zero on success, -3 if the log argument is
86 invalid. The log object is freed, and so cannot be referenced again. */
87int gzlog_close(gzlog *log);
88
89#endif
diff --git a/examples/pigz.c b/examples/pigz.c
new file mode 100644
index 0000000..42794d0
--- /dev/null
+++ b/examples/pigz.c
@@ -0,0 +1,452 @@
1/* pigz.c -- parallel implementation of gzip
2 * Copyright (C) 2007 Mark Adler
3 * Version 1.1 28 January 2007 Mark Adler
4 */
5
6/* Version history:
7 1.0 17 Jan 2007 First version
8 1.1 28 Jan 2007 Avoid void * arithmetic (some compilers don't get that)
9 Add note about requiring zlib 1.2.3
10 Allow compression level 0 (no compression)
11 Completely rewrite parallelism -- add a write thread
12 Use deflateSetDictionary() to make use of history
13 Tune argument defaults to best performance on four cores
14 */
15
16/*
17 pigz compresses from stdin to stdout using threads to make use of multiple
18 processors and cores. The input is broken up into 128 KB chunks, and each
19 is compressed separately. The CRC for each chunk is also calculated
20 separately. The compressed chunks are written in order to the output,
21 and the overall CRC is calculated from the CRC's of the chunks.
22
23 The compressed data format generated is the gzip format using the deflate
24 compression method. First a gzip header is written, followed by raw deflate
25 partial streams. They are partial, in that they do not have a terminating
26 block. At the end, the deflate stream is terminated with a final empty
27 static block, and lastly a gzip trailer is written with the CRC and the
28 number of input bytes.
29
30 Each raw deflate partial stream is terminated by an empty stored block
31 (using the Z_SYNC_FLUSH option of zlib), in order to end that partial
32 bit stream at a byte boundary. That allows the partial streams to be
33 concantenated simply as sequences of bytes. This adds a very small four
34 or five byte overhead to the output for each input chunk.
35
36 zlib's crc32_combine() routine allows the calcuation of the CRC of the
37 entire input using the independent CRC's of the chunks. pigz requires zlib
38 version 1.2.3 or later, since that is the first version that provides the
39 crc32_combine() function.
40
41 pigz uses the POSIX pthread library for thread control and communication.
42 */
43
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47#include <pthread.h>
48#include <sys/types.h>
49#include <sys/uio.h>
50#include <unistd.h>
51#include "zlib.h"
52
53#define local static
54
55/* exit with error */
56local void bail(char *msg)
57{
58 fprintf(stderr, "pigz abort: %s\n", msg);
59 exit(1);
60}
61
62/* read up to len bytes into buf, repeating read() calls as needed */
63local size_t readn(int desc, unsigned char *buf, size_t len)
64{
65 ssize_t ret;
66 size_t got;
67
68 got = 0;
69 while (len) {
70 ret = read(desc, buf, len);
71 if (ret < 0)
72 bail("read error");
73 if (ret == 0)
74 break;
75 buf += ret;
76 len -= ret;
77 got += ret;
78 }
79 return got;
80}
81
82/* write len bytes, repeating write() calls as needed */
83local void writen(int desc, unsigned char *buf, size_t len)
84{
85 ssize_t ret;
86
87 while (len) {
88 ret = write(desc, buf, len);
89 if (ret < 1)
90 bail("write error");
91 buf += ret;
92 len -= ret;
93 }
94}
95
96/* a flag variable for communication between two threads */
97struct flag {
98 int value; /* value of flag */
99 pthread_mutex_t lock; /* lock for checking and changing flag */
100 pthread_cond_t cond; /* condition for signaling on flag change */
101};
102
103/* initialize a flag for use, starting with value val */
104local void flag_init(struct flag *me, int val)
105{
106 me->value = val;
107 pthread_mutex_init(&(me->lock), NULL);
108 pthread_cond_init(&(me->cond), NULL);
109}
110
111/* set the flag to val, signal another process that may be waiting for it */
112local void flag_set(struct flag *me, int val)
113{
114 pthread_mutex_lock(&(me->lock));
115 me->value = val;
116 pthread_cond_signal(&(me->cond));
117 pthread_mutex_unlock(&(me->lock));
118}
119
120/* if it isn't already, wait for some other thread to set the flag to val */
121local void flag_wait(struct flag *me, int val)
122{
123 pthread_mutex_lock(&(me->lock));
124 while (me->value != val)
125 pthread_cond_wait(&(me->cond), &(me->lock));
126 pthread_mutex_unlock(&(me->lock));
127}
128
129/* if flag is equal to val, wait for some other thread to change it */
130local void flag_wait_not(struct flag *me, int val)
131{
132 pthread_mutex_lock(&(me->lock));
133 while (me->value == val)
134 pthread_cond_wait(&(me->cond), &(me->lock));
135 pthread_mutex_unlock(&(me->lock));
136}
137
138/* clean up the flag when done with it */
139local void flag_done(struct flag *me)
140{
141 pthread_cond_destroy(&(me->cond));
142 pthread_mutex_destroy(&(me->lock));
143}
144
145/* a unit of work to feed to compress_thread() -- it is assumed that the out
146 buffer is large enough to hold the maximum size len bytes could deflate to,
147 plus five bytes for the final sync marker */
148struct work {
149 size_t len; /* length of input */
150 unsigned long crc; /* crc of input */
151 unsigned char *buf; /* input */
152 unsigned char *out; /* space for output (guaranteed big enough) */
153 z_stream strm; /* pre-initialized z_stream */
154 struct flag busy; /* busy flag indicating work unit in use */
155 pthread_t comp; /* this compression thread */
156};
157
158/* busy flag values */
159#define IDLE 0 /* compress and writing done -- can start compress */
160#define COMP 1 /* compress -- input and output buffers in use */
161#define WRITE 2 /* compress done, writing output -- can read input */
162
163/* read-only globals (set by main/read thread before others started) */
164local int ind; /* input file descriptor */
165local int outd; /* output file descriptor */
166local int level; /* compression level */
167local int procs; /* number of compression threads (>= 2) */
168local size_t size; /* uncompressed input size per thread (>= 32K) */
169local struct work *jobs; /* work units: jobs[0..procs-1] */
170
171/* next and previous jobs[] indices */
172#define NEXT(n) ((n) == procs - 1 ? 0 : (n) + 1)
173#define PREV(n) ((n) == 0 ? procs - 1 : (n) - 1)
174
175/* sliding dictionary size for deflate */
176#define DICT 32768U
177
178/* largest power of 2 that fits in an unsigned int -- used to limit requests
179 to zlib functions that use unsigned int lengths */
180#define MAX ((((unsigned)-1) >> 1) + 1)
181
182/* compress thread: compress the input in the provided work unit and compute
183 its crc -- assume that the amount of space at job->out is guaranteed to be
184 enough for the compressed output, as determined by the maximum expansion
185 of deflate compression -- use the input in the previous work unit (if there
186 is one) to set the deflate dictionary for better compression */
187local void *compress_thread(void *arg)
188{
189 size_t len; /* input length for this work unit */
190 unsigned long crc; /* crc of input data */
191 struct work *prev; /* previous work unit */
192 struct work *job = arg; /* work unit for this thread */
193 z_stream *strm = &(job->strm); /* zlib stream for this work unit */
194
195 /* reset state for a new compressed stream */
196 (void)deflateReset(strm);
197
198 /* initialize input, output, and crc */
199 strm->next_in = job->buf;
200 strm->next_out = job->out;
201 len = job->len;
202 crc = crc32(0L, Z_NULL, 0);
203
204 /* set dictionary if this isn't the first work unit, and if we will be
205 compressing something (the read thread assures that the dictionary
206 data in the previous work unit is still there) */
207 prev = jobs + PREV(job - jobs);
208 if (prev->buf != NULL && len != 0)
209 deflateSetDictionary(strm, prev->buf + (size - DICT), DICT);
210
211 /* run MAX-sized amounts of input through deflate and crc32 -- this loop
212 is needed for those cases where the integer type is smaller than the
213 size_t type, or when len is close to the limit of the size_t type */
214 while (len > MAX) {
215 strm->avail_in = MAX;
216 strm->avail_out = (unsigned)-1;
217 crc = crc32(crc, strm->next_in, strm->avail_in);
218 (void)deflate(strm, Z_NO_FLUSH);
219 len -= MAX;
220 }
221
222 /* run last piece through deflate and crc32, follow with a sync marker */
223 if (len) {
224 strm->avail_in = len;
225 strm->avail_out = (unsigned)-1;
226 crc = crc32(crc, strm->next_in, strm->avail_in);
227 (void)deflate(strm, Z_SYNC_FLUSH);
228 }
229
230 /* don't need to Z_FINISH, since we'd delete the last two bytes anyway */
231
232 /* return result */
233 job->crc = crc;
234 return NULL;
235}
236
237/* put a 4-byte integer into a byte array in LSB order */
238#define PUT4(a,b) (*(a)=(b),(a)[1]=(b)>>8,(a)[2]=(b)>>16,(a)[3]=(b)>>24)
239
240/* write thread: wait for compression threads to complete, write output in
241 order, also write gzip header and trailer around the compressed data */
242local void *write_thread(void *arg)
243{
244 int n; /* compress thread index */
245 size_t len; /* length of input processed */
246 unsigned long tot; /* total uncompressed size (overflow ok) */
247 unsigned long crc; /* CRC-32 of uncompressed data */
248 unsigned char wrap[10]; /* gzip header or trailer */
249
250 /* write simple gzip header */
251 memcpy(wrap, "\037\213\10\0\0\0\0\0\0\3", 10);
252 wrap[8] = level == 9 ? 2 : (level == 1 ? 4 : 0);
253 writen(outd, wrap, 10);
254
255 /* process output of compress threads until end of input */
256 tot = 0;
257 crc = crc32(0L, Z_NULL, 0);
258 n = 0;
259 do {
260 /* wait for compress thread to start, then wait to complete */
261 flag_wait(&(jobs[n].busy), COMP);
262 pthread_join(jobs[n].comp, NULL);
263
264 /* now that compress is done, allow read thread to use input buffer */
265 flag_set(&(jobs[n].busy), WRITE);
266
267 /* write compressed data and update length and crc */
268 writen(outd, jobs[n].out, jobs[n].strm.next_out - jobs[n].out);
269 len = jobs[n].len;
270 tot += len;
271 crc = crc32_combine(crc, jobs[n].crc, len);
272
273 /* release this work unit and go to the next work unit */
274 flag_set(&(jobs[n].busy), IDLE);
275 n = NEXT(n);
276
277 /* an input buffer less than size in length indicates end of input */
278 } while (len == size);
279
280 /* write final static block and gzip trailer (crc and len mod 2^32) */
281 wrap[0] = 3; wrap[1] = 0;
282 PUT4(wrap + 2, crc);
283 PUT4(wrap + 6, tot);
284 writen(outd, wrap, 10);
285 return NULL;
286}
287
288/* one-time initialization of a work unit -- this is where we set the deflate
289 compression level and request raw deflate, and also where we set the size
290 of the output buffer to guarantee enough space for a worst-case deflate
291 ending with a Z_SYNC_FLUSH */
292local void job_init(struct work *job)
293{
294 int ret; /* deflateInit2() return value */
295
296 job->buf = malloc(size);
297 job->out = malloc(size + (size >> 11) + 10);
298 job->strm.zfree = Z_NULL;
299 job->strm.zalloc = Z_NULL;
300 job->strm.opaque = Z_NULL;
301 ret = deflateInit2(&(job->strm), level, Z_DEFLATED, -15, 8,
302 Z_DEFAULT_STRATEGY);
303 if (job->buf == NULL || job->out == NULL || ret != Z_OK)
304 bail("not enough memory");
305}
306
307/* compress ind to outd in the gzip format, using multiple threads for the
308 compression and crc calculation and another thread for writing the output --
309 the read thread is the main thread */
310local void read_thread(void)
311{
312 int n; /* general index */
313 size_t got; /* amount read */
314 pthread_attr_t attr; /* thread attributes (left at defaults) */
315 pthread_t write; /* write thread */
316
317 /* set defaults (not all pthread implementations default to joinable) */
318 pthread_attr_init(&attr);
319 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
320
321 /* allocate and set up work list (individual work units will be initialized
322 as needed, in case the input is short), assure that allocation size
323 arithmetic does not overflow */
324 if (size + (size >> 11) + 10 < (size >> 11) + 10 ||
325 (ssize_t)(size + (size >> 11) + 10) < 0 ||
326 ((size_t)0 - 1) / procs <= sizeof(struct work) ||
327 (jobs = malloc(procs * sizeof(struct work))) == NULL)
328 bail("not enough memory");
329 for (n = 0; n < procs; n++) {
330 jobs[n].buf = NULL;
331 flag_init(&(jobs[n].busy), IDLE);
332 }
333
334 /* start write thread */
335 pthread_create(&write, &attr, write_thread, NULL);
336
337 /* read from input and start compress threads (write thread will pick up
338 the output of the compress threads) */
339 n = 0;
340 do {
341 /* initialize this work unit if it's the first time it's used */
342 if (jobs[n].buf == NULL)
343 job_init(jobs + n);
344
345 /* read input data, but wait for last compress on this work unit to be
346 done, and wait for the dictionary to be used by the last compress on
347 the next work unit */
348 flag_wait_not(&(jobs[n].busy), COMP);
349 flag_wait_not(&(jobs[NEXT(n)].busy), COMP);
350 got = readn(ind, jobs[n].buf, size);
351
352 /* start compress thread, but wait for write to be done first */
353 flag_wait(&(jobs[n].busy), IDLE);
354 jobs[n].len = got;
355 pthread_create(&(jobs[n].comp), &attr, compress_thread, jobs + n);
356
357 /* mark work unit so write thread knows compress was started */
358 flag_set(&(jobs[n].busy), COMP);
359
360 /* go to the next work unit */
361 n = NEXT(n);
362
363 /* do until end of input, indicated by a read less than size */
364 } while (got == size);
365
366 /* wait for the write thread to complete -- the write thread will join with
367 all of the compress threads, so this waits for all of the threads to
368 complete */
369 pthread_join(write, NULL);
370
371 /* free up all requested resources and return */
372 for (n = procs - 1; n >= 0; n--) {
373 flag_done(&(jobs[n].busy));
374 (void)deflateEnd(&(jobs[n].strm));
375 free(jobs[n].out);
376 free(jobs[n].buf);
377 }
378 free(jobs);
379 pthread_attr_destroy(&attr);
380}
381
382/* Process arguments for level, size, and procs, compress from stdin to
383 stdout in the gzip format. Note that procs must be at least two in
384 order to provide a dictionary in one work unit for the other work
385 unit, and that size must be at least 32K to store a full dictionary. */
386int main(int argc, char **argv)
387{
388 int n; /* general index */
389 int get; /* command line parameters to get */
390 char *arg; /* command line argument */
391
392 /* set defaults -- 32 processes and 128K buffers was found to provide
393 good utilization of four cores (about 97%) and balanced the overall
394 execution time impact of more threads against more dictionary
395 processing for a fixed amount of memory -- the memory usage for these
396 settings and full use of all work units (at least 4 MB of input) is
397 16.2 MB
398 */
399 level = Z_DEFAULT_COMPRESSION;
400 procs = 32;
401 size = 131072UL;
402
403 /* process command-line arguments */
404 get = 0;
405 for (n = 1; n < argc; n++) {
406 arg = argv[n];
407 if (*arg == '-') {
408 while (*++arg)
409 if (*arg >= '0' && *arg <= '9') /* compression level */
410 level = *arg - '0';
411 else if (*arg == 'b') /* chunk size in K */
412 get |= 1;
413 else if (*arg == 'p') /* number of processes */
414 get |= 2;
415 else if (*arg == 'h') { /* help */
416 fputs("usage: pigz [-0..9] [-b blocksizeinK]", stderr);
417 fputs(" [-p processes] < foo > foo.gz\n", stderr);
418 return 0;
419 }
420 else
421 bail("invalid option");
422 }
423 else if (get & 1) {
424 if (get & 2)
425 bail("you need to separate the -b and -p options");
426 size = (size_t)(atol(arg)) << 10; /* chunk size */
427 if (size < DICT)
428 bail("invalid option");
429 get = 0;
430 }
431 else if (get & 2) {
432 procs = atoi(arg); /* processes */
433 if (procs < 2)
434 bail("invalid option");
435 get = 0;
436 }
437 else
438 bail("invalid option (you need to pipe input and output)");
439 }
440 if (get)
441 bail("missing option argument");
442
443 /* do parallel compression from stdin to stdout (the read thread starts up
444 the write thread and the compression threads, and they all join before
445 the read thread returns) */
446 ind = 0;
447 outd = 1;
448 read_thread();
449
450 /* done */
451 return 0;
452}