diff options
| author | Mark Adler <madler@alumni.caltech.edu> | 2018-08-01 01:49:45 -0700 |
|---|---|---|
| committer | Mark Adler <madler@alumni.caltech.edu> | 2018-08-01 19:51:56 -0700 |
| commit | 8ba2cdb6bdbee95703e1c6632fd8519a5e4a206e (patch) | |
| tree | 4c7be828258ab03d4585ba46714c2b858563f7e4 /examples | |
| parent | 4c14b51587b3bf484f7ca26a92dcfc0f8559201b (diff) | |
| download | zlib-8ba2cdb6bdbee95703e1c6632fd8519a5e4a206e.tar.gz zlib-8ba2cdb6bdbee95703e1c6632fd8519a5e4a206e.tar.bz2 zlib-8ba2cdb6bdbee95703e1c6632fd8519a5e4a206e.zip | |
Clean up code style in enough.c, update version.
Diffstat (limited to 'examples')
| -rw-r--r-- | examples/enough.c | 413 |
1 files changed, 202 insertions, 211 deletions
diff --git a/examples/enough.c b/examples/enough.c index d6f8e25..56ad63e 100644 --- a/examples/enough.c +++ b/examples/enough.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* enough.c -- determine the maximum size of inflate's Huffman code tables over | 1 | /* enough.c -- determine the maximum size of inflate's Huffman code tables over |
| 2 | * all possible valid and complete Huffman codes, subject to a length limit. | 2 | * all possible valid and complete Huffman codes, subject to a length limit. |
| 3 | * Copyright (C) 2007, 2008, 2012 Mark Adler | 3 | * Copyright (C) 2007, 2008, 2012, 2018 Mark Adler |
| 4 | * Version 1.4 18 August 2012 Mark Adler | 4 | * Version 1.5 1 August 2018 Mark Adler |
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | /* Version history: | 7 | /* Version history: |
| @@ -17,86 +17,87 @@ | |||
| 17 | 1.4 18 Aug 2012 Avoid shifts more than bits in type (caused endless loop!) | 17 | 1.4 18 Aug 2012 Avoid shifts more than bits in type (caused endless loop!) |
| 18 | Clean up comparisons of different types | 18 | Clean up comparisons of different types |
| 19 | Clean up code indentation | 19 | Clean up code indentation |
| 20 | 1.5 1 Aug 2018 Clean up code style and formatting | ||
| 20 | */ | 21 | */ |
| 21 | 22 | ||
| 22 | /* | 23 | /* |
| 23 | Examine all possible Huffman codes for a given number of symbols and a | 24 | Examine all possible Huffman codes for a given number of symbols and a |
| 24 | maximum code length in bits to determine the maximum table size for zilb's | 25 | maximum code length in bits to determine the maximum table size for zlib's |
| 25 | inflate. Only complete Huffman codes are counted. | 26 | inflate. Only complete Huffman codes are counted. |
| 26 | 27 | ||
| 27 | Two codes are considered distinct if the vectors of the number of codes per | 28 | Two codes are considered distinct if the vectors of the number of codes per |
| 28 | length are not identical. So permutations of the symbol assignments result | 29 | length are not identical. So permutations of the symbol assignments result |
| 29 | in the same code for the counting, as do permutations of the assignments of | 30 | in the same code for the counting, as do permutations of the assignments of |
| 30 | the bit values to the codes (i.e. only canonical codes are counted). | 31 | the bit values to the codes (i.e. only canonical codes are counted). |
| 31 | 32 | ||
| 32 | We build a code from shorter to longer lengths, determining how many symbols | 33 | We build a code from shorter to longer lengths, determining how many symbols |
| 33 | are coded at each length. At each step, we have how many symbols remain to | 34 | are coded at each length. At each step, we have how many symbols remain to |
| 34 | be coded, what the last code length used was, and how many bit patterns of | 35 | be coded, what the last code length used was, and how many bit patterns of |
| 35 | that length remain unused. Then we add one to the code length and double the | 36 | that length remain unused. Then we add one to the code length and double the |
| 36 | number of unused patterns to graduate to the next code length. We then | 37 | number of unused patterns to graduate to the next code length. We then |
| 37 | assign all portions of the remaining symbols to that code length that | 38 | assign all portions of the remaining symbols to that code length that |
| 38 | preserve the properties of a correct and eventually complete code. Those | 39 | preserve the properties of a correct and eventually complete code. Those |
| 39 | properties are: we cannot use more bit patterns than are available; and when | 40 | properties are: we cannot use more bit patterns than are available; and when |
| 40 | all the symbols are used, there are exactly zero possible bit patterns | 41 | all the symbols are used, there are exactly zero possible bit patterns |
| 41 | remaining. | 42 | remaining. |
| 42 | 43 | ||
| 43 | The inflate Huffman decoding algorithm uses two-level lookup tables for | 44 | The inflate Huffman decoding algorithm uses two-level lookup tables for |
| 44 | speed. There is a single first-level table to decode codes up to root bits | 45 | speed. There is a single first-level table to decode codes up to root bits |
| 45 | in length (root == 9 in the current inflate implementation). The table | 46 | in length (root == 9 in the current inflate implementation). The table has 1 |
| 46 | has 1 << root entries and is indexed by the next root bits of input. Codes | 47 | << root entries and is indexed by the next root bits of input. Codes shorter |
| 47 | shorter than root bits have replicated table entries, so that the correct | 48 | than root bits have replicated table entries, so that the correct entry is |
| 48 | entry is pointed to regardless of the bits that follow the short code. If | 49 | pointed to regardless of the bits that follow the short code. If the code is |
| 49 | the code is longer than root bits, then the table entry points to a second- | 50 | longer than root bits, then the table entry points to a second- level table. |
| 50 | level table. The size of that table is determined by the longest code with | 51 | The size of that table is determined by the longest code with that root-bit |
| 51 | that root-bit prefix. If that longest code has length len, then the table | 52 | prefix. If that longest code has length len, then the table has size 1 << |
| 52 | has size 1 << (len - root), to index the remaining bits in that set of | 53 | (len - root), to index the remaining bits in that set of codes. Each |
| 53 | codes. Each subsequent root-bit prefix then has its own sub-table. The | 54 | subsequent root-bit prefix then has its own sub-table. The total number of |
| 54 | total number of table entries required by the code is calculated | 55 | table entries required by the code is calculated incrementally as the number |
| 55 | incrementally as the number of codes at each bit length is populated. When | 56 | of codes at each bit length is populated. When all of the codes are shorter |
| 56 | all of the codes are shorter than root bits, then root is reduced to the | 57 | than root bits, then root is reduced to the longest code length, resulting |
| 57 | longest code length, resulting in a single, smaller, one-level table. | 58 | in a single, smaller, one-level table. |
| 58 | 59 | ||
| 59 | The inflate algorithm also provides for small values of root (relative to | 60 | The inflate algorithm also provides for small values of root (relative to |
| 60 | the log2 of the number of symbols), where the shortest code has more bits | 61 | the log2 of the number of symbols), where the shortest code has more bits |
| 61 | than root. In that case, root is increased to the length of the shortest | 62 | than root. In that case, root is increased to the length of the shortest |
| 62 | code. This program, by design, does not handle that case, so it is verified | 63 | code. This program, by design, does not handle that case, so it is verified |
| 63 | that the number of symbols is less than 2^(root + 1). | 64 | that the number of symbols is less than 2^(root + 1). |
| 64 | 65 | ||
| 65 | In order to speed up the examination (by about ten orders of magnitude for | 66 | In order to speed up the examination (by about ten orders of magnitude for |
| 66 | the default arguments), the intermediate states in the build-up of a code | 67 | the default arguments), the intermediate states in the build-up of a code |
| 67 | are remembered and previously visited branches are pruned. The memory | 68 | are remembered and previously visited branches are pruned. The memory |
| 68 | required for this will increase rapidly with the total number of symbols and | 69 | required for this will increase rapidly with the total number of symbols and |
| 69 | the maximum code length in bits. However this is a very small price to pay | 70 | the maximum code length in bits. However this is a very small price to pay |
| 70 | for the vast speedup. | 71 | for the vast speedup. |
| 71 | 72 | ||
| 72 | First, all of the possible Huffman codes are counted, and reachable | 73 | First, all of the possible Huffman codes are counted, and reachable |
| 73 | intermediate states are noted by a non-zero count in a saved-results array. | 74 | intermediate states are noted by a non-zero count in a saved-results array. |
| 74 | Second, the intermediate states that lead to (root + 1) bit or longer codes | 75 | Second, the intermediate states that lead to (root + 1) bit or longer codes |
| 75 | are used to look at all sub-codes from those junctures for their inflate | 76 | are used to look at all sub-codes from those junctures for their inflate |
| 76 | memory usage. (The amount of memory used is not affected by the number of | 77 | memory usage. (The amount of memory used is not affected by the number of |
| 77 | codes of root bits or less in length.) Third, the visited states in the | 78 | codes of root bits or less in length.) Third, the visited states in the |
| 78 | construction of those sub-codes and the associated calculation of the table | 79 | construction of those sub-codes and the associated calculation of the table |
| 79 | size is recalled in order to avoid recalculating from the same juncture. | 80 | size is recalled in order to avoid recalculating from the same juncture. |
| 80 | Beginning the code examination at (root + 1) bit codes, which is enabled by | 81 | Beginning the code examination at (root + 1) bit codes, which is enabled by |
| 81 | identifying the reachable nodes, accounts for about six of the orders of | 82 | identifying the reachable nodes, accounts for about six of the orders of |
| 82 | magnitude of improvement for the default arguments. About another four | 83 | magnitude of improvement for the default arguments. About another four |
| 83 | orders of magnitude come from not revisiting previous states. Out of | 84 | orders of magnitude come from not revisiting previous states. Out of |
| 84 | approximately 2x10^16 possible Huffman codes, only about 2x10^6 sub-codes | 85 | approximately 2x10^16 possible Huffman codes, only about 2x10^6 sub-codes |
| 85 | need to be examined to cover all of the possible table memory usage cases | 86 | need to be examined to cover all of the possible table memory usage cases |
| 86 | for the default arguments of 286 symbols limited to 15-bit codes. | 87 | for the default arguments of 286 symbols limited to 15-bit codes. |
| 87 | 88 | ||
| 88 | Note that an unsigned long long type is used for counting. It is quite easy | 89 | Note that an unsigned long long type is used for counting. It is quite easy |
| 89 | to exceed the capacity of an eight-byte integer with a large number of | 90 | to exceed the capacity of an eight-byte integer with a large number of |
| 90 | symbols and a large maximum code length, so multiple-precision arithmetic | 91 | symbols and a large maximum code length, so multiple-precision arithmetic |
| 91 | would need to replace the unsigned long long arithmetic in that case. This | 92 | would need to replace the unsigned long long arithmetic in that case. This |
| 92 | program will abort if an overflow occurs. The big_t type identifies where | 93 | program will abort if an overflow occurs. The big_t type identifies where |
| 93 | the counting takes place. | 94 | the counting takes place. |
| 94 | 95 | ||
| 95 | An unsigned long long type is also used for calculating the number of | 96 | An unsigned long long type is also used for calculating the number of |
| 96 | possible codes remaining at the maximum length. This limits the maximum | 97 | possible codes remaining at the maximum length. This limits the maximum code |
| 97 | code length to the number of bits in a long long minus the number of bits | 98 | length to the number of bits in a long long minus the number of bits needed |
| 98 | needed to represent the symbols in a flat code. The code_t type identifies | 99 | to represent the symbols in a flat code. The code_t type identifies where |
| 99 | where the bit pattern counting takes place. | 100 | the bit pattern counting takes place. |
| 100 | */ | 101 | */ |
| 101 | 102 | ||
| 102 | #include <stdio.h> | 103 | #include <stdio.h> |
| @@ -106,13 +107,13 @@ | |||
| 106 | 107 | ||
| 107 | #define local static | 108 | #define local static |
| 108 | 109 | ||
| 109 | /* special data types */ | 110 | // Special data types. |
| 110 | typedef unsigned long long big_t; /* type for code counting */ | 111 | typedef unsigned long long big_t; // type for code counting |
| 111 | #define PRIbig "llu" /* printf format for big_t */ | 112 | #define PRIbig "llu" // printf format for big_t |
| 112 | typedef unsigned long long code_t; /* type for bit pattern counting */ | 113 | typedef unsigned long long code_t; // type for bit pattern counting |
| 113 | struct tab { /* type for been here check */ | 114 | struct tab { // type for been here check |
| 114 | size_t len; /* length of bit vector in char's */ | 115 | size_t len; // length of bit vector in char's |
| 115 | char *vec; /* allocated bit vector */ | 116 | char *vec; // allocated bit vector |
| 116 | }; | 117 | }; |
| 117 | 118 | ||
| 118 | /* The array for saving results, num[], is indexed with this triplet: | 119 | /* The array for saving results, num[], is indexed with this triplet: |
| @@ -127,24 +128,23 @@ struct tab { /* type for been here check */ | |||
| 127 | left: 2..syms - 1, but only the evens (so syms == 8 -> 2, 4, 6) | 128 | left: 2..syms - 1, but only the evens (so syms == 8 -> 2, 4, 6) |
| 128 | len: 1..max - 1 (max == maximum code length in bits) | 129 | len: 1..max - 1 (max == maximum code length in bits) |
| 129 | 130 | ||
| 130 | syms == 2 is not saved since that immediately leads to a single code. left | 131 | syms == 2 is not saved since that immediately leads to a single code. left |
| 131 | must be even, since it represents the number of available bit patterns at | 132 | must be even, since it represents the number of available bit patterns at |
| 132 | the current length, which is double the number at the previous length. | 133 | the current length, which is double the number at the previous length. left |
| 133 | left ends at syms-1 since left == syms immediately results in a single code. | 134 | ends at syms-1 since left == syms immediately results in a single code. |
| 134 | (left > sym is not allowed since that would result in an incomplete code.) | 135 | (left > sym is not allowed since that would result in an incomplete code.) |
| 135 | len is less than max, since the code completes immediately when len == max. | 136 | len is less than max, since the code completes immediately when len == max. |
| 136 | 137 | ||
| 137 | The offset into the array is calculated for the three indices with the | 138 | The offset into the array is calculated for the three indices with the first |
| 138 | first one (syms) being outermost, and the last one (len) being innermost. | 139 | one (syms) being outermost, and the last one (len) being innermost. We build |
| 139 | We build the array with length max-1 lists for the len index, with syms-3 | 140 | the array with length max-1 lists for the len index, with syms-3 of those |
| 140 | of those for each symbol. There are totsym-2 of those, with each one | 141 | for each symbol. There are totsym-2 of those, with each one varying in |
| 141 | varying in length as a function of sym. See the calculation of index in | 142 | length as a function of sym. See the calculation of index in map() for the |
| 142 | count() for the index, and the calculation of size in main() for the size | 143 | index, and the calculation of size in main() for the size of the array. |
| 143 | of the array. | ||
| 144 | 144 | ||
| 145 | For the deflate example of 286 symbols limited to 15-bit codes, the array | 145 | For the deflate example of 286 symbols limited to 15-bit codes, the array |
| 146 | has 284,284 entries, taking up 2.17 MB for an 8-byte big_t. More than | 146 | has 284,284 entries, taking up 2.17 MB for an 8-byte big_t. More than half |
| 147 | half of the space allocated for saved results is actually used -- not all | 147 | of the space allocated for saved results is actually used -- not all |
| 148 | possible triplets are reached in the generation of valid Huffman codes. | 148 | possible triplets are reached in the generation of valid Huffman codes. |
| 149 | */ | 149 | */ |
| 150 | 150 | ||
| @@ -152,14 +152,14 @@ struct tab { /* type for been here check */ | |||
| 152 | to the num[] array as described above for the (syms, left, len) triplet. | 152 | to the num[] array as described above for the (syms, left, len) triplet. |
| 153 | Each element in the array is further indexed by the (mem, rem) doublet, | 153 | Each element in the array is further indexed by the (mem, rem) doublet, |
| 154 | where mem is the amount of inflate table space used so far, and rem is the | 154 | where mem is the amount of inflate table space used so far, and rem is the |
| 155 | remaining unused entries in the current inflate sub-table. Each indexed | 155 | remaining unused entries in the current inflate sub-table. Each indexed |
| 156 | element is simply one bit indicating whether the state has been visited or | 156 | element is simply one bit indicating whether the state has been visited or |
| 157 | not. Since the ranges for mem and rem are not known a priori, each bit | 157 | not. Since the ranges for mem and rem are not known a priori, each bit |
| 158 | vector is of a variable size, and grows as needed to accommodate the visited | 158 | vector is of a variable size, and grows as needed to accommodate the visited |
| 159 | states. mem and rem are used to calculate a single index in a triangular | 159 | states. mem and rem are used to calculate a single index in a triangular |
| 160 | array. Since the range of mem is expected in the default case to be about | 160 | array. Since the range of mem is expected in the default case to be about |
| 161 | ten times larger than the range of rem, the array is skewed to reduce the | 161 | ten times larger than the range of rem, the array is skewed to reduce the |
| 162 | memory usage, with eight times the range for mem than for rem. See the | 162 | memory usage, with eight times the range for mem than for rem. See the |
| 163 | calculations for offset and bit in beenhere() for the details. | 163 | calculations for offset and bit in beenhere() for the details. |
| 164 | 164 | ||
| 165 | For the deflate example of 286 symbols limited to 15-bit codes, the bit | 165 | For the deflate example of 286 symbols limited to 15-bit codes, the bit |
| @@ -167,23 +167,22 @@ struct tab { /* type for been here check */ | |||
| 167 | array itself. | 167 | array itself. |
| 168 | */ | 168 | */ |
| 169 | 169 | ||
| 170 | /* Globals to avoid propagating constants or constant pointers recursively */ | 170 | // Globals to avoid propagating constants or constant pointers recursively. |
| 171 | struct { | 171 | struct { |
| 172 | int max; /* maximum allowed bit length for the codes */ | 172 | int max; // maximum allowed bit length for the codes |
| 173 | int root; /* size of base code table in bits */ | 173 | int root; // size of base code table in bits |
| 174 | int large; /* largest code table so far */ | 174 | int large; // largest code table so far |
| 175 | size_t size; /* number of elements in num and done */ | 175 | size_t size; // number of elements in num and done |
| 176 | int *code; /* number of symbols assigned to each bit length */ | 176 | int *code; // number of symbols assigned to each bit length |
| 177 | big_t *num; /* saved results array for code counting */ | 177 | big_t *num; // saved results array for code counting |
| 178 | struct tab *done; /* states already evaluated array */ | 178 | struct tab *done; // states already evaluated array |
| 179 | } g; | 179 | } g; |
| 180 | 180 | ||
| 181 | /* Index function for num[] and done[] */ | 181 | // Index function for num[] and done[]. |
| 182 | #define INDEX(i,j,k) (((size_t)((i-1)>>1)*((i-2)>>1)+(j>>1)-1)*(g.max-1)+k-1) | 182 | #define INDEX(i,j,k) (((size_t)((i-1)>>1)*((i-2)>>1)+(j>>1)-1)*(g.max-1)+k-1) |
| 183 | 183 | ||
| 184 | /* Free allocated space. Uses globals code, num, and done. */ | 184 | // Free allocated space. Uses globals code, num, and done. |
| 185 | local void cleanup(void) | 185 | local void cleanup(void) { |
| 186 | { | ||
| 187 | size_t n; | 186 | size_t n; |
| 188 | 187 | ||
| 189 | if (g.done != NULL) { | 188 | if (g.done != NULL) { |
| @@ -198,91 +197,89 @@ local void cleanup(void) | |||
| 198 | free(g.code); | 197 | free(g.code); |
| 199 | } | 198 | } |
| 200 | 199 | ||
| 201 | /* Return the number of possible Huffman codes using bit patterns of lengths | 200 | // Return the number of possible Huffman codes using bit patterns of lengths |
| 202 | len through max inclusive, coding syms symbols, with left bit patterns of | 201 | // len through max inclusive, coding syms symbols, with left bit patterns of |
| 203 | length len unused -- return -1 if there is an overflow in the counting. | 202 | // length len unused -- return -1 if there is an overflow in the counting. Keep |
| 204 | Keep a record of previous results in num to prevent repeating the same | 203 | // a record of previous results in num to prevent repeating the same |
| 205 | calculation. Uses the globals max and num. */ | 204 | // calculation. Uses the globals max and num. |
| 206 | local big_t count(int syms, int len, int left) | 205 | local big_t count(int syms, int len, int left) { |
| 207 | { | 206 | big_t sum; // number of possible codes from this juncture |
| 208 | big_t sum; /* number of possible codes from this juncture */ | 207 | big_t got; // value returned from count() |
| 209 | big_t got; /* value returned from count() */ | 208 | int least; // least number of syms to use at this juncture |
| 210 | int least; /* least number of syms to use at this juncture */ | 209 | int most; // most number of syms to use at this juncture |
| 211 | int most; /* most number of syms to use at this juncture */ | 210 | int use; // number of bit patterns to use in next call |
| 212 | int use; /* number of bit patterns to use in next call */ | 211 | size_t index; // index of this case in *num |
| 213 | size_t index; /* index of this case in *num */ | 212 | |
| 214 | 213 | // see if only one possible code | |
| 215 | /* see if only one possible code */ | ||
| 216 | if (syms == left) | 214 | if (syms == left) |
| 217 | return 1; | 215 | return 1; |
| 218 | 216 | ||
| 219 | /* note and verify the expected state */ | 217 | // note and verify the expected state |
| 220 | assert(syms > left && left > 0 && len < g.max); | 218 | assert(syms > left && left > 0 && len < g.max); |
| 221 | 219 | ||
| 222 | /* see if we've done this one already */ | 220 | // see if we've done this one already |
| 223 | index = INDEX(syms, left, len); | 221 | index = INDEX(syms, left, len); |
| 224 | got = g.num[index]; | 222 | got = g.num[index]; |
| 225 | if (got) | 223 | if (got) |
| 226 | return got; /* we have -- return the saved result */ | 224 | return got; // we have -- return the saved result |
| 227 | 225 | ||
| 228 | /* we need to use at least this many bit patterns so that the code won't be | 226 | // we need to use at least this many bit patterns so that the code won't be |
| 229 | incomplete at the next length (more bit patterns than symbols) */ | 227 | // incomplete at the next length (more bit patterns than symbols) |
| 230 | least = (left << 1) - syms; | 228 | least = (left << 1) - syms; |
| 231 | if (least < 0) | 229 | if (least < 0) |
| 232 | least = 0; | 230 | least = 0; |
| 233 | 231 | ||
| 234 | /* we can use at most this many bit patterns, lest there not be enough | 232 | // we can use at most this many bit patterns, lest there not be enough |
| 235 | available for the remaining symbols at the maximum length (if there were | 233 | // available for the remaining symbols at the maximum length (if there were |
| 236 | no limit to the code length, this would become: most = left - 1) */ | 234 | // no limit to the code length, this would become: most = left - 1) |
| 237 | most = (((code_t)left << (g.max - len)) - syms) / | 235 | most = (((code_t)left << (g.max - len)) - syms) / |
| 238 | (((code_t)1 << (g.max - len)) - 1); | 236 | (((code_t)1 << (g.max - len)) - 1); |
| 239 | 237 | ||
| 240 | /* count all possible codes from this juncture and add them up */ | 238 | // count all possible codes from this juncture and add them up |
| 241 | sum = 0; | 239 | sum = 0; |
| 242 | for (use = least; use <= most; use++) { | 240 | for (use = least; use <= most; use++) { |
| 243 | got = count(syms - use, len + 1, (left - use) << 1); | 241 | got = count(syms - use, len + 1, (left - use) << 1); |
| 244 | sum += got; | 242 | sum += got; |
| 245 | if (got == (big_t)0 - 1 || sum < got) /* overflow */ | 243 | if (got == (big_t)0 - 1 || sum < got) // overflow |
| 246 | return (big_t)0 - 1; | 244 | return (big_t)0 - 1; |
| 247 | } | 245 | } |
| 248 | 246 | ||
| 249 | /* verify that all recursive calls are productive */ | 247 | // verify that all recursive calls are productive |
| 250 | assert(sum != 0); | 248 | assert(sum != 0); |
| 251 | 249 | ||
| 252 | /* save the result and return it */ | 250 | // save the result and return it |
| 253 | g.num[index] = sum; | 251 | g.num[index] = sum; |
| 254 | return sum; | 252 | return sum; |
| 255 | } | 253 | } |
| 256 | 254 | ||
| 257 | /* Return true if we've been here before, set to true if not. Set a bit in a | 255 | // Return true if we've been here before, set to true if not. Set a bit in a |
| 258 | bit vector to indicate visiting this state. Each (syms,len,left) state | 256 | // bit vector to indicate visiting this state. Each (syms,len,left) state has a |
| 259 | has a variable size bit vector indexed by (mem,rem). The bit vector is | 257 | // variable size bit vector indexed by (mem,rem). The bit vector is lengthened |
| 260 | lengthened if needed to allow setting the (mem,rem) bit. */ | 258 | // if needed to allow setting the (mem,rem) bit. |
| 261 | local int beenhere(int syms, int len, int left, int mem, int rem) | 259 | local int beenhere(int syms, int len, int left, int mem, int rem) { |
| 262 | { | 260 | size_t index; // index for this state's bit vector |
| 263 | size_t index; /* index for this state's bit vector */ | 261 | size_t offset; // offset in this state's bit vector |
| 264 | size_t offset; /* offset in this state's bit vector */ | 262 | int bit; // mask for this state's bit |
| 265 | int bit; /* mask for this state's bit */ | 263 | size_t length; // length of the bit vector in bytes |
| 266 | size_t length; /* length of the bit vector in bytes */ | 264 | char *vector; // new or enlarged bit vector |
| 267 | char *vector; /* new or enlarged bit vector */ | 265 | |
| 268 | 266 | // point to vector for (syms,left,len), bit in vector for (mem,rem) | |
| 269 | /* point to vector for (syms,left,len), bit in vector for (mem,rem) */ | ||
| 270 | index = INDEX(syms, left, len); | 267 | index = INDEX(syms, left, len); |
| 271 | mem -= 1 << g.root; | 268 | mem -= 1 << g.root; |
| 272 | offset = (mem >> 3) + rem; | 269 | offset = (mem >> 3) + rem; |
| 273 | offset = ((offset * (offset + 1)) >> 1) + rem; | 270 | offset = ((offset * (offset + 1)) >> 1) + rem; |
| 274 | bit = 1 << (mem & 7); | 271 | bit = 1 << (mem & 7); |
| 275 | 272 | ||
| 276 | /* see if we've been here */ | 273 | // see if we've been here |
| 277 | length = g.done[index].len; | 274 | length = g.done[index].len; |
| 278 | if (offset < length && (g.done[index].vec[offset] & bit) != 0) | 275 | if (offset < length && (g.done[index].vec[offset] & bit) != 0) |
| 279 | return 1; /* done this! */ | 276 | return 1; // done this! |
| 280 | 277 | ||
| 281 | /* we haven't been here before -- set the bit to show we have now */ | 278 | // we haven't been here before -- set the bit to show we have now |
| 282 | 279 | ||
| 283 | /* see if we need to lengthen the vector in order to set the bit */ | 280 | // see if we need to lengthen the vector in order to set the bit |
| 284 | if (length <= offset) { | 281 | if (length <= offset) { |
| 285 | /* if we have one already, enlarge it, zero out the appended space */ | 282 | // if we have one already, enlarge it, zero out the appended space |
| 286 | if (length) { | 283 | if (length) { |
| 287 | do { | 284 | do { |
| 288 | length <<= 1; | 285 | length <<= 1; |
| @@ -293,7 +290,7 @@ local int beenhere(int syms, int len, int left, int mem, int rem) | |||
| 293 | length - g.done[index].len); | 290 | length - g.done[index].len); |
| 294 | } | 291 | } |
| 295 | 292 | ||
| 296 | /* otherwise we need to make a new vector and zero it out */ | 293 | // otherwise we need to make a new vector and zero it out |
| 297 | else { | 294 | else { |
| 298 | length = 1 << (len - g.root); | 295 | length = 1 << (len - g.root); |
| 299 | while (length <= offset) | 296 | while (length <= offset) |
| @@ -301,40 +298,39 @@ local int beenhere(int syms, int len, int left, int mem, int rem) | |||
| 301 | vector = calloc(length, sizeof(char)); | 298 | vector = calloc(length, sizeof(char)); |
| 302 | } | 299 | } |
| 303 | 300 | ||
| 304 | /* in either case, bail if we can't get the memory */ | 301 | // in either case, bail if we can't get the memory |
| 305 | if (vector == NULL) { | 302 | if (vector == NULL) { |
| 306 | fputs("abort: unable to allocate enough memory\n", stderr); | 303 | fputs("abort: unable to allocate enough memory\n", stderr); |
| 307 | cleanup(); | 304 | cleanup(); |
| 308 | exit(1); | 305 | exit(1); |
| 309 | } | 306 | } |
| 310 | 307 | ||
| 311 | /* install the new vector */ | 308 | // install the new vector |
| 312 | g.done[index].len = length; | 309 | g.done[index].len = length; |
| 313 | g.done[index].vec = vector; | 310 | g.done[index].vec = vector; |
| 314 | } | 311 | } |
| 315 | 312 | ||
| 316 | /* set the bit */ | 313 | // set the bit |
| 317 | g.done[index].vec[offset] |= bit; | 314 | g.done[index].vec[offset] |= bit; |
| 318 | return 0; | 315 | return 0; |
| 319 | } | 316 | } |
| 320 | 317 | ||
| 321 | /* Examine all possible codes from the given node (syms, len, left). Compute | 318 | // Examine all possible codes from the given node (syms, len, left). Compute |
| 322 | the amount of memory required to build inflate's decoding tables, where the | 319 | // the amount of memory required to build inflate's decoding tables, where the |
| 323 | number of code structures used so far is mem, and the number remaining in | 320 | // number of code structures used so far is mem, and the number remaining in |
| 324 | the current sub-table is rem. Uses the globals max, code, root, large, and | 321 | // the current sub-table is rem. Uses the globals max, code, root, large, and |
| 325 | done. */ | 322 | // done. |
| 326 | local void examine(int syms, int len, int left, int mem, int rem) | 323 | local void examine(int syms, int len, int left, int mem, int rem) { |
| 327 | { | 324 | int least; // least number of syms to use at this juncture |
| 328 | int least; /* least number of syms to use at this juncture */ | 325 | int most; // most number of syms to use at this juncture |
| 329 | int most; /* most number of syms to use at this juncture */ | 326 | int use; // number of bit patterns to use in next call |
| 330 | int use; /* number of bit patterns to use in next call */ | 327 | |
| 331 | 328 | // see if we have a complete code | |
| 332 | /* see if we have a complete code */ | ||
| 333 | if (syms == left) { | 329 | if (syms == left) { |
| 334 | /* set the last code entry */ | 330 | // set the last code entry |
| 335 | g.code[len] = left; | 331 | g.code[len] = left; |
| 336 | 332 | ||
| 337 | /* complete computation of memory used by this code */ | 333 | // complete computation of memory used by this code |
| 338 | while (rem < left) { | 334 | while (rem < left) { |
| 339 | left -= rem; | 335 | left -= rem; |
| 340 | rem = 1 << (len - g.root); | 336 | rem = 1 << (len - g.root); |
| @@ -342,7 +338,7 @@ local void examine(int syms, int len, int left, int mem, int rem) | |||
| 342 | } | 338 | } |
| 343 | assert(rem == left); | 339 | assert(rem == left); |
| 344 | 340 | ||
| 345 | /* if this is a new maximum, show the entries used and the sub-code */ | 341 | // if this is a new maximum, show the entries used and the sub-code |
| 346 | if (mem > g.large) { | 342 | if (mem > g.large) { |
| 347 | g.large = mem; | 343 | g.large = mem; |
| 348 | printf("max %d: ", mem); | 344 | printf("max %d: ", mem); |
| @@ -353,28 +349,28 @@ local void examine(int syms, int len, int left, int mem, int rem) | |||
| 353 | fflush(stdout); | 349 | fflush(stdout); |
| 354 | } | 350 | } |
| 355 | 351 | ||
| 356 | /* remove entries as we drop back down in the recursion */ | 352 | // remove entries as we drop back down in the recursion |
| 357 | g.code[len] = 0; | 353 | g.code[len] = 0; |
| 358 | return; | 354 | return; |
| 359 | } | 355 | } |
| 360 | 356 | ||
| 361 | /* prune the tree if we can */ | 357 | // prune the tree if we can |
| 362 | if (beenhere(syms, len, left, mem, rem)) | 358 | if (beenhere(syms, len, left, mem, rem)) |
| 363 | return; | 359 | return; |
| 364 | 360 | ||
| 365 | /* we need to use at least this many bit patterns so that the code won't be | 361 | // we need to use at least this many bit patterns so that the code won't be |
| 366 | incomplete at the next length (more bit patterns than symbols) */ | 362 | // incomplete at the next length (more bit patterns than symbols) |
| 367 | least = (left << 1) - syms; | 363 | least = (left << 1) - syms; |
| 368 | if (least < 0) | 364 | if (least < 0) |
| 369 | least = 0; | 365 | least = 0; |
| 370 | 366 | ||
| 371 | /* we can use at most this many bit patterns, lest there not be enough | 367 | // we can use at most this many bit patterns, lest there not be enough |
| 372 | available for the remaining symbols at the maximum length (if there were | 368 | // available for the remaining symbols at the maximum length (if there were |
| 373 | no limit to the code length, this would become: most = left - 1) */ | 369 | // no limit to the code length, this would become: most = left - 1) |
| 374 | most = (((code_t)left << (g.max - len)) - syms) / | 370 | most = (((code_t)left << (g.max - len)) - syms) / |
| 375 | (((code_t)1 << (g.max - len)) - 1); | 371 | (((code_t)1 << (g.max - len)) - 1); |
| 376 | 372 | ||
| 377 | /* occupy least table spaces, creating new sub-tables as needed */ | 373 | // occupy least table spaces, creating new sub-tables as needed |
| 378 | use = least; | 374 | use = least; |
| 379 | while (rem < use) { | 375 | while (rem < use) { |
| 380 | use -= rem; | 376 | use -= rem; |
| @@ -383,7 +379,7 @@ local void examine(int syms, int len, int left, int mem, int rem) | |||
| 383 | } | 379 | } |
| 384 | rem -= use; | 380 | rem -= use; |
| 385 | 381 | ||
| 386 | /* examine codes from here, updating table space as we go */ | 382 | // examine codes from here, updating table space as we go |
| 387 | for (use = least; use <= most; use++) { | 383 | for (use = least; use <= most; use++) { |
| 388 | g.code[len] = use; | 384 | g.code[len] = use; |
| 389 | examine(syms - use, len + 1, (left - use) << 1, | 385 | examine(syms - use, len + 1, (left - use) << 1, |
| @@ -395,79 +391,74 @@ local void examine(int syms, int len, int left, int mem, int rem) | |||
| 395 | rem--; | 391 | rem--; |
| 396 | } | 392 | } |
| 397 | 393 | ||
| 398 | /* remove entries as we drop back down in the recursion */ | 394 | // remove entries as we drop back down in the recursion |
| 399 | g.code[len] = 0; | 395 | g.code[len] = 0; |
| 400 | } | 396 | } |
| 401 | 397 | ||
| 402 | /* Look at all sub-codes starting with root + 1 bits. Look at only the valid | 398 | // Look at all sub-codes starting with root + 1 bits. Look at only the valid |
| 403 | intermediate code states (syms, left, len). For each completed code, | 399 | // intermediate code states (syms, left, len). For each completed code, |
| 404 | calculate the amount of memory required by inflate to build the decoding | 400 | // calculate the amount of memory required by inflate to build the decoding |
| 405 | tables. Find the maximum amount of memory required and show the code that | 401 | // tables. Find the maximum amount of memory required and show the code that |
| 406 | requires that maximum. Uses the globals max, root, and num. */ | 402 | // requires that maximum. Uses the globals max, root, and num. |
| 407 | local void enough(int syms) | 403 | local void enough(int syms) { |
| 408 | { | 404 | int n; // number of remaing symbols for this node |
| 409 | int n; /* number of remaing symbols for this node */ | 405 | int left; // number of unused bit patterns at this length |
| 410 | int left; /* number of unused bit patterns at this length */ | 406 | size_t index; // index of this case in *num |
| 411 | size_t index; /* index of this case in *num */ | 407 | |
| 412 | 408 | // clear code | |
| 413 | /* clear code */ | ||
| 414 | for (n = 0; n <= g.max; n++) | 409 | for (n = 0; n <= g.max; n++) |
| 415 | g.code[n] = 0; | 410 | g.code[n] = 0; |
| 416 | 411 | ||
| 417 | /* look at all (root + 1) bit and longer codes */ | 412 | // look at all (root + 1) bit and longer codes |
| 418 | g.large = 1 << g.root; /* base table */ | 413 | g.large = 1 << g.root; // base table |
| 419 | if (g.root < g.max) /* otherwise, there's only a base table */ | 414 | if (g.root < g.max) // otherwise, there's only a base table |
| 420 | for (n = 3; n <= syms; n++) | 415 | for (n = 3; n <= syms; n++) |
| 421 | for (left = 2; left < n; left += 2) | 416 | for (left = 2; left < n; left += 2) { |
| 422 | { | 417 | // look at all reachable (root + 1) bit nodes, and the |
| 423 | /* look at all reachable (root + 1) bit nodes, and the | 418 | // resulting codes (complete at root + 2 or more) |
| 424 | resulting codes (complete at root + 2 or more) */ | ||
| 425 | index = INDEX(n, left, g.root + 1); | 419 | index = INDEX(n, left, g.root + 1); |
| 426 | if (g.root + 1 < g.max && g.num[index]) /* reachable node */ | 420 | if (g.root + 1 < g.max && g.num[index]) // reachable node |
| 427 | examine(n, g.root + 1, left, 1 << g.root, 0); | 421 | examine(n, g.root + 1, left, 1 << g.root, 0); |
| 428 | 422 | ||
| 429 | /* also look at root bit codes with completions at root + 1 | 423 | // also look at root bit codes with completions at root + 1 |
| 430 | bits (not saved in num, since complete), just in case */ | 424 | // bits (not saved in num, since complete), just in case |
| 431 | if (g.num[index - 1] && n <= left << 1) | 425 | if (g.num[index - 1] && n <= left << 1) |
| 432 | examine((n - left) << 1, g.root + 1, (n - left) << 1, | 426 | examine((n - left) << 1, g.root + 1, (n - left) << 1, |
| 433 | 1 << g.root, 0); | 427 | 1 << g.root, 0); |
| 434 | } | 428 | } |
| 435 | 429 | ||
| 436 | /* done */ | 430 | // done |
| 437 | printf("done: maximum of %d table entries\n", g.large); | 431 | printf("done: maximum of %d table entries\n", g.large); |
| 438 | } | 432 | } |
| 439 | 433 | ||
| 440 | /* | 434 | // Examine and show the total number of possible Huffman codes for a given |
| 441 | Examine and show the total number of possible Huffman codes for a given | 435 | // maximum number of symbols, initial root table size, and maximum code length |
| 442 | maximum number of symbols, initial root table size, and maximum code length | 436 | // in bits -- those are the command arguments in that order. The default values |
| 443 | in bits -- those are the command arguments in that order. The default | 437 | // are 286, 9, and 15 respectively, for the deflate literal/length code. The |
| 444 | values are 286, 9, and 15 respectively, for the deflate literal/length code. | 438 | // possible codes are counted for each number of coded symbols from two to the |
| 445 | The possible codes are counted for each number of coded symbols from two to | 439 | // maximum. The counts for each of those and the total number of codes are |
| 446 | the maximum. The counts for each of those and the total number of codes are | 440 | // shown. The maximum number of inflate table entires is then calculated across |
| 447 | shown. The maximum number of inflate table entires is then calculated | 441 | // all possible codes. Each new maximum number of table entries and the |
| 448 | across all possible codes. Each new maximum number of table entries and the | 442 | // associated sub-code (starting at root + 1 == 10 bits) is shown. |
| 449 | associated sub-code (starting at root + 1 == 10 bits) is shown. | 443 | // |
| 450 | 444 | // To count and examine Huffman codes that are not length-limited, provide a | |
| 451 | To count and examine Huffman codes that are not length-limited, provide a | 445 | // maximum length equal to the number of symbols minus one. |
| 452 | maximum length equal to the number of symbols minus one. | 446 | // |
| 453 | 447 | // For the deflate literal/length code, use "enough". For the deflate distance | |
| 454 | For the deflate literal/length code, use "enough". For the deflate distance | 448 | // code, use "enough 30 6". |
| 455 | code, use "enough 30 6". | 449 | int main(int argc, char **argv) { |
| 456 | */ | 450 | int syms; // total number of symbols to code |
| 457 | int main(int argc, char **argv) | 451 | int n; // number of symbols to code for this run |
| 458 | { | 452 | big_t got; // return value of count() |
| 459 | int syms; /* total number of symbols to code */ | 453 | big_t sum; // accumulated number of codes over n |
| 460 | int n; /* number of symbols to code for this run */ | 454 | code_t word; // for counting bits in code_t |
| 461 | big_t got; /* return value of count() */ | 455 | |
| 462 | big_t sum; /* accumulated number of codes over n */ | 456 | // set up globals for cleanup() |
| 463 | code_t word; /* for counting bits in code_t */ | ||
| 464 | |||
| 465 | /* set up globals for cleanup() */ | ||
| 466 | g.code = NULL; | 457 | g.code = NULL; |
| 467 | g.num = NULL; | 458 | g.num = NULL; |
| 468 | g.done = NULL; | 459 | g.done = NULL; |
| 469 | 460 | ||
| 470 | /* get arguments -- default to the deflate literal/length code */ | 461 | // get arguments -- default to the deflate literal/length code |
| 471 | syms = 286; | 462 | syms = 286; |
| 472 | g.root = 9; | 463 | g.root = 9; |
| 473 | g.max = 15; | 464 | g.max = 15; |
| @@ -485,38 +476,38 @@ int main(int argc, char **argv) | |||
| 485 | return 1; | 476 | return 1; |
| 486 | } | 477 | } |
| 487 | 478 | ||
| 488 | /* if not restricting the code length, the longest is syms - 1 */ | 479 | // if not restricting the code length, the longest is syms - 1 |
| 489 | if (g.max > syms - 1) | 480 | if (g.max > syms - 1) |
| 490 | g.max = syms - 1; | 481 | g.max = syms - 1; |
| 491 | 482 | ||
| 492 | /* determine the number of bits in a code_t */ | 483 | // determine the number of bits in a code_t |
| 493 | for (n = 0, word = 1; word; n++, word <<= 1) | 484 | for (n = 0, word = 1; word; n++, word <<= 1) |
| 494 | ; | 485 | ; |
| 495 | 486 | ||
| 496 | /* make sure that the calculation of most will not overflow */ | 487 | // make sure that the calculation of most will not overflow |
| 497 | if (g.max > n || (code_t)(syms - 2) >= (((code_t)0 - 1) >> (g.max - 1))) { | 488 | if (g.max > n || (code_t)(syms - 2) >= (((code_t)0 - 1) >> (g.max - 1))) { |
| 498 | fputs("abort: code length too long for internal types\n", stderr); | 489 | fputs("abort: code length too long for internal types\n", stderr); |
| 499 | return 1; | 490 | return 1; |
| 500 | } | 491 | } |
| 501 | 492 | ||
| 502 | /* reject impossible code requests */ | 493 | // reject impossible code requests |
| 503 | if ((code_t)(syms - 1) > ((code_t)1 << g.max) - 1) { | 494 | if ((code_t)(syms - 1) > ((code_t)1 << g.max) - 1) { |
| 504 | fprintf(stderr, "%d symbols cannot be coded in %d bits\n", | 495 | fprintf(stderr, "%d symbols cannot be coded in %d bits\n", |
| 505 | syms, g.max); | 496 | syms, g.max); |
| 506 | return 1; | 497 | return 1; |
| 507 | } | 498 | } |
| 508 | 499 | ||
| 509 | /* allocate code vector */ | 500 | // allocate code vector |
| 510 | g.code = calloc(g.max + 1, sizeof(int)); | 501 | g.code = calloc(g.max + 1, sizeof(int)); |
| 511 | if (g.code == NULL) { | 502 | if (g.code == NULL) { |
| 512 | fputs("abort: unable to allocate enough memory\n", stderr); | 503 | fputs("abort: unable to allocate enough memory\n", stderr); |
| 513 | return 1; | 504 | return 1; |
| 514 | } | 505 | } |
| 515 | 506 | ||
| 516 | /* determine size of saved results array, checking for overflows, | 507 | // determine size of saved results array, checking for overflows, |
| 517 | allocate and clear the array (set all to zero with calloc()) */ | 508 | // allocate and clear the array (set all to zero with calloc()) |
| 518 | if (syms == 2) /* iff max == 1 */ | 509 | if (syms == 2) // iff max == 1 |
| 519 | g.num = NULL; /* won't be saving any results */ | 510 | g.num = NULL; // won't be saving any results |
| 520 | else { | 511 | else { |
| 521 | g.size = syms >> 1; | 512 | g.size = syms >> 1; |
| 522 | if (g.size > ((size_t)0 - 1) / (n = (syms - 1) >> 1) || | 513 | if (g.size > ((size_t)0 - 1) / (n = (syms - 1) >> 1) || |
| @@ -529,12 +520,12 @@ int main(int argc, char **argv) | |||
| 529 | } | 520 | } |
| 530 | } | 521 | } |
| 531 | 522 | ||
| 532 | /* count possible codes for all numbers of symbols, add up counts */ | 523 | // count possible codes for all numbers of symbols, add up counts |
| 533 | sum = 0; | 524 | sum = 0; |
| 534 | for (n = 2; n <= syms; n++) { | 525 | for (n = 2; n <= syms; n++) { |
| 535 | got = count(n, 1, 2); | 526 | got = count(n, 1, 2); |
| 536 | sum += got; | 527 | sum += got; |
| 537 | if (got == (big_t)0 - 1 || sum < got) { /* overflow */ | 528 | if (got == (big_t)0 - 1 || sum < got) { // overflow |
| 538 | fputs("abort: can't count that high!\n", stderr); | 529 | fputs("abort: can't count that high!\n", stderr); |
| 539 | cleanup(); | 530 | cleanup(); |
| 540 | return 1; | 531 | return 1; |
| @@ -547,7 +538,7 @@ int main(int argc, char **argv) | |||
| 547 | else | 538 | else |
| 548 | puts(" (no length limit)"); | 539 | puts(" (no length limit)"); |
| 549 | 540 | ||
| 550 | /* allocate and clear done array for beenhere() */ | 541 | // allocate and clear done array for beenhere() |
| 551 | if (syms == 2) | 542 | if (syms == 2) |
| 552 | g.done = NULL; | 543 | g.done = NULL; |
| 553 | else if (g.size > ((size_t)0 - 1) / sizeof(struct tab) || | 544 | else if (g.size > ((size_t)0 - 1) / sizeof(struct tab) || |
| @@ -557,15 +548,15 @@ int main(int argc, char **argv) | |||
| 557 | return 1; | 548 | return 1; |
| 558 | } | 549 | } |
| 559 | 550 | ||
| 560 | /* find and show maximum inflate table usage */ | 551 | // find and show maximum inflate table usage |
| 561 | if (g.root > g.max) /* reduce root to max length */ | 552 | if (g.root > g.max) // reduce root to max length |
| 562 | g.root = g.max; | 553 | g.root = g.max; |
| 563 | if ((code_t)syms < ((code_t)1 << (g.root + 1))) | 554 | if ((code_t)syms < ((code_t)1 << (g.root + 1))) |
| 564 | enough(syms); | 555 | enough(syms); |
| 565 | else | 556 | else |
| 566 | puts("cannot handle minimum code lengths > root"); | 557 | puts("cannot handle minimum code lengths > root"); |
| 567 | 558 | ||
| 568 | /* done */ | 559 | // done |
| 569 | cleanup(); | 560 | cleanup(); |
| 570 | return 0; | 561 | return 0; |
| 571 | } | 562 | } |
