diff options
-rw-r--r-- | CHANGES | 22 | ||||
-rw-r--r-- | LICENSE | 7 | ||||
-rw-r--r-- | Makefile | 51 | ||||
-rw-r--r-- | Makefile-libbz2_so | 12 | ||||
-rw-r--r-- | README | 40 | ||||
-rw-r--r-- | README.COMPILATION.PROBLEMS | 103 | ||||
-rw-r--r-- | README.XML.STUFF | 31 | ||||
-rw-r--r-- | blocksort.c | 4 | ||||
-rw-r--r-- | bz-common.xsl | 39 | ||||
-rw-r--r-- | bz-fo.xsl | 257 | ||||
-rw-r--r-- | bz-html.xsl | 20 | ||||
-rw-r--r-- | bzip.css | 74 | ||||
-rw-r--r-- | bzip2.1 | 17 | ||||
-rw-r--r-- | bzip2.1.preformatted | 247 | ||||
-rw-r--r-- | bzip2.c | 20 | ||||
-rw-r--r-- | bzip2.txt | 119 | ||||
-rw-r--r-- | bzip2recover.c | 12 | ||||
-rw-r--r-- | bzlib.c | 59 | ||||
-rw-r--r-- | bzlib.h | 6 | ||||
-rw-r--r-- | bzlib_private.h | 6 | ||||
-rw-r--r-- | compress.c | 16 | ||||
-rw-r--r-- | crctable.c | 4 | ||||
-rw-r--r-- | decompress.c | 20 | ||||
-rw-r--r-- | entities.xml | 9 | ||||
-rwxr-xr-x | format.pl | 53 | ||||
-rw-r--r-- | huffman.c | 23 | ||||
-rw-r--r-- | manual.texi | 2243 | ||||
-rw-r--r-- | manual.xml | 2966 | ||||
-rw-r--r-- | randtable.c | 4 | ||||
-rwxr-xr-x | xmlproc.sh | 99 |
30 files changed, 3947 insertions, 2636 deletions
@@ -251,3 +251,25 @@ of bzip2: | |||
251 | 251 | ||
252 | * added --fast and --best aliases for -1 -9 for gzip compatibility. | 252 | * added --fast and --best aliases for -1 -9 for gzip compatibility. |
253 | 253 | ||
254 | |||
255 | 1.0.3 (15 Feb 05) | ||
256 | ~~~~~~~~~~~~~~~~~ | ||
257 | Fixes some minor bugs since the last version, 1.0.2. | ||
258 | |||
259 | * Further robustification against corrupted compressed data. | ||
260 | There are currently no known bitstreams which can cause the | ||
261 | decompressor to crash, loop or access memory which does not | ||
262 | belong to it. If you are using bzip2 or the library to | ||
263 | decompress bitstreams from untrusted sources, an upgrade | ||
264 | to 1.0.3 is recommended. | ||
265 | |||
266 | * The documentation has been converted to XML, from which html | ||
267 | and pdf can be derived. | ||
268 | |||
269 | * Various minor bugs in the documentation have been fixed. | ||
270 | |||
271 | * Fixes for various compilation warnings with newer versions of | ||
272 | gcc, and on 64-bit platforms. | ||
273 | |||
274 | * The BZ_NO_STDIO cpp symbol was not properly observed in 1.0.2. | ||
275 | This has been fixed. | ||
@@ -1,6 +1,7 @@ | |||
1 | 1 | ||
2 | This program, "bzip2" and associated library "libbzip2", are | 2 | This program, "bzip2", the associated library "libbzip2", and all |
3 | copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 3 | documentation, are copyright (C) 1996-2005 Julian R Seward. All |
4 | rights reserved. | ||
4 | 5 | ||
5 | Redistribution and use in source and binary forms, with or without | 6 | Redistribution and use in source and binary forms, with or without |
6 | modification, are permitted provided that the following conditions | 7 | modification, are permitted provided that the following conditions |
@@ -35,5 +36,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
35 | 36 | ||
36 | Julian Seward, Cambridge, UK. | 37 | Julian Seward, Cambridge, UK. |
37 | jseward@acm.org | 38 | jseward@acm.org |
38 | bzip2/libbzip2 version 1.0.2 of 30 December 2001 | 39 | bzip2/libbzip2 version 1.0.3 of 15 February 2005 |
39 | 40 | ||
@@ -7,9 +7,8 @@ AR=ar | |||
7 | RANLIB=ranlib | 7 | RANLIB=ranlib |
8 | LDFLAGS= | 8 | LDFLAGS= |
9 | 9 | ||
10 | # Suitably paranoid flags to avoid bugs in gcc-2.7 | ||
11 | BIGFILES=-D_FILE_OFFSET_BITS=64 | 10 | BIGFILES=-D_FILE_OFFSET_BITS=64 |
12 | CFLAGS=-Wall -Winline -O2 -fomit-frame-pointer -fno-strength-reduce $(BIGFILES) | 11 | CFLAGS=-Wall -Winline -O -g $(BIGFILES) |
13 | 12 | ||
14 | # Where you want it installed when you do 'make install' | 13 | # Where you want it installed when you do 'make install' |
15 | PREFIX=/usr | 14 | PREFIX=/usr |
@@ -96,7 +95,6 @@ install: bzip2 bzip2recover | |||
96 | echo ".so man1/bzmore.1" > $(PREFIX)/man/man1/bzless.1 | 95 | echo ".so man1/bzmore.1" > $(PREFIX)/man/man1/bzless.1 |
97 | echo ".so man1/bzdiff.1" > $(PREFIX)/man/man1/bzcmp.1 | 96 | echo ".so man1/bzdiff.1" > $(PREFIX)/man/man1/bzcmp.1 |
98 | 97 | ||
99 | distclean: clean | ||
100 | clean: | 98 | clean: |
101 | rm -f *.o libbz2.a bzip2 bzip2recover \ | 99 | rm -f *.o libbz2.a bzip2 bzip2recover \ |
102 | sample1.rb2 sample2.rb2 sample3.rb2 \ | 100 | sample1.rb2 sample2.rb2 sample3.rb2 \ |
@@ -122,8 +120,12 @@ bzip2.o: bzip2.c | |||
122 | bzip2recover.o: bzip2recover.c | 120 | bzip2recover.o: bzip2recover.c |
123 | $(CC) $(CFLAGS) -c bzip2recover.c | 121 | $(CC) $(CFLAGS) -c bzip2recover.c |
124 | 122 | ||
125 | DISTNAME=bzip2-1.0.2 | 123 | |
126 | tarfile: | 124 | distclean: clean |
125 | rm -f manual.ps manual.html manual.pdf | ||
126 | |||
127 | DISTNAME=bzip2-1.0.3 | ||
128 | dist: check manual | ||
127 | rm -f $(DISTNAME) | 129 | rm -f $(DISTNAME) |
128 | ln -sf . $(DISTNAME) | 130 | ln -sf . $(DISTNAME) |
129 | tar cvf $(DISTNAME).tar \ | 131 | tar cvf $(DISTNAME).tar \ |
@@ -139,9 +141,6 @@ tarfile: | |||
139 | $(DISTNAME)/bzlib.h \ | 141 | $(DISTNAME)/bzlib.h \ |
140 | $(DISTNAME)/bzlib_private.h \ | 142 | $(DISTNAME)/bzlib_private.h \ |
141 | $(DISTNAME)/Makefile \ | 143 | $(DISTNAME)/Makefile \ |
142 | $(DISTNAME)/manual.texi \ | ||
143 | $(DISTNAME)/manual.ps \ | ||
144 | $(DISTNAME)/manual.pdf \ | ||
145 | $(DISTNAME)/LICENSE \ | 144 | $(DISTNAME)/LICENSE \ |
146 | $(DISTNAME)/bzip2.1 \ | 145 | $(DISTNAME)/bzip2.1 \ |
147 | $(DISTNAME)/bzip2.1.preformatted \ | 146 | $(DISTNAME)/bzip2.1.preformatted \ |
@@ -157,9 +156,12 @@ tarfile: | |||
157 | $(DISTNAME)/sample2.bz2 \ | 156 | $(DISTNAME)/sample2.bz2 \ |
158 | $(DISTNAME)/sample3.bz2 \ | 157 | $(DISTNAME)/sample3.bz2 \ |
159 | $(DISTNAME)/dlltest.c \ | 158 | $(DISTNAME)/dlltest.c \ |
160 | $(DISTNAME)/*.html \ | 159 | $(DISTNAME)/manual.html \ |
160 | $(DISTNAME)/manual.pdf \ | ||
161 | $(DISTNAME)/manual.ps \ | ||
161 | $(DISTNAME)/README \ | 162 | $(DISTNAME)/README \ |
162 | $(DISTNAME)/README.COMPILATION.PROBLEMS \ | 163 | $(DISTNAME)/README.COMPILATION.PROBLEMS \ |
164 | $(DISTNAME)/README.XML.STUFF \ | ||
163 | $(DISTNAME)/CHANGES \ | 165 | $(DISTNAME)/CHANGES \ |
164 | $(DISTNAME)/libbz2.def \ | 166 | $(DISTNAME)/libbz2.def \ |
165 | $(DISTNAME)/libbz2.dsp \ | 167 | $(DISTNAME)/libbz2.dsp \ |
@@ -175,18 +177,29 @@ tarfile: | |||
175 | $(DISTNAME)/bzmore.1 \ | 177 | $(DISTNAME)/bzmore.1 \ |
176 | $(DISTNAME)/bzgrep \ | 178 | $(DISTNAME)/bzgrep \ |
177 | $(DISTNAME)/bzgrep.1 \ | 179 | $(DISTNAME)/bzgrep.1 \ |
178 | $(DISTNAME)/Makefile-libbz2_so | 180 | $(DISTNAME)/Makefile-libbz2_so \ |
181 | $(DISTNAME)/bz-common.xsl \ | ||
182 | $(DISTNAME)/bz-fo.xsl \ | ||
183 | $(DISTNAME)/bz-html.xsl \ | ||
184 | $(DISTNAME)/bzip.css \ | ||
185 | $(DISTNAME)/entities.xml \ | ||
186 | $(DISTNAME)/manual.xml \ | ||
187 | $(DISTNAME)/format.pl \ | ||
188 | $(DISTNAME)/xmlproc.sh | ||
179 | gzip -v $(DISTNAME).tar | 189 | gzip -v $(DISTNAME).tar |
180 | 190 | ||
181 | # For rebuilding the manual from sources on my RedHat 7.2 box | 191 | # For rebuilding the manual from sources on my SuSE 9.1 box |
182 | manual: manual.ps manual.pdf manual.html | 192 | |
193 | MANUAL_SRCS= bz-common.xsl bz-fo.xsl bz-html.xsl bzip.css \ | ||
194 | entities.xml manual.xml | ||
195 | |||
196 | manual: manual.html manual.ps manual.pdf | ||
183 | 197 | ||
184 | manual.ps: manual.texi | 198 | manual.ps: $(MANUAL_SRCS) |
185 | tex manual.texi | 199 | ./xmlproc.sh -ps manual.xml |
186 | dvips -o manual.ps manual.dvi | ||
187 | 200 | ||
188 | manual.pdf: manual.ps | 201 | manual.pdf: $(MANUAL_SRCS) |
189 | ps2pdf manual.ps | 202 | ./xmlproc.sh -pdf manual.xml |
190 | 203 | ||
191 | manual.html: manual.texi | 204 | manual.html: $(MANUAL_SRCS) |
192 | texi2html -split_chapter manual.texi | 205 | ./xmlproc.sh -html manual.xml |
diff --git a/Makefile-libbz2_so b/Makefile-libbz2_so index 4986fe2..458c5a1 100644 --- a/Makefile-libbz2_so +++ b/Makefile-libbz2_so | |||
@@ -1,6 +1,6 @@ | |||
1 | 1 | ||
2 | # This Makefile builds a shared version of the library, | 2 | # This Makefile builds a shared version of the library, |
3 | # libbz2.so.1.0.2, with soname libbz2.so.1.0, | 3 | # libbz2.so.1.0.3, with soname libbz2.so.1.0, |
4 | # at least on x86-Linux (RedHat 7.2), | 4 | # at least on x86-Linux (RedHat 7.2), |
5 | # with gcc-2.96 20000731 (Red Hat Linux 7.1 2.96-98). | 5 | # with gcc-2.96 20000731 (Red Hat Linux 7.1 2.96-98). |
6 | # Please see the README file for some | 6 | # Please see the README file for some |
@@ -9,7 +9,7 @@ | |||
9 | SHELL=/bin/sh | 9 | SHELL=/bin/sh |
10 | CC=gcc | 10 | CC=gcc |
11 | BIGFILES=-D_FILE_OFFSET_BITS=64 | 11 | BIGFILES=-D_FILE_OFFSET_BITS=64 |
12 | CFLAGS=-fpic -fPIC -Wall -Winline -O2 -fomit-frame-pointer -fno-strength-reduce $(BIGFILES) | 12 | CFLAGS=-fpic -fPIC -Wall -Winline -O -g |
13 | 13 | ||
14 | OBJS= blocksort.o \ | 14 | OBJS= blocksort.o \ |
15 | huffman.o \ | 15 | huffman.o \ |
@@ -20,13 +20,13 @@ OBJS= blocksort.o \ | |||
20 | bzlib.o | 20 | bzlib.o |
21 | 21 | ||
22 | all: $(OBJS) | 22 | all: $(OBJS) |
23 | $(CC) -shared -Wl,-soname -Wl,libbz2.so.1.0 -o libbz2.so.1.0.2 $(OBJS) | 23 | $(CC) -shared -Wl,-soname -Wl,libbz2.so.1.0 -o libbz2.so.1.0.3 $(OBJS) |
24 | $(CC) $(CFLAGS) -o bzip2-shared bzip2.c libbz2.so.1.0.2 | 24 | $(CC) $(CFLAGS) -o bzip2-shared bzip2.c libbz2.so.1.0.3 |
25 | rm -f libbz2.so.1.0 | 25 | rm -f libbz2.so.1.0 |
26 | ln -s libbz2.so.1.0.2 libbz2.so.1.0 | 26 | ln -s libbz2.so.1.0.3 libbz2.so.1.0 |
27 | 27 | ||
28 | clean: | 28 | clean: |
29 | rm -f $(OBJS) bzip2.o libbz2.so.1.0.2 libbz2.so.1.0 bzip2-shared | 29 | rm -f $(OBJS) bzip2.o libbz2.so.1.0.3 libbz2.so.1.0 bzip2-shared |
30 | 30 | ||
31 | blocksort.o: blocksort.c | 31 | blocksort.o: blocksort.c |
32 | $(CC) $(CFLAGS) -c blocksort.c | 32 | $(CC) $(CFLAGS) -c blocksort.c |
@@ -1,15 +1,15 @@ | |||
1 | 1 | ||
2 | This is the README for bzip2, a block-sorting file compressor, version | 2 | This is the README for bzip2, a block-sorting file compressor, version |
3 | 1.0.2. This version is fully compatible with the previous public | 3 | 1.0.3. This version is fully compatible with the previous public |
4 | releases, versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0 and 1.0.1. | 4 | releases, versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0, 1.0.1 and 1.0.2. |
5 | 5 | ||
6 | bzip2-1.0.2 is distributed under a BSD-style license. For details, | 6 | bzip2-1.0.3 is distributed under a BSD-style license. For details, |
7 | see the file LICENSE. | 7 | see the file LICENSE. |
8 | 8 | ||
9 | Complete documentation is available in Postscript form (manual.ps), | 9 | Complete documentation is available in Postscript form (manual.ps), |
10 | PDF (manual.pdf, amazingly enough) or html (manual_toc.html). A | 10 | PDF (manual.pdf) or html (manual.html). A plain-text version of the |
11 | plain-text version of the manual page is available as bzip2.txt. | 11 | manual page is available as bzip2.txt. A statement about Y2K issues |
12 | A statement about Y2K issues is now included in the file Y2K_INFO. | 12 | is now included in the file Y2K_INFO. |
13 | 13 | ||
14 | 14 | ||
15 | HOW TO BUILD -- UNIX | 15 | HOW TO BUILD -- UNIX |
@@ -78,8 +78,7 @@ importance. To validate bzip2, I used a modified version of Mark | |||
78 | Nelson's churn program. Churn is an automated test driver which | 78 | Nelson's churn program. Churn is an automated test driver which |
79 | recursively traverses a directory structure, using bzip2 to compress | 79 | recursively traverses a directory structure, using bzip2 to compress |
80 | and then decompress each file it encounters, and checking that the | 80 | and then decompress each file it encounters, and checking that the |
81 | decompressed data is the same as the original. There are more details | 81 | decompressed data is the same as the original. |
82 | in Section 4 of the user guide. | ||
83 | 82 | ||
84 | 83 | ||
85 | 84 | ||
@@ -119,9 +118,9 @@ DISCLAIMER: | |||
119 | PATENTS: | 118 | PATENTS: |
120 | 119 | ||
121 | To the best of my knowledge, bzip2 does not use any patented | 120 | To the best of my knowledge, bzip2 does not use any patented |
122 | algorithms. However, I do not have the resources available to | 121 | algorithms. However, I do not have the resources to carry out |
123 | carry out a full patent search. Therefore I cannot give any | 122 | a patent search. Therefore I cannot give any guarantee of the |
124 | guarantee of the above statement. | 123 | above statement. |
125 | 124 | ||
126 | End of legalities. | 125 | End of legalities. |
127 | 126 | ||
@@ -153,22 +152,26 @@ WHAT'S NEW IN 1.0.2 ? | |||
153 | 152 | ||
154 | See the CHANGES file. | 153 | See the CHANGES file. |
155 | 154 | ||
155 | WHAT'S NEW IN 1.0.3 ? | ||
156 | |||
157 | See the CHANGES file. | ||
158 | |||
156 | 159 | ||
157 | I hope you find bzip2 useful. Feel free to contact me at | 160 | I hope you find bzip2 useful. Feel free to contact me at |
158 | jseward@acm.org | 161 | jseward@bzip.org |
159 | if you have any suggestions or queries. Many people mailed me with | 162 | if you have any suggestions or queries. Many people mailed me with |
160 | comments, suggestions and patches after the releases of bzip-0.15, | 163 | comments, suggestions and patches after the releases of bzip-0.15, |
161 | bzip-0.21, and bzip2 versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0 and 1.0.1, | 164 | bzip-0.21, and bzip2 versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0, 1.0.1 and |
162 | and the changes in bzip2 are largely a result of this feedback. | 165 | 1.0.2, and the changes in bzip2 are largely a result of this feedback. |
163 | I thank you for your comments. | 166 | I thank you for your comments. |
164 | 167 | ||
165 | At least for the time being, bzip2's "home" is (or can be reached via) | 168 | At least for the time being, bzip2's "home" is (or can be reached via) |
166 | http://sources.redhat.com/bzip2. | 169 | http://www.bzip.org |
167 | 170 | ||
168 | Julian Seward | 171 | Julian Seward |
169 | jseward@acm.org | 172 | jseward@bzip.org |
170 | 173 | ||
171 | Cambridge, UK (and what a great town this is!) | 174 | Cambridge, UK. |
172 | 175 | ||
173 | 18 July 1996 (version 0.15) | 176 | 18 July 1996 (version 0.15) |
174 | 25 August 1996 (version 0.21) | 177 | 25 August 1996 (version 0.21) |
@@ -178,4 +181,5 @@ Cambridge, UK (and what a great town this is!) | |||
178 | 8 June 1999 (bzip2, version 0.9.5) | 181 | 8 June 1999 (bzip2, version 0.9.5) |
179 | 4 Sept 1999 (bzip2, version 0.9.5d) | 182 | 4 Sept 1999 (bzip2, version 0.9.5d) |
180 | 5 May 2000 (bzip2, version 1.0pre8) | 183 | 5 May 2000 (bzip2, version 1.0pre8) |
181 | 30 December 2001 (bzip2, version 1.0.2pre1) \ No newline at end of file | 184 | 30 December 2001 (bzip2, version 1.0.2pre1) |
185 | 15 February 2005 (bzip2, version 1.0.3) | ||
diff --git a/README.COMPILATION.PROBLEMS b/README.COMPILATION.PROBLEMS index bd1822d..f1bc396 100644 --- a/README.COMPILATION.PROBLEMS +++ b/README.COMPILATION.PROBLEMS | |||
@@ -1,11 +1,10 @@ | |||
1 | 1 | ||
2 | bzip2-1.0 should compile without problems on the vast majority of | 2 | bzip2-1.0.3 should compile without problems on the vast majority of |
3 | platforms. Using the supplied Makefile, I've built and tested it | 3 | platforms. Using the supplied Makefile, I've built and tested it |
4 | myself for x86-linux, sparc-solaris, alpha-linux, x86-cygwin32 and | 4 | myself for x86-linux and x86_64-linux. With makefile.msc, Visual C++ |
5 | alpha-tru64unix. With makefile.msc, Visual C++ 6.0 and nmake, you can | 5 | 6.0 and nmake, you can build a native Win32 version too. Large file |
6 | build a native Win32 version too. Large file support seems to work | 6 | support seems to work correctly on at least alpha-tru64unix and |
7 | correctly on at least alpha-tru64unix and x86-cygwin32 (on Windows | 7 | x86-cygwin32 (on Windows 2000). |
8 | 2000). | ||
9 | 8 | ||
10 | When I say "large file" I mean a file of size 2,147,483,648 (2^31) | 9 | When I say "large file" I mean a file of size 2,147,483,648 (2^31) |
11 | bytes or above. Many older OSs can't handle files above this size, | 10 | bytes or above. Many older OSs can't handle files above this size, |
@@ -22,7 +21,7 @@ The technique of adding -D_FILE_OFFSET_BITS=64 to get large file | |||
22 | support is, as far as I know, the Recommended Way to get correct large | 21 | support is, as far as I know, the Recommended Way to get correct large |
23 | file support. For more details, see the Large File Support | 22 | file support. For more details, see the Large File Support |
24 | Specification, published by the Large File Summit, at | 23 | Specification, published by the Large File Summit, at |
25 | http://www.sas.com/standard/large.file/ | 24 | http://ftp.sas.com/standards/large.file |
26 | 25 | ||
27 | As a general comment, if you get compilation errors which you think | 26 | As a general comment, if you get compilation errors which you think |
28 | are related to large file support, try removing the above define from | 27 | are related to large file support, try removing the above define from |
@@ -38,93 +37,3 @@ You can use the spewG.c program to generate huge files to test bzip2's | |||
38 | large file support, if you are feeling paranoid. Be aware though that | 37 | large file support, if you are feeling paranoid. Be aware though that |
39 | any compilation problems which affect bzip2 will also affect spewG.c, | 38 | any compilation problems which affect bzip2 will also affect spewG.c, |
40 | alas. | 39 | alas. |
41 | |||
42 | |||
43 | Known problems as of 1.0pre8: | ||
44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
45 | |||
46 | * HP/UX 10.20 and 11.00, using gcc (2.7.2.3 and 2.95.2): A large | ||
47 | number of warnings appear, including the following: | ||
48 | |||
49 | /usr/include/sys/resource.h: In function `getrlimit': | ||
50 | /usr/include/sys/resource.h:168: | ||
51 | warning: implicit declaration of function `__getrlimit64' | ||
52 | /usr/include/sys/resource.h: In function `setrlimit': | ||
53 | /usr/include/sys/resource.h:170: | ||
54 | warning: implicit declaration of function `__setrlimit64' | ||
55 | |||
56 | This would appear to be a problem with large file support, header | ||
57 | files and gcc. gcc may or may not give up at this point. If it | ||
58 | fails, you might be able to improve matters by adding | ||
59 | -D__STDC_EXT__=1 | ||
60 | to the BIGFILES variable in the Makefile (ie, change its definition | ||
61 | to | ||
62 | BIGFILES=-D_FILE_OFFSET_BITS=64 -D__STDC_EXT__=1 | ||
63 | |||
64 | Even if gcc does produce a binary which appears to work (ie passes | ||
65 | its self-tests), you might want to test it to see if it works properly | ||
66 | on large files. | ||
67 | |||
68 | |||
69 | * HP/UX 10.20 and 11.00, using HP's cc compiler. | ||
70 | |||
71 | No specific problems for this combination, except that you'll need to | ||
72 | specify the -Ae flag, and zap the gcc-specific stuff | ||
73 | -Wall -Winline -O2 -fomit-frame-pointer -fno-strength-reduce. | ||
74 | You should retain -D_FILE_OFFSET_BITS=64 in order to get large | ||
75 | file support -- which is reported to work ok for this HP/UX + cc | ||
76 | combination. | ||
77 | |||
78 | |||
79 | * SunOS 4.1.X. | ||
80 | |||
81 | Amazingly, there are still people out there using this venerable old | ||
82 | banger. I shouldn't be too rude -- I started life on SunOS, and | ||
83 | it was a pretty darn good OS, way back then. Anyway: | ||
84 | |||
85 | SunOS doesn't seem to have strerror(), so you'll have to use | ||
86 | perror(), perhaps by doing adding this (warning: UNTESTED CODE): | ||
87 | |||
88 | char* strerror ( int errnum ) | ||
89 | { | ||
90 | if (errnum < 0 || errnum >= sys_nerr) | ||
91 | return "Unknown error"; | ||
92 | else | ||
93 | return sys_errlist[errnum]; | ||
94 | } | ||
95 | |||
96 | Or you could comment out the relevant calls to strerror; they're | ||
97 | not mission-critical. Or you could upgrade to Solaris. Ha ha ha! | ||
98 | (what?? you think I've got Bad Attitude?) | ||
99 | |||
100 | |||
101 | * Making a shared library on Solaris. (Not really a compilation | ||
102 | problem, but many people ask ...) | ||
103 | |||
104 | Firstly, if you have Solaris 8, either you have libbz2.so already | ||
105 | on your system, or you can install it from the Solaris CD. | ||
106 | |||
107 | Secondly, be aware that there are potential naming conflicts | ||
108 | between the .so file supplied with Solaris 8, and the .so file | ||
109 | which Makefile-libbz2_so will make. Makefile-libbz2_so creates | ||
110 | a .so which has the names which I intend to be "official" as | ||
111 | of version 1.0.0 and onwards. Unfortunately, the .so in | ||
112 | Solaris 8 appeared before I decided on the final names, so | ||
113 | the two libraries are incompatible. We have since communicated | ||
114 | and I hope that the problems will have been solved in the next | ||
115 | version of Solaris, whenever that might appear. | ||
116 | |||
117 | All that said: you might be able to get somewhere | ||
118 | by finding the line in Makefile-libbz2_so which says | ||
119 | |||
120 | $(CC) -shared -Wl,-soname -Wl,libbz2.so.1.0 -o libbz2.so.1.0.2 $(OBJS) | ||
121 | |||
122 | and replacing with | ||
123 | |||
124 | $(CC) -G -shared -o libbz2.so.1.0.2 -h libbz2.so.1.0 $(OBJS) | ||
125 | |||
126 | If gcc objects to the combination -fpic -fPIC, get rid of | ||
127 | the second one, leaving just "-fpic". | ||
128 | |||
129 | |||
130 | That's the end of the currently known compilation problems. | ||
diff --git a/README.XML.STUFF b/README.XML.STUFF new file mode 100644 index 0000000..0ff209f --- /dev/null +++ b/README.XML.STUFF | |||
@@ -0,0 +1,31 @@ | |||
1 | The script xmlproc.sh takes an xml file as input, | ||
2 | and processes it to create .pdf, .html or .ps output. | ||
3 | It uses format.pl, a perl script to format <pre> blocks nicely, | ||
4 | and add CDATA tags so writers do not have to use eg. < | ||
5 | |||
6 | The file "entities.xml" must be edited to reflect current | ||
7 | version, year, etc. | ||
8 | |||
9 | |||
10 | Usage: | ||
11 | |||
12 | xmlproc.sh -v manual.xml | ||
13 | Validates an xml file to ensure no dtd-compliance errors | ||
14 | |||
15 | xmlproc.sh -html manual.xml | ||
16 | Output: manual.html | ||
17 | |||
18 | xmlproc.sh -pdf manual.xml | ||
19 | Output: manual.pdf | ||
20 | |||
21 | xmlproc.sh -ps manual.xml | ||
22 | Output: manual.ps | ||
23 | |||
24 | |||
25 | Notum bene: | ||
26 | - pdfxmltex barfs if given a filename with an underscore in it | ||
27 | |||
28 | - xmltex won't work yet - there's a bug in passivetex | ||
29 | which we are all waiting for Sebastian to fix. | ||
30 | So we are going the xml -> pdf -> ps route for the time being, | ||
31 | using pdfxmltex. | ||
diff --git a/blocksort.c b/blocksort.c index aba3efc..33ec9f5 100644 --- a/blocksort.c +++ b/blocksort.c | |||
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
diff --git a/bz-common.xsl b/bz-common.xsl new file mode 100644 index 0000000..66fcd6f --- /dev/null +++ b/bz-common.xsl | |||
@@ -0,0 +1,39 @@ | |||
1 | <?xml version="1.0"?> <!-- -*- sgml -*- --> | ||
2 | <xsl:stylesheet | ||
3 | xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> | ||
4 | |||
5 | <!-- we like '1.2 Title' --> | ||
6 | <xsl:param name="section.autolabel" select="'1'"/> | ||
7 | <xsl:param name="section.label.includes.component.label" select="'1'"/> | ||
8 | |||
9 | <!-- Do not put 'Chapter' at the start of eg 'Chapter 1. Doing This' --> | ||
10 | <xsl:param name="local.l10n.xml" select="document('')"/> | ||
11 | <l:i18n xmlns:l="http://docbook.sourceforge.net/xmlns/l10n/1.0"> | ||
12 | <l:l10n language="en"> | ||
13 | <l:context name="title-numbered"> | ||
14 | <l:template name="chapter" text="%n. %t"/> | ||
15 | </l:context> | ||
16 | </l:l10n> | ||
17 | </l:i18n> | ||
18 | |||
19 | <!-- don't generate sub-tocs for qanda sets --> | ||
20 | <xsl:param name="generate.toc"> | ||
21 | set toc,title | ||
22 | book toc,title,figure,table,example,equation | ||
23 | chapter toc,title | ||
24 | section toc | ||
25 | sect1 toc | ||
26 | sect2 toc | ||
27 | sect3 toc | ||
28 | sect4 nop | ||
29 | sect5 nop | ||
30 | qandaset toc | ||
31 | qandadiv nop | ||
32 | appendix toc,title | ||
33 | article/appendix nop | ||
34 | article toc,title | ||
35 | preface toc,title | ||
36 | reference toc,title | ||
37 | </xsl:param> | ||
38 | |||
39 | </xsl:stylesheet> | ||
diff --git a/bz-fo.xsl b/bz-fo.xsl new file mode 100644 index 0000000..7f2a767 --- /dev/null +++ b/bz-fo.xsl | |||
@@ -0,0 +1,257 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8"?> <!-- -*- sgml -*- --> | ||
2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | ||
3 | xmlns:fo="http://www.w3.org/1999/XSL/Format" version="1.0"> | ||
4 | |||
5 | <xsl:import href="http://docbook.sourceforge.net/release/xsl/current/fo/docbook.xsl"/> | ||
6 | <xsl:import href="bz-common.xsl"/> | ||
7 | |||
8 | <!-- set indent = yes while debugging, then change to NO --> | ||
9 | <xsl:output method="xml" indent="yes"/> | ||
10 | |||
11 | <!-- ensure only passivetex extensions are on --> | ||
12 | <xsl:param name="stylesheet.result.type" select="'fo'"/> | ||
13 | <!-- fo extensions: PDF bookmarks and index terms --> | ||
14 | <xsl:param name="use.extensions" select="'1'"/> | ||
15 | <xsl:param name="xep.extensions" select="0"/> | ||
16 | <xsl:param name="fop.extensions" select="0"/> | ||
17 | <xsl:param name="saxon.extensions" select="0"/> | ||
18 | <xsl:param name="passivetex.extensions" select="1"/> | ||
19 | <xsl:param name="tablecolumns.extension" select="'1'"/> | ||
20 | |||
21 | <!-- ensure we are using single sided --> | ||
22 | <xsl:param name="double.sided" select="'0'"/> | ||
23 | |||
24 | <!-- insert cross references to page numbers --> | ||
25 | <xsl:param name="insert.xref.page.number" select="1"/> | ||
26 | |||
27 | <!-- <?custom-pagebreak?> inserts a page break at this point --> | ||
28 | <xsl:template match="processing-instruction('custom-pagebreak')"> | ||
29 | <fo:block break-before='page'/> | ||
30 | </xsl:template> | ||
31 | |||
32 | <!-- show links in color --> | ||
33 | <xsl:attribute-set name="xref.properties"> | ||
34 | <xsl:attribute name="color">blue</xsl:attribute> | ||
35 | </xsl:attribute-set> | ||
36 | |||
37 | <!-- make pre listings indented a bit + a bg colour --> | ||
38 | <xsl:template match="programlisting | screen"> | ||
39 | <fo:block start-indent="0.25in" wrap-option="no-wrap" | ||
40 | white-space-collapse="false" text-align="start" | ||
41 | font-family="monospace" background-color="#f2f2f9" | ||
42 | linefeed-treatment="preserve" | ||
43 | xsl:use-attribute-sets="normal.para.spacing"> | ||
44 | <xsl:apply-templates/> | ||
45 | </fo:block> | ||
46 | </xsl:template> | ||
47 | <!-- make verbatim output prettier --> | ||
48 | <xsl:template match="literallayout"> | ||
49 | <fo:block start-indent="0.25in" wrap-option="no-wrap" | ||
50 | white-space-collapse="false" text-align="start" | ||
51 | font-family="monospace" background-color="#edf7f4" | ||
52 | linefeed-treatment="preserve" | ||
53 | space-before="0em" space-after="0em"> | ||
54 | <xsl:apply-templates/> | ||
55 | </fo:block> | ||
56 | </xsl:template> | ||
57 | |||
58 | <!-- workaround bug in passivetex fo output for itemizedlist --> | ||
59 | <xsl:template match="itemizedlist/listitem"> | ||
60 | <xsl:variable name="id"> | ||
61 | <xsl:call-template name="object.id"/></xsl:variable> | ||
62 | <xsl:variable name="itemsymbol"> | ||
63 | <xsl:call-template name="list.itemsymbol"> | ||
64 | <xsl:with-param name="node" select="parent::itemizedlist"/> | ||
65 | </xsl:call-template> | ||
66 | </xsl:variable> | ||
67 | <xsl:variable name="item.contents"> | ||
68 | <fo:list-item-label end-indent="label-end()"> | ||
69 | <fo:block> | ||
70 | <xsl:choose> | ||
71 | <xsl:when test="$itemsymbol='disc'">•</xsl:when> | ||
72 | <xsl:when test="$itemsymbol='bullet'">•</xsl:when> | ||
73 | <xsl:otherwise>•</xsl:otherwise> | ||
74 | </xsl:choose> | ||
75 | </fo:block> | ||
76 | </fo:list-item-label> | ||
77 | <fo:list-item-body start-indent="body-start()"> | ||
78 | <xsl:apply-templates/> <!-- removed extra block wrapper --> | ||
79 | </fo:list-item-body> | ||
80 | </xsl:variable> | ||
81 | <xsl:choose> | ||
82 | <xsl:when test="parent::*/@spacing = 'compact'"> | ||
83 | <fo:list-item id="{$id}" | ||
84 | xsl:use-attribute-sets="compact.list.item.spacing"> | ||
85 | <xsl:copy-of select="$item.contents"/> | ||
86 | </fo:list-item> | ||
87 | </xsl:when> | ||
88 | <xsl:otherwise> | ||
89 | <fo:list-item id="{$id}" xsl:use-attribute-sets="list.item.spacing"> | ||
90 | <xsl:copy-of select="$item.contents"/> | ||
91 | </fo:list-item> | ||
92 | </xsl:otherwise> | ||
93 | </xsl:choose> | ||
94 | </xsl:template> | ||
95 | |||
96 | <!-- workaround bug in passivetex fo output for orderedlist --> | ||
97 | <xsl:template match="orderedlist/listitem"> | ||
98 | <xsl:variable name="id"> | ||
99 | <xsl:call-template name="object.id"/></xsl:variable> | ||
100 | <xsl:variable name="item.contents"> | ||
101 | <fo:list-item-label end-indent="label-end()"> | ||
102 | <fo:block> | ||
103 | <xsl:apply-templates select="." mode="item-number"/> | ||
104 | </fo:block> | ||
105 | </fo:list-item-label> | ||
106 | <fo:list-item-body start-indent="body-start()"> | ||
107 | <xsl:apply-templates/> <!-- removed extra block wrapper --> | ||
108 | </fo:list-item-body> | ||
109 | </xsl:variable> | ||
110 | <xsl:choose> | ||
111 | <xsl:when test="parent::*/@spacing = 'compact'"> | ||
112 | <fo:list-item id="{$id}" | ||
113 | xsl:use-attribute-sets="compact.list.item.spacing"> | ||
114 | <xsl:copy-of select="$item.contents"/> | ||
115 | </fo:list-item> | ||
116 | </xsl:when> | ||
117 | <xsl:otherwise> | ||
118 | <fo:list-item id="{$id}" xsl:use-attribute-sets="list.item.spacing"> | ||
119 | <xsl:copy-of select="$item.contents"/> | ||
120 | </fo:list-item> | ||
121 | </xsl:otherwise> | ||
122 | </xsl:choose> | ||
123 | </xsl:template> | ||
124 | |||
125 | <!-- workaround bug in passivetex fo output for variablelist --> | ||
126 | <xsl:param name="variablelist.as.blocks" select="1"/> | ||
127 | <xsl:template match="varlistentry" mode="vl.as.blocks"> | ||
128 | <xsl:variable name="id"> | ||
129 | <xsl:call-template name="object.id"/></xsl:variable> | ||
130 | <fo:block id="{$id}" xsl:use-attribute-sets="list.item.spacing" | ||
131 | keep-together.within-column="always" | ||
132 | keep-with-next.within-column="always"> | ||
133 | <xsl:apply-templates select="term"/> | ||
134 | </fo:block> | ||
135 | <fo:block start-indent="0.5in" end-indent="0in" | ||
136 | space-after.minimum="0.2em" | ||
137 | space-after.optimum="0.4em" | ||
138 | space-after.maximum="0.6em"> | ||
139 | <fo:block> | ||
140 | <xsl:apply-templates select="listitem"/> | ||
141 | </fo:block> | ||
142 | </fo:block> | ||
143 | </xsl:template> | ||
144 | |||
145 | |||
146 | <!-- workaround bug in footers: force right-align w/two 80|30 cols --> | ||
147 | <xsl:template name="footer.table"> | ||
148 | <xsl:param name="pageclass" select="''"/> | ||
149 | <xsl:param name="sequence" select="''"/> | ||
150 | <xsl:param name="gentext-key" select="''"/> | ||
151 | <xsl:choose> | ||
152 | <xsl:when test="$pageclass = 'index'"> | ||
153 | <xsl:attribute name="margin-left">0pt</xsl:attribute> | ||
154 | </xsl:when> | ||
155 | </xsl:choose> | ||
156 | <xsl:variable name="candidate"> | ||
157 | <fo:table table-layout="fixed" width="100%"> | ||
158 | <fo:table-column column-number="1" column-width="80%"/> | ||
159 | <fo:table-column column-number="2" column-width="20%"/> | ||
160 | <fo:table-body> | ||
161 | <fo:table-row height="14pt"> | ||
162 | <fo:table-cell text-align="left" display-align="after"> | ||
163 | <xsl:attribute name="relative-align">baseline</xsl:attribute> | ||
164 | <fo:block> | ||
165 | <fo:block> </fo:block><!-- empty cell --> | ||
166 | </fo:block> | ||
167 | </fo:table-cell> | ||
168 | <fo:table-cell text-align="center" display-align="after"> | ||
169 | <xsl:attribute name="relative-align">baseline</xsl:attribute> | ||
170 | <fo:block> | ||
171 | <xsl:call-template name="footer.content"> | ||
172 | <xsl:with-param name="pageclass" select="$pageclass"/> | ||
173 | <xsl:with-param name="sequence" select="$sequence"/> | ||
174 | <xsl:with-param name="position" select="'center'"/> | ||
175 | <xsl:with-param name="gentext-key" select="$gentext-key"/> | ||
176 | </xsl:call-template> | ||
177 | </fo:block> | ||
178 | </fo:table-cell> | ||
179 | </fo:table-row> | ||
180 | </fo:table-body> | ||
181 | </fo:table> | ||
182 | </xsl:variable> | ||
183 | <!-- Really output a footer? --> | ||
184 | <xsl:choose> | ||
185 | <xsl:when test="$pageclass='titlepage' and $gentext-key='book' | ||
186 | and $sequence='first'"> | ||
187 | <!-- no, book titlepages have no footers at all --> | ||
188 | </xsl:when> | ||
189 | <xsl:when test="$sequence = 'blank' and $footers.on.blank.pages = 0"> | ||
190 | <!-- no output --> | ||
191 | </xsl:when> | ||
192 | <xsl:otherwise> | ||
193 | <xsl:copy-of select="$candidate"/> | ||
194 | </xsl:otherwise> | ||
195 | </xsl:choose> | ||
196 | </xsl:template> | ||
197 | |||
198 | |||
199 | <!-- fix bug in headers: force right-align w/two 40|60 cols --> | ||
200 | <xsl:template name="header.table"> | ||
201 | <xsl:param name="pageclass" select="''"/> | ||
202 | <xsl:param name="sequence" select="''"/> | ||
203 | <xsl:param name="gentext-key" select="''"/> | ||
204 | <xsl:choose> | ||
205 | <xsl:when test="$pageclass = 'index'"> | ||
206 | <xsl:attribute name="margin-left">0pt</xsl:attribute> | ||
207 | </xsl:when> | ||
208 | </xsl:choose> | ||
209 | <xsl:variable name="candidate"> | ||
210 | <fo:table table-layout="fixed" width="100%"> | ||
211 | <xsl:call-template name="head.sep.rule"> | ||
212 | <xsl:with-param name="pageclass" select="$pageclass"/> | ||
213 | <xsl:with-param name="sequence" select="$sequence"/> | ||
214 | <xsl:with-param name="gentext-key" select="$gentext-key"/> | ||
215 | </xsl:call-template> | ||
216 | <fo:table-column column-number="1" column-width="40%"/> | ||
217 | <fo:table-column column-number="2" column-width="60%"/> | ||
218 | <fo:table-body> | ||
219 | <fo:table-row height="14pt"> | ||
220 | <fo:table-cell text-align="left" display-align="before"> | ||
221 | <xsl:attribute name="relative-align">baseline</xsl:attribute> | ||
222 | <fo:block> | ||
223 | <fo:block> </fo:block><!-- empty cell --> | ||
224 | </fo:block> | ||
225 | </fo:table-cell> | ||
226 | <fo:table-cell text-align="center" display-align="before"> | ||
227 | <xsl:attribute name="relative-align">baseline</xsl:attribute> | ||
228 | <fo:block> | ||
229 | <xsl:call-template name="header.content"> | ||
230 | <xsl:with-param name="pageclass" select="$pageclass"/> | ||
231 | <xsl:with-param name="sequence" select="$sequence"/> | ||
232 | <xsl:with-param name="position" select="'center'"/> | ||
233 | <xsl:with-param name="gentext-key" select="$gentext-key"/> | ||
234 | </xsl:call-template> | ||
235 | </fo:block> | ||
236 | </fo:table-cell> | ||
237 | </fo:table-row> | ||
238 | </fo:table-body> | ||
239 | </fo:table> | ||
240 | </xsl:variable> | ||
241 | <!-- Really output a header? --> | ||
242 | <xsl:choose> | ||
243 | <xsl:when test="$pageclass = 'titlepage' and $gentext-key = 'book' | ||
244 | and $sequence='first'"> | ||
245 | <!-- no, book titlepages have no headers at all --> | ||
246 | </xsl:when> | ||
247 | <xsl:when test="$sequence = 'blank' and $headers.on.blank.pages = 0"> | ||
248 | <!-- no output --> | ||
249 | </xsl:when> | ||
250 | <xsl:otherwise> | ||
251 | <xsl:copy-of select="$candidate"/> | ||
252 | </xsl:otherwise> | ||
253 | </xsl:choose> | ||
254 | </xsl:template> | ||
255 | |||
256 | |||
257 | </xsl:stylesheet> | ||
diff --git a/bz-html.xsl b/bz-html.xsl new file mode 100644 index 0000000..1785fff --- /dev/null +++ b/bz-html.xsl | |||
@@ -0,0 +1,20 @@ | |||
1 | <?xml version="1.0"?> <!-- -*- sgml -*- --> | ||
2 | <!DOCTYPE xsl:stylesheet [ <!ENTITY bz-css SYSTEM "./bzip.css"> ]> | ||
3 | |||
4 | <xsl:stylesheet | ||
5 | xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> | ||
6 | |||
7 | <xsl:import href="http://docbook.sourceforge.net/release/xsl/current/html/docbook.xsl"/> | ||
8 | <xsl:import href="bz-common.xsl"/> | ||
9 | |||
10 | <!-- use 8859-1 encoding --> | ||
11 | <xsl:output method="html" encoding="ISO-8859-1" indent="yes"/> | ||
12 | |||
13 | <!-- we include the css directly when generating one large file --> | ||
14 | <xsl:template name="user.head.content"> | ||
15 | <style type="text/css" media="screen"> | ||
16 | <xsl:text>&bz-css;</xsl:text> | ||
17 | </style> | ||
18 | </xsl:template> | ||
19 | |||
20 | </xsl:stylesheet> | ||
diff --git a/bzip.css b/bzip.css new file mode 100644 index 0000000..43193d8 --- /dev/null +++ b/bzip.css | |||
@@ -0,0 +1,74 @@ | |||
1 | /* Colours: | ||
2 | #74240f dark brown h1, h2, h3, h4 | ||
3 | #336699 medium blue links | ||
4 | #339999 turquoise link hover colour | ||
5 | #202020 almost black general text | ||
6 | #761596 purple md5sum text | ||
7 | #626262 dark gray pre border | ||
8 | #eeeeee very light gray pre background | ||
9 | #f2f2f9 very light blue nav table background | ||
10 | #3366cc medium blue nav table border | ||
11 | */ | ||
12 | |||
13 | a, a:link, a:visited, a:active { color: #336699; } | ||
14 | a:hover { color: #339999; } | ||
15 | |||
16 | body { font: 80%/126% sans-serif; } | ||
17 | h1, h2, h3, h4 { color: #74240f; } | ||
18 | |||
19 | dt { color: #336699; font-weight: bold } | ||
20 | dd { | ||
21 | margin-left: 1.5em; | ||
22 | padding-bottom: 0.8em; | ||
23 | } | ||
24 | |||
25 | /* -- ruler -- */ | ||
26 | div.hr_blue { | ||
27 | height: 3px; | ||
28 | background:#ffffff url("/images/hr_blue.png") repeat-x; } | ||
29 | div.hr_blue hr { display:none; } | ||
30 | |||
31 | /* release styles */ | ||
32 | #release p { margin-top: 0.4em; } | ||
33 | #release .md5sum { color: #761596; } | ||
34 | |||
35 | |||
36 | /* ------ styles for docs|manuals|howto ------ */ | ||
37 | /* -- lists -- */ | ||
38 | ul { | ||
39 | margin: 0px 4px 16px 16px; | ||
40 | padding: 0px; | ||
41 | list-style: url("/images/li-blue.png"); | ||
42 | } | ||
43 | ul li { | ||
44 | margin-bottom: 10px; | ||
45 | } | ||
46 | ul ul { | ||
47 | list-style-type: none; | ||
48 | list-style-image: none; | ||
49 | margin-left: 0px; | ||
50 | } | ||
51 | |||
52 | /* header / footer nav tables */ | ||
53 | table.nav { | ||
54 | border: solid 1px #3366cc; | ||
55 | background: #f2f2f9; | ||
56 | background-color: #f2f2f9; | ||
57 | margin-bottom: 0.5em; | ||
58 | } | ||
59 | /* don't have underlined links in chunked nav menus */ | ||
60 | table.nav a { text-decoration: none; } | ||
61 | table.nav a:hover { text-decoration: underline; } | ||
62 | table.nav td { font-size: 85%; } | ||
63 | |||
64 | code, tt, pre { font-size: 120%; } | ||
65 | code, tt { color: #761596; } | ||
66 | |||
67 | div.literallayout, pre.programlisting, pre.screen { | ||
68 | color: #000000; | ||
69 | padding: 0.5em; | ||
70 | background: #eeeeee; | ||
71 | border: 1px solid #626262; | ||
72 | background-color: #eeeeee; | ||
73 | margin: 4px 0px 4px 0px; | ||
74 | } | ||
@@ -1,7 +1,7 @@ | |||
1 | .PU | 1 | .PU |
2 | .TH bzip2 1 | 2 | .TH bzip2 1 |
3 | .SH NAME | 3 | .SH NAME |
4 | bzip2, bunzip2 \- a block-sorting file compressor, v1.0.2 | 4 | bzip2, bunzip2 \- a block-sorting file compressor, v1.0.3 |
5 | .br | 5 | .br |
6 | bzcat \- decompresses files to stdout | 6 | bzcat \- decompresses files to stdout |
7 | .br | 7 | .br |
@@ -405,19 +405,19 @@ I/O error messages are not as helpful as they could be. | |||
405 | tries hard to detect I/O errors and exit cleanly, but the details of | 405 | tries hard to detect I/O errors and exit cleanly, but the details of |
406 | what the problem is sometimes seem rather misleading. | 406 | what the problem is sometimes seem rather misleading. |
407 | 407 | ||
408 | This manual page pertains to version 1.0.2 of | 408 | This manual page pertains to version 1.0.3 of |
409 | .I bzip2. | 409 | .I bzip2. |
410 | Compressed data created by this version is entirely forwards and | 410 | Compressed data created by this version is entirely forwards and |
411 | backwards compatible with the previous public releases, versions | 411 | backwards compatible with the previous public releases, versions |
412 | 0.1pl2, 0.9.0, 0.9.5, 1.0.0 and 1.0.1, but with the following | 412 | 0.1pl2, 0.9.0, 0.9.5, 1.0.0, 1.0.1 and 1.0.2, but with the following |
413 | exception: 0.9.0 and above can correctly decompress multiple | 413 | exception: 0.9.0 and above can correctly decompress multiple |
414 | concatenated compressed files. 0.1pl2 cannot do this; it will stop | 414 | concatenated compressed files. 0.1pl2 cannot do this; it will stop |
415 | after decompressing just the first file in the stream. | 415 | after decompressing just the first file in the stream. |
416 | 416 | ||
417 | .I bzip2recover | 417 | .I bzip2recover |
418 | versions prior to this one, 1.0.2, used 32-bit integers to represent | 418 | versions prior to 1.0.2 used 32-bit integers to represent |
419 | bit positions in compressed files, so it could not handle compressed | 419 | bit positions in compressed files, so they could not handle compressed |
420 | files more than 512 megabytes long. Version 1.0.2 and above uses | 420 | files more than 512 megabytes long. Versions 1.0.2 and above use |
421 | 64-bit ints on some platforms which support them (GNU supported | 421 | 64-bit ints on some platforms which support them (GNU supported |
422 | targets, and Windows). To establish whether or not bzip2recover was | 422 | targets, and Windows). To establish whether or not bzip2recover was |
423 | built with such a limitation, run it without arguments. In any event | 423 | built with such a limitation, run it without arguments. In any event |
@@ -427,9 +427,9 @@ with MaybeUInt64 set to be an unsigned 64-bit integer. | |||
427 | 427 | ||
428 | 428 | ||
429 | .SH AUTHOR | 429 | .SH AUTHOR |
430 | Julian Seward, jseward@acm.org. | 430 | Julian Seward, jsewardbzip.org. |
431 | 431 | ||
432 | http://sources.redhat.com/bzip2 | 432 | http://www.bzip.org |
433 | 433 | ||
434 | The ideas embodied in | 434 | The ideas embodied in |
435 | .I bzip2 | 435 | .I bzip2 |
@@ -447,6 +447,7 @@ source distribution for pointers to sources of documentation. Christian | |||
447 | von Roques encouraged me to look for faster sorting algorithms, so as to | 447 | von Roques encouraged me to look for faster sorting algorithms, so as to |
448 | speed up compression. Bela Lubkin encouraged me to improve the | 448 | speed up compression. Bela Lubkin encouraged me to improve the |
449 | worst-case compression performance. | 449 | worst-case compression performance. |
450 | Donna Robinson XMLised the documentation. | ||
450 | The bz* scripts are derived from those of GNU gzip. | 451 | The bz* scripts are derived from those of GNU gzip. |
451 | Many people sent patches, helped | 452 | Many people sent patches, helped |
452 | with portability problems, lent machines, gave advice and were generally | 453 | with portability problems, lent machines, gave advice and were generally |
diff --git a/bzip2.1.preformatted b/bzip2.1.preformatted index 0f20cb5..129ca83 100644 --- a/bzip2.1.preformatted +++ b/bzip2.1.preformatted | |||
@@ -3,43 +3,43 @@ bzip2(1) bzip2(1) | |||
3 | 3 | ||
4 | 4 | ||
5 | NNAAMMEE | 5 | NNAAMMEE |
6 | bzip2, bunzip2 - a block-sorting file compressor, v1.0.2 | 6 | bzip2, bunzip2 − a blockâ€sorting file compressor, v1.0.3 |
7 | bzcat - decompresses files to stdout | 7 | bzcat − decompresses files to stdout |
8 | bzip2recover - recovers data from damaged bzip2 files | 8 | bzip2recover − recovers data from damaged bzip2 files |
9 | 9 | ||
10 | 10 | ||
11 | SSYYNNOOPPSSIISS | 11 | SSYYNNOOPPSSIISS |
12 | bbzziipp22 [ --ccddffkkqqssttvvzzVVLL112233445566778899 ] [ _f_i_l_e_n_a_m_e_s _._._. ] | 12 | bbzziipp22 [ −−ccddffkkqqssttvvzzVVLL112233445566778899 ] [ _f_i_l_e_n_a_m_e_s _._._. ] |
13 | bbuunnzziipp22 [ --ffkkvvssVVLL ] [ _f_i_l_e_n_a_m_e_s _._._. ] | 13 | bbuunnzziipp22 [ −−ffkkvvssVVLL ] [ _f_i_l_e_n_a_m_e_s _._._. ] |
14 | bbzzccaatt [ --ss ] [ _f_i_l_e_n_a_m_e_s _._._. ] | 14 | bbzzccaatt [ −−ss ] [ _f_i_l_e_n_a_m_e_s _._._. ] |
15 | bbzziipp22rreeccoovveerr _f_i_l_e_n_a_m_e | 15 | bbzziipp22rreeccoovveerr _f_i_l_e_n_a_m_e |
16 | 16 | ||
17 | 17 | ||
18 | DDEESSCCRRIIPPTTIIOONN | 18 | DDEESSCCRRIIPPTTIIOONN |
19 | _b_z_i_p_2 compresses files using the Burrows-Wheeler block | 19 | _b_z_i_p_2 compresses files using the Burrowsâ€Wheeler block |
20 | sorting text compression algorithm, and Huffman coding. | 20 | sorting text compression algorithm, and Huffman coding. |
21 | Compression is generally considerably better than that | 21 | Compression is generally considerably better than that |
22 | achieved by more conventional LZ77/LZ78-based compressors, | 22 | achieved by more conventional LZ77/LZ78â€based compressors, |
23 | and approaches the performance of the PPM family of sta | 23 | and approaches the performance of the PPM family of sta |
24 | tistical compressors. | 24 | tistical compressors. |
25 | 25 | ||
26 | The command-line options are deliberately very similar to | 26 | The commandâ€line options are deliberately very similar to |
27 | those of _G_N_U _g_z_i_p_, but they are not identical. | 27 | those of _G_N_U _g_z_i_p_, but they are not identical. |
28 | 28 | ||
29 | _b_z_i_p_2 expects a list of file names to accompany the com | 29 | _b_z_i_p_2 expects a list of file names to accompany the com |
30 | mand-line flags. Each file is replaced by a compressed | 30 | mandâ€line flags. Each file is replaced by a compressed |
31 | version of itself, with the name "original_name.bz2". | 31 | version of itself, with the name "original_name.bz2". |
32 | Each compressed file has the same modification date, per | 32 | Each compressed file has the same modification date, per |
33 | missions, and, when possible, ownership as the correspond | 33 | missions, and, when possible, ownership as the correspond |
34 | ing original, so that these properties can be correctly | 34 | ing original, so that these properties can be correctly |
35 | restored at decompression time. File name handling is | 35 | restored at decompression time. File name handling is |
36 | naive in the sense that there is no mechanism for preserv | 36 | naive in the sense that there is no mechanism for preserv |
37 | ing original file names, permissions, ownerships or dates | 37 | ing original file names, permissions, ownerships or dates |
38 | in filesystems which lack these concepts, or have serious | 38 | in filesystems which lack these concepts, or have serious |
39 | file name length restrictions, such as MS-DOS. | 39 | file name length restrictions, such as MSâ€DOS. |
40 | 40 | ||
41 | _b_z_i_p_2 and _b_u_n_z_i_p_2 will by default not overwrite existing | 41 | _b_z_i_p_2 and _b_u_n_z_i_p_2 will by default not overwrite existing |
42 | files. If you want this to happen, specify the -f flag. | 42 | files. If you want this to happen, specify the −f flag. |
43 | 43 | ||
44 | If no file names are specified, _b_z_i_p_2 compresses from | 44 | If no file names are specified, _b_z_i_p_2 compresses from |
45 | standard input to standard output. In this case, _b_z_i_p_2 | 45 | standard input to standard output. In this case, _b_z_i_p_2 |
@@ -47,7 +47,7 @@ DDEESSCCRRIIPPTTIIOONN | |||
47 | this would be entirely incomprehensible and therefore | 47 | this would be entirely incomprehensible and therefore |
48 | pointless. | 48 | pointless. |
49 | 49 | ||
50 | _b_u_n_z_i_p_2 (or _b_z_i_p_2 _-_d_) decompresses all specified files. | 50 | _b_u_n_z_i_p_2 (or _b_z_i_p_2 _−_d_) decompresses all specified files. |
51 | Files which were not created by _b_z_i_p_2 will be detected and | 51 | Files which were not created by _b_z_i_p_2 will be detected and |
52 | ignored, and a warning issued. _b_z_i_p_2 attempts to guess | 52 | ignored, and a warning issued. _b_z_i_p_2 attempts to guess |
53 | the filename for the decompressed file from that of the | 53 | the filename for the decompressed file from that of the |
@@ -64,26 +64,26 @@ DDEESSCCRRIIPPTTIIOONN | |||
64 | guess the name of the original file, and uses the original | 64 | guess the name of the original file, and uses the original |
65 | name with _._o_u_t appended. | 65 | name with _._o_u_t appended. |
66 | 66 | ||
67 | As with compression, supplying no filenames causes decom | 67 | As with compression, supplying no filenames causes decom |
68 | pression from standard input to standard output. | 68 | pression from standard input to standard output. |
69 | 69 | ||
70 | _b_u_n_z_i_p_2 will correctly decompress a file which is the con | 70 | _b_u_n_z_i_p_2 will correctly decompress a file which is the con |
71 | catenation of two or more compressed files. The result is | 71 | catenation of two or more compressed files. The result is |
72 | the concatenation of the corresponding uncompressed files. | 72 | the concatenation of the corresponding uncompressed files. |
73 | Integrity testing (-t) of concatenated compressed files is | 73 | Integrity testing (−t) of concatenated compressed files is |
74 | also supported. | 74 | also supported. |
75 | 75 | ||
76 | You can also compress or decompress files to the standard | 76 | You can also compress or decompress files to the standard |
77 | output by giving the -c flag. Multiple files may be com | 77 | output by giving the −c flag. Multiple files may be com |
78 | pressed and decompressed like this. The resulting outputs | 78 | pressed and decompressed like this. The resulting outputs |
79 | are fed sequentially to stdout. Compression of multiple | 79 | are fed sequentially to stdout. Compression of multiple |
80 | files in this manner generates a stream containing multi | 80 | files in this manner generates a stream containing multi |
81 | ple compressed file representations. Such a stream can be | 81 | ple compressed file representations. Such a stream can be |
82 | decompressed correctly only by _b_z_i_p_2 version 0.9.0 or | 82 | decompressed correctly only by _b_z_i_p_2 version 0.9.0 or |
83 | later. Earlier versions of _b_z_i_p_2 will stop after decom | 83 | later. Earlier versions of _b_z_i_p_2 will stop after decom |
84 | pressing the first file in the stream. | 84 | pressing the first file in the stream. |
85 | 85 | ||
86 | _b_z_c_a_t (or _b_z_i_p_2 _-_d_c_) decompresses all specified files to | 86 | _b_z_c_a_t (or _b_z_i_p_2 _â€_d_c_) decompresses all specified files to |
87 | the standard output. | 87 | the standard output. |
88 | 88 | ||
89 | _b_z_i_p_2 will read arguments from the environment variables | 89 | _b_z_i_p_2 will read arguments from the environment variables |
@@ -99,15 +99,15 @@ DDEESSCCRRIIPPTTIIOONN | |||
99 | most file compressors) is coded at about 8.05 bits per | 99 | most file compressors) is coded at about 8.05 bits per |
100 | byte, giving an expansion of around 0.5%. | 100 | byte, giving an expansion of around 0.5%. |
101 | 101 | ||
102 | As a self-check for your protection, _b_z_i_p_2 uses 32-bit | 102 | As a selfâ€check for your protection, _b_z_i_p_2 uses 32â€bit |
103 | CRCs to make sure that the decompressed version of a file | 103 | CRCs to make sure that the decompressed version of a file |
104 | is identical to the original. This guards against corrup | 104 | is identical to the original. This guards against corrup |
105 | tion of the compressed data, and against undetected bugs | 105 | tion of the compressed data, and against undetected bugs |
106 | in _b_z_i_p_2 (hopefully very unlikely). The chances of data | 106 | in _b_z_i_p_2 (hopefully very unlikely). The chances of data |
107 | corruption going undetected is microscopic, about one | 107 | corruption going undetected is microscopic, about one |
108 | chance in four billion for each file processed. Be aware, | 108 | chance in four billion for each file processed. Be aware, |
109 | though, that the check occurs upon decompression, so it | 109 | though, that the check occurs upon decompression, so it |
110 | can only tell you that something is wrong. It can't help | 110 | can only tell you that something is wrong. It can’t help |
111 | you recover the original uncompressed data. You can use | 111 | you recover the original uncompressed data. You can use |
112 | _b_z_i_p_2_r_e_c_o_v_e_r to try to recover data from damaged files. | 112 | _b_z_i_p_2_r_e_c_o_v_e_r to try to recover data from damaged files. |
113 | 113 | ||
@@ -118,41 +118,41 @@ DDEESSCCRRIIPPTTIIOONN | |||
118 | 118 | ||
119 | 119 | ||
120 | OOPPTTIIOONNSS | 120 | OOPPTTIIOONNSS |
121 | --cc ----ssttddoouutt | 121 | −−cc â€â€â€â€ssttddoouutt |
122 | Compress or decompress to standard output. | 122 | Compress or decompress to standard output. |
123 | 123 | ||
124 | --dd ----ddeeccoommpprreessss | 124 | −−dd â€â€â€â€ddeeccoommpprreessss |
125 | Force decompression. _b_z_i_p_2_, _b_u_n_z_i_p_2 and _b_z_c_a_t are | 125 | Force decompression. _b_z_i_p_2_, _b_u_n_z_i_p_2 and _b_z_c_a_t are |
126 | really the same program, and the decision about | 126 | really the same program, and the decision about |
127 | what actions to take is done on the basis of which | 127 | what actions to take is done on the basis of which |
128 | name is used. This flag overrides that mechanism, | 128 | name is used. This flag overrides that mechanism, |
129 | and forces _b_z_i_p_2 to decompress. | 129 | and forces _b_z_i_p_2 to decompress. |
130 | 130 | ||
131 | --zz ----ccoommpprreessss | 131 | −−zz â€â€â€â€ccoommpprreessss |
132 | The complement to -d: forces compression, | 132 | The complement to −d: forces compression, |
133 | regardless of the invocation name. | 133 | regardless of the invocation name. |
134 | 134 | ||
135 | --tt ----tteesstt | 135 | −−tt â€â€â€â€tteesstt |
136 | Check integrity of the specified file(s), but don't | 136 | Check integrity of the specified file(s), but don’t |
137 | decompress them. This really performs a trial | 137 | decompress them. This really performs a trial |
138 | decompression and throws away the result. | 138 | decompression and throws away the result. |
139 | 139 | ||
140 | --ff ----ffoorrccee | 140 | −−ff â€â€â€â€ffoorrccee |
141 | Force overwrite of output files. Normally, _b_z_i_p_2 | 141 | Force overwrite of output files. Normally, _b_z_i_p_2 |
142 | will not overwrite existing output files. Also | 142 | will not overwrite existing output files. Also |
143 | forces _b_z_i_p_2 to break hard links to files, which it | 143 | forces _b_z_i_p_2 to break hard links to files, which it |
144 | otherwise wouldn't do. | 144 | otherwise wouldn’t do. |
145 | 145 | ||
146 | bzip2 normally declines to decompress files which | 146 | bzip2 normally declines to decompress files which |
147 | don't have the correct magic header bytes. If | 147 | don’t have the correct magic header bytes. If |
148 | forced (-f), however, it will pass such files | 148 | forced (â€f), however, it will pass such files |
149 | through unmodified. This is how GNU gzip behaves. | 149 | through unmodified. This is how GNU gzip behaves. |
150 | 150 | ||
151 | --kk ----kkeeeepp | 151 | −−kk â€â€â€â€kkeeeepp |
152 | Keep (don't delete) input files during compression | 152 | Keep (don’t delete) input files during compression |
153 | or decompression. | 153 | or decompression. |
154 | 154 | ||
155 | --ss ----ssmmaallll | 155 | −−ss â€â€â€â€ssmmaallll |
156 | Reduce memory usage, for compression, decompression | 156 | Reduce memory usage, for compression, decompression |
157 | and testing. Files are decompressed and tested | 157 | and testing. Files are decompressed and tested |
158 | using a modified algorithm which only requires 2.5 | 158 | using a modified algorithm which only requires 2.5 |
@@ -160,46 +160,46 @@ OOPPTTIIOONNSS | |||
160 | decompressed in 2300k of memory, albeit at about | 160 | decompressed in 2300k of memory, albeit at about |
161 | half the normal speed. | 161 | half the normal speed. |
162 | 162 | ||
163 | During compression, -s selects a block size of | 163 | During compression, −s selects a block size of |
164 | 200k, which limits memory use to around the same | 164 | 200k, which limits memory use to around the same |
165 | figure, at the expense of your compression ratio. | 165 | figure, at the expense of your compression ratio. |
166 | In short, if your machine is low on memory (8 | 166 | In short, if your machine is low on memory (8 |
167 | megabytes or less), use -s for everything. See | 167 | megabytes or less), use −s for everything. See |
168 | MEMORY MANAGEMENT below. | 168 | MEMORY MANAGEMENT below. |
169 | 169 | ||
170 | --qq ----qquuiieett | 170 | −−qq â€â€â€â€qquuiieett |
171 | Suppress non-essential warning messages. Messages | 171 | Suppress nonâ€essential warning messages. Messages |
172 | pertaining to I/O errors and other critical events | 172 | pertaining to I/O errors and other critical events |
173 | will not be suppressed. | 173 | will not be suppressed. |
174 | 174 | ||
175 | --vv ----vveerrbboossee | 175 | −−vv â€â€â€â€vveerrbboossee |
176 | Verbose mode -- show the compression ratio for each | 176 | Verbose mode â€â€ show the compression ratio for each |
177 | file processed. Further -v's increase the ver | 177 | file processed. Further −v’s increase the ver |
178 | bosity level, spewing out lots of information which | 178 | bosity level, spewing out lots of information which |
179 | is primarily of interest for diagnostic purposes. | 179 | is primarily of interest for diagnostic purposes. |
180 | 180 | ||
181 | --LL ----lliicceennssee --VV ----vveerrssiioonn | 181 | −−LL â€â€â€â€lliicceennssee â€â€VV â€â€â€â€vveerrssiioonn |
182 | Display the software version, license terms and | 182 | Display the software version, license terms and |
183 | conditions. | 183 | conditions. |
184 | 184 | ||
185 | --11 ((oorr ----ffaasstt)) ttoo --99 ((oorr ----bbeesstt)) | 185 | −−11 ((oorr −−−−ffaasstt)) ttoo −−99 ((oorr −−−−bbeesstt)) |
186 | Set the block size to 100 k, 200 k .. 900 k when | 186 | Set the block size to 100 k, 200 k .. 900 k when |
187 | compressing. Has no effect when decompressing. | 187 | compressing. Has no effect when decompressing. |
188 | See MEMORY MANAGEMENT below. The --fast and --best | 188 | See MEMORY MANAGEMENT below. The −−fast and −−best |
189 | aliases are primarily for GNU gzip compatibility. | 189 | aliases are primarily for GNU gzip compatibility. |
190 | In particular, --fast doesn't make things signifi | 190 | In particular, −−fast doesn’t make things signifi |
191 | cantly faster. And --best merely selects the | 191 | cantly faster. And −−best merely selects the |
192 | default behaviour. | 192 | default behaviour. |
193 | 193 | ||
194 | ---- Treats all subsequent arguments as file names, even | 194 | −−â€â€ Treats all subsequent arguments as file names, even |
195 | if they start with a dash. This is so you can han | 195 | if they start with a dash. This is so you can han |
196 | dle files with names beginning with a dash, for | 196 | dle files with names beginning with a dash, for |
197 | example: bzip2 -- -myfilename. | 197 | example: bzip2 −†−myfilename. |
198 | 198 | ||
199 | ----rreeppeettiittiivvee--ffaasstt ----rreeppeettiittiivvee--bbeesstt | 199 | −−â€â€rreeppeettiittiivveeâ€â€ffaasstt â€â€â€â€rreeppeettiittiivveeâ€â€bbeesstt |
200 | These flags are redundant in versions 0.9.5 and | 200 | These flags are redundant in versions 0.9.5 and |
201 | above. They provided some coarse control over the | 201 | above. They provided some coarse control over the |
202 | behaviour of the sorting algorithm in earlier ver | 202 | behaviour of the sorting algorithm in earlier ver |
203 | sions, which was sometimes useful. 0.9.5 and above | 203 | sions, which was sometimes useful. 0.9.5 and above |
204 | have an improved algorithm which renders these | 204 | have an improved algorithm which renders these |
205 | flags irrelevant. | 205 | flags irrelevant. |
@@ -209,13 +209,13 @@ MMEEMMOORRYY MMAANNAAGGEEMMEENNTT | |||
209 | _b_z_i_p_2 compresses large files in blocks. The block size | 209 | _b_z_i_p_2 compresses large files in blocks. The block size |
210 | affects both the compression ratio achieved, and the | 210 | affects both the compression ratio achieved, and the |
211 | amount of memory needed for compression and decompression. | 211 | amount of memory needed for compression and decompression. |
212 | The flags -1 through -9 specify the block size to be | 212 | The flags −1 through −9 specify the block size to be |
213 | 100,000 bytes through 900,000 bytes (the default) respec | 213 | 100,000 bytes through 900,000 bytes (the default) respec |
214 | tively. At decompression time, the block size used for | 214 | tively. At decompression time, the block size used for |
215 | compression is read from the header of the compressed | 215 | compression is read from the header of the compressed |
216 | file, and _b_u_n_z_i_p_2 then allocates itself just enough memory | 216 | file, and _b_u_n_z_i_p_2 then allocates itself just enough memory |
217 | to decompress the file. Since block sizes are stored in | 217 | to decompress the file. Since block sizes are stored in |
218 | compressed files, it follows that the flags -1 to -9 are | 218 | compressed files, it follows that the flags −1 to −9 are |
219 | irrelevant to and so ignored during decompression. | 219 | irrelevant to and so ignored during decompression. |
220 | 220 | ||
221 | Compression and decompression requirements, in bytes, can | 221 | Compression and decompression requirements, in bytes, can |
@@ -238,21 +238,21 @@ MMEEMMOORRYY MMAANNAAGGEEMMEENNTT | |||
238 | _b_u_n_z_i_p_2 will require about 3700 kbytes to decompress. To | 238 | _b_u_n_z_i_p_2 will require about 3700 kbytes to decompress. To |
239 | support decompression of any file on a 4 megabyte machine, | 239 | support decompression of any file on a 4 megabyte machine, |
240 | _b_u_n_z_i_p_2 has an option to decompress using approximately | 240 | _b_u_n_z_i_p_2 has an option to decompress using approximately |
241 | half this amount of memory, about 2300 kbytes. Decompres | 241 | half this amount of memory, about 2300 kbytes. Decompres |
242 | sion speed is also halved, so you should use this option | 242 | sion speed is also halved, so you should use this option |
243 | only where necessary. The relevant flag is -s. | 243 | only where necessary. The relevant flag is â€s. |
244 | 244 | ||
245 | In general, try and use the largest block size memory con | 245 | In general, try and use the largest block size memory con |
246 | straints allow, since that maximises the compression | 246 | straints allow, since that maximises the compression |
247 | achieved. Compression and decompression speed are virtu | 247 | achieved. Compression and decompression speed are virtu |
248 | ally unaffected by block size. | 248 | ally unaffected by block size. |
249 | 249 | ||
250 | Another significant point applies to files which fit in a | 250 | Another significant point applies to files which fit in a |
251 | single block -- that means most files you'd encounter | 251 | single block â€â€ that means most files you’d encounter |
252 | using a large block size. The amount of real memory | 252 | using a large block size. The amount of real memory |
253 | touched is proportional to the size of the file, since the | 253 | touched is proportional to the size of the file, since the |
254 | file is smaller than a block. For example, compressing a | 254 | file is smaller than a block. For example, compressing a |
255 | file 20,000 bytes long with the flag -9 will cause the | 255 | file 20,000 bytes long with the flag â€9 will cause the |
256 | compressor to allocate around 7600k of memory, but only | 256 | compressor to allocate around 7600k of memory, but only |
257 | touch 400k + 20000 * 8 = 560 kbytes of it. Similarly, the | 257 | touch 400k + 20000 * 8 = 560 kbytes of it. Similarly, the |
258 | decompressor will allocate 3700k but only touch 100k + | 258 | decompressor will allocate 3700k but only touch 100k + |
@@ -260,59 +260,59 @@ MMEEMMOORRYY MMAANNAAGGEEMMEENNTT | |||
260 | 260 | ||
261 | Here is a table which summarises the maximum memory usage | 261 | Here is a table which summarises the maximum memory usage |
262 | for different block sizes. Also recorded is the total | 262 | for different block sizes. Also recorded is the total |
263 | compressed size for 14 files of the Calgary Text Compres | 263 | compressed size for 14 files of the Calgary Text Compres |
264 | sion Corpus totalling 3,141,622 bytes. This column gives | 264 | sion Corpus totalling 3,141,622 bytes. This column gives |
265 | some feel for how compression varies with block size. | 265 | some feel for how compression varies with block size. |
266 | These figures tend to understate the advantage of larger | 266 | These figures tend to understate the advantage of larger |
267 | block sizes for larger files, since the Corpus is domi | 267 | block sizes for larger files, since the Corpus is domi |
268 | nated by smaller files. | 268 | nated by smaller files. |
269 | 269 | ||
270 | Compress Decompress Decompress Corpus | 270 | Compress Decompress Decompress Corpus |
271 | Flag usage usage -s usage Size | 271 | Flag usage usage â€s usage Size |
272 | 272 | ||
273 | -1 1200k 500k 350k 914704 | 273 | â€1 1200k 500k 350k 914704 |
274 | -2 2000k 900k 600k 877703 | 274 | â€2 2000k 900k 600k 877703 |
275 | -3 2800k 1300k 850k 860338 | 275 | â€3 2800k 1300k 850k 860338 |
276 | -4 3600k 1700k 1100k 846899 | 276 | â€4 3600k 1700k 1100k 846899 |
277 | -5 4400k 2100k 1350k 845160 | 277 | â€5 4400k 2100k 1350k 845160 |
278 | -6 5200k 2500k 1600k 838626 | 278 | â€6 5200k 2500k 1600k 838626 |
279 | -7 6100k 2900k 1850k 834096 | 279 | â€7 6100k 2900k 1850k 834096 |
280 | -8 6800k 3300k 2100k 828642 | 280 | â€8 6800k 3300k 2100k 828642 |
281 | -9 7600k 3700k 2350k 828642 | 281 | â€9 7600k 3700k 2350k 828642 |
282 | 282 | ||
283 | 283 | ||
284 | RREECCOOVVEERRIINNGG DDAATTAA FFRROOMM DDAAMMAAGGEEDD FFIILLEESS | 284 | RREECCOOVVEERRIINNGG DDAATTAA FFRROOMM DDAAMMAAGGEEDD FFIILLEESS |
285 | _b_z_i_p_2 compresses files in blocks, usually 900kbytes long. | 285 | _b_z_i_p_2 compresses files in blocks, usually 900kbytes long. |
286 | Each block is handled independently. If a media or trans | 286 | Each block is handled independently. If a media or trans |
287 | mission error causes a multi-block .bz2 file to become | 287 | mission error causes a multiâ€block .bz2 file to become |
288 | damaged, it may be possible to recover data from the | 288 | damaged, it may be possible to recover data from the |
289 | undamaged blocks in the file. | 289 | undamaged blocks in the file. |
290 | 290 | ||
291 | The compressed representation of each block is delimited | 291 | The compressed representation of each block is delimited |
292 | by a 48-bit pattern, which makes it possible to find the | 292 | by a 48â€bit pattern, which makes it possible to find the |
293 | block boundaries with reasonable certainty. Each block | 293 | block boundaries with reasonable certainty. Each block |
294 | also carries its own 32-bit CRC, so damaged blocks can be | 294 | also carries its own 32â€bit CRC, so damaged blocks can be |
295 | distinguished from undamaged ones. | 295 | distinguished from undamaged ones. |
296 | 296 | ||
297 | _b_z_i_p_2_r_e_c_o_v_e_r is a simple program whose purpose is to | 297 | _b_z_i_p_2_r_e_c_o_v_e_r is a simple program whose purpose is to |
298 | search for blocks in .bz2 files, and write each block out | 298 | search for blocks in .bz2 files, and write each block out |
299 | into its own .bz2 file. You can then use _b_z_i_p_2 -t to test | 299 | into its own .bz2 file. You can then use _b_z_i_p_2 −t to test |
300 | the integrity of the resulting files, and decompress those | 300 | the integrity of the resulting files, and decompress those |
301 | which are undamaged. | 301 | which are undamaged. |
302 | 302 | ||
303 | _b_z_i_p_2_r_e_c_o_v_e_r takes a single argument, the name of the dam | 303 | _b_z_i_p_2_r_e_c_o_v_e_r takes a single argument, the name of the dam |
304 | aged file, and writes a number of files | 304 | aged file, and writes a number of files |
305 | "rec00001file.bz2", "rec00002file.bz2", etc, containing | 305 | "rec00001file.bz2", "rec00002file.bz2", etc, containing |
306 | the extracted blocks. The output filenames are | 306 | the extracted blocks. The output filenames are |
307 | designed so that the use of wildcards in subsequent pro | 307 | designed so that the use of wildcards in subsequent pro |
308 | cessing -- for example, "bzip2 -dc rec*file.bz2 > recov | 308 | cessing â€â€ for example, "bzip2 â€dc rec*file.bz2 > recov |
309 | ered_data" -- processes the files in the correct order. | 309 | ered_data" â€â€ processes the files in the correct order. |
310 | 310 | ||
311 | _b_z_i_p_2_r_e_c_o_v_e_r should be of most use dealing with large .bz2 | 311 | _b_z_i_p_2_r_e_c_o_v_e_r should be of most use dealing with large .bz2 |
312 | files, as these will contain many blocks. It is clearly | 312 | files, as these will contain many blocks. It is clearly |
313 | futile to use it on damaged single-block files, since a | 313 | futile to use it on damaged singleâ€block files, since a |
314 | damaged block cannot be recovered. If you wish to min | 314 | damaged block cannot be recovered. If you wish to min |
315 | imise any potential data loss through media or transmis | 315 | imise any potential data loss through media or transmis |
316 | sion errors, you might consider compressing with a smaller | 316 | sion errors, you might consider compressing with a smaller |
317 | block size. | 317 | block size. |
318 | 318 | ||
@@ -324,21 +324,21 @@ PPEERRFFOORRMMAANNCCEE NNOOTTEESS | |||
324 | ..." (repeated several hundred times) may compress more | 324 | ..." (repeated several hundred times) may compress more |
325 | slowly than normal. Versions 0.9.5 and above fare much | 325 | slowly than normal. Versions 0.9.5 and above fare much |
326 | better than previous versions in this respect. The ratio | 326 | better than previous versions in this respect. The ratio |
327 | between worst-case and average-case compression time is in | 327 | between worstâ€case and averageâ€case compression time is in |
328 | the region of 10:1. For previous versions, this figure | 328 | the region of 10:1. For previous versions, this figure |
329 | was more like 100:1. You can use the -vvvv option to mon | 329 | was more like 100:1. You can use the −vvvv option to mon |
330 | itor progress in great detail, if you want. | 330 | itor progress in great detail, if you want. |
331 | 331 | ||
332 | Decompression speed is unaffected by these phenomena. | 332 | Decompression speed is unaffected by these phenomena. |
333 | 333 | ||
334 | _b_z_i_p_2 usually allocates several megabytes of memory to | 334 | _b_z_i_p_2 usually allocates several megabytes of memory to |
335 | operate in, and then charges all over it in a fairly ran | 335 | operate in, and then charges all over it in a fairly ran |
336 | dom fashion. This means that performance, both for com | 336 | dom fashion. This means that performance, both for com |
337 | pressing and decompressing, is largely determined by the | 337 | pressing and decompressing, is largely determined by the |
338 | speed at which your machine can service cache misses. | 338 | speed at which your machine can service cache misses. |
339 | Because of this, small changes to the code to reduce the | 339 | Because of this, small changes to the code to reduce the |
340 | miss rate have been observed to give disproportionately | 340 | miss rate have been observed to give disproportionately |
341 | large performance improvements. I imagine _b_z_i_p_2 will per | 341 | large performance improvements. I imagine _b_z_i_p_2 will per |
342 | form best on machines with very large caches. | 342 | form best on machines with very large caches. |
343 | 343 | ||
344 | 344 | ||
@@ -348,50 +348,51 @@ CCAAVVEEAATTSS | |||
348 | but the details of what the problem is sometimes seem | 348 | but the details of what the problem is sometimes seem |
349 | rather misleading. | 349 | rather misleading. |
350 | 350 | ||
351 | This manual page pertains to version 1.0.2 of _b_z_i_p_2_. Com | 351 | This manual page pertains to version 1.0.3 of _b_z_i_p_2_. Com |
352 | pressed data created by this version is entirely forwards | 352 | pressed data created by this version is entirely forwards |
353 | and backwards compatible with the previous public | 353 | and backwards compatible with the previous public |
354 | releases, versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0 and 1.0.1, | 354 | releases, versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0, 1.0.1 and |
355 | but with the following exception: 0.9.0 and above can cor | 355 | 1.0.2, but with the following exception: 0.9.0 and above |
356 | rectly decompress multiple concatenated compressed files. | 356 | can correctly decompress multiple concatenated compressed |
357 | 0.1pl2 cannot do this; it will stop after decompressing | 357 | files. 0.1pl2 cannot do this; it will stop after decom |
358 | just the first file in the stream. | 358 | pressing just the first file in the stream. |
359 | 359 | ||
360 | _b_z_i_p_2_r_e_c_o_v_e_r versions prior to this one, 1.0.2, used | 360 | _b_z_i_p_2_r_e_c_o_v_e_r versions prior to 1.0.2 used 32â€bit integers |
361 | 32-bit integers to represent bit positions in compressed | 361 | to represent bit positions in compressed files, so they |
362 | files, so it could not handle compressed files more than | 362 | could not handle compressed files more than 512 megabytes |
363 | 512 megabytes long. Version 1.0.2 and above uses 64-bit | 363 | long. Versions 1.0.2 and above use 64â€bit ints on some |
364 | ints on some platforms which support them (GNU supported | 364 | platforms which support them (GNU supported targets, and |
365 | targets, and Windows). To establish whether or not | 365 | Windows). To establish whether or not bzip2recover was |
366 | bzip2recover was built with such a limitation, run it | 366 | built with such a limitation, run it without arguments. |
367 | without arguments. In any event you can build yourself an | 367 | In any event you can build yourself an unlimited version |
368 | unlimited version if you can recompile it with MaybeUInt64 | 368 | if you can recompile it with MaybeUInt64 set to be an |
369 | set to be an unsigned 64-bit integer. | 369 | unsigned 64â€bit integer. |
370 | 370 | ||
371 | 371 | ||
372 | 372 | ||
373 | 373 | ||
374 | AAUUTTHHOORR | 374 | AAUUTTHHOORR |
375 | Julian Seward, jseward@acm.org. | 375 | Julian Seward, jsewardbzip.org. |
376 | 376 | ||
377 | http://sources.redhat.com/bzip2 | 377 | http://www.bzip.org |
378 | 378 | ||
379 | The ideas embodied in _b_z_i_p_2 are due to (at least) the fol | 379 | The ideas embodied in _b_z_i_p_2 are due to (at least) the fol |
380 | lowing people: Michael Burrows and David Wheeler (for the | 380 | lowing people: Michael Burrows and David Wheeler (for the |
381 | block sorting transformation), David Wheeler (again, for | 381 | block sorting transformation), David Wheeler (again, for |
382 | the Huffman coder), Peter Fenwick (for the structured cod | 382 | the Huffman coder), Peter Fenwick (for the structured cod |
383 | ing model in the original _b_z_i_p_, and many refinements), and | 383 | ing model in the original _b_z_i_p_, and many refinements), and |
384 | Alistair Moffat, Radford Neal and Ian Witten (for the | 384 | Alistair Moffat, Radford Neal and Ian Witten (for the |
385 | arithmetic coder in the original _b_z_i_p_)_. I am much | 385 | arithmetic coder in the original _b_z_i_p_)_. I am much |
386 | indebted for their help, support and advice. See the man | 386 | indebted for their help, support and advice. See the man |
387 | ual in the source distribution for pointers to sources of | 387 | ual in the source distribution for pointers to sources of |
388 | documentation. Christian von Roques encouraged me to look | 388 | documentation. Christian von Roques encouraged me to look |
389 | for faster sorting algorithms, so as to speed up compres | 389 | for faster sorting algorithms, so as to speed up compres |
390 | sion. Bela Lubkin encouraged me to improve the worst-case | 390 | sion. Bela Lubkin encouraged me to improve the worstâ€case |
391 | compression performance. The bz* scripts are derived from | 391 | compression performance. Donna Robinson XMLised the docu |
392 | those of GNU gzip. Many people sent patches, helped with | 392 | mentation. The bz* scripts are derived from those of GNU |
393 | portability problems, lent machines, gave advice and were | 393 | gzip. Many people sent patches, helped with portability |
394 | generally helpful. | 394 | problems, lent machines, gave advice and were generally |
395 | helpful. | ||
395 | 396 | ||
396 | 397 | ||
397 | 398 | ||
@@ -7,7 +7,7 @@ | |||
7 | This file is a part of bzip2 and/or libbzip2, a program and | 7 | This file is a part of bzip2 and/or libbzip2, a program and |
8 | library for lossless, block-sorting data compression. | 8 | library for lossless, block-sorting data compression. |
9 | 9 | ||
10 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 10 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
11 | 11 | ||
12 | Redistribution and use in source and binary forms, with or without | 12 | Redistribution and use in source and binary forms, with or without |
13 | modification, are permitted provided that the following conditions | 13 | modification, are permitted provided that the following conditions |
@@ -41,7 +41,7 @@ | |||
41 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 41 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
42 | 42 | ||
43 | Julian Seward, Cambridge, UK. | 43 | Julian Seward, Cambridge, UK. |
44 | jseward@acm.org | 44 | jseward@bzip.org |
45 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 45 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
46 | 46 | ||
47 | This program is based on (at least) the work of: | 47 | This program is based on (at least) the work of: |
@@ -525,6 +525,7 @@ Bool uncompressStream ( FILE *zStream, FILE *stream ) | |||
525 | UChar obuf[5000]; | 525 | UChar obuf[5000]; |
526 | UChar unused[BZ_MAX_UNUSED]; | 526 | UChar unused[BZ_MAX_UNUSED]; |
527 | Int32 nUnused; | 527 | Int32 nUnused; |
528 | void* unusedTmpV; | ||
528 | UChar* unusedTmp; | 529 | UChar* unusedTmp; |
529 | 530 | ||
530 | nUnused = 0; | 531 | nUnused = 0; |
@@ -554,9 +555,10 @@ Bool uncompressStream ( FILE *zStream, FILE *stream ) | |||
554 | } | 555 | } |
555 | if (bzerr != BZ_STREAM_END) goto errhandler; | 556 | if (bzerr != BZ_STREAM_END) goto errhandler; |
556 | 557 | ||
557 | BZ2_bzReadGetUnused ( &bzerr, bzf, (void**)(&unusedTmp), &nUnused ); | 558 | BZ2_bzReadGetUnused ( &bzerr, bzf, &unusedTmpV, &nUnused ); |
558 | if (bzerr != BZ_OK) panic ( "decompress:bzReadGetUnused" ); | 559 | if (bzerr != BZ_OK) panic ( "decompress:bzReadGetUnused" ); |
559 | 560 | ||
561 | unusedTmp = (UChar*)unusedTmpV; | ||
560 | for (i = 0; i < nUnused; i++) unused[i] = unusedTmp[i]; | 562 | for (i = 0; i < nUnused; i++) unused[i] = unusedTmp[i]; |
561 | 563 | ||
562 | BZ2_bzReadClose ( &bzerr, bzf ); | 564 | BZ2_bzReadClose ( &bzerr, bzf ); |
@@ -639,6 +641,7 @@ Bool testStream ( FILE *zStream ) | |||
639 | UChar obuf[5000]; | 641 | UChar obuf[5000]; |
640 | UChar unused[BZ_MAX_UNUSED]; | 642 | UChar unused[BZ_MAX_UNUSED]; |
641 | Int32 nUnused; | 643 | Int32 nUnused; |
644 | void* unusedTmpV; | ||
642 | UChar* unusedTmp; | 645 | UChar* unusedTmp; |
643 | 646 | ||
644 | nUnused = 0; | 647 | nUnused = 0; |
@@ -662,9 +665,10 @@ Bool testStream ( FILE *zStream ) | |||
662 | } | 665 | } |
663 | if (bzerr != BZ_STREAM_END) goto errhandler; | 666 | if (bzerr != BZ_STREAM_END) goto errhandler; |
664 | 667 | ||
665 | BZ2_bzReadGetUnused ( &bzerr, bzf, (void**)(&unusedTmp), &nUnused ); | 668 | BZ2_bzReadGetUnused ( &bzerr, bzf, &unusedTmpV, &nUnused ); |
666 | if (bzerr != BZ_OK) panic ( "test:bzReadGetUnused" ); | 669 | if (bzerr != BZ_OK) panic ( "test:bzReadGetUnused" ); |
667 | 670 | ||
671 | unusedTmp = (UChar*)unusedTmpV; | ||
668 | for (i = 0; i < nUnused; i++) unused[i] = unusedTmp[i]; | 672 | for (i = 0; i < nUnused; i++) unused[i] = unusedTmp[i]; |
669 | 673 | ||
670 | BZ2_bzReadClose ( &bzerr, bzf ); | 674 | BZ2_bzReadClose ( &bzerr, bzf ); |
@@ -828,7 +832,7 @@ void panic ( Char* s ) | |||
828 | "\n%s: PANIC -- internal consistency error:\n" | 832 | "\n%s: PANIC -- internal consistency error:\n" |
829 | "\t%s\n" | 833 | "\t%s\n" |
830 | "\tThis is a BUG. Please report it to me at:\n" | 834 | "\tThis is a BUG. Please report it to me at:\n" |
831 | "\tjseward@acm.org\n", | 835 | "\tjseward@bzip.org\n", |
832 | progName, s ); | 836 | progName, s ); |
833 | showFileNames(); | 837 | showFileNames(); |
834 | cleanUpAndFail( 3 ); | 838 | cleanUpAndFail( 3 ); |
@@ -908,7 +912,7 @@ void mySIGSEGVorSIGBUScatcher ( IntNative n ) | |||
908 | " The user's manual, Section 4.3, has more info on (1) and (2).\n" | 912 | " The user's manual, Section 4.3, has more info on (1) and (2).\n" |
909 | " \n" | 913 | " \n" |
910 | " If you suspect this is a bug in bzip2, or are unsure about (1)\n" | 914 | " If you suspect this is a bug in bzip2, or are unsure about (1)\n" |
911 | " or (2), feel free to report it to me at: jseward@acm.org.\n" | 915 | " or (2), feel free to report it to me at: jseward@bzip.org.\n" |
912 | " Section 4.3 of the user's manual describes the info a useful\n" | 916 | " Section 4.3 of the user's manual describes the info a useful\n" |
913 | " bug report should have. If the manual is available on your\n" | 917 | " bug report should have. If the manual is available on your\n" |
914 | " system, please try and read it before mailing me. If you don't\n" | 918 | " system, please try and read it before mailing me. If you don't\n" |
@@ -931,7 +935,7 @@ void mySIGSEGVorSIGBUScatcher ( IntNative n ) | |||
931 | " The user's manual, Section 4.3, has more info on (2) and (3).\n" | 935 | " The user's manual, Section 4.3, has more info on (2) and (3).\n" |
932 | " \n" | 936 | " \n" |
933 | " If you suspect this is a bug in bzip2, or are unsure about (2)\n" | 937 | " If you suspect this is a bug in bzip2, or are unsure about (2)\n" |
934 | " or (3), feel free to report it to me at: jseward@acm.org.\n" | 938 | " or (3), feel free to report it to me at: jseward@bzip.org.\n" |
935 | " Section 4.3 of the user's manual describes the info a useful\n" | 939 | " Section 4.3 of the user's manual describes the info a useful\n" |
936 | " bug report should have. If the manual is available on your\n" | 940 | " bug report should have. If the manual is available on your\n" |
937 | " system, please try and read it before mailing me. If you don't\n" | 941 | " system, please try and read it before mailing me. If you don't\n" |
@@ -1674,7 +1678,7 @@ void license ( void ) | |||
1674 | "bzip2, a block-sorting file compressor. " | 1678 | "bzip2, a block-sorting file compressor. " |
1675 | "Version %s.\n" | 1679 | "Version %s.\n" |
1676 | " \n" | 1680 | " \n" |
1677 | " Copyright (C) 1996-2002 by Julian Seward.\n" | 1681 | " Copyright (C) 1996-2005 by Julian Seward.\n" |
1678 | " \n" | 1682 | " \n" |
1679 | " This program is free software; you can redistribute it and/or modify\n" | 1683 | " This program is free software; you can redistribute it and/or modify\n" |
1680 | " it under the terms set out in the LICENSE file, which is included\n" | 1684 | " it under the terms set out in the LICENSE file, which is included\n" |
@@ -1,6 +1,6 @@ | |||
1 | 1 | ||
2 | NAME | 2 | NAME |
3 | bzip2, bunzip2 - a block-sorting file compressor, v1.0.2 | 3 | bzip2, bunzip2 - a block-sorting file compressor, v1.0.3 |
4 | bzcat - decompresses files to stdout | 4 | bzcat - decompresses files to stdout |
5 | bzip2recover - recovers data from damaged bzip2 files | 5 | bzip2recover - recovers data from damaged bzip2 files |
6 | 6 | ||
@@ -17,20 +17,20 @@ DESCRIPTION | |||
17 | sorting text compression algorithm, and Huffman coding. | 17 | sorting text compression algorithm, and Huffman coding. |
18 | Compression is generally considerably better than that | 18 | Compression is generally considerably better than that |
19 | achieved by more conventional LZ77/LZ78-based compressors, | 19 | achieved by more conventional LZ77/LZ78-based compressors, |
20 | and approaches the performance of the PPM family of sta | 20 | and approaches the performance of the PPM family of sta- |
21 | tistical compressors. | 21 | tistical compressors. |
22 | 22 | ||
23 | The command-line options are deliberately very similar to | 23 | The command-line options are deliberately very similar to |
24 | those of GNU gzip, but they are not identical. | 24 | those of GNU gzip, but they are not identical. |
25 | 25 | ||
26 | bzip2 expects a list of file names to accompany the com | 26 | bzip2 expects a list of file names to accompany the com- |
27 | mand-line flags. Each file is replaced by a compressed | 27 | mand-line flags. Each file is replaced by a compressed |
28 | version of itself, with the name "original_name.bz2". | 28 | version of itself, with the name "original_name.bz2". |
29 | Each compressed file has the same modification date, per | 29 | Each compressed file has the same modification date, per- |
30 | missions, and, when possible, ownership as the correspond | 30 | missions, and, when possible, ownership as the correspond- |
31 | ing original, so that these properties can be correctly | 31 | ing original, so that these properties can be correctly |
32 | restored at decompression time. File name handling is | 32 | restored at decompression time. File name handling is |
33 | naive in the sense that there is no mechanism for preserv | 33 | naive in the sense that there is no mechanism for preserv- |
34 | ing original file names, permissions, ownerships or dates | 34 | ing original file names, permissions, ownerships or dates |
35 | in filesystems which lack these concepts, or have serious | 35 | in filesystems which lack these concepts, or have serious |
36 | file name length restrictions, such as MS-DOS. | 36 | file name length restrictions, such as MS-DOS. |
@@ -61,23 +61,23 @@ DESCRIPTION | |||
61 | guess the name of the original file, and uses the original | 61 | guess the name of the original file, and uses the original |
62 | name with .out appended. | 62 | name with .out appended. |
63 | 63 | ||
64 | As with compression, supplying no filenames causes decom | 64 | As with compression, supplying no filenames causes decom- |
65 | pression from standard input to standard output. | 65 | pression from standard input to standard output. |
66 | 66 | ||
67 | bunzip2 will correctly decompress a file which is the con | 67 | bunzip2 will correctly decompress a file which is the con- |
68 | catenation of two or more compressed files. The result is | 68 | catenation of two or more compressed files. The result is |
69 | the concatenation of the corresponding uncompressed files. | 69 | the concatenation of the corresponding uncompressed files. |
70 | Integrity testing (-t) of concatenated compressed files is | 70 | Integrity testing (-t) of concatenated compressed files is |
71 | also supported. | 71 | also supported. |
72 | 72 | ||
73 | You can also compress or decompress files to the standard | 73 | You can also compress or decompress files to the standard |
74 | output by giving the -c flag. Multiple files may be com | 74 | output by giving the -c flag. Multiple files may be com- |
75 | pressed and decompressed like this. The resulting outputs | 75 | pressed and decompressed like this. The resulting outputs |
76 | are fed sequentially to stdout. Compression of multiple | 76 | are fed sequentially to stdout. Compression of multiple |
77 | files in this manner generates a stream containing multi | 77 | files in this manner generates a stream containing multi- |
78 | ple compressed file representations. Such a stream can be | 78 | ple compressed file representations. Such a stream can be |
79 | decompressed correctly only by bzip2 version 0.9.0 or | 79 | decompressed correctly only by bzip2 version 0.9.0 or |
80 | later. Earlier versions of bzip2 will stop after decom | 80 | later. Earlier versions of bzip2 will stop after decom- |
81 | pressing the first file in the stream. | 81 | pressing the first file in the stream. |
82 | 82 | ||
83 | bzcat (or bzip2 -dc) decompresses all specified files to | 83 | bzcat (or bzip2 -dc) decompresses all specified files to |
@@ -98,7 +98,7 @@ DESCRIPTION | |||
98 | 98 | ||
99 | As a self-check for your protection, bzip2 uses 32-bit | 99 | As a self-check for your protection, bzip2 uses 32-bit |
100 | CRCs to make sure that the decompressed version of a file | 100 | CRCs to make sure that the decompressed version of a file |
101 | is identical to the original. This guards against corrup | 101 | is identical to the original. This guards against corrup- |
102 | tion of the compressed data, and against undetected bugs | 102 | tion of the compressed data, and against undetected bugs |
103 | in bzip2 (hopefully very unlikely). The chances of data | 103 | in bzip2 (hopefully very unlikely). The chances of data |
104 | corruption going undetected is microscopic, about one | 104 | corruption going undetected is microscopic, about one |
@@ -171,7 +171,7 @@ OPTIONS | |||
171 | 171 | ||
172 | -v --verbose | 172 | -v --verbose |
173 | Verbose mode -- show the compression ratio for each | 173 | Verbose mode -- show the compression ratio for each |
174 | file processed. Further -v's increase the ver | 174 | file processed. Further -v's increase the ver- |
175 | bosity level, spewing out lots of information which | 175 | bosity level, spewing out lots of information which |
176 | is primarily of interest for diagnostic purposes. | 176 | is primarily of interest for diagnostic purposes. |
177 | 177 | ||
@@ -184,19 +184,19 @@ OPTIONS | |||
184 | compressing. Has no effect when decompressing. | 184 | compressing. Has no effect when decompressing. |
185 | See MEMORY MANAGEMENT below. The --fast and --best | 185 | See MEMORY MANAGEMENT below. The --fast and --best |
186 | aliases are primarily for GNU gzip compatibility. | 186 | aliases are primarily for GNU gzip compatibility. |
187 | In particular, --fast doesn't make things signifi | 187 | In particular, --fast doesn't make things signifi- |
188 | cantly faster. And --best merely selects the | 188 | cantly faster. And --best merely selects the |
189 | default behaviour. | 189 | default behaviour. |
190 | 190 | ||
191 | -- Treats all subsequent arguments as file names, even | 191 | -- Treats all subsequent arguments as file names, even |
192 | if they start with a dash. This is so you can han | 192 | if they start with a dash. This is so you can han- |
193 | dle files with names beginning with a dash, for | 193 | dle files with names beginning with a dash, for |
194 | example: bzip2 -- -myfilename. | 194 | example: bzip2 -- -myfilename. |
195 | 195 | ||
196 | --repetitive-fast --repetitive-best | 196 | --repetitive-fast --repetitive-best |
197 | These flags are redundant in versions 0.9.5 and | 197 | These flags are redundant in versions 0.9.5 and |
198 | above. They provided some coarse control over the | 198 | above. They provided some coarse control over the |
199 | behaviour of the sorting algorithm in earlier ver | 199 | behaviour of the sorting algorithm in earlier ver- |
200 | sions, which was sometimes useful. 0.9.5 and above | 200 | sions, which was sometimes useful. 0.9.5 and above |
201 | have an improved algorithm which renders these | 201 | have an improved algorithm which renders these |
202 | flags irrelevant. | 202 | flags irrelevant. |
@@ -207,7 +207,7 @@ MEMORY MANAGEMENT | |||
207 | affects both the compression ratio achieved, and the | 207 | affects both the compression ratio achieved, and the |
208 | amount of memory needed for compression and decompression. | 208 | amount of memory needed for compression and decompression. |
209 | The flags -1 through -9 specify the block size to be | 209 | The flags -1 through -9 specify the block size to be |
210 | 100,000 bytes through 900,000 bytes (the default) respec | 210 | 100,000 bytes through 900,000 bytes (the default) respec- |
211 | tively. At decompression time, the block size used for | 211 | tively. At decompression time, the block size used for |
212 | compression is read from the header of the compressed | 212 | compression is read from the header of the compressed |
213 | file, and bunzip2 then allocates itself just enough memory | 213 | file, and bunzip2 then allocates itself just enough memory |
@@ -235,13 +235,13 @@ MEMORY MANAGEMENT | |||
235 | bunzip2 will require about 3700 kbytes to decompress. To | 235 | bunzip2 will require about 3700 kbytes to decompress. To |
236 | support decompression of any file on a 4 megabyte machine, | 236 | support decompression of any file on a 4 megabyte machine, |
237 | bunzip2 has an option to decompress using approximately | 237 | bunzip2 has an option to decompress using approximately |
238 | half this amount of memory, about 2300 kbytes. Decompres | 238 | half this amount of memory, about 2300 kbytes. Decompres- |
239 | sion speed is also halved, so you should use this option | 239 | sion speed is also halved, so you should use this option |
240 | only where necessary. The relevant flag is -s. | 240 | only where necessary. The relevant flag is -s. |
241 | 241 | ||
242 | In general, try and use the largest block size memory con | 242 | In general, try and use the largest block size memory con- |
243 | straints allow, since that maximises the compression | 243 | straints allow, since that maximises the compression |
244 | achieved. Compression and decompression speed are virtu | 244 | achieved. Compression and decompression speed are virtu- |
245 | ally unaffected by block size. | 245 | ally unaffected by block size. |
246 | 246 | ||
247 | Another significant point applies to files which fit in a | 247 | Another significant point applies to files which fit in a |
@@ -257,11 +257,11 @@ MEMORY MANAGEMENT | |||
257 | 257 | ||
258 | Here is a table which summarises the maximum memory usage | 258 | Here is a table which summarises the maximum memory usage |
259 | for different block sizes. Also recorded is the total | 259 | for different block sizes. Also recorded is the total |
260 | compressed size for 14 files of the Calgary Text Compres | 260 | compressed size for 14 files of the Calgary Text Compres- |
261 | sion Corpus totalling 3,141,622 bytes. This column gives | 261 | sion Corpus totalling 3,141,622 bytes. This column gives |
262 | some feel for how compression varies with block size. | 262 | some feel for how compression varies with block size. |
263 | These figures tend to understate the advantage of larger | 263 | These figures tend to understate the advantage of larger |
264 | block sizes for larger files, since the Corpus is domi | 264 | block sizes for larger files, since the Corpus is domi- |
265 | nated by smaller files. | 265 | nated by smaller files. |
266 | 266 | ||
267 | Compress Decompress Decompress Corpus | 267 | Compress Decompress Decompress Corpus |
@@ -280,7 +280,7 @@ MEMORY MANAGEMENT | |||
280 | 280 | ||
281 | RECOVERING DATA FROM DAMAGED FILES | 281 | RECOVERING DATA FROM DAMAGED FILES |
282 | bzip2 compresses files in blocks, usually 900kbytes long. | 282 | bzip2 compresses files in blocks, usually 900kbytes long. |
283 | Each block is handled independently. If a media or trans | 283 | Each block is handled independently. If a media or trans- |
284 | mission error causes a multi-block .bz2 file to become | 284 | mission error causes a multi-block .bz2 file to become |
285 | damaged, it may be possible to recover data from the | 285 | damaged, it may be possible to recover data from the |
286 | undamaged blocks in the file. | 286 | undamaged blocks in the file. |
@@ -297,19 +297,19 @@ RECOVERING DATA FROM DAMAGED FILES | |||
297 | the integrity of the resulting files, and decompress those | 297 | the integrity of the resulting files, and decompress those |
298 | which are undamaged. | 298 | which are undamaged. |
299 | 299 | ||
300 | bzip2recover takes a single argument, the name of the dam | 300 | bzip2recover takes a single argument, the name of the dam- |
301 | aged file, and writes a number of files | 301 | aged file, and writes a number of files |
302 | "rec00001file.bz2", "rec00002file.bz2", etc, containing | 302 | "rec00001file.bz2", "rec00002file.bz2", etc, containing |
303 | the extracted blocks. The output filenames are | 303 | the extracted blocks. The output filenames are |
304 | designed so that the use of wildcards in subsequent pro | 304 | designed so that the use of wildcards in subsequent pro- |
305 | cessing -- for example, "bzip2 -dc rec*file.bz2 > recov | 305 | cessing -- for example, "bzip2 -dc rec*file.bz2 > recov- |
306 | ered_data" -- processes the files in the correct order. | 306 | ered_data" -- processes the files in the correct order. |
307 | 307 | ||
308 | bzip2recover should be of most use dealing with large .bz2 | 308 | bzip2recover should be of most use dealing with large .bz2 |
309 | files, as these will contain many blocks. It is clearly | 309 | files, as these will contain many blocks. It is clearly |
310 | futile to use it on damaged single-block files, since a | 310 | futile to use it on damaged single-block files, since a |
311 | damaged block cannot be recovered. If you wish to min | 311 | damaged block cannot be recovered. If you wish to min- |
312 | imise any potential data loss through media or transmis | 312 | imise any potential data loss through media or transmis- |
313 | sion errors, you might consider compressing with a smaller | 313 | sion errors, you might consider compressing with a smaller |
314 | block size. | 314 | block size. |
315 | 315 | ||
@@ -323,19 +323,19 @@ PERFORMANCE NOTES | |||
323 | better than previous versions in this respect. The ratio | 323 | better than previous versions in this respect. The ratio |
324 | between worst-case and average-case compression time is in | 324 | between worst-case and average-case compression time is in |
325 | the region of 10:1. For previous versions, this figure | 325 | the region of 10:1. For previous versions, this figure |
326 | was more like 100:1. You can use the -vvvv option to mon | 326 | was more like 100:1. You can use the -vvvv option to mon- |
327 | itor progress in great detail, if you want. | 327 | itor progress in great detail, if you want. |
328 | 328 | ||
329 | Decompression speed is unaffected by these phenomena. | 329 | Decompression speed is unaffected by these phenomena. |
330 | 330 | ||
331 | bzip2 usually allocates several megabytes of memory to | 331 | bzip2 usually allocates several megabytes of memory to |
332 | operate in, and then charges all over it in a fairly ran | 332 | operate in, and then charges all over it in a fairly ran- |
333 | dom fashion. This means that performance, both for com | 333 | dom fashion. This means that performance, both for com- |
334 | pressing and decompressing, is largely determined by the | 334 | pressing and decompressing, is largely determined by the |
335 | speed at which your machine can service cache misses. | 335 | speed at which your machine can service cache misses. |
336 | Because of this, small changes to the code to reduce the | 336 | Because of this, small changes to the code to reduce the |
337 | miss rate have been observed to give disproportionately | 337 | miss rate have been observed to give disproportionately |
338 | large performance improvements. I imagine bzip2 will per | 338 | large performance improvements. I imagine bzip2 will per- |
339 | form best on machines with very large caches. | 339 | form best on machines with very large caches. |
340 | 340 | ||
341 | 341 | ||
@@ -345,46 +345,47 @@ CAVEATS | |||
345 | but the details of what the problem is sometimes seem | 345 | but the details of what the problem is sometimes seem |
346 | rather misleading. | 346 | rather misleading. |
347 | 347 | ||
348 | This manual page pertains to version 1.0.2 of bzip2. Com | 348 | This manual page pertains to version 1.0.3 of bzip2. Com- |
349 | pressed data created by this version is entirely forwards | 349 | pressed data created by this version is entirely forwards |
350 | and backwards compatible with the previous public | 350 | and backwards compatible with the previous public |
351 | releases, versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0 and 1.0.1, | 351 | releases, versions 0.1pl2, 0.9.0, 0.9.5, 1.0.0, 1.0.1 and |
352 | but with the following exception: 0.9.0 and above can cor | 352 | 1.0.2, but with the following exception: 0.9.0 and above |
353 | rectly decompress multiple concatenated compressed files. | 353 | can correctly decompress multiple concatenated compressed |
354 | 0.1pl2 cannot do this; it will stop after decompressing | 354 | files. 0.1pl2 cannot do this; it will stop after decom- |
355 | just the first file in the stream. | 355 | pressing just the first file in the stream. |
356 | 356 | ||
357 | bzip2recover versions prior to this one, 1.0.2, used | 357 | bzip2recover versions prior to 1.0.2 used 32-bit integers |
358 | 32-bit integers to represent bit positions in compressed | 358 | to represent bit positions in compressed files, so they |
359 | files, so it could not handle compressed files more than | 359 | could not handle compressed files more than 512 megabytes |
360 | 512 megabytes long. Version 1.0.2 and above uses 64-bit | 360 | long. Versions 1.0.2 and above use 64-bit ints on some |
361 | ints on some platforms which support them (GNU supported | 361 | platforms which support them (GNU supported targets, and |
362 | targets, and Windows). To establish whether or not | 362 | Windows). To establish whether or not bzip2recover was |
363 | bzip2recover was built with such a limitation, run it | 363 | built with such a limitation, run it without arguments. |
364 | without arguments. In any event you can build yourself an | 364 | In any event you can build yourself an unlimited version |
365 | unlimited version if you can recompile it with MaybeUInt64 | 365 | if you can recompile it with MaybeUInt64 set to be an |
366 | set to be an unsigned 64-bit integer. | 366 | unsigned 64-bit integer. |
367 | 367 | ||
368 | 368 | ||
369 | AUTHOR | 369 | AUTHOR |
370 | Julian Seward, jseward@acm.org. | 370 | Julian Seward, jsewardbzip.org. |
371 | 371 | ||
372 | http://sources.redhat.com/bzip2 | 372 | http://www.bzip.org |
373 | 373 | ||
374 | The ideas embodied in bzip2 are due to (at least) the fol | 374 | The ideas embodied in bzip2 are due to (at least) the fol- |
375 | lowing people: Michael Burrows and David Wheeler (for the | 375 | lowing people: Michael Burrows and David Wheeler (for the |
376 | block sorting transformation), David Wheeler (again, for | 376 | block sorting transformation), David Wheeler (again, for |
377 | the Huffman coder), Peter Fenwick (for the structured cod | 377 | the Huffman coder), Peter Fenwick (for the structured cod- |
378 | ing model in the original bzip, and many refinements), and | 378 | ing model in the original bzip, and many refinements), and |
379 | Alistair Moffat, Radford Neal and Ian Witten (for the | 379 | Alistair Moffat, Radford Neal and Ian Witten (for the |
380 | arithmetic coder in the original bzip). I am much | 380 | arithmetic coder in the original bzip). I am much |
381 | indebted for their help, support and advice. See the man | 381 | indebted for their help, support and advice. See the man- |
382 | ual in the source distribution for pointers to sources of | 382 | ual in the source distribution for pointers to sources of |
383 | documentation. Christian von Roques encouraged me to look | 383 | documentation. Christian von Roques encouraged me to look |
384 | for faster sorting algorithms, so as to speed up compres | 384 | for faster sorting algorithms, so as to speed up compres- |
385 | sion. Bela Lubkin encouraged me to improve the worst-case | 385 | sion. Bela Lubkin encouraged me to improve the worst-case |
386 | compression performance. The bz* scripts are derived from | 386 | compression performance. Donna Robinson XMLised the docu- |
387 | those of GNU gzip. Many people sent patches, helped with | 387 | mentation. The bz* scripts are derived from those of GNU |
388 | portability problems, lent machines, gave advice and were | 388 | gzip. Many people sent patches, helped with portability |
389 | generally helpful. | 389 | problems, lent machines, gave advice and were generally |
390 | helpful. | ||
390 | 391 | ||
diff --git a/bzip2recover.c b/bzip2recover.c index 286873b..5cd405d 100644 --- a/bzip2recover.c +++ b/bzip2recover.c | |||
@@ -7,9 +7,9 @@ | |||
7 | /*-- | 7 | /*-- |
8 | This program is bzip2recover, a program to attempt data | 8 | This program is bzip2recover, a program to attempt data |
9 | salvage from damaged files created by the accompanying | 9 | salvage from damaged files created by the accompanying |
10 | bzip2-1.0 program. | 10 | bzip2-1.0.3 program. |
11 | 11 | ||
12 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 12 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
13 | 13 | ||
14 | Redistribution and use in source and binary forms, with or without | 14 | Redistribution and use in source and binary forms, with or without |
15 | modification, are permitted provided that the following conditions | 15 | modification, are permitted provided that the following conditions |
@@ -43,8 +43,8 @@ | |||
43 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 43 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
44 | 44 | ||
45 | Julian Seward, Cambridge, UK. | 45 | Julian Seward, Cambridge, UK. |
46 | jseward@acm.org | 46 | jseward@bzip.org |
47 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 47 | bzip2/libbzip2 version 1.0.3 of 15 February 2005 |
48 | --*/ | 48 | --*/ |
49 | 49 | ||
50 | /*-- | 50 | /*-- |
@@ -345,7 +345,7 @@ Int32 main ( Int32 argc, Char** argv ) | |||
345 | inFileName[0] = outFileName[0] = 0; | 345 | inFileName[0] = outFileName[0] = 0; |
346 | 346 | ||
347 | fprintf ( stderr, | 347 | fprintf ( stderr, |
348 | "bzip2recover 1.0.2: extracts blocks from damaged .bz2 files.\n" ); | 348 | "bzip2recover 1.0.3: extracts blocks from damaged .bz2 files.\n" ); |
349 | 349 | ||
350 | if (argc != 2) { | 350 | if (argc != 2) { |
351 | fprintf ( stderr, "%s: usage is `%s damaged_file_name'.\n", | 351 | fprintf ( stderr, "%s: usage is `%s damaged_file_name'.\n", |
@@ -374,7 +374,7 @@ Int32 main ( Int32 argc, Char** argv ) | |||
374 | if (strlen(argv[1]) >= BZ_MAX_FILENAME-20) { | 374 | if (strlen(argv[1]) >= BZ_MAX_FILENAME-20) { |
375 | fprintf ( stderr, | 375 | fprintf ( stderr, |
376 | "%s: supplied filename is suspiciously (>= %d chars) long. Bye!\n", | 376 | "%s: supplied filename is suspiciously (>= %d chars) long. Bye!\n", |
377 | progName, strlen(argv[1]) ); | 377 | progName, (int)strlen(argv[1]) ); |
378 | exit(1); | 378 | exit(1); |
379 | } | 379 | } |
380 | 380 | ||
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
@@ -88,12 +88,12 @@ void BZ2_bz__AssertH__fail ( int errcode ) | |||
88 | fprintf(stderr, | 88 | fprintf(stderr, |
89 | "\n\nbzip2/libbzip2: internal error number %d.\n" | 89 | "\n\nbzip2/libbzip2: internal error number %d.\n" |
90 | "This is a bug in bzip2/libbzip2, %s.\n" | 90 | "This is a bug in bzip2/libbzip2, %s.\n" |
91 | "Please report it to me at: jseward@acm.org. If this happened\n" | 91 | "Please report it to me at: jseward@bzip.org. If this happened\n" |
92 | "when you were using some program which uses libbzip2 as a\n" | 92 | "when you were using some program which uses libbzip2 as a\n" |
93 | "component, you should also report this bug to the author(s)\n" | 93 | "component, you should also report this bug to the author(s)\n" |
94 | "of that program. Please make an effort to report this bug;\n" | 94 | "of that program. Please make an effort to report this bug;\n" |
95 | "timely and accurate bug reports eventually lead to higher\n" | 95 | "timely and accurate bug reports eventually lead to higher\n" |
96 | "quality software. Thanks. Julian Seward, 30 December 2001.\n\n", | 96 | "quality software. Thanks. Julian Seward, 15 February 2005.\n\n", |
97 | errcode, | 97 | errcode, |
98 | BZ2_bzlibVersion() | 98 | BZ2_bzlibVersion() |
99 | ); | 99 | ); |
@@ -574,8 +574,11 @@ int BZ_API(BZ2_bzDecompressInit) | |||
574 | 574 | ||
575 | 575 | ||
576 | /*---------------------------------------------------*/ | 576 | /*---------------------------------------------------*/ |
577 | /* Return True iff data corruption is discovered. | ||
578 | Returns False if there is no problem. | ||
579 | */ | ||
577 | static | 580 | static |
578 | void unRLE_obuf_to_output_FAST ( DState* s ) | 581 | Bool unRLE_obuf_to_output_FAST ( DState* s ) |
579 | { | 582 | { |
580 | UChar k1; | 583 | UChar k1; |
581 | 584 | ||
@@ -584,7 +587,7 @@ void unRLE_obuf_to_output_FAST ( DState* s ) | |||
584 | while (True) { | 587 | while (True) { |
585 | /* try to finish existing run */ | 588 | /* try to finish existing run */ |
586 | while (True) { | 589 | while (True) { |
587 | if (s->strm->avail_out == 0) return; | 590 | if (s->strm->avail_out == 0) return False; |
588 | if (s->state_out_len == 0) break; | 591 | if (s->state_out_len == 0) break; |
589 | *( (UChar*)(s->strm->next_out) ) = s->state_out_ch; | 592 | *( (UChar*)(s->strm->next_out) ) = s->state_out_ch; |
590 | BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch ); | 593 | BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch ); |
@@ -594,10 +597,13 @@ void unRLE_obuf_to_output_FAST ( DState* s ) | |||
594 | s->strm->total_out_lo32++; | 597 | s->strm->total_out_lo32++; |
595 | if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++; | 598 | if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++; |
596 | } | 599 | } |
597 | 600 | ||
598 | /* can a new run be started? */ | 601 | /* can a new run be started? */ |
599 | if (s->nblock_used == s->save_nblock+1) return; | 602 | if (s->nblock_used == s->save_nblock+1) return False; |
600 | 603 | ||
604 | /* Only caused by corrupt data stream? */ | ||
605 | if (s->nblock_used > s->save_nblock+1) | ||
606 | return True; | ||
601 | 607 | ||
602 | s->state_out_len = 1; | 608 | s->state_out_len = 1; |
603 | s->state_out_ch = s->k0; | 609 | s->state_out_ch = s->k0; |
@@ -667,6 +673,10 @@ void unRLE_obuf_to_output_FAST ( DState* s ) | |||
667 | cs_avail_out--; | 673 | cs_avail_out--; |
668 | } | 674 | } |
669 | } | 675 | } |
676 | /* Only caused by corrupt data stream? */ | ||
677 | if (c_nblock_used > s_save_nblockPP) | ||
678 | return True; | ||
679 | |||
670 | /* can a new run be started? */ | 680 | /* can a new run be started? */ |
671 | if (c_nblock_used == s_save_nblockPP) { | 681 | if (c_nblock_used == s_save_nblockPP) { |
672 | c_state_out_len = 0; goto return_notr; | 682 | c_state_out_len = 0; goto return_notr; |
@@ -712,6 +722,7 @@ void unRLE_obuf_to_output_FAST ( DState* s ) | |||
712 | s->strm->avail_out = cs_avail_out; | 722 | s->strm->avail_out = cs_avail_out; |
713 | /* end save */ | 723 | /* end save */ |
714 | } | 724 | } |
725 | return False; | ||
715 | } | 726 | } |
716 | 727 | ||
717 | 728 | ||
@@ -732,8 +743,11 @@ __inline__ Int32 BZ2_indexIntoF ( Int32 indx, Int32 *cftab ) | |||
732 | 743 | ||
733 | 744 | ||
734 | /*---------------------------------------------------*/ | 745 | /*---------------------------------------------------*/ |
746 | /* Return True iff data corruption is discovered. | ||
747 | Returns False if there is no problem. | ||
748 | */ | ||
735 | static | 749 | static |
736 | void unRLE_obuf_to_output_SMALL ( DState* s ) | 750 | Bool unRLE_obuf_to_output_SMALL ( DState* s ) |
737 | { | 751 | { |
738 | UChar k1; | 752 | UChar k1; |
739 | 753 | ||
@@ -742,7 +756,7 @@ void unRLE_obuf_to_output_SMALL ( DState* s ) | |||
742 | while (True) { | 756 | while (True) { |
743 | /* try to finish existing run */ | 757 | /* try to finish existing run */ |
744 | while (True) { | 758 | while (True) { |
745 | if (s->strm->avail_out == 0) return; | 759 | if (s->strm->avail_out == 0) return False; |
746 | if (s->state_out_len == 0) break; | 760 | if (s->state_out_len == 0) break; |
747 | *( (UChar*)(s->strm->next_out) ) = s->state_out_ch; | 761 | *( (UChar*)(s->strm->next_out) ) = s->state_out_ch; |
748 | BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch ); | 762 | BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch ); |
@@ -754,8 +768,11 @@ void unRLE_obuf_to_output_SMALL ( DState* s ) | |||
754 | } | 768 | } |
755 | 769 | ||
756 | /* can a new run be started? */ | 770 | /* can a new run be started? */ |
757 | if (s->nblock_used == s->save_nblock+1) return; | 771 | if (s->nblock_used == s->save_nblock+1) return False; |
758 | 772 | ||
773 | /* Only caused by corrupt data stream? */ | ||
774 | if (s->nblock_used > s->save_nblock+1) | ||
775 | return True; | ||
759 | 776 | ||
760 | s->state_out_len = 1; | 777 | s->state_out_len = 1; |
761 | s->state_out_ch = s->k0; | 778 | s->state_out_ch = s->k0; |
@@ -788,7 +805,7 @@ void unRLE_obuf_to_output_SMALL ( DState* s ) | |||
788 | while (True) { | 805 | while (True) { |
789 | /* try to finish existing run */ | 806 | /* try to finish existing run */ |
790 | while (True) { | 807 | while (True) { |
791 | if (s->strm->avail_out == 0) return; | 808 | if (s->strm->avail_out == 0) return False; |
792 | if (s->state_out_len == 0) break; | 809 | if (s->state_out_len == 0) break; |
793 | *( (UChar*)(s->strm->next_out) ) = s->state_out_ch; | 810 | *( (UChar*)(s->strm->next_out) ) = s->state_out_ch; |
794 | BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch ); | 811 | BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch ); |
@@ -800,7 +817,11 @@ void unRLE_obuf_to_output_SMALL ( DState* s ) | |||
800 | } | 817 | } |
801 | 818 | ||
802 | /* can a new run be started? */ | 819 | /* can a new run be started? */ |
803 | if (s->nblock_used == s->save_nblock+1) return; | 820 | if (s->nblock_used == s->save_nblock+1) return False; |
821 | |||
822 | /* Only caused by corrupt data stream? */ | ||
823 | if (s->nblock_used > s->save_nblock+1) | ||
824 | return True; | ||
804 | 825 | ||
805 | s->state_out_len = 1; | 826 | s->state_out_len = 1; |
806 | s->state_out_ch = s->k0; | 827 | s->state_out_ch = s->k0; |
@@ -830,6 +851,7 @@ void unRLE_obuf_to_output_SMALL ( DState* s ) | |||
830 | /*---------------------------------------------------*/ | 851 | /*---------------------------------------------------*/ |
831 | int BZ_API(BZ2_bzDecompress) ( bz_stream *strm ) | 852 | int BZ_API(BZ2_bzDecompress) ( bz_stream *strm ) |
832 | { | 853 | { |
854 | Bool corrupt; | ||
833 | DState* s; | 855 | DState* s; |
834 | if (strm == NULL) return BZ_PARAM_ERROR; | 856 | if (strm == NULL) return BZ_PARAM_ERROR; |
835 | s = strm->state; | 857 | s = strm->state; |
@@ -840,12 +862,13 @@ int BZ_API(BZ2_bzDecompress) ( bz_stream *strm ) | |||
840 | if (s->state == BZ_X_IDLE) return BZ_SEQUENCE_ERROR; | 862 | if (s->state == BZ_X_IDLE) return BZ_SEQUENCE_ERROR; |
841 | if (s->state == BZ_X_OUTPUT) { | 863 | if (s->state == BZ_X_OUTPUT) { |
842 | if (s->smallDecompress) | 864 | if (s->smallDecompress) |
843 | unRLE_obuf_to_output_SMALL ( s ); else | 865 | corrupt = unRLE_obuf_to_output_SMALL ( s ); else |
844 | unRLE_obuf_to_output_FAST ( s ); | 866 | corrupt = unRLE_obuf_to_output_FAST ( s ); |
867 | if (corrupt) return BZ_DATA_ERROR; | ||
845 | if (s->nblock_used == s->save_nblock+1 && s->state_out_len == 0) { | 868 | if (s->nblock_used == s->save_nblock+1 && s->state_out_len == 0) { |
846 | BZ_FINALISE_CRC ( s->calculatedBlockCRC ); | 869 | BZ_FINALISE_CRC ( s->calculatedBlockCRC ); |
847 | if (s->verbosity >= 3) | 870 | if (s->verbosity >= 3) |
848 | VPrintf2 ( " {0x%x, 0x%x}", s->storedBlockCRC, | 871 | VPrintf2 ( " {0x%08x, 0x%08x}", s->storedBlockCRC, |
849 | s->calculatedBlockCRC ); | 872 | s->calculatedBlockCRC ); |
850 | if (s->verbosity >= 2) VPrintf0 ( "]" ); | 873 | if (s->verbosity >= 2) VPrintf0 ( "]" ); |
851 | if (s->calculatedBlockCRC != s->storedBlockCRC) | 874 | if (s->calculatedBlockCRC != s->storedBlockCRC) |
@@ -863,7 +886,7 @@ int BZ_API(BZ2_bzDecompress) ( bz_stream *strm ) | |||
863 | Int32 r = BZ2_decompress ( s ); | 886 | Int32 r = BZ2_decompress ( s ); |
864 | if (r == BZ_STREAM_END) { | 887 | if (r == BZ_STREAM_END) { |
865 | if (s->verbosity >= 3) | 888 | if (s->verbosity >= 3) |
866 | VPrintf2 ( "\n combined CRCs: stored = 0x%x, computed = 0x%x", | 889 | VPrintf2 ( "\n combined CRCs: stored = 0x%08x, computed = 0x%08x", |
867 | s->storedCombinedCRC, s->calculatedCombinedCRC ); | 890 | s->storedCombinedCRC, s->calculatedCombinedCRC ); |
868 | if (s->calculatedCombinedCRC != s->storedCombinedCRC) | 891 | if (s->calculatedCombinedCRC != s->storedCombinedCRC) |
869 | return BZ_DATA_ERROR; | 892 | return BZ_DATA_ERROR; |
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
@@ -110,8 +110,10 @@ typedef | |||
110 | #define BZ_EXPORT | 110 | #define BZ_EXPORT |
111 | #endif | 111 | #endif |
112 | 112 | ||
113 | #ifndef BZ_NO_STDIO | ||
113 | /* Need a definitition for FILE */ | 114 | /* Need a definitition for FILE */ |
114 | #include <stdio.h> | 115 | #include <stdio.h> |
116 | #endif | ||
115 | 117 | ||
116 | #ifdef _WIN32 | 118 | #ifdef _WIN32 |
117 | # include <windows.h> | 119 | # include <windows.h> |
diff --git a/bzlib_private.h b/bzlib_private.h index ff973c3..ca76fe6 100644 --- a/bzlib_private.h +++ b/bzlib_private.h | |||
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
@@ -76,7 +76,7 @@ | |||
76 | 76 | ||
77 | /*-- General stuff. --*/ | 77 | /*-- General stuff. --*/ |
78 | 78 | ||
79 | #define BZ_VERSION "1.0.2, 30-Dec-2001" | 79 | #define BZ_VERSION "1.0.3, 15-Feb-2005" |
80 | 80 | ||
81 | typedef char Char; | 81 | typedef char Char; |
82 | typedef unsigned char Bool; | 82 | typedef unsigned char Bool; |
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
@@ -488,9 +488,11 @@ void sendMTFValues ( EState* s ) | |||
488 | /*-- | 488 | /*-- |
489 | Recompute the tables based on the accumulated frequencies. | 489 | Recompute the tables based on the accumulated frequencies. |
490 | --*/ | 490 | --*/ |
491 | /* maxLen was changed from 20 to 17 in bzip2-1.0.3. See | ||
492 | comment in huffman.c for details. */ | ||
491 | for (t = 0; t < nGroups; t++) | 493 | for (t = 0; t < nGroups; t++) |
492 | BZ2_hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]), | 494 | BZ2_hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]), |
493 | alphaSize, 20 ); | 495 | alphaSize, 17 /*20*/ ); |
494 | } | 496 | } |
495 | 497 | ||
496 | 498 | ||
@@ -527,7 +529,7 @@ void sendMTFValues ( EState* s ) | |||
527 | if (s->len[t][i] > maxLen) maxLen = s->len[t][i]; | 529 | if (s->len[t][i] > maxLen) maxLen = s->len[t][i]; |
528 | if (s->len[t][i] < minLen) minLen = s->len[t][i]; | 530 | if (s->len[t][i] < minLen) minLen = s->len[t][i]; |
529 | } | 531 | } |
530 | AssertH ( !(maxLen > 20), 3004 ); | 532 | AssertH ( !(maxLen > 17 /*20*/ ), 3004 ); |
531 | AssertH ( !(minLen < 1), 3005 ); | 533 | AssertH ( !(minLen < 1), 3005 ); |
532 | BZ2_hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]), | 534 | BZ2_hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]), |
533 | minLen, maxLen, alphaSize ); | 535 | minLen, maxLen, alphaSize ); |
@@ -651,8 +653,8 @@ void BZ2_compressBlock ( EState* s, Bool is_last_block ) | |||
651 | if (s->blockNo > 1) s->numZ = 0; | 653 | if (s->blockNo > 1) s->numZ = 0; |
652 | 654 | ||
653 | if (s->verbosity >= 2) | 655 | if (s->verbosity >= 2) |
654 | VPrintf4( " block %d: crc = 0x%8x, " | 656 | VPrintf4( " block %d: crc = 0x%08x, " |
655 | "combined CRC = 0x%8x, size = %d\n", | 657 | "combined CRC = 0x%08x, size = %d\n", |
656 | s->blockNo, s->blockCRC, s->combinedCRC, s->nblock ); | 658 | s->blockNo, s->blockCRC, s->combinedCRC, s->nblock ); |
657 | 659 | ||
658 | BZ2_blockSort ( s ); | 660 | BZ2_blockSort ( s ); |
@@ -703,7 +705,7 @@ void BZ2_compressBlock ( EState* s, Bool is_last_block ) | |||
703 | bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 ); | 705 | bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 ); |
704 | bsPutUInt32 ( s, s->combinedCRC ); | 706 | bsPutUInt32 ( s, s->combinedCRC ); |
705 | if (s->verbosity >= 2) | 707 | if (s->verbosity >= 2) |
706 | VPrintf1( " final combined CRC = 0x%x\n ", s->combinedCRC ); | 708 | VPrintf1( " final combined CRC = 0x%08x\n ", s->combinedCRC ); |
707 | bsFinishWrite ( s ); | 709 | bsFinishWrite ( s ); |
708 | } | 710 | } |
709 | } | 711 | } |
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
diff --git a/decompress.c b/decompress.c index e921347..81c3d2c 100644 --- a/decompress.c +++ b/decompress.c | |||
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
@@ -524,17 +524,23 @@ Int32 BZ2_decompress ( DState* s ) | |||
524 | if (s->origPtr < 0 || s->origPtr >= nblock) | 524 | if (s->origPtr < 0 || s->origPtr >= nblock) |
525 | RETURN(BZ_DATA_ERROR); | 525 | RETURN(BZ_DATA_ERROR); |
526 | 526 | ||
527 | /*-- Set up cftab to facilitate generation of T^(-1) --*/ | ||
528 | s->cftab[0] = 0; | ||
529 | for (i = 1; i <= 256; i++) s->cftab[i] = s->unzftab[i-1]; | ||
530 | for (i = 1; i <= 256; i++) s->cftab[i] += s->cftab[i-1]; | ||
531 | for (i = 0; i <= 256; i++) { | ||
532 | if (s->cftab[i] < 0 || s->cftab[i] > nblock) { | ||
533 | /* s->cftab[i] can legitimately be == nblock */ | ||
534 | RETURN(BZ_DATA_ERROR); | ||
535 | } | ||
536 | } | ||
537 | |||
527 | s->state_out_len = 0; | 538 | s->state_out_len = 0; |
528 | s->state_out_ch = 0; | 539 | s->state_out_ch = 0; |
529 | BZ_INITIALISE_CRC ( s->calculatedBlockCRC ); | 540 | BZ_INITIALISE_CRC ( s->calculatedBlockCRC ); |
530 | s->state = BZ_X_OUTPUT; | 541 | s->state = BZ_X_OUTPUT; |
531 | if (s->verbosity >= 2) VPrintf0 ( "rt+rld" ); | 542 | if (s->verbosity >= 2) VPrintf0 ( "rt+rld" ); |
532 | 543 | ||
533 | /*-- Set up cftab to facilitate generation of T^(-1) --*/ | ||
534 | s->cftab[0] = 0; | ||
535 | for (i = 1; i <= 256; i++) s->cftab[i] = s->unzftab[i-1]; | ||
536 | for (i = 1; i <= 256; i++) s->cftab[i] += s->cftab[i-1]; | ||
537 | |||
538 | if (s->smallDecompress) { | 544 | if (s->smallDecompress) { |
539 | 545 | ||
540 | /*-- Make a copy of cftab, used in generation of T --*/ | 546 | /*-- Make a copy of cftab, used in generation of T --*/ |
diff --git a/entities.xml b/entities.xml new file mode 100644 index 0000000..6d0975f --- /dev/null +++ b/entities.xml | |||
@@ -0,0 +1,9 @@ | |||
1 | <!-- misc. strings --> | ||
2 | <!ENTITY bz-url "http://www.bzip.org"> | ||
3 | <!ENTITY bz-email "jseward@bzip.org"> | ||
4 | <!ENTITY bz-lifespan "1996-2005"> | ||
5 | |||
6 | <!ENTITY bz-version "1.0.3"> | ||
7 | <!ENTITY bz-date "15 February 2005"> | ||
8 | |||
9 | <!ENTITY manual-title "bzip2 Manual"> | ||
diff --git a/format.pl b/format.pl new file mode 100755 index 0000000..8ab47ac --- /dev/null +++ b/format.pl | |||
@@ -0,0 +1,53 @@ | |||
1 | #!/usr/bin/perl -w | ||
2 | use strict; | ||
3 | |||
4 | # get command line values: | ||
5 | if ( $#ARGV !=1 ) { | ||
6 | die "Usage: $0 xml_infile xml_outfile\n"; | ||
7 | } | ||
8 | |||
9 | my $infile = shift; | ||
10 | # check infile exists | ||
11 | die "Can't find file \"$infile\"" | ||
12 | unless -f $infile; | ||
13 | # check we can read infile | ||
14 | if (! -r $infile) { | ||
15 | die "Can't read input $infile\n"; | ||
16 | } | ||
17 | # check we can open infile | ||
18 | open( INFILE,"<$infile" ) or | ||
19 | die "Can't input $infile $!"; | ||
20 | |||
21 | #my $outfile = 'fmt-manual.xml'; | ||
22 | my $outfile = shift; | ||
23 | #print "Infile: $infile, Outfile: $outfile\n"; | ||
24 | # check we can write to outfile | ||
25 | open( OUTFILE,">$outfile" ) or | ||
26 | die "Can't output $outfile $! for writing"; | ||
27 | |||
28 | my ($prev, $curr, $str); | ||
29 | $prev = ''; $curr = ''; | ||
30 | while ( <INFILE> ) { | ||
31 | |||
32 | print OUTFILE $prev; | ||
33 | $prev = $curr; | ||
34 | $curr = $_; | ||
35 | $str = ''; | ||
36 | |||
37 | if ( $prev =~ /<programlisting>$|<screen>$/ ) { | ||
38 | chomp $prev; | ||
39 | $curr = join( '', $prev, "<![CDATA[", $curr ); | ||
40 | $prev = ''; | ||
41 | next; | ||
42 | } | ||
43 | elsif ( $curr =~ /<\/programlisting>|<\/screen>/ ) { | ||
44 | chomp $prev; | ||
45 | $curr = join( '', $prev, "]]>", $curr ); | ||
46 | $prev = ''; | ||
47 | next; | ||
48 | } | ||
49 | } | ||
50 | print OUTFILE $curr; | ||
51 | close INFILE; | ||
52 | close OUTFILE; | ||
53 | exit; | ||
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
@@ -162,7 +162,24 @@ void BZ2_hbMakeCodeLengths ( UChar *len, | |||
162 | 162 | ||
163 | if (! tooLong) break; | 163 | if (! tooLong) break; |
164 | 164 | ||
165 | for (i = 1; i < alphaSize; i++) { | 165 | /* 17 Oct 04: keep-going condition for the following loop used |
166 | to be 'i < alphaSize', which missed the last element, | ||
167 | theoretically leading to the possibility of the compressor | ||
168 | looping. However, this count-scaling step is only needed if | ||
169 | one of the generated Huffman code words is longer than | ||
170 | maxLen, which up to and including version 1.0.2 was 20 bits, | ||
171 | which is extremely unlikely. In version 1.0.3 maxLen was | ||
172 | changed to 17 bits, which has minimal effect on compression | ||
173 | ratio, but does mean this scaling step is used from time to | ||
174 | time, enough to verify that it works. | ||
175 | |||
176 | This means that bzip2-1.0.3 and later will only produce | ||
177 | Huffman codes with a maximum length of 17 bits. However, in | ||
178 | order to preserve backwards compatibility with bitstreams | ||
179 | produced by versions pre-1.0.3, the decompressor must still | ||
180 | handle lengths of up to 20. */ | ||
181 | |||
182 | for (i = 1; i <= alphaSize; i++) { | ||
166 | j = weight[i] >> 8; | 183 | j = weight[i] >> 8; |
167 | j = 1 + (j / 2); | 184 | j = 1 + (j / 2); |
168 | weight[i] = j << 8; | 185 | weight[i] = j << 8; |
diff --git a/manual.texi b/manual.texi deleted file mode 100644 index 5bc27d5..0000000 --- a/manual.texi +++ /dev/null | |||
@@ -1,2243 +0,0 @@ | |||
1 | \input texinfo @c -*- Texinfo -*- | ||
2 | @setfilename bzip2.info | ||
3 | |||
4 | @ignore | ||
5 | This file documents bzip2 version 1.0.2, and associated library | ||
6 | libbzip2, written by Julian Seward (jseward@acm.org). | ||
7 | |||
8 | Copyright (C) 1996-2002 Julian R Seward | ||
9 | |||
10 | Permission is granted to make and distribute verbatim copies of | ||
11 | this manual provided the copyright notice and this permission notice | ||
12 | are preserved on all copies. | ||
13 | |||
14 | Permission is granted to copy and distribute translations of this manual | ||
15 | into another language, under the above conditions for verbatim copies. | ||
16 | @end ignore | ||
17 | |||
18 | @ifinfo | ||
19 | @format | ||
20 | START-INFO-DIR-ENTRY | ||
21 | * Bzip2: (bzip2). A program and library for data compression. | ||
22 | END-INFO-DIR-ENTRY | ||
23 | @end format | ||
24 | |||
25 | @end ifinfo | ||
26 | |||
27 | @iftex | ||
28 | @c @finalout | ||
29 | @settitle bzip2 and libbzip2 | ||
30 | @titlepage | ||
31 | @title bzip2 and libbzip2 | ||
32 | @subtitle a program and library for data compression | ||
33 | @subtitle copyright (C) 1996-2002 Julian Seward | ||
34 | @subtitle version 1.0.2 of 30 December 2001 | ||
35 | @author Julian Seward | ||
36 | |||
37 | @end titlepage | ||
38 | |||
39 | @parindent 0mm | ||
40 | @parskip 2mm | ||
41 | |||
42 | @end iftex | ||
43 | @node Top,,, (dir) | ||
44 | |||
45 | The following text is the License for this software. You should | ||
46 | find it identical to that contained in the file LICENSE in the | ||
47 | source distribution. | ||
48 | |||
49 | @bf{------------------ START OF THE LICENSE ------------------} | ||
50 | |||
51 | This program, @code{bzip2}, | ||
52 | and associated library @code{libbzip2}, are | ||
53 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | ||
54 | |||
55 | Redistribution and use in source and binary forms, with or without | ||
56 | modification, are permitted provided that the following conditions | ||
57 | are met: | ||
58 | @itemize @bullet | ||
59 | @item | ||
60 | Redistributions of source code must retain the above copyright | ||
61 | notice, this list of conditions and the following disclaimer. | ||
62 | @item | ||
63 | The origin of this software must not be misrepresented; you must | ||
64 | not claim that you wrote the original software. If you use this | ||
65 | software in a product, an acknowledgment in the product | ||
66 | documentation would be appreciated but is not required. | ||
67 | @item | ||
68 | Altered source versions must be plainly marked as such, and must | ||
69 | not be misrepresented as being the original software. | ||
70 | @item | ||
71 | The name of the author may not be used to endorse or promote | ||
72 | products derived from this software without specific prior written | ||
73 | permission. | ||
74 | @end itemize | ||
75 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS | ||
76 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
77 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
78 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY | ||
79 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
80 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | ||
81 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
82 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | ||
83 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
84 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
85 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
86 | |||
87 | Julian Seward, Cambridge, UK. | ||
88 | |||
89 | @code{jseward@@acm.org} | ||
90 | |||
91 | @code{bzip2}/@code{libbzip2} version 1.0.2 of 30 December 2001. | ||
92 | |||
93 | @bf{------------------ END OF THE LICENSE ------------------} | ||
94 | |||
95 | Web sites: | ||
96 | |||
97 | @code{http://sources.redhat.com/bzip2} | ||
98 | |||
99 | @code{http://www.cacheprof.org} | ||
100 | |||
101 | PATENTS: To the best of my knowledge, @code{bzip2} does not use any patented | ||
102 | algorithms. However, I do not have the resources available to carry out | ||
103 | a full patent search. Therefore I cannot give any guarantee of the | ||
104 | above statement. | ||
105 | |||
106 | |||
107 | |||
108 | |||
109 | |||
110 | |||
111 | |||
112 | @chapter Introduction | ||
113 | |||
114 | @code{bzip2} compresses files using the Burrows-Wheeler | ||
115 | block-sorting text compression algorithm, and Huffman coding. | ||
116 | Compression is generally considerably better than that | ||
117 | achieved by more conventional LZ77/LZ78-based compressors, | ||
118 | and approaches the performance of the PPM family of statistical compressors. | ||
119 | |||
120 | @code{bzip2} is built on top of @code{libbzip2}, a flexible library | ||
121 | for handling compressed data in the @code{bzip2} format. This manual | ||
122 | describes both how to use the program and | ||
123 | how to work with the library interface. Most of the | ||
124 | manual is devoted to this library, not the program, | ||
125 | which is good news if your interest is only in the program. | ||
126 | |||
127 | Chapter 2 describes how to use @code{bzip2}; this is the only part | ||
128 | you need to read if you just want to know how to operate the program. | ||
129 | Chapter 3 describes the programming interfaces in detail, and | ||
130 | Chapter 4 records some miscellaneous notes which I thought | ||
131 | ought to be recorded somewhere. | ||
132 | |||
133 | |||
134 | @chapter How to use @code{bzip2} | ||
135 | |||
136 | This chapter contains a copy of the @code{bzip2} man page, | ||
137 | and nothing else. | ||
138 | |||
139 | @quotation | ||
140 | |||
141 | @unnumberedsubsubsec NAME | ||
142 | @itemize | ||
143 | @item @code{bzip2}, @code{bunzip2} | ||
144 | - a block-sorting file compressor, v1.0.2 | ||
145 | @item @code{bzcat} | ||
146 | - decompresses files to stdout | ||
147 | @item @code{bzip2recover} | ||
148 | - recovers data from damaged bzip2 files | ||
149 | @end itemize | ||
150 | |||
151 | @unnumberedsubsubsec SYNOPSIS | ||
152 | @itemize | ||
153 | @item @code{bzip2} [ -cdfkqstvzVL123456789 ] [ filenames ... ] | ||
154 | @item @code{bunzip2} [ -fkvsVL ] [ filenames ... ] | ||
155 | @item @code{bzcat} [ -s ] [ filenames ... ] | ||
156 | @item @code{bzip2recover} filename | ||
157 | @end itemize | ||
158 | |||
159 | @unnumberedsubsubsec DESCRIPTION | ||
160 | |||
161 | @code{bzip2} compresses files using the Burrows-Wheeler block sorting | ||
162 | text compression algorithm, and Huffman coding. Compression is | ||
163 | generally considerably better than that achieved by more conventional | ||
164 | LZ77/LZ78-based compressors, and approaches the performance of the PPM | ||
165 | family of statistical compressors. | ||
166 | |||
167 | The command-line options are deliberately very similar to those of GNU | ||
168 | @code{gzip}, but they are not identical. | ||
169 | |||
170 | @code{bzip2} expects a list of file names to accompany the command-line | ||
171 | flags. Each file is replaced by a compressed version of itself, with | ||
172 | the name @code{original_name.bz2}. Each compressed file has the same | ||
173 | modification date, permissions, and, when possible, ownership as the | ||
174 | corresponding original, so that these properties can be correctly | ||
175 | restored at decompression time. File name handling is naive in the | ||
176 | sense that there is no mechanism for preserving original file names, | ||
177 | permissions, ownerships or dates in filesystems which lack these | ||
178 | concepts, or have serious file name length restrictions, such as MS-DOS. | ||
179 | |||
180 | @code{bzip2} and @code{bunzip2} will by default not overwrite existing | ||
181 | files. If you want this to happen, specify the @code{-f} flag. | ||
182 | |||
183 | If no file names are specified, @code{bzip2} compresses from standard | ||
184 | input to standard output. In this case, @code{bzip2} will decline to | ||
185 | write compressed output to a terminal, as this would be entirely | ||
186 | incomprehensible and therefore pointless. | ||
187 | |||
188 | @code{bunzip2} (or @code{bzip2 -d}) decompresses all | ||
189 | specified files. Files which were not created by @code{bzip2} | ||
190 | will be detected and ignored, and a warning issued. | ||
191 | @code{bzip2} attempts to guess the filename for the decompressed file | ||
192 | from that of the compressed file as follows: | ||
193 | @itemize | ||
194 | @item @code{filename.bz2 } becomes @code{filename} | ||
195 | @item @code{filename.bz } becomes @code{filename} | ||
196 | @item @code{filename.tbz2} becomes @code{filename.tar} | ||
197 | @item @code{filename.tbz } becomes @code{filename.tar} | ||
198 | @item @code{anyothername } becomes @code{anyothername.out} | ||
199 | @end itemize | ||
200 | If the file does not end in one of the recognised endings, | ||
201 | @code{.bz2}, @code{.bz}, | ||
202 | @code{.tbz2} or @code{.tbz}, @code{bzip2} complains that it cannot | ||
203 | guess the name of the original file, and uses the original name | ||
204 | with @code{.out} appended. | ||
205 | |||
206 | As with compression, supplying no | ||
207 | filenames causes decompression from standard input to standard output. | ||
208 | |||
209 | @code{bunzip2} will correctly decompress a file which is the | ||
210 | concatenation of two or more compressed files. The result is the | ||
211 | concatenation of the corresponding uncompressed files. Integrity | ||
212 | testing (@code{-t}) of concatenated compressed files is also supported. | ||
213 | |||
214 | You can also compress or decompress files to the standard output by | ||
215 | giving the @code{-c} flag. Multiple files may be compressed and | ||
216 | decompressed like this. The resulting outputs are fed sequentially to | ||
217 | stdout. Compression of multiple files in this manner generates a stream | ||
218 | containing multiple compressed file representations. Such a stream | ||
219 | can be decompressed correctly only by @code{bzip2} version 0.9.0 or | ||
220 | later. Earlier versions of @code{bzip2} will stop after decompressing | ||
221 | the first file in the stream. | ||
222 | |||
223 | @code{bzcat} (or @code{bzip2 -dc}) decompresses all specified files to | ||
224 | the standard output. | ||
225 | |||
226 | @code{bzip2} will read arguments from the environment variables | ||
227 | @code{BZIP2} and @code{BZIP}, in that order, and will process them | ||
228 | before any arguments read from the command line. This gives a | ||
229 | convenient way to supply default arguments. | ||
230 | |||
231 | Compression is always performed, even if the compressed file is slightly | ||
232 | larger than the original. Files of less than about one hundred bytes | ||
233 | tend to get larger, since the compression mechanism has a constant | ||
234 | overhead in the region of 50 bytes. Random data (including the output | ||
235 | of most file compressors) is coded at about 8.05 bits per byte, giving | ||
236 | an expansion of around 0.5%. | ||
237 | |||
238 | As a self-check for your protection, @code{bzip2} uses 32-bit CRCs to | ||
239 | make sure that the decompressed version of a file is identical to the | ||
240 | original. This guards against corruption of the compressed data, and | ||
241 | against undetected bugs in @code{bzip2} (hopefully very unlikely). The | ||
242 | chances of data corruption going undetected is microscopic, about one | ||
243 | chance in four billion for each file processed. Be aware, though, that | ||
244 | the check occurs upon decompression, so it can only tell you that | ||
245 | something is wrong. It can't help you recover the original uncompressed | ||
246 | data. You can use @code{bzip2recover} to try to recover data from | ||
247 | damaged files. | ||
248 | |||
249 | Return values: 0 for a normal exit, 1 for environmental problems (file | ||
250 | not found, invalid flags, I/O errors, &c), 2 to indicate a corrupt | ||
251 | compressed file, 3 for an internal consistency error (eg, bug) which | ||
252 | caused @code{bzip2} to panic. | ||
253 | |||
254 | |||
255 | @unnumberedsubsubsec OPTIONS | ||
256 | @table @code | ||
257 | @item -c --stdout | ||
258 | Compress or decompress to standard output. | ||
259 | @item -d --decompress | ||
260 | Force decompression. @code{bzip2}, @code{bunzip2} and @code{bzcat} are | ||
261 | really the same program, and the decision about what actions to take is | ||
262 | done on the basis of which name is used. This flag overrides that | ||
263 | mechanism, and forces bzip2 to decompress. | ||
264 | @item -z --compress | ||
265 | The complement to @code{-d}: forces compression, regardless of the | ||
266 | invokation name. | ||
267 | @item -t --test | ||
268 | Check integrity of the specified file(s), but don't decompress them. | ||
269 | This really performs a trial decompression and throws away the result. | ||
270 | @item -f --force | ||
271 | Force overwrite of output files. Normally, @code{bzip2} will not overwrite | ||
272 | existing output files. Also forces @code{bzip2} to break hard links | ||
273 | to files, which it otherwise wouldn't do. | ||
274 | |||
275 | @code{bzip2} normally declines to decompress files which don't have the | ||
276 | correct magic header bytes. If forced (@code{-f}), however, it will | ||
277 | pass such files through unmodified. This is how GNU @code{gzip} | ||
278 | behaves. | ||
279 | @item -k --keep | ||
280 | Keep (don't delete) input files during compression | ||
281 | or decompression. | ||
282 | @item -s --small | ||
283 | Reduce memory usage, for compression, decompression and testing. Files | ||
284 | are decompressed and tested using a modified algorithm which only | ||
285 | requires 2.5 bytes per block byte. This means any file can be | ||
286 | decompressed in 2300k of memory, albeit at about half the normal speed. | ||
287 | |||
288 | During compression, @code{-s} selects a block size of 200k, which limits | ||
289 | memory use to around the same figure, at the expense of your compression | ||
290 | ratio. In short, if your machine is low on memory (8 megabytes or | ||
291 | less), use -s for everything. See MEMORY MANAGEMENT below. | ||
292 | @item -q --quiet | ||
293 | Suppress non-essential warning messages. Messages pertaining to | ||
294 | I/O errors and other critical events will not be suppressed. | ||
295 | @item -v --verbose | ||
296 | Verbose mode -- show the compression ratio for each file processed. | ||
297 | Further @code{-v}'s increase the verbosity level, spewing out lots of | ||
298 | information which is primarily of interest for diagnostic purposes. | ||
299 | @item -L --license -V --version | ||
300 | Display the software version, license terms and conditions. | ||
301 | @item -1 (or --fast) to -9 (or --best) | ||
302 | Set the block size to 100 k, 200 k .. 900 k when compressing. Has no | ||
303 | effect when decompressing. See MEMORY MANAGEMENT below. | ||
304 | The @code{--fast} and @code{--best} aliases are primarily for GNU | ||
305 | @code{gzip} compatibility. In particular, @code{--fast} doesn't make | ||
306 | things significantly faster. And @code{--best} merely selects the | ||
307 | default behaviour. | ||
308 | @item -- | ||
309 | Treats all subsequent arguments as file names, even if they start | ||
310 | with a dash. This is so you can handle files with names beginning | ||
311 | with a dash, for example: @code{bzip2 -- -myfilename}. | ||
312 | @item --repetitive-fast | ||
313 | @item --repetitive-best | ||
314 | These flags are redundant in versions 0.9.5 and above. They provided | ||
315 | some coarse control over the behaviour of the sorting algorithm in | ||
316 | earlier versions, which was sometimes useful. 0.9.5 and above have an | ||
317 | improved algorithm which renders these flags irrelevant. | ||
318 | @end table | ||
319 | |||
320 | |||
321 | @unnumberedsubsubsec MEMORY MANAGEMENT | ||
322 | |||
323 | @code{bzip2} compresses large files in blocks. The block size affects | ||
324 | both the compression ratio achieved, and the amount of memory needed for | ||
325 | compression and decompression. The flags @code{-1} through @code{-9} | ||
326 | specify the block size to be 100,000 bytes through 900,000 bytes (the | ||
327 | default) respectively. At decompression time, the block size used for | ||
328 | compression is read from the header of the compressed file, and | ||
329 | @code{bunzip2} then allocates itself just enough memory to decompress | ||
330 | the file. Since block sizes are stored in compressed files, it follows | ||
331 | that the flags @code{-1} to @code{-9} are irrelevant to and so ignored | ||
332 | during decompression. | ||
333 | |||
334 | Compression and decompression requirements, in bytes, can be estimated | ||
335 | as: | ||
336 | @example | ||
337 | Compression: 400k + ( 8 x block size ) | ||
338 | |||
339 | Decompression: 100k + ( 4 x block size ), or | ||
340 | 100k + ( 2.5 x block size ) | ||
341 | @end example | ||
342 | Larger block sizes give rapidly diminishing marginal returns. Most of | ||
343 | the compression comes from the first two or three hundred k of block | ||
344 | size, a fact worth bearing in mind when using @code{bzip2} on small machines. | ||
345 | It is also important to appreciate that the decompression memory | ||
346 | requirement is set at compression time by the choice of block size. | ||
347 | |||
348 | For files compressed with the default 900k block size, @code{bunzip2} | ||
349 | will require about 3700 kbytes to decompress. To support decompression | ||
350 | of any file on a 4 megabyte machine, @code{bunzip2} has an option to | ||
351 | decompress using approximately half this amount of memory, about 2300 | ||
352 | kbytes. Decompression speed is also halved, so you should use this | ||
353 | option only where necessary. The relevant flag is @code{-s}. | ||
354 | |||
355 | In general, try and use the largest block size memory constraints allow, | ||
356 | since that maximises the compression achieved. Compression and | ||
357 | decompression speed are virtually unaffected by block size. | ||
358 | |||
359 | Another significant point applies to files which fit in a single block | ||
360 | -- that means most files you'd encounter using a large block size. The | ||
361 | amount of real memory touched is proportional to the size of the file, | ||
362 | since the file is smaller than a block. For example, compressing a file | ||
363 | 20,000 bytes long with the flag @code{-9} will cause the compressor to | ||
364 | allocate around 7600k of memory, but only touch 400k + 20000 * 8 = 560 | ||
365 | kbytes of it. Similarly, the decompressor will allocate 3700k but only | ||
366 | touch 100k + 20000 * 4 = 180 kbytes. | ||
367 | |||
368 | Here is a table which summarises the maximum memory usage for different | ||
369 | block sizes. Also recorded is the total compressed size for 14 files of | ||
370 | the Calgary Text Compression Corpus totalling 3,141,622 bytes. This | ||
371 | column gives some feel for how compression varies with block size. | ||
372 | These figures tend to understate the advantage of larger block sizes for | ||
373 | larger files, since the Corpus is dominated by smaller files. | ||
374 | @example | ||
375 | Compress Decompress Decompress Corpus | ||
376 | Flag usage usage -s usage Size | ||
377 | |||
378 | -1 1200k 500k 350k 914704 | ||
379 | -2 2000k 900k 600k 877703 | ||
380 | -3 2800k 1300k 850k 860338 | ||
381 | -4 3600k 1700k 1100k 846899 | ||
382 | -5 4400k 2100k 1350k 845160 | ||
383 | -6 5200k 2500k 1600k 838626 | ||
384 | -7 6100k 2900k 1850k 834096 | ||
385 | -8 6800k 3300k 2100k 828642 | ||
386 | -9 7600k 3700k 2350k 828642 | ||
387 | @end example | ||
388 | |||
389 | @unnumberedsubsubsec RECOVERING DATA FROM DAMAGED FILES | ||
390 | |||
391 | @code{bzip2} compresses files in blocks, usually 900kbytes long. Each | ||
392 | block is handled independently. If a media or transmission error causes | ||
393 | a multi-block @code{.bz2} file to become damaged, it may be possible to | ||
394 | recover data from the undamaged blocks in the file. | ||
395 | |||
396 | The compressed representation of each block is delimited by a 48-bit | ||
397 | pattern, which makes it possible to find the block boundaries with | ||
398 | reasonable certainty. Each block also carries its own 32-bit CRC, so | ||
399 | damaged blocks can be distinguished from undamaged ones. | ||
400 | |||
401 | @code{bzip2recover} is a simple program whose purpose is to search for | ||
402 | blocks in @code{.bz2} files, and write each block out into its own | ||
403 | @code{.bz2} file. You can then use @code{bzip2 -t} to test the | ||
404 | integrity of the resulting files, and decompress those which are | ||
405 | undamaged. | ||
406 | |||
407 | @code{bzip2recover} | ||
408 | takes a single argument, the name of the damaged file, and writes a | ||
409 | number of files @code{rec00001file.bz2}, @code{rec00002file.bz2}, etc, | ||
410 | containing the extracted blocks. The output filenames are designed so | ||
411 | that the use of wildcards in subsequent processing -- for example, | ||
412 | @code{bzip2 -dc rec*file.bz2 > recovered_data} -- processes the files in | ||
413 | the correct order. | ||
414 | |||
415 | @code{bzip2recover} should be of most use dealing with large @code{.bz2} | ||
416 | files, as these will contain many blocks. It is clearly futile to use | ||
417 | it on damaged single-block files, since a damaged block cannot be | ||
418 | recovered. If you wish to minimise any potential data loss through | ||
419 | media or transmission errors, you might consider compressing with a | ||
420 | smaller block size. | ||
421 | |||
422 | |||
423 | @unnumberedsubsubsec PERFORMANCE NOTES | ||
424 | |||
425 | The sorting phase of compression gathers together similar strings in the | ||
426 | file. Because of this, files containing very long runs of repeated | ||
427 | symbols, like "aabaabaabaab ..." (repeated several hundred times) may | ||
428 | compress more slowly than normal. Versions 0.9.5 and above fare much | ||
429 | better than previous versions in this respect. The ratio between | ||
430 | worst-case and average-case compression time is in the region of 10:1. | ||
431 | For previous versions, this figure was more like 100:1. You can use the | ||
432 | @code{-vvvv} option to monitor progress in great detail, if you want. | ||
433 | |||
434 | Decompression speed is unaffected by these phenomena. | ||
435 | |||
436 | @code{bzip2} usually allocates several megabytes of memory to operate | ||
437 | in, and then charges all over it in a fairly random fashion. This means | ||
438 | that performance, both for compressing and decompressing, is largely | ||
439 | determined by the speed at which your machine can service cache misses. | ||
440 | Because of this, small changes to the code to reduce the miss rate have | ||
441 | been observed to give disproportionately large performance improvements. | ||
442 | I imagine @code{bzip2} will perform best on machines with very large | ||
443 | caches. | ||
444 | |||
445 | |||
446 | @unnumberedsubsubsec CAVEATS | ||
447 | |||
448 | I/O error messages are not as helpful as they could be. @code{bzip2} | ||
449 | tries hard to detect I/O errors and exit cleanly, but the details of | ||
450 | what the problem is sometimes seem rather misleading. | ||
451 | |||
452 | This manual page pertains to version 1.0.2 of @code{bzip2}. Compressed | ||
453 | data created by this version is entirely forwards and backwards | ||
454 | compatible with the previous public releases, versions 0.1pl2, 0.9.0, | ||
455 | 0.9.5, 1.0.0 and 1.0.1, but with the following exception: 0.9.0 and | ||
456 | above can correctly decompress multiple concatenated compressed files. | ||
457 | 0.1pl2 cannot do this; it will stop after decompressing just the first | ||
458 | file in the stream. | ||
459 | |||
460 | @code{bzip2recover} versions prior to this one, 1.0.2, used 32-bit | ||
461 | integers to represent bit positions in compressed files, so it could not | ||
462 | handle compressed files more than 512 megabytes long. Version 1.0.2 and | ||
463 | above uses 64-bit ints on some platforms which support them (GNU | ||
464 | supported targets, and Windows). To establish whether or not | ||
465 | @code{bzip2recover} was built with such a limitation, run it without | ||
466 | arguments. In any event you can build yourself an unlimited version if | ||
467 | you can recompile it with @code{MaybeUInt64} set to be an unsigned | ||
468 | 64-bit integer. | ||
469 | |||
470 | |||
471 | |||
472 | @unnumberedsubsubsec AUTHOR | ||
473 | Julian Seward, @code{jseward@@acm.org}. | ||
474 | |||
475 | @code{http://sources.redhat.com/bzip2} | ||
476 | |||
477 | The ideas embodied in @code{bzip2} are due to (at least) the following | ||
478 | people: Michael Burrows and David Wheeler (for the block sorting | ||
479 | transformation), David Wheeler (again, for the Huffman coder), Peter | ||
480 | Fenwick (for the structured coding model in the original @code{bzip}, | ||
481 | and many refinements), and Alistair Moffat, Radford Neal and Ian Witten | ||
482 | (for the arithmetic coder in the original @code{bzip}). I am much | ||
483 | indebted for their help, support and advice. See the manual in the | ||
484 | source distribution for pointers to sources of documentation. Christian | ||
485 | von Roques encouraged me to look for faster sorting algorithms, so as to | ||
486 | speed up compression. Bela Lubkin encouraged me to improve the | ||
487 | worst-case compression performance. The @code{bz*} scripts are derived | ||
488 | from those of GNU @code{gzip}. Many people sent patches, helped with | ||
489 | portability problems, lent machines, gave advice and were generally | ||
490 | helpful. | ||
491 | |||
492 | @end quotation | ||
493 | |||
494 | |||
495 | |||
496 | |||
497 | @chapter Programming with @code{libbzip2} | ||
498 | |||
499 | This chapter describes the programming interface to @code{libbzip2}. | ||
500 | |||
501 | For general background information, particularly about memory | ||
502 | use and performance aspects, you'd be well advised to read Chapter 2 | ||
503 | as well. | ||
504 | |||
505 | @section Top-level structure | ||
506 | |||
507 | @code{libbzip2} is a flexible library for compressing and decompressing | ||
508 | data in the @code{bzip2} data format. Although packaged as a single | ||
509 | entity, it helps to regard the library as three separate parts: the low | ||
510 | level interface, and the high level interface, and some utility | ||
511 | functions. | ||
512 | |||
513 | The structure of @code{libbzip2}'s interfaces is similar to | ||
514 | that of Jean-loup Gailly's and Mark Adler's excellent @code{zlib} | ||
515 | library. | ||
516 | |||
517 | All externally visible symbols have names beginning @code{BZ2_}. | ||
518 | This is new in version 1.0. The intention is to minimise pollution | ||
519 | of the namespaces of library clients. | ||
520 | |||
521 | @subsection Low-level summary | ||
522 | |||
523 | This interface provides services for compressing and decompressing | ||
524 | data in memory. There's no provision for dealing with files, streams | ||
525 | or any other I/O mechanisms, just straight memory-to-memory work. | ||
526 | In fact, this part of the library can be compiled without inclusion | ||
527 | of @code{stdio.h}, which may be helpful for embedded applications. | ||
528 | |||
529 | The low-level part of the library has no global variables and | ||
530 | is therefore thread-safe. | ||
531 | |||
532 | Six routines make up the low level interface: | ||
533 | @code{BZ2_bzCompressInit}, @code{BZ2_bzCompress}, and @* @code{BZ2_bzCompressEnd} | ||
534 | for compression, | ||
535 | and a corresponding trio @code{BZ2_bzDecompressInit}, @* @code{BZ2_bzDecompress} | ||
536 | and @code{BZ2_bzDecompressEnd} for decompression. | ||
537 | The @code{*Init} functions allocate | ||
538 | memory for compression/decompression and do other | ||
539 | initialisations, whilst the @code{*End} functions close down operations | ||
540 | and release memory. | ||
541 | |||
542 | The real work is done by @code{BZ2_bzCompress} and @code{BZ2_bzDecompress}. | ||
543 | These compress and decompress data from a user-supplied input buffer | ||
544 | to a user-supplied output buffer. These buffers can be any size; | ||
545 | arbitrary quantities of data are handled by making repeated calls | ||
546 | to these functions. This is a flexible mechanism allowing a | ||
547 | consumer-pull style of activity, or producer-push, or a mixture of | ||
548 | both. | ||
549 | |||
550 | |||
551 | |||
552 | @subsection High-level summary | ||
553 | |||
554 | This interface provides some handy wrappers around the low-level | ||
555 | interface to facilitate reading and writing @code{bzip2} format | ||
556 | files (@code{.bz2} files). The routines provide hooks to facilitate | ||
557 | reading files in which the @code{bzip2} data stream is embedded | ||
558 | within some larger-scale file structure, or where there are | ||
559 | multiple @code{bzip2} data streams concatenated end-to-end. | ||
560 | |||
561 | For reading files, @code{BZ2_bzReadOpen}, @code{BZ2_bzRead}, | ||
562 | @code{BZ2_bzReadClose} and @* @code{BZ2_bzReadGetUnused} are supplied. For | ||
563 | writing files, @code{BZ2_bzWriteOpen}, @code{BZ2_bzWrite} and | ||
564 | @code{BZ2_bzWriteFinish} are available. | ||
565 | |||
566 | As with the low-level library, no global variables are used | ||
567 | so the library is per se thread-safe. However, if I/O errors | ||
568 | occur whilst reading or writing the underlying compressed files, | ||
569 | you may have to consult @code{errno} to determine the cause of | ||
570 | the error. In that case, you'd need a C library which correctly | ||
571 | supports @code{errno} in a multithreaded environment. | ||
572 | |||
573 | To make the library a little simpler and more portable, | ||
574 | @code{BZ2_bzReadOpen} and @code{BZ2_bzWriteOpen} require you to pass them file | ||
575 | handles (@code{FILE*}s) which have previously been opened for reading or | ||
576 | writing respectively. That avoids portability problems associated with | ||
577 | file operations and file attributes, whilst not being much of an | ||
578 | imposition on the programmer. | ||
579 | |||
580 | |||
581 | |||
582 | @subsection Utility functions summary | ||
583 | For very simple needs, @code{BZ2_bzBuffToBuffCompress} and | ||
584 | @code{BZ2_bzBuffToBuffDecompress} are provided. These compress | ||
585 | data in memory from one buffer to another buffer in a single | ||
586 | function call. You should assess whether these functions | ||
587 | fulfill your memory-to-memory compression/decompression | ||
588 | requirements before investing effort in understanding the more | ||
589 | general but more complex low-level interface. | ||
590 | |||
591 | Yoshioka Tsuneo (@code{QWF00133@@niftyserve.or.jp} / | ||
592 | @code{tsuneo-y@@is.aist-nara.ac.jp}) has contributed some functions to | ||
593 | give better @code{zlib} compatibility. These functions are | ||
594 | @code{BZ2_bzopen}, @code{BZ2_bzread}, @code{BZ2_bzwrite}, @code{BZ2_bzflush}, | ||
595 | @code{BZ2_bzclose}, | ||
596 | @code{BZ2_bzerror} and @code{BZ2_bzlibVersion}. You may find these functions | ||
597 | more convenient for simple file reading and writing, than those in the | ||
598 | high-level interface. These functions are not (yet) officially part of | ||
599 | the library, and are minimally documented here. If they break, you | ||
600 | get to keep all the pieces. I hope to document them properly when time | ||
601 | permits. | ||
602 | |||
603 | Yoshioka also contributed modifications to allow the library to be | ||
604 | built as a Windows DLL. | ||
605 | |||
606 | |||
607 | @section Error handling | ||
608 | |||
609 | The library is designed to recover cleanly in all situations, including | ||
610 | the worst-case situation of decompressing random data. I'm not | ||
611 | 100% sure that it can always do this, so you might want to add | ||
612 | a signal handler to catch segmentation violations during decompression | ||
613 | if you are feeling especially paranoid. I would be interested in | ||
614 | hearing more about the robustness of the library to corrupted | ||
615 | compressed data. | ||
616 | |||
617 | Version 1.0 is much more robust in this respect than | ||
618 | 0.9.0 or 0.9.5. Investigations with Checker (a tool for | ||
619 | detecting problems with memory management, similar to Purify) | ||
620 | indicate that, at least for the few files I tested, all single-bit | ||
621 | errors in the decompressed data are caught properly, with no | ||
622 | segmentation faults, no reads of uninitialised data and no | ||
623 | out of range reads or writes. So it's certainly much improved, | ||
624 | although I wouldn't claim it to be totally bombproof. | ||
625 | |||
626 | The file @code{bzlib.h} contains all definitions needed to use | ||
627 | the library. In particular, you should definitely not include | ||
628 | @code{bzlib_private.h}. | ||
629 | |||
630 | In @code{bzlib.h}, the various return values are defined. The following | ||
631 | list is not intended as an exhaustive description of the circumstances | ||
632 | in which a given value may be returned -- those descriptions are given | ||
633 | later. Rather, it is intended to convey the rough meaning of each | ||
634 | return value. The first five actions are normal and not intended to | ||
635 | denote an error situation. | ||
636 | @table @code | ||
637 | @item BZ_OK | ||
638 | The requested action was completed successfully. | ||
639 | @item BZ_RUN_OK | ||
640 | @itemx BZ_FLUSH_OK | ||
641 | @itemx BZ_FINISH_OK | ||
642 | In @code{BZ2_bzCompress}, the requested flush/finish/nothing-special action | ||
643 | was completed successfully. | ||
644 | @item BZ_STREAM_END | ||
645 | Compression of data was completed, or the logical stream end was | ||
646 | detected during decompression. | ||
647 | @end table | ||
648 | |||
649 | The following return values indicate an error of some kind. | ||
650 | @table @code | ||
651 | @item BZ_CONFIG_ERROR | ||
652 | Indicates that the library has been improperly compiled on your | ||
653 | platform -- a major configuration error. Specifically, it means | ||
654 | that @code{sizeof(char)}, @code{sizeof(short)} and @code{sizeof(int)} | ||
655 | are not 1, 2 and 4 respectively, as they should be. Note that the | ||
656 | library should still work properly on 64-bit platforms which follow | ||
657 | the LP64 programming model -- that is, where @code{sizeof(long)} | ||
658 | and @code{sizeof(void*)} are 8. Under LP64, @code{sizeof(int)} is | ||
659 | still 4, so @code{libbzip2}, which doesn't use the @code{long} type, | ||
660 | is OK. | ||
661 | @item BZ_SEQUENCE_ERROR | ||
662 | When using the library, it is important to call the functions in the | ||
663 | correct sequence and with data structures (buffers etc) in the correct | ||
664 | states. @code{libbzip2} checks as much as it can to ensure this is | ||
665 | happening, and returns @code{BZ_SEQUENCE_ERROR} if not. Code which | ||
666 | complies precisely with the function semantics, as detailed below, | ||
667 | should never receive this value; such an event denotes buggy code | ||
668 | which you should investigate. | ||
669 | @item BZ_PARAM_ERROR | ||
670 | Returned when a parameter to a function call is out of range | ||
671 | or otherwise manifestly incorrect. As with @code{BZ_SEQUENCE_ERROR}, | ||
672 | this denotes a bug in the client code. The distinction between | ||
673 | @code{BZ_PARAM_ERROR} and @code{BZ_SEQUENCE_ERROR} is a bit hazy, but still worth | ||
674 | making. | ||
675 | @item BZ_MEM_ERROR | ||
676 | Returned when a request to allocate memory failed. Note that the | ||
677 | quantity of memory needed to decompress a stream cannot be determined | ||
678 | until the stream's header has been read. So @code{BZ2_bzDecompress} and | ||
679 | @code{BZ2_bzRead} may return @code{BZ_MEM_ERROR} even though some of | ||
680 | the compressed data has been read. The same is not true for | ||
681 | compression; once @code{BZ2_bzCompressInit} or @code{BZ2_bzWriteOpen} have | ||
682 | successfully completed, @code{BZ_MEM_ERROR} cannot occur. | ||
683 | @item BZ_DATA_ERROR | ||
684 | Returned when a data integrity error is detected during decompression. | ||
685 | Most importantly, this means when stored and computed CRCs for the | ||
686 | data do not match. This value is also returned upon detection of any | ||
687 | other anomaly in the compressed data. | ||
688 | @item BZ_DATA_ERROR_MAGIC | ||
689 | As a special case of @code{BZ_DATA_ERROR}, it is sometimes useful to | ||
690 | know when the compressed stream does not start with the correct | ||
691 | magic bytes (@code{'B' 'Z' 'h'}). | ||
692 | @item BZ_IO_ERROR | ||
693 | Returned by @code{BZ2_bzRead} and @code{BZ2_bzWrite} when there is an error | ||
694 | reading or writing in the compressed file, and by @code{BZ2_bzReadOpen} | ||
695 | and @code{BZ2_bzWriteOpen} for attempts to use a file for which the | ||
696 | error indicator (viz, @code{ferror(f)}) is set. | ||
697 | On receipt of @code{BZ_IO_ERROR}, the caller should consult | ||
698 | @code{errno} and/or @code{perror} to acquire operating-system | ||
699 | specific information about the problem. | ||
700 | @item BZ_UNEXPECTED_EOF | ||
701 | Returned by @code{BZ2_bzRead} when the compressed file finishes | ||
702 | before the logical end of stream is detected. | ||
703 | @item BZ_OUTBUFF_FULL | ||
704 | Returned by @code{BZ2_bzBuffToBuffCompress} and | ||
705 | @code{BZ2_bzBuffToBuffDecompress} to indicate that the output data | ||
706 | will not fit into the output buffer provided. | ||
707 | @end table | ||
708 | |||
709 | |||
710 | |||
711 | @section Low-level interface | ||
712 | |||
713 | @subsection @code{BZ2_bzCompressInit} | ||
714 | @example | ||
715 | typedef | ||
716 | struct @{ | ||
717 | char *next_in; | ||
718 | unsigned int avail_in; | ||
719 | unsigned int total_in_lo32; | ||
720 | unsigned int total_in_hi32; | ||
721 | |||
722 | char *next_out; | ||
723 | unsigned int avail_out; | ||
724 | unsigned int total_out_lo32; | ||
725 | unsigned int total_out_hi32; | ||
726 | |||
727 | void *state; | ||
728 | |||
729 | void *(*bzalloc)(void *,int,int); | ||
730 | void (*bzfree)(void *,void *); | ||
731 | void *opaque; | ||
732 | @} | ||
733 | bz_stream; | ||
734 | |||
735 | int BZ2_bzCompressInit ( bz_stream *strm, | ||
736 | int blockSize100k, | ||
737 | int verbosity, | ||
738 | int workFactor ); | ||
739 | |||
740 | @end example | ||
741 | |||
742 | Prepares for compression. The @code{bz_stream} structure | ||
743 | holds all data pertaining to the compression activity. | ||
744 | A @code{bz_stream} structure should be allocated and initialised | ||
745 | prior to the call. | ||
746 | The fields of @code{bz_stream} | ||
747 | comprise the entirety of the user-visible data. @code{state} | ||
748 | is a pointer to the private data structures required for compression. | ||
749 | |||
750 | Custom memory allocators are supported, via fields @code{bzalloc}, | ||
751 | @code{bzfree}, | ||
752 | and @code{opaque}. The value | ||
753 | @code{opaque} is passed to as the first argument to | ||
754 | all calls to @code{bzalloc} and @code{bzfree}, but is | ||
755 | otherwise ignored by the library. | ||
756 | The call @code{bzalloc ( opaque, n, m )} is expected to return a | ||
757 | pointer @code{p} to | ||
758 | @code{n * m} bytes of memory, and @code{bzfree ( opaque, p )} | ||
759 | should free | ||
760 | that memory. | ||
761 | |||
762 | If you don't want to use a custom memory allocator, set @code{bzalloc}, | ||
763 | @code{bzfree} and | ||
764 | @code{opaque} to @code{NULL}, | ||
765 | and the library will then use the standard @code{malloc}/@code{free} | ||
766 | routines. | ||
767 | |||
768 | Before calling @code{BZ2_bzCompressInit}, fields @code{bzalloc}, | ||
769 | @code{bzfree} and @code{opaque} should | ||
770 | be filled appropriately, as just described. Upon return, the internal | ||
771 | state will have been allocated and initialised, and @code{total_in_lo32}, | ||
772 | @code{total_in_hi32}, @code{total_out_lo32} and | ||
773 | @code{total_out_hi32} will have been set to zero. | ||
774 | These four fields are used by the library | ||
775 | to inform the caller of the total amount of data passed into and out of | ||
776 | the library, respectively. You should not try to change them. | ||
777 | As of version 1.0, 64-bit counts are maintained, even on 32-bit | ||
778 | platforms, using the @code{_hi32} fields to store the upper 32 bits | ||
779 | of the count. So, for example, the total amount of data in | ||
780 | is @code{(total_in_hi32 << 32) + total_in_lo32}. | ||
781 | |||
782 | Parameter @code{blockSize100k} specifies the block size to be used for | ||
783 | compression. It should be a value between 1 and 9 inclusive, and the | ||
784 | actual block size used is 100000 x this figure. 9 gives the best | ||
785 | compression but takes most memory. | ||
786 | |||
787 | Parameter @code{verbosity} should be set to a number between 0 and 4 | ||
788 | inclusive. 0 is silent, and greater numbers give increasingly verbose | ||
789 | monitoring/debugging output. If the library has been compiled with | ||
790 | @code{-DBZ_NO_STDIO}, no such output will appear for any verbosity | ||
791 | setting. | ||
792 | |||
793 | Parameter @code{workFactor} controls how the compression phase behaves | ||
794 | when presented with worst case, highly repetitive, input data. If | ||
795 | compression runs into difficulties caused by repetitive data, the | ||
796 | library switches from the standard sorting algorithm to a fallback | ||
797 | algorithm. The fallback is slower than the standard algorithm by | ||
798 | perhaps a factor of three, but always behaves reasonably, no matter how | ||
799 | bad the input. | ||
800 | |||
801 | Lower values of @code{workFactor} reduce the amount of effort the | ||
802 | standard algorithm will expend before resorting to the fallback. You | ||
803 | should set this parameter carefully; too low, and many inputs will be | ||
804 | handled by the fallback algorithm and so compress rather slowly, too | ||
805 | high, and your average-to-worst case compression times can become very | ||
806 | large. The default value of 30 gives reasonable behaviour over a wide | ||
807 | range of circumstances. | ||
808 | |||
809 | Allowable values range from 0 to 250 inclusive. 0 is a special case, | ||
810 | equivalent to using the default value of 30. | ||
811 | |||
812 | Note that the compressed output generated is the same regardless of | ||
813 | whether or not the fallback algorithm is used. | ||
814 | |||
815 | Be aware also that this parameter may disappear entirely in future | ||
816 | versions of the library. In principle it should be possible to devise a | ||
817 | good way to automatically choose which algorithm to use. Such a | ||
818 | mechanism would render the parameter obsolete. | ||
819 | |||
820 | Possible return values: | ||
821 | @display | ||
822 | @code{BZ_CONFIG_ERROR} | ||
823 | if the library has been mis-compiled | ||
824 | @code{BZ_PARAM_ERROR} | ||
825 | if @code{strm} is @code{NULL} | ||
826 | or @code{blockSize} < 1 or @code{blockSize} > 9 | ||
827 | or @code{verbosity} < 0 or @code{verbosity} > 4 | ||
828 | or @code{workFactor} < 0 or @code{workFactor} > 250 | ||
829 | @code{BZ_MEM_ERROR} | ||
830 | if not enough memory is available | ||
831 | @code{BZ_OK} | ||
832 | otherwise | ||
833 | @end display | ||
834 | Allowable next actions: | ||
835 | @display | ||
836 | @code{BZ2_bzCompress} | ||
837 | if @code{BZ_OK} is returned | ||
838 | no specific action needed in case of error | ||
839 | @end display | ||
840 | |||
841 | @subsection @code{BZ2_bzCompress} | ||
842 | @example | ||
843 | int BZ2_bzCompress ( bz_stream *strm, int action ); | ||
844 | @end example | ||
845 | Provides more input and/or output buffer space for the library. The | ||
846 | caller maintains input and output buffers, and calls @code{BZ2_bzCompress} to | ||
847 | transfer data between them. | ||
848 | |||
849 | Before each call to @code{BZ2_bzCompress}, @code{next_in} should point at | ||
850 | the data to be compressed, and @code{avail_in} should indicate how many | ||
851 | bytes the library may read. @code{BZ2_bzCompress} updates @code{next_in}, | ||
852 | @code{avail_in} and @code{total_in} to reflect the number of bytes it | ||
853 | has read. | ||
854 | |||
855 | Similarly, @code{next_out} should point to a buffer in which the | ||
856 | compressed data is to be placed, with @code{avail_out} indicating how | ||
857 | much output space is available. @code{BZ2_bzCompress} updates | ||
858 | @code{next_out}, @code{avail_out} and @code{total_out} to reflect the | ||
859 | number of bytes output. | ||
860 | |||
861 | You may provide and remove as little or as much data as you like on each | ||
862 | call of @code{BZ2_bzCompress}. In the limit, it is acceptable to supply and | ||
863 | remove data one byte at a time, although this would be terribly | ||
864 | inefficient. You should always ensure that at least one byte of output | ||
865 | space is available at each call. | ||
866 | |||
867 | A second purpose of @code{BZ2_bzCompress} is to request a change of mode of the | ||
868 | compressed stream. | ||
869 | |||
870 | Conceptually, a compressed stream can be in one of four states: IDLE, | ||
871 | RUNNING, FLUSHING and FINISHING. Before initialisation | ||
872 | (@code{BZ2_bzCompressInit}) and after termination (@code{BZ2_bzCompressEnd}), a | ||
873 | stream is regarded as IDLE. | ||
874 | |||
875 | Upon initialisation (@code{BZ2_bzCompressInit}), the stream is placed in the | ||
876 | RUNNING state. Subsequent calls to @code{BZ2_bzCompress} should pass | ||
877 | @code{BZ_RUN} as the requested action; other actions are illegal and | ||
878 | will result in @code{BZ_SEQUENCE_ERROR}. | ||
879 | |||
880 | At some point, the calling program will have provided all the input data | ||
881 | it wants to. It will then want to finish up -- in effect, asking the | ||
882 | library to process any data it might have buffered internally. In this | ||
883 | state, @code{BZ2_bzCompress} will no longer attempt to read data from | ||
884 | @code{next_in}, but it will want to write data to @code{next_out}. | ||
885 | Because the output buffer supplied by the user can be arbitrarily small, | ||
886 | the finishing-up operation cannot necessarily be done with a single call | ||
887 | of @code{BZ2_bzCompress}. | ||
888 | |||
889 | Instead, the calling program passes @code{BZ_FINISH} as an action to | ||
890 | @code{BZ2_bzCompress}. This changes the stream's state to FINISHING. Any | ||
891 | remaining input (ie, @code{next_in[0 .. avail_in-1]}) is compressed and | ||
892 | transferred to the output buffer. To do this, @code{BZ2_bzCompress} must be | ||
893 | called repeatedly until all the output has been consumed. At that | ||
894 | point, @code{BZ2_bzCompress} returns @code{BZ_STREAM_END}, and the stream's | ||
895 | state is set back to IDLE. @code{BZ2_bzCompressEnd} should then be | ||
896 | called. | ||
897 | |||
898 | Just to make sure the calling program does not cheat, the library makes | ||
899 | a note of @code{avail_in} at the time of the first call to | ||
900 | @code{BZ2_bzCompress} which has @code{BZ_FINISH} as an action (ie, at the | ||
901 | time the program has announced its intention to not supply any more | ||
902 | input). By comparing this value with that of @code{avail_in} over | ||
903 | subsequent calls to @code{BZ2_bzCompress}, the library can detect any | ||
904 | attempts to slip in more data to compress. Any calls for which this is | ||
905 | detected will return @code{BZ_SEQUENCE_ERROR}. This indicates a | ||
906 | programming mistake which should be corrected. | ||
907 | |||
908 | Instead of asking to finish, the calling program may ask | ||
909 | @code{BZ2_bzCompress} to take all the remaining input, compress it and | ||
910 | terminate the current (Burrows-Wheeler) compression block. This could | ||
911 | be useful for error control purposes. The mechanism is analogous to | ||
912 | that for finishing: call @code{BZ2_bzCompress} with an action of | ||
913 | @code{BZ_FLUSH}, remove output data, and persist with the | ||
914 | @code{BZ_FLUSH} action until the value @code{BZ_RUN} is returned. As | ||
915 | with finishing, @code{BZ2_bzCompress} detects any attempt to provide more | ||
916 | input data once the flush has begun. | ||
917 | |||
918 | Once the flush is complete, the stream returns to the normal RUNNING | ||
919 | state. | ||
920 | |||
921 | This all sounds pretty complex, but isn't really. Here's a table | ||
922 | which shows which actions are allowable in each state, what action | ||
923 | will be taken, what the next state is, and what the non-error return | ||
924 | values are. Note that you can't explicitly ask what state the | ||
925 | stream is in, but nor do you need to -- it can be inferred from the | ||
926 | values returned by @code{BZ2_bzCompress}. | ||
927 | @display | ||
928 | IDLE/@code{any} | ||
929 | Illegal. IDLE state only exists after @code{BZ2_bzCompressEnd} or | ||
930 | before @code{BZ2_bzCompressInit}. | ||
931 | Return value = @code{BZ_SEQUENCE_ERROR} | ||
932 | |||
933 | RUNNING/@code{BZ_RUN} | ||
934 | Compress from @code{next_in} to @code{next_out} as much as possible. | ||
935 | Next state = RUNNING | ||
936 | Return value = @code{BZ_RUN_OK} | ||
937 | |||
938 | RUNNING/@code{BZ_FLUSH} | ||
939 | Remember current value of @code{next_in}. Compress from @code{next_in} | ||
940 | to @code{next_out} as much as possible, but do not accept any more input. | ||
941 | Next state = FLUSHING | ||
942 | Return value = @code{BZ_FLUSH_OK} | ||
943 | |||
944 | RUNNING/@code{BZ_FINISH} | ||
945 | Remember current value of @code{next_in}. Compress from @code{next_in} | ||
946 | to @code{next_out} as much as possible, but do not accept any more input. | ||
947 | Next state = FINISHING | ||
948 | Return value = @code{BZ_FINISH_OK} | ||
949 | |||
950 | FLUSHING/@code{BZ_FLUSH} | ||
951 | Compress from @code{next_in} to @code{next_out} as much as possible, | ||
952 | but do not accept any more input. | ||
953 | If all the existing input has been used up and all compressed | ||
954 | output has been removed | ||
955 | Next state = RUNNING; Return value = @code{BZ_RUN_OK} | ||
956 | else | ||
957 | Next state = FLUSHING; Return value = @code{BZ_FLUSH_OK} | ||
958 | |||
959 | FLUSHING/other | ||
960 | Illegal. | ||
961 | Return value = @code{BZ_SEQUENCE_ERROR} | ||
962 | |||
963 | FINISHING/@code{BZ_FINISH} | ||
964 | Compress from @code{next_in} to @code{next_out} as much as possible, | ||
965 | but to not accept any more input. | ||
966 | If all the existing input has been used up and all compressed | ||
967 | output has been removed | ||
968 | Next state = IDLE; Return value = @code{BZ_STREAM_END} | ||
969 | else | ||
970 | Next state = FINISHING; Return value = @code{BZ_FINISHING} | ||
971 | |||
972 | FINISHING/other | ||
973 | Illegal. | ||
974 | Return value = @code{BZ_SEQUENCE_ERROR} | ||
975 | @end display | ||
976 | |||
977 | That still looks complicated? Well, fair enough. The usual sequence | ||
978 | of calls for compressing a load of data is: | ||
979 | @itemize @bullet | ||
980 | @item Get started with @code{BZ2_bzCompressInit}. | ||
981 | @item Shovel data in and shlurp out its compressed form using zero or more | ||
982 | calls of @code{BZ2_bzCompress} with action = @code{BZ_RUN}. | ||
983 | @item Finish up. | ||
984 | Repeatedly call @code{BZ2_bzCompress} with action = @code{BZ_FINISH}, | ||
985 | copying out the compressed output, until @code{BZ_STREAM_END} is returned. | ||
986 | @item Close up and go home. Call @code{BZ2_bzCompressEnd}. | ||
987 | @end itemize | ||
988 | If the data you want to compress fits into your input buffer all | ||
989 | at once, you can skip the calls of @code{BZ2_bzCompress ( ..., BZ_RUN )} and | ||
990 | just do the @code{BZ2_bzCompress ( ..., BZ_FINISH )} calls. | ||
991 | |||
992 | All required memory is allocated by @code{BZ2_bzCompressInit}. The | ||
993 | compression library can accept any data at all (obviously). So you | ||
994 | shouldn't get any error return values from the @code{BZ2_bzCompress} calls. | ||
995 | If you do, they will be @code{BZ_SEQUENCE_ERROR}, and indicate a bug in | ||
996 | your programming. | ||
997 | |||
998 | Trivial other possible return values: | ||
999 | @display | ||
1000 | @code{BZ_PARAM_ERROR} | ||
1001 | if @code{strm} is @code{NULL}, or @code{strm->s} is @code{NULL} | ||
1002 | @end display | ||
1003 | |||
1004 | @subsection @code{BZ2_bzCompressEnd} | ||
1005 | @example | ||
1006 | int BZ2_bzCompressEnd ( bz_stream *strm ); | ||
1007 | @end example | ||
1008 | Releases all memory associated with a compression stream. | ||
1009 | |||
1010 | Possible return values: | ||
1011 | @display | ||
1012 | @code{BZ_PARAM_ERROR} if @code{strm} is @code{NULL} or @code{strm->s} is @code{NULL} | ||
1013 | @code{BZ_OK} otherwise | ||
1014 | @end display | ||
1015 | |||
1016 | |||
1017 | @subsection @code{BZ2_bzDecompressInit} | ||
1018 | @example | ||
1019 | int BZ2_bzDecompressInit ( bz_stream *strm, int verbosity, int small ); | ||
1020 | @end example | ||
1021 | Prepares for decompression. As with @code{BZ2_bzCompressInit}, a | ||
1022 | @code{bz_stream} record should be allocated and initialised before the | ||
1023 | call. Fields @code{bzalloc}, @code{bzfree} and @code{opaque} should be | ||
1024 | set if a custom memory allocator is required, or made @code{NULL} for | ||
1025 | the normal @code{malloc}/@code{free} routines. Upon return, the internal | ||
1026 | state will have been initialised, and @code{total_in} and | ||
1027 | @code{total_out} will be zero. | ||
1028 | |||
1029 | For the meaning of parameter @code{verbosity}, see @code{BZ2_bzCompressInit}. | ||
1030 | |||
1031 | If @code{small} is nonzero, the library will use an alternative | ||
1032 | decompression algorithm which uses less memory but at the cost of | ||
1033 | decompressing more slowly (roughly speaking, half the speed, but the | ||
1034 | maximum memory requirement drops to around 2300k). See Chapter 2 for | ||
1035 | more information on memory management. | ||
1036 | |||
1037 | Note that the amount of memory needed to decompress | ||
1038 | a stream cannot be determined until the stream's header has been read, | ||
1039 | so even if @code{BZ2_bzDecompressInit} succeeds, a subsequent | ||
1040 | @code{BZ2_bzDecompress} could fail with @code{BZ_MEM_ERROR}. | ||
1041 | |||
1042 | Possible return values: | ||
1043 | @display | ||
1044 | @code{BZ_CONFIG_ERROR} | ||
1045 | if the library has been mis-compiled | ||
1046 | @code{BZ_PARAM_ERROR} | ||
1047 | if @code{(small != 0 && small != 1)} | ||
1048 | or @code{(verbosity < 0 || verbosity > 4)} | ||
1049 | @code{BZ_MEM_ERROR} | ||
1050 | if insufficient memory is available | ||
1051 | @end display | ||
1052 | |||
1053 | Allowable next actions: | ||
1054 | @display | ||
1055 | @code{BZ2_bzDecompress} | ||
1056 | if @code{BZ_OK} was returned | ||
1057 | no specific action required in case of error | ||
1058 | @end display | ||
1059 | |||
1060 | |||
1061 | |||
1062 | @subsection @code{BZ2_bzDecompress} | ||
1063 | @example | ||
1064 | int BZ2_bzDecompress ( bz_stream *strm ); | ||
1065 | @end example | ||
1066 | Provides more input and/out output buffer space for the library. The | ||
1067 | caller maintains input and output buffers, and uses @code{BZ2_bzDecompress} | ||
1068 | to transfer data between them. | ||
1069 | |||
1070 | Before each call to @code{BZ2_bzDecompress}, @code{next_in} | ||
1071 | should point at the compressed data, | ||
1072 | and @code{avail_in} should indicate how many bytes the library | ||
1073 | may read. @code{BZ2_bzDecompress} updates @code{next_in}, @code{avail_in} | ||
1074 | and @code{total_in} | ||
1075 | to reflect the number of bytes it has read. | ||
1076 | |||
1077 | Similarly, @code{next_out} should point to a buffer in which the uncompressed | ||
1078 | output is to be placed, with @code{avail_out} indicating how much output space | ||
1079 | is available. @code{BZ2_bzCompress} updates @code{next_out}, | ||
1080 | @code{avail_out} and @code{total_out} to reflect | ||
1081 | the number of bytes output. | ||
1082 | |||
1083 | You may provide and remove as little or as much data as you like on | ||
1084 | each call of @code{BZ2_bzDecompress}. | ||
1085 | In the limit, it is acceptable to | ||
1086 | supply and remove data one byte at a time, although this would be | ||
1087 | terribly inefficient. You should always ensure that at least one | ||
1088 | byte of output space is available at each call. | ||
1089 | |||
1090 | Use of @code{BZ2_bzDecompress} is simpler than @code{BZ2_bzCompress}. | ||
1091 | |||
1092 | You should provide input and remove output as described above, and | ||
1093 | repeatedly call @code{BZ2_bzDecompress} until @code{BZ_STREAM_END} is | ||
1094 | returned. Appearance of @code{BZ_STREAM_END} denotes that | ||
1095 | @code{BZ2_bzDecompress} has detected the logical end of the compressed | ||
1096 | stream. @code{BZ2_bzDecompress} will not produce @code{BZ_STREAM_END} until | ||
1097 | all output data has been placed into the output buffer, so once | ||
1098 | @code{BZ_STREAM_END} appears, you are guaranteed to have available all | ||
1099 | the decompressed output, and @code{BZ2_bzDecompressEnd} can safely be | ||
1100 | called. | ||
1101 | |||
1102 | If case of an error return value, you should call @code{BZ2_bzDecompressEnd} | ||
1103 | to clean up and release memory. | ||
1104 | |||
1105 | Possible return values: | ||
1106 | @display | ||
1107 | @code{BZ_PARAM_ERROR} | ||
1108 | if @code{strm} is @code{NULL} or @code{strm->s} is @code{NULL} | ||
1109 | or @code{strm->avail_out < 1} | ||
1110 | @code{BZ_DATA_ERROR} | ||
1111 | if a data integrity error is detected in the compressed stream | ||
1112 | @code{BZ_DATA_ERROR_MAGIC} | ||
1113 | if the compressed stream doesn't begin with the right magic bytes | ||
1114 | @code{BZ_MEM_ERROR} | ||
1115 | if there wasn't enough memory available | ||
1116 | @code{BZ_STREAM_END} | ||
1117 | if the logical end of the data stream was detected and all | ||
1118 | output in has been consumed, eg @code{s->avail_out > 0} | ||
1119 | @code{BZ_OK} | ||
1120 | otherwise | ||
1121 | @end display | ||
1122 | Allowable next actions: | ||
1123 | @display | ||
1124 | @code{BZ2_bzDecompress} | ||
1125 | if @code{BZ_OK} was returned | ||
1126 | @code{BZ2_bzDecompressEnd} | ||
1127 | otherwise | ||
1128 | @end display | ||
1129 | |||
1130 | |||
1131 | @subsection @code{BZ2_bzDecompressEnd} | ||
1132 | @example | ||
1133 | int BZ2_bzDecompressEnd ( bz_stream *strm ); | ||
1134 | @end example | ||
1135 | Releases all memory associated with a decompression stream. | ||
1136 | |||
1137 | Possible return values: | ||
1138 | @display | ||
1139 | @code{BZ_PARAM_ERROR} | ||
1140 | if @code{strm} is @code{NULL} or @code{strm->s} is @code{NULL} | ||
1141 | @code{BZ_OK} | ||
1142 | otherwise | ||
1143 | @end display | ||
1144 | |||
1145 | Allowable next actions: | ||
1146 | @display | ||
1147 | None. | ||
1148 | @end display | ||
1149 | |||
1150 | |||
1151 | @section High-level interface | ||
1152 | |||
1153 | This interface provides functions for reading and writing | ||
1154 | @code{bzip2} format files. First, some general points. | ||
1155 | |||
1156 | @itemize @bullet | ||
1157 | @item All of the functions take an @code{int*} first argument, | ||
1158 | @code{bzerror}. | ||
1159 | After each call, @code{bzerror} should be consulted first to determine | ||
1160 | the outcome of the call. If @code{bzerror} is @code{BZ_OK}, | ||
1161 | the call completed | ||
1162 | successfully, and only then should the return value of the function | ||
1163 | (if any) be consulted. If @code{bzerror} is @code{BZ_IO_ERROR}, | ||
1164 | there was an error | ||
1165 | reading/writing the underlying compressed file, and you should | ||
1166 | then consult @code{errno}/@code{perror} to determine the | ||
1167 | cause of the difficulty. | ||
1168 | @code{bzerror} may also be set to various other values; precise details are | ||
1169 | given on a per-function basis below. | ||
1170 | @item If @code{bzerror} indicates an error | ||
1171 | (ie, anything except @code{BZ_OK} and @code{BZ_STREAM_END}), | ||
1172 | you should immediately call @code{BZ2_bzReadClose} (or @code{BZ2_bzWriteClose}, | ||
1173 | depending on whether you are attempting to read or to write) | ||
1174 | to free up all resources associated | ||
1175 | with the stream. Once an error has been indicated, behaviour of all calls | ||
1176 | except @code{BZ2_bzReadClose} (@code{BZ2_bzWriteClose}) is undefined. | ||
1177 | The implication is that (1) @code{bzerror} should | ||
1178 | be checked after each call, and (2) if @code{bzerror} indicates an error, | ||
1179 | @code{BZ2_bzReadClose} (@code{BZ2_bzWriteClose}) should then be called to clean up. | ||
1180 | @item The @code{FILE*} arguments passed to | ||
1181 | @code{BZ2_bzReadOpen}/@code{BZ2_bzWriteOpen} | ||
1182 | should be set to binary mode. | ||
1183 | Most Unix systems will do this by default, but other platforms, | ||
1184 | including Windows and Mac, will not. If you omit this, you may | ||
1185 | encounter problems when moving code to new platforms. | ||
1186 | @item Memory allocation requests are handled by | ||
1187 | @code{malloc}/@code{free}. | ||
1188 | At present | ||
1189 | there is no facility for user-defined memory allocators in the file I/O | ||
1190 | functions (could easily be added, though). | ||
1191 | @end itemize | ||
1192 | |||
1193 | |||
1194 | |||
1195 | @subsection @code{BZ2_bzReadOpen} | ||
1196 | @example | ||
1197 | typedef void BZFILE; | ||
1198 | |||
1199 | BZFILE *BZ2_bzReadOpen ( int *bzerror, FILE *f, | ||
1200 | int small, int verbosity, | ||
1201 | void *unused, int nUnused ); | ||
1202 | @end example | ||
1203 | Prepare to read compressed data from file handle @code{f}. @code{f} | ||
1204 | should refer to a file which has been opened for reading, and for which | ||
1205 | the error indicator (@code{ferror(f)})is not set. If @code{small} is 1, | ||
1206 | the library will try to decompress using less memory, at the expense of | ||
1207 | speed. | ||
1208 | |||
1209 | For reasons explained below, @code{BZ2_bzRead} will decompress the | ||
1210 | @code{nUnused} bytes starting at @code{unused}, before starting to read | ||
1211 | from the file @code{f}. At most @code{BZ_MAX_UNUSED} bytes may be | ||
1212 | supplied like this. If this facility is not required, you should pass | ||
1213 | @code{NULL} and @code{0} for @code{unused} and n@code{Unused} | ||
1214 | respectively. | ||
1215 | |||
1216 | For the meaning of parameters @code{small} and @code{verbosity}, | ||
1217 | see @code{BZ2_bzDecompressInit}. | ||
1218 | |||
1219 | The amount of memory needed to decompress a file cannot be determined | ||
1220 | until the file's header has been read. So it is possible that | ||
1221 | @code{BZ2_bzReadOpen} returns @code{BZ_OK} but a subsequent call of | ||
1222 | @code{BZ2_bzRead} will return @code{BZ_MEM_ERROR}. | ||
1223 | |||
1224 | Possible assignments to @code{bzerror}: | ||
1225 | @display | ||
1226 | @code{BZ_CONFIG_ERROR} | ||
1227 | if the library has been mis-compiled | ||
1228 | @code{BZ_PARAM_ERROR} | ||
1229 | if @code{f} is @code{NULL} | ||
1230 | or @code{small} is neither @code{0} nor @code{1} | ||
1231 | or @code{(unused == NULL && nUnused != 0)} | ||
1232 | or @code{(unused != NULL && !(0 <= nUnused <= BZ_MAX_UNUSED))} | ||
1233 | @code{BZ_IO_ERROR} | ||
1234 | if @code{ferror(f)} is nonzero | ||
1235 | @code{BZ_MEM_ERROR} | ||
1236 | if insufficient memory is available | ||
1237 | @code{BZ_OK} | ||
1238 | otherwise. | ||
1239 | @end display | ||
1240 | |||
1241 | Possible return values: | ||
1242 | @display | ||
1243 | Pointer to an abstract @code{BZFILE} | ||
1244 | if @code{bzerror} is @code{BZ_OK} | ||
1245 | @code{NULL} | ||
1246 | otherwise | ||
1247 | @end display | ||
1248 | |||
1249 | Allowable next actions: | ||
1250 | @display | ||
1251 | @code{BZ2_bzRead} | ||
1252 | if @code{bzerror} is @code{BZ_OK} | ||
1253 | @code{BZ2_bzClose} | ||
1254 | otherwise | ||
1255 | @end display | ||
1256 | |||
1257 | |||
1258 | @subsection @code{BZ2_bzRead} | ||
1259 | @example | ||
1260 | int BZ2_bzRead ( int *bzerror, BZFILE *b, void *buf, int len ); | ||
1261 | @end example | ||
1262 | Reads up to @code{len} (uncompressed) bytes from the compressed file | ||
1263 | @code{b} into | ||
1264 | the buffer @code{buf}. If the read was successful, | ||
1265 | @code{bzerror} is set to @code{BZ_OK} | ||
1266 | and the number of bytes read is returned. If the logical end-of-stream | ||
1267 | was detected, @code{bzerror} will be set to @code{BZ_STREAM_END}, | ||
1268 | and the number | ||
1269 | of bytes read is returned. All other @code{bzerror} values denote an error. | ||
1270 | |||
1271 | @code{BZ2_bzRead} will supply @code{len} bytes, | ||
1272 | unless the logical stream end is detected | ||
1273 | or an error occurs. Because of this, it is possible to detect the | ||
1274 | stream end by observing when the number of bytes returned is | ||
1275 | less than the number | ||
1276 | requested. Nevertheless, this is regarded as inadvisable; you should | ||
1277 | instead check @code{bzerror} after every call and watch out for | ||
1278 | @code{BZ_STREAM_END}. | ||
1279 | |||
1280 | Internally, @code{BZ2_bzRead} copies data from the compressed file in chunks | ||
1281 | of size @code{BZ_MAX_UNUSED} bytes | ||
1282 | before decompressing it. If the file contains more bytes than strictly | ||
1283 | needed to reach the logical end-of-stream, @code{BZ2_bzRead} will almost certainly | ||
1284 | read some of the trailing data before signalling @code{BZ_SEQUENCE_END}. | ||
1285 | To collect the read but unused data once @code{BZ_SEQUENCE_END} has | ||
1286 | appeared, call @code{BZ2_bzReadGetUnused} immediately before @code{BZ2_bzReadClose}. | ||
1287 | |||
1288 | Possible assignments to @code{bzerror}: | ||
1289 | @display | ||
1290 | @code{BZ_PARAM_ERROR} | ||
1291 | if @code{b} is @code{NULL} or @code{buf} is @code{NULL} or @code{len < 0} | ||
1292 | @code{BZ_SEQUENCE_ERROR} | ||
1293 | if @code{b} was opened with @code{BZ2_bzWriteOpen} | ||
1294 | @code{BZ_IO_ERROR} | ||
1295 | if there is an error reading from the compressed file | ||
1296 | @code{BZ_UNEXPECTED_EOF} | ||
1297 | if the compressed file ended before the logical end-of-stream was detected | ||
1298 | @code{BZ_DATA_ERROR} | ||
1299 | if a data integrity error was detected in the compressed stream | ||
1300 | @code{BZ_DATA_ERROR_MAGIC} | ||
1301 | if the stream does not begin with the requisite header bytes (ie, is not | ||
1302 | a @code{bzip2} data file). This is really a special case of @code{BZ_DATA_ERROR}. | ||
1303 | @code{BZ_MEM_ERROR} | ||
1304 | if insufficient memory was available | ||
1305 | @code{BZ_STREAM_END} | ||
1306 | if the logical end of stream was detected. | ||
1307 | @code{BZ_OK} | ||
1308 | otherwise. | ||
1309 | @end display | ||
1310 | |||
1311 | Possible return values: | ||
1312 | @display | ||
1313 | number of bytes read | ||
1314 | if @code{bzerror} is @code{BZ_OK} or @code{BZ_STREAM_END} | ||
1315 | undefined | ||
1316 | otherwise | ||
1317 | @end display | ||
1318 | |||
1319 | Allowable next actions: | ||
1320 | @display | ||
1321 | collect data from @code{buf}, then @code{BZ2_bzRead} or @code{BZ2_bzReadClose} | ||
1322 | if @code{bzerror} is @code{BZ_OK} | ||
1323 | collect data from @code{buf}, then @code{BZ2_bzReadClose} or @code{BZ2_bzReadGetUnused} | ||
1324 | if @code{bzerror} is @code{BZ_SEQUENCE_END} | ||
1325 | @code{BZ2_bzReadClose} | ||
1326 | otherwise | ||
1327 | @end display | ||
1328 | |||
1329 | |||
1330 | |||
1331 | @subsection @code{BZ2_bzReadGetUnused} | ||
1332 | @example | ||
1333 | void BZ2_bzReadGetUnused ( int* bzerror, BZFILE *b, | ||
1334 | void** unused, int* nUnused ); | ||
1335 | @end example | ||
1336 | Returns data which was read from the compressed file but was not needed | ||
1337 | to get to the logical end-of-stream. @code{*unused} is set to the address | ||
1338 | of the data, and @code{*nUnused} to the number of bytes. @code{*nUnused} will | ||
1339 | be set to a value between @code{0} and @code{BZ_MAX_UNUSED} inclusive. | ||
1340 | |||
1341 | This function may only be called once @code{BZ2_bzRead} has signalled | ||
1342 | @code{BZ_STREAM_END} but before @code{BZ2_bzReadClose}. | ||
1343 | |||
1344 | Possible assignments to @code{bzerror}: | ||
1345 | @display | ||
1346 | @code{BZ_PARAM_ERROR} | ||
1347 | if @code{b} is @code{NULL} | ||
1348 | or @code{unused} is @code{NULL} or @code{nUnused} is @code{NULL} | ||
1349 | @code{BZ_SEQUENCE_ERROR} | ||
1350 | if @code{BZ_STREAM_END} has not been signalled | ||
1351 | or if @code{b} was opened with @code{BZ2_bzWriteOpen} | ||
1352 | @code{BZ_OK} | ||
1353 | otherwise | ||
1354 | @end display | ||
1355 | |||
1356 | Allowable next actions: | ||
1357 | @display | ||
1358 | @code{BZ2_bzReadClose} | ||
1359 | @end display | ||
1360 | |||
1361 | |||
1362 | @subsection @code{BZ2_bzReadClose} | ||
1363 | @example | ||
1364 | void BZ2_bzReadClose ( int *bzerror, BZFILE *b ); | ||
1365 | @end example | ||
1366 | Releases all memory pertaining to the compressed file @code{b}. | ||
1367 | @code{BZ2_bzReadClose} does not call @code{fclose} on the underlying file | ||
1368 | handle, so you should do that yourself if appropriate. | ||
1369 | @code{BZ2_bzReadClose} should be called to clean up after all error | ||
1370 | situations. | ||
1371 | |||
1372 | Possible assignments to @code{bzerror}: | ||
1373 | @display | ||
1374 | @code{BZ_SEQUENCE_ERROR} | ||
1375 | if @code{b} was opened with @code{BZ2_bzOpenWrite} | ||
1376 | @code{BZ_OK} | ||
1377 | otherwise | ||
1378 | @end display | ||
1379 | |||
1380 | Allowable next actions: | ||
1381 | @display | ||
1382 | none | ||
1383 | @end display | ||
1384 | |||
1385 | |||
1386 | |||
1387 | @subsection @code{BZ2_bzWriteOpen} | ||
1388 | @example | ||
1389 | BZFILE *BZ2_bzWriteOpen ( int *bzerror, FILE *f, | ||
1390 | int blockSize100k, int verbosity, | ||
1391 | int workFactor ); | ||
1392 | @end example | ||
1393 | Prepare to write compressed data to file handle @code{f}. | ||
1394 | @code{f} should refer to | ||
1395 | a file which has been opened for writing, and for which the error | ||
1396 | indicator (@code{ferror(f)})is not set. | ||
1397 | |||
1398 | For the meaning of parameters @code{blockSize100k}, | ||
1399 | @code{verbosity} and @code{workFactor}, see | ||
1400 | @* @code{BZ2_bzCompressInit}. | ||
1401 | |||
1402 | All required memory is allocated at this stage, so if the call | ||
1403 | completes successfully, @code{BZ_MEM_ERROR} cannot be signalled by a | ||
1404 | subsequent call to @code{BZ2_bzWrite}. | ||
1405 | |||
1406 | Possible assignments to @code{bzerror}: | ||
1407 | @display | ||
1408 | @code{BZ_CONFIG_ERROR} | ||
1409 | if the library has been mis-compiled | ||
1410 | @code{BZ_PARAM_ERROR} | ||
1411 | if @code{f} is @code{NULL} | ||
1412 | or @code{blockSize100k < 1} or @code{blockSize100k > 9} | ||
1413 | @code{BZ_IO_ERROR} | ||
1414 | if @code{ferror(f)} is nonzero | ||
1415 | @code{BZ_MEM_ERROR} | ||
1416 | if insufficient memory is available | ||
1417 | @code{BZ_OK} | ||
1418 | otherwise | ||
1419 | @end display | ||
1420 | |||
1421 | Possible return values: | ||
1422 | @display | ||
1423 | Pointer to an abstract @code{BZFILE} | ||
1424 | if @code{bzerror} is @code{BZ_OK} | ||
1425 | @code{NULL} | ||
1426 | otherwise | ||
1427 | @end display | ||
1428 | |||
1429 | Allowable next actions: | ||
1430 | @display | ||
1431 | @code{BZ2_bzWrite} | ||
1432 | if @code{bzerror} is @code{BZ_OK} | ||
1433 | (you could go directly to @code{BZ2_bzWriteClose}, but this would be pretty pointless) | ||
1434 | @code{BZ2_bzWriteClose} | ||
1435 | otherwise | ||
1436 | @end display | ||
1437 | |||
1438 | |||
1439 | |||
1440 | @subsection @code{BZ2_bzWrite} | ||
1441 | @example | ||
1442 | void BZ2_bzWrite ( int *bzerror, BZFILE *b, void *buf, int len ); | ||
1443 | @end example | ||
1444 | Absorbs @code{len} bytes from the buffer @code{buf}, eventually to be | ||
1445 | compressed and written to the file. | ||
1446 | |||
1447 | Possible assignments to @code{bzerror}: | ||
1448 | @display | ||
1449 | @code{BZ_PARAM_ERROR} | ||
1450 | if @code{b} is @code{NULL} or @code{buf} is @code{NULL} or @code{len < 0} | ||
1451 | @code{BZ_SEQUENCE_ERROR} | ||
1452 | if b was opened with @code{BZ2_bzReadOpen} | ||
1453 | @code{BZ_IO_ERROR} | ||
1454 | if there is an error writing the compressed file. | ||
1455 | @code{BZ_OK} | ||
1456 | otherwise | ||
1457 | @end display | ||
1458 | |||
1459 | |||
1460 | |||
1461 | |||
1462 | @subsection @code{BZ2_bzWriteClose} | ||
1463 | @example | ||
1464 | void BZ2_bzWriteClose ( int *bzerror, BZFILE* f, | ||
1465 | int abandon, | ||
1466 | unsigned int* nbytes_in, | ||
1467 | unsigned int* nbytes_out ); | ||
1468 | |||
1469 | void BZ2_bzWriteClose64 ( int *bzerror, BZFILE* f, | ||
1470 | int abandon, | ||
1471 | unsigned int* nbytes_in_lo32, | ||
1472 | unsigned int* nbytes_in_hi32, | ||
1473 | unsigned int* nbytes_out_lo32, | ||
1474 | unsigned int* nbytes_out_hi32 ); | ||
1475 | @end example | ||
1476 | |||
1477 | Compresses and flushes to the compressed file all data so far supplied | ||
1478 | by @code{BZ2_bzWrite}. The logical end-of-stream markers are also written, so | ||
1479 | subsequent calls to @code{BZ2_bzWrite} are illegal. All memory associated | ||
1480 | with the compressed file @code{b} is released. | ||
1481 | @code{fflush} is called on the | ||
1482 | compressed file, but it is not @code{fclose}'d. | ||
1483 | |||
1484 | If @code{BZ2_bzWriteClose} is called to clean up after an error, the only | ||
1485 | action is to release the memory. The library records the error codes | ||
1486 | issued by previous calls, so this situation will be detected | ||
1487 | automatically. There is no attempt to complete the compression | ||
1488 | operation, nor to @code{fflush} the compressed file. You can force this | ||
1489 | behaviour to happen even in the case of no error, by passing a nonzero | ||
1490 | value to @code{abandon}. | ||
1491 | |||
1492 | If @code{nbytes_in} is non-null, @code{*nbytes_in} will be set to be the | ||
1493 | total volume of uncompressed data handled. Similarly, @code{nbytes_out} | ||
1494 | will be set to the total volume of compressed data written. For | ||
1495 | compatibility with older versions of the library, @code{BZ2_bzWriteClose} | ||
1496 | only yields the lower 32 bits of these counts. Use | ||
1497 | @code{BZ2_bzWriteClose64} if you want the full 64 bit counts. These | ||
1498 | two functions are otherwise absolutely identical. | ||
1499 | |||
1500 | |||
1501 | Possible assignments to @code{bzerror}: | ||
1502 | @display | ||
1503 | @code{BZ_SEQUENCE_ERROR} | ||
1504 | if @code{b} was opened with @code{BZ2_bzReadOpen} | ||
1505 | @code{BZ_IO_ERROR} | ||
1506 | if there is an error writing the compressed file | ||
1507 | @code{BZ_OK} | ||
1508 | otherwise | ||
1509 | @end display | ||
1510 | |||
1511 | @subsection Handling embedded compressed data streams | ||
1512 | |||
1513 | The high-level library facilitates use of | ||
1514 | @code{bzip2} data streams which form some part of a surrounding, larger | ||
1515 | data stream. | ||
1516 | @itemize @bullet | ||
1517 | @item For writing, the library takes an open file handle, writes | ||
1518 | compressed data to it, @code{fflush}es it but does not @code{fclose} it. | ||
1519 | The calling application can write its own data before and after the | ||
1520 | compressed data stream, using that same file handle. | ||
1521 | @item Reading is more complex, and the facilities are not as general | ||
1522 | as they could be since generality is hard to reconcile with efficiency. | ||
1523 | @code{BZ2_bzRead} reads from the compressed file in blocks of size | ||
1524 | @code{BZ_MAX_UNUSED} bytes, and in doing so probably will overshoot | ||
1525 | the logical end of compressed stream. | ||
1526 | To recover this data once decompression has | ||
1527 | ended, call @code{BZ2_bzReadGetUnused} after the last call of @code{BZ2_bzRead} | ||
1528 | (the one returning @code{BZ_STREAM_END}) but before calling | ||
1529 | @code{BZ2_bzReadClose}. | ||
1530 | @end itemize | ||
1531 | |||
1532 | This mechanism makes it easy to decompress multiple @code{bzip2} | ||
1533 | streams placed end-to-end. As the end of one stream, when @code{BZ2_bzRead} | ||
1534 | returns @code{BZ_STREAM_END}, call @code{BZ2_bzReadGetUnused} to collect the | ||
1535 | unused data (copy it into your own buffer somewhere). | ||
1536 | That data forms the start of the next compressed stream. | ||
1537 | To start uncompressing that next stream, call @code{BZ2_bzReadOpen} again, | ||
1538 | feeding in the unused data via the @code{unused}/@code{nUnused} | ||
1539 | parameters. | ||
1540 | Keep doing this until @code{BZ_STREAM_END} return coincides with the | ||
1541 | physical end of file (@code{feof(f)}). In this situation | ||
1542 | @code{BZ2_bzReadGetUnused} | ||
1543 | will of course return no data. | ||
1544 | |||
1545 | This should give some feel for how the high-level interface can be used. | ||
1546 | If you require extra flexibility, you'll have to bite the bullet and get | ||
1547 | to grips with the low-level interface. | ||
1548 | |||
1549 | @subsection Standard file-reading/writing code | ||
1550 | Here's how you'd write data to a compressed file: | ||
1551 | @example @code | ||
1552 | FILE* f; | ||
1553 | BZFILE* b; | ||
1554 | int nBuf; | ||
1555 | char buf[ /* whatever size you like */ ]; | ||
1556 | int bzerror; | ||
1557 | int nWritten; | ||
1558 | |||
1559 | f = fopen ( "myfile.bz2", "w" ); | ||
1560 | if (!f) @{ | ||
1561 | /* handle error */ | ||
1562 | @} | ||
1563 | b = BZ2_bzWriteOpen ( &bzerror, f, 9 ); | ||
1564 | if (bzerror != BZ_OK) @{ | ||
1565 | BZ2_bzWriteClose ( b ); | ||
1566 | /* handle error */ | ||
1567 | @} | ||
1568 | |||
1569 | while ( /* condition */ ) @{ | ||
1570 | /* get data to write into buf, and set nBuf appropriately */ | ||
1571 | nWritten = BZ2_bzWrite ( &bzerror, b, buf, nBuf ); | ||
1572 | if (bzerror == BZ_IO_ERROR) @{ | ||
1573 | BZ2_bzWriteClose ( &bzerror, b ); | ||
1574 | /* handle error */ | ||
1575 | @} | ||
1576 | @} | ||
1577 | |||
1578 | BZ2_bzWriteClose ( &bzerror, b ); | ||
1579 | if (bzerror == BZ_IO_ERROR) @{ | ||
1580 | /* handle error */ | ||
1581 | @} | ||
1582 | @end example | ||
1583 | And to read from a compressed file: | ||
1584 | @example | ||
1585 | FILE* f; | ||
1586 | BZFILE* b; | ||
1587 | int nBuf; | ||
1588 | char buf[ /* whatever size you like */ ]; | ||
1589 | int bzerror; | ||
1590 | int nWritten; | ||
1591 | |||
1592 | f = fopen ( "myfile.bz2", "r" ); | ||
1593 | if (!f) @{ | ||
1594 | /* handle error */ | ||
1595 | @} | ||
1596 | b = BZ2_bzReadOpen ( &bzerror, f, 0, NULL, 0 ); | ||
1597 | if (bzerror != BZ_OK) @{ | ||
1598 | BZ2_bzReadClose ( &bzerror, b ); | ||
1599 | /* handle error */ | ||
1600 | @} | ||
1601 | |||
1602 | bzerror = BZ_OK; | ||
1603 | while (bzerror == BZ_OK && /* arbitrary other conditions */) @{ | ||
1604 | nBuf = BZ2_bzRead ( &bzerror, b, buf, /* size of buf */ ); | ||
1605 | if (bzerror == BZ_OK) @{ | ||
1606 | /* do something with buf[0 .. nBuf-1] */ | ||
1607 | @} | ||
1608 | @} | ||
1609 | if (bzerror != BZ_STREAM_END) @{ | ||
1610 | BZ2_bzReadClose ( &bzerror, b ); | ||
1611 | /* handle error */ | ||
1612 | @} else @{ | ||
1613 | BZ2_bzReadClose ( &bzerror ); | ||
1614 | @} | ||
1615 | @end example | ||
1616 | |||
1617 | |||
1618 | |||
1619 | @section Utility functions | ||
1620 | @subsection @code{BZ2_bzBuffToBuffCompress} | ||
1621 | @example | ||
1622 | int BZ2_bzBuffToBuffCompress( char* dest, | ||
1623 | unsigned int* destLen, | ||
1624 | char* source, | ||
1625 | unsigned int sourceLen, | ||
1626 | int blockSize100k, | ||
1627 | int verbosity, | ||
1628 | int workFactor ); | ||
1629 | @end example | ||
1630 | Attempts to compress the data in @code{source[0 .. sourceLen-1]} | ||
1631 | into the destination buffer, @code{dest[0 .. *destLen-1]}. | ||
1632 | If the destination buffer is big enough, @code{*destLen} is | ||
1633 | set to the size of the compressed data, and @code{BZ_OK} is | ||
1634 | returned. If the compressed data won't fit, @code{*destLen} | ||
1635 | is unchanged, and @code{BZ_OUTBUFF_FULL} is returned. | ||
1636 | |||
1637 | Compression in this manner is a one-shot event, done with a single call | ||
1638 | to this function. The resulting compressed data is a complete | ||
1639 | @code{bzip2} format data stream. There is no mechanism for making | ||
1640 | additional calls to provide extra input data. If you want that kind of | ||
1641 | mechanism, use the low-level interface. | ||
1642 | |||
1643 | For the meaning of parameters @code{blockSize100k}, @code{verbosity} | ||
1644 | and @code{workFactor}, @* see @code{BZ2_bzCompressInit}. | ||
1645 | |||
1646 | To guarantee that the compressed data will fit in its buffer, allocate | ||
1647 | an output buffer of size 1% larger than the uncompressed data, plus | ||
1648 | six hundred extra bytes. | ||
1649 | |||
1650 | @code{BZ2_bzBuffToBuffDecompress} will not write data at or | ||
1651 | beyond @code{dest[*destLen]}, even in case of buffer overflow. | ||
1652 | |||
1653 | Possible return values: | ||
1654 | @display | ||
1655 | @code{BZ_CONFIG_ERROR} | ||
1656 | if the library has been mis-compiled | ||
1657 | @code{BZ_PARAM_ERROR} | ||
1658 | if @code{dest} is @code{NULL} or @code{destLen} is @code{NULL} | ||
1659 | or @code{blockSize100k < 1} or @code{blockSize100k > 9} | ||
1660 | or @code{verbosity < 0} or @code{verbosity > 4} | ||
1661 | or @code{workFactor < 0} or @code{workFactor > 250} | ||
1662 | @code{BZ_MEM_ERROR} | ||
1663 | if insufficient memory is available | ||
1664 | @code{BZ_OUTBUFF_FULL} | ||
1665 | if the size of the compressed data exceeds @code{*destLen} | ||
1666 | @code{BZ_OK} | ||
1667 | otherwise | ||
1668 | @end display | ||
1669 | |||
1670 | |||
1671 | |||
1672 | @subsection @code{BZ2_bzBuffToBuffDecompress} | ||
1673 | @example | ||
1674 | int BZ2_bzBuffToBuffDecompress ( char* dest, | ||
1675 | unsigned int* destLen, | ||
1676 | char* source, | ||
1677 | unsigned int sourceLen, | ||
1678 | int small, | ||
1679 | int verbosity ); | ||
1680 | @end example | ||
1681 | Attempts to decompress the data in @code{source[0 .. sourceLen-1]} | ||
1682 | into the destination buffer, @code{dest[0 .. *destLen-1]}. | ||
1683 | If the destination buffer is big enough, @code{*destLen} is | ||
1684 | set to the size of the uncompressed data, and @code{BZ_OK} is | ||
1685 | returned. If the compressed data won't fit, @code{*destLen} | ||
1686 | is unchanged, and @code{BZ_OUTBUFF_FULL} is returned. | ||
1687 | |||
1688 | @code{source} is assumed to hold a complete @code{bzip2} format | ||
1689 | data stream. @* @code{BZ2_bzBuffToBuffDecompress} tries to decompress | ||
1690 | the entirety of the stream into the output buffer. | ||
1691 | |||
1692 | For the meaning of parameters @code{small} and @code{verbosity}, | ||
1693 | see @code{BZ2_bzDecompressInit}. | ||
1694 | |||
1695 | Because the compression ratio of the compressed data cannot be known in | ||
1696 | advance, there is no easy way to guarantee that the output buffer will | ||
1697 | be big enough. You may of course make arrangements in your code to | ||
1698 | record the size of the uncompressed data, but such a mechanism is beyond | ||
1699 | the scope of this library. | ||
1700 | |||
1701 | @code{BZ2_bzBuffToBuffDecompress} will not write data at or | ||
1702 | beyond @code{dest[*destLen]}, even in case of buffer overflow. | ||
1703 | |||
1704 | Possible return values: | ||
1705 | @display | ||
1706 | @code{BZ_CONFIG_ERROR} | ||
1707 | if the library has been mis-compiled | ||
1708 | @code{BZ_PARAM_ERROR} | ||
1709 | if @code{dest} is @code{NULL} or @code{destLen} is @code{NULL} | ||
1710 | or @code{small != 0 && small != 1} | ||
1711 | or @code{verbosity < 0} or @code{verbosity > 4} | ||
1712 | @code{BZ_MEM_ERROR} | ||
1713 | if insufficient memory is available | ||
1714 | @code{BZ_OUTBUFF_FULL} | ||
1715 | if the size of the compressed data exceeds @code{*destLen} | ||
1716 | @code{BZ_DATA_ERROR} | ||
1717 | if a data integrity error was detected in the compressed data | ||
1718 | @code{BZ_DATA_ERROR_MAGIC} | ||
1719 | if the compressed data doesn't begin with the right magic bytes | ||
1720 | @code{BZ_UNEXPECTED_EOF} | ||
1721 | if the compressed data ends unexpectedly | ||
1722 | @code{BZ_OK} | ||
1723 | otherwise | ||
1724 | @end display | ||
1725 | |||
1726 | |||
1727 | |||
1728 | @section @code{zlib} compatibility functions | ||
1729 | Yoshioka Tsuneo has contributed some functions to | ||
1730 | give better @code{zlib} compatibility. These functions are | ||
1731 | @code{BZ2_bzopen}, @code{BZ2_bzread}, @code{BZ2_bzwrite}, @code{BZ2_bzflush}, | ||
1732 | @code{BZ2_bzclose}, | ||
1733 | @code{BZ2_bzerror} and @code{BZ2_bzlibVersion}. | ||
1734 | These functions are not (yet) officially part of | ||
1735 | the library. If they break, you get to keep all the pieces. | ||
1736 | Nevertheless, I think they work ok. | ||
1737 | @example | ||
1738 | typedef void BZFILE; | ||
1739 | |||
1740 | const char * BZ2_bzlibVersion ( void ); | ||
1741 | @end example | ||
1742 | Returns a string indicating the library version. | ||
1743 | @example | ||
1744 | BZFILE * BZ2_bzopen ( const char *path, const char *mode ); | ||
1745 | BZFILE * BZ2_bzdopen ( int fd, const char *mode ); | ||
1746 | @end example | ||
1747 | Opens a @code{.bz2} file for reading or writing, using either its name | ||
1748 | or a pre-existing file descriptor. | ||
1749 | Analogous to @code{fopen} and @code{fdopen}. | ||
1750 | @example | ||
1751 | int BZ2_bzread ( BZFILE* b, void* buf, int len ); | ||
1752 | int BZ2_bzwrite ( BZFILE* b, void* buf, int len ); | ||
1753 | @end example | ||
1754 | Reads/writes data from/to a previously opened @code{BZFILE}. | ||
1755 | Analogous to @code{fread} and @code{fwrite}. | ||
1756 | @example | ||
1757 | int BZ2_bzflush ( BZFILE* b ); | ||
1758 | void BZ2_bzclose ( BZFILE* b ); | ||
1759 | @end example | ||
1760 | Flushes/closes a @code{BZFILE}. @code{BZ2_bzflush} doesn't actually do | ||
1761 | anything. Analogous to @code{fflush} and @code{fclose}. | ||
1762 | |||
1763 | @example | ||
1764 | const char * BZ2_bzerror ( BZFILE *b, int *errnum ) | ||
1765 | @end example | ||
1766 | Returns a string describing the more recent error status of | ||
1767 | @code{b}, and also sets @code{*errnum} to its numerical value. | ||
1768 | |||
1769 | |||
1770 | @section Using the library in a @code{stdio}-free environment | ||
1771 | |||
1772 | @subsection Getting rid of @code{stdio} | ||
1773 | |||
1774 | In a deeply embedded application, you might want to use just | ||
1775 | the memory-to-memory functions. You can do this conveniently | ||
1776 | by compiling the library with preprocessor symbol @code{BZ_NO_STDIO} | ||
1777 | defined. Doing this gives you a library containing only the following | ||
1778 | eight functions: | ||
1779 | |||
1780 | @code{BZ2_bzCompressInit}, @code{BZ2_bzCompress}, @code{BZ2_bzCompressEnd} @* | ||
1781 | @code{BZ2_bzDecompressInit}, @code{BZ2_bzDecompress}, @code{BZ2_bzDecompressEnd} @* | ||
1782 | @code{BZ2_bzBuffToBuffCompress}, @code{BZ2_bzBuffToBuffDecompress} | ||
1783 | |||
1784 | When compiled like this, all functions will ignore @code{verbosity} | ||
1785 | settings. | ||
1786 | |||
1787 | @subsection Critical error handling | ||
1788 | @code{libbzip2} contains a number of internal assertion checks which | ||
1789 | should, needless to say, never be activated. Nevertheless, if an | ||
1790 | assertion should fail, behaviour depends on whether or not the library | ||
1791 | was compiled with @code{BZ_NO_STDIO} set. | ||
1792 | |||
1793 | For a normal compile, an assertion failure yields the message | ||
1794 | @example | ||
1795 | bzip2/libbzip2: internal error number N. | ||
1796 | This is a bug in bzip2/libbzip2, 1.0.2, 30-Dec-2001. | ||
1797 | Please report it to me at: jseward@@acm.org. If this happened | ||
1798 | when you were using some program which uses libbzip2 as a | ||
1799 | component, you should also report this bug to the author(s) | ||
1800 | of that program. Please make an effort to report this bug; | ||
1801 | timely and accurate bug reports eventually lead to higher | ||
1802 | quality software. Thanks. Julian Seward, 30 December 2001. | ||
1803 | @end example | ||
1804 | where @code{N} is some error code number. If @code{N == 1007}, it also | ||
1805 | prints some extra text advising the reader that unreliable memory is | ||
1806 | often associated with internal error 1007. (This is a | ||
1807 | frequently-observed-phenomenon with versions 1.0.0/1.0.1). | ||
1808 | |||
1809 | @code{exit(3)} is then called. | ||
1810 | |||
1811 | For a @code{stdio}-free library, assertion failures result | ||
1812 | in a call to a function declared as: | ||
1813 | @example | ||
1814 | extern void bz_internal_error ( int errcode ); | ||
1815 | @end example | ||
1816 | The relevant code is passed as a parameter. You should supply | ||
1817 | such a function. | ||
1818 | |||
1819 | In either case, once an assertion failure has occurred, any | ||
1820 | @code{bz_stream} records involved can be regarded as invalid. | ||
1821 | You should not attempt to resume normal operation with them. | ||
1822 | |||
1823 | You may, of course, change critical error handling to suit | ||
1824 | your needs. As I said above, critical errors indicate bugs | ||
1825 | in the library and should not occur. All "normal" error | ||
1826 | situations are indicated via error return codes from functions, | ||
1827 | and can be recovered from. | ||
1828 | |||
1829 | |||
1830 | @section Making a Windows DLL | ||
1831 | Everything related to Windows has been contributed by Yoshioka Tsuneo | ||
1832 | @* (@code{QWF00133@@niftyserve.or.jp} / | ||
1833 | @code{tsuneo-y@@is.aist-nara.ac.jp}), so you should send your queries to | ||
1834 | him (but perhaps Cc: me, @code{jseward@@acm.org}). | ||
1835 | |||
1836 | My vague understanding of what to do is: using Visual C++ 5.0, | ||
1837 | open the project file @code{libbz2.dsp}, and build. That's all. | ||
1838 | |||
1839 | If you can't | ||
1840 | open the project file for some reason, make a new one, naming these files: | ||
1841 | @code{blocksort.c}, @code{bzlib.c}, @code{compress.c}, | ||
1842 | @code{crctable.c}, @code{decompress.c}, @code{huffman.c}, @* | ||
1843 | @code{randtable.c} and @code{libbz2.def}. You will also need | ||
1844 | to name the header files @code{bzlib.h} and @code{bzlib_private.h}. | ||
1845 | |||
1846 | If you don't use VC++, you may need to define the proprocessor symbol | ||
1847 | @code{_WIN32}. | ||
1848 | |||
1849 | Finally, @code{dlltest.c} is a sample program using the DLL. It has a | ||
1850 | project file, @code{dlltest.dsp}. | ||
1851 | |||
1852 | If you just want a makefile for Visual C, have a look at | ||
1853 | @code{makefile.msc}. | ||
1854 | |||
1855 | Be aware that if you compile @code{bzip2} itself on Win32, you must set | ||
1856 | @code{BZ_UNIX} to 0 and @code{BZ_LCCWIN32} to 1, in the file | ||
1857 | @code{bzip2.c}, before compiling. Otherwise the resulting binary won't | ||
1858 | work correctly. | ||
1859 | |||
1860 | I haven't tried any of this stuff myself, but it all looks plausible. | ||
1861 | |||
1862 | |||
1863 | |||
1864 | @chapter Miscellanea | ||
1865 | |||
1866 | These are just some random thoughts of mine. Your mileage may | ||
1867 | vary. | ||
1868 | |||
1869 | @section Limitations of the compressed file format | ||
1870 | @code{bzip2-1.0}, @code{0.9.5} and @code{0.9.0} | ||
1871 | use exactly the same file format as the previous | ||
1872 | version, @code{bzip2-0.1}. This decision was made in the interests of | ||
1873 | stability. Creating yet another incompatible compressed file format | ||
1874 | would create further confusion and disruption for users. | ||
1875 | |||
1876 | Nevertheless, this is not a painless decision. Development | ||
1877 | work since the release of @code{bzip2-0.1} in August 1997 | ||
1878 | has shown complexities in the file format which slow down | ||
1879 | decompression and, in retrospect, are unnecessary. These are: | ||
1880 | @itemize @bullet | ||
1881 | @item The run-length encoder, which is the first of the | ||
1882 | compression transformations, is entirely irrelevant. | ||
1883 | The original purpose was to protect the sorting algorithm | ||
1884 | from the very worst case input: a string of repeated | ||
1885 | symbols. But algorithm steps Q6a and Q6b in the original | ||
1886 | Burrows-Wheeler technical report (SRC-124) show how | ||
1887 | repeats can be handled without difficulty in block | ||
1888 | sorting. | ||
1889 | @item The randomisation mechanism doesn't really need to be | ||
1890 | there. Udi Manber and Gene Myers published a suffix | ||
1891 | array construction algorithm a few years back, which | ||
1892 | can be employed to sort any block, no matter how | ||
1893 | repetitive, in O(N log N) time. Subsequent work by | ||
1894 | Kunihiko Sadakane has produced a derivative O(N (log N)^2) | ||
1895 | algorithm which usually outperforms the Manber-Myers | ||
1896 | algorithm. | ||
1897 | |||
1898 | I could have changed to Sadakane's algorithm, but I find | ||
1899 | it to be slower than @code{bzip2}'s existing algorithm for | ||
1900 | most inputs, and the randomisation mechanism protects | ||
1901 | adequately against bad cases. I didn't think it was | ||
1902 | a good tradeoff to make. Partly this is due to the fact | ||
1903 | that I was not flooded with email complaints about | ||
1904 | @code{bzip2-0.1}'s performance on repetitive data, so | ||
1905 | perhaps it isn't a problem for real inputs. | ||
1906 | |||
1907 | Probably the best long-term solution, | ||
1908 | and the one I have incorporated into 0.9.5 and above, | ||
1909 | is to use the existing sorting | ||
1910 | algorithm initially, and fall back to a O(N (log N)^2) | ||
1911 | algorithm if the standard algorithm gets into difficulties. | ||
1912 | @item The compressed file format was never designed to be | ||
1913 | handled by a library, and I have had to jump though | ||
1914 | some hoops to produce an efficient implementation of | ||
1915 | decompression. It's a bit hairy. Try passing | ||
1916 | @code{decompress.c} through the C preprocessor | ||
1917 | and you'll see what I mean. Much of this complexity | ||
1918 | could have been avoided if the compressed size of | ||
1919 | each block of data was recorded in the data stream. | ||
1920 | @item An Adler-32 checksum, rather than a CRC32 checksum, | ||
1921 | would be faster to compute. | ||
1922 | @end itemize | ||
1923 | It would be fair to say that the @code{bzip2} format was frozen | ||
1924 | before I properly and fully understood the performance | ||
1925 | consequences of doing so. | ||
1926 | |||
1927 | Improvements which I was able to incorporate into | ||
1928 | 0.9.0, despite using the same file format, are: | ||
1929 | @itemize @bullet | ||
1930 | @item Single array implementation of the inverse BWT. This | ||
1931 | significantly speeds up decompression, presumably | ||
1932 | because it reduces the number of cache misses. | ||
1933 | @item Faster inverse MTF transform for large MTF values. The | ||
1934 | new implementation is based on the notion of sliding blocks | ||
1935 | of values. | ||
1936 | @item @code{bzip2-0.9.0} now reads and writes files with @code{fread} | ||
1937 | and @code{fwrite}; version 0.1 used @code{putc} and @code{getc}. | ||
1938 | Duh! Well, you live and learn. | ||
1939 | |||
1940 | @end itemize | ||
1941 | Further ahead, it would be nice | ||
1942 | to be able to do random access into files. This will | ||
1943 | require some careful design of compressed file formats. | ||
1944 | |||
1945 | |||
1946 | |||
1947 | @section Portability issues | ||
1948 | After some consideration, I have decided not to use | ||
1949 | GNU @code{autoconf} to configure 0.9.5 or 1.0. | ||
1950 | |||
1951 | @code{autoconf}, admirable and wonderful though it is, | ||
1952 | mainly assists with portability problems between Unix-like | ||
1953 | platforms. But @code{bzip2} doesn't have much in the way | ||
1954 | of portability problems on Unix; most of the difficulties appear | ||
1955 | when porting to the Mac, or to Microsoft's operating systems. | ||
1956 | @code{autoconf} doesn't help in those cases, and brings in a | ||
1957 | whole load of new complexity. | ||
1958 | |||
1959 | Most people should be able to compile the library and program | ||
1960 | under Unix straight out-of-the-box, so to speak, especially | ||
1961 | if you have a version of GNU C available. | ||
1962 | |||
1963 | There are a couple of @code{__inline__} directives in the code. GNU C | ||
1964 | (@code{gcc}) should be able to handle them. If you're not using | ||
1965 | GNU C, your C compiler shouldn't see them at all. | ||
1966 | If your compiler does, for some reason, see them and doesn't | ||
1967 | like them, just @code{#define} @code{__inline__} to be @code{/* */}. One | ||
1968 | easy way to do this is to compile with the flag @code{-D__inline__=}, | ||
1969 | which should be understood by most Unix compilers. | ||
1970 | |||
1971 | If you still have difficulties, try compiling with the macro | ||
1972 | @code{BZ_STRICT_ANSI} defined. This should enable you to build the | ||
1973 | library in a strictly ANSI compliant environment. Building the program | ||
1974 | itself like this is dangerous and not supported, since you remove | ||
1975 | @code{bzip2}'s checks against compressing directories, symbolic links, | ||
1976 | devices, and other not-really-a-file entities. This could cause | ||
1977 | filesystem corruption! | ||
1978 | |||
1979 | One other thing: if you create a @code{bzip2} binary for public | ||
1980 | distribution, please try and link it statically (@code{gcc -s}). This | ||
1981 | avoids all sorts of library-version issues that others may encounter | ||
1982 | later on. | ||
1983 | |||
1984 | If you build @code{bzip2} on Win32, you must set @code{BZ_UNIX} to 0 and | ||
1985 | @code{BZ_LCCWIN32} to 1, in the file @code{bzip2.c}, before compiling. | ||
1986 | Otherwise the resulting binary won't work correctly. | ||
1987 | |||
1988 | |||
1989 | |||
1990 | @section Reporting bugs | ||
1991 | I tried pretty hard to make sure @code{bzip2} is | ||
1992 | bug free, both by design and by testing. Hopefully | ||
1993 | you'll never need to read this section for real. | ||
1994 | |||
1995 | Nevertheless, if @code{bzip2} dies with a segmentation | ||
1996 | fault, a bus error or an internal assertion failure, it | ||
1997 | will ask you to email me a bug report. Experience with | ||
1998 | version 0.1 shows that almost all these problems can | ||
1999 | be traced to either compiler bugs or hardware problems. | ||
2000 | @itemize @bullet | ||
2001 | @item | ||
2002 | Recompile the program with no optimisation, and see if it | ||
2003 | works. And/or try a different compiler. | ||
2004 | I heard all sorts of stories about various flavours | ||
2005 | of GNU C (and other compilers) generating bad code for | ||
2006 | @code{bzip2}, and I've run across two such examples myself. | ||
2007 | |||
2008 | 2.7.X versions of GNU C are known to generate bad code from | ||
2009 | time to time, at high optimisation levels. | ||
2010 | If you get problems, try using the flags | ||
2011 | @code{-O2} @code{-fomit-frame-pointer} @code{-fno-strength-reduce}. | ||
2012 | You should specifically @emph{not} use @code{-funroll-loops}. | ||
2013 | |||
2014 | You may notice that the Makefile runs six tests as part of | ||
2015 | the build process. If the program passes all of these, it's | ||
2016 | a pretty good (but not 100%) indication that the compiler has | ||
2017 | done its job correctly. | ||
2018 | @item | ||
2019 | If @code{bzip2} crashes randomly, and the crashes are not | ||
2020 | repeatable, you may have a flaky memory subsystem. @code{bzip2} | ||
2021 | really hammers your memory hierarchy, and if it's a bit marginal, | ||
2022 | you may get these problems. Ditto if your disk or I/O subsystem | ||
2023 | is slowly failing. Yup, this really does happen. | ||
2024 | |||
2025 | Try using a different machine of the same type, and see if | ||
2026 | you can repeat the problem. | ||
2027 | @item This isn't really a bug, but ... If @code{bzip2} tells | ||
2028 | you your file is corrupted on decompression, and you | ||
2029 | obtained the file via FTP, there is a possibility that you | ||
2030 | forgot to tell FTP to do a binary mode transfer. That absolutely | ||
2031 | will cause the file to be non-decompressible. You'll have to transfer | ||
2032 | it again. | ||
2033 | @end itemize | ||
2034 | |||
2035 | If you've incorporated @code{libbzip2} into your own program | ||
2036 | and are getting problems, please, please, please, check that the | ||
2037 | parameters you are passing in calls to the library, are | ||
2038 | correct, and in accordance with what the documentation says | ||
2039 | is allowable. I have tried to make the library robust against | ||
2040 | such problems, but I'm sure I haven't succeeded. | ||
2041 | |||
2042 | Finally, if the above comments don't help, you'll have to send | ||
2043 | me a bug report. Now, it's just amazing how many people will | ||
2044 | send me a bug report saying something like | ||
2045 | @display | ||
2046 | bzip2 crashed with segmentation fault on my machine | ||
2047 | @end display | ||
2048 | and absolutely nothing else. Needless to say, a such a report | ||
2049 | is @emph{totally, utterly, completely and comprehensively 100% useless; | ||
2050 | a waste of your time, my time, and net bandwidth}. | ||
2051 | With no details at all, there's no way I can possibly begin | ||
2052 | to figure out what the problem is. | ||
2053 | |||
2054 | The rules of the game are: facts, facts, facts. Don't omit | ||
2055 | them because "oh, they won't be relevant". At the bare | ||
2056 | minimum: | ||
2057 | @display | ||
2058 | Machine type. Operating system version. | ||
2059 | Exact version of @code{bzip2} (do @code{bzip2 -V}). | ||
2060 | Exact version of the compiler used. | ||
2061 | Flags passed to the compiler. | ||
2062 | @end display | ||
2063 | However, the most important single thing that will help me is | ||
2064 | the file that you were trying to compress or decompress at the | ||
2065 | time the problem happened. Without that, my ability to do anything | ||
2066 | more than speculate about the cause, is limited. | ||
2067 | |||
2068 | Please remember that I connect to the Internet with a modem, so | ||
2069 | you should contact me before mailing me huge files. | ||
2070 | |||
2071 | |||
2072 | @section Did you get the right package? | ||
2073 | |||
2074 | @code{bzip2} is a resource hog. It soaks up large amounts of CPU cycles | ||
2075 | and memory. Also, it gives very large latencies. In the worst case, you | ||
2076 | can feed many megabytes of uncompressed data into the library before | ||
2077 | getting any compressed output, so this probably rules out applications | ||
2078 | requiring interactive behaviour. | ||
2079 | |||
2080 | These aren't faults of my implementation, I hope, but more | ||
2081 | an intrinsic property of the Burrows-Wheeler transform (unfortunately). | ||
2082 | Maybe this isn't what you want. | ||
2083 | |||
2084 | If you want a compressor and/or library which is faster, uses less | ||
2085 | memory but gets pretty good compression, and has minimal latency, | ||
2086 | consider Jean-loup | ||
2087 | Gailly's and Mark Adler's work, @code{zlib-1.1.3} and | ||
2088 | @code{gzip-1.2.4}. Look for them at | ||
2089 | |||
2090 | @code{http://www.zlib.org} and | ||
2091 | @code{http://www.gzip.org} respectively. | ||
2092 | |||
2093 | For something faster and lighter still, you might try Markus F X J | ||
2094 | Oberhumer's @code{LZO} real-time compression/decompression library, at | ||
2095 | @* @code{http://wildsau.idv.uni-linz.ac.at/mfx/lzo.html}. | ||
2096 | |||
2097 | If you want to use the @code{bzip2} algorithms to compress small blocks | ||
2098 | of data, 64k bytes or smaller, for example on an on-the-fly disk | ||
2099 | compressor, you'd be well advised not to use this library. Instead, | ||
2100 | I've made a special library tuned for that kind of use. It's part of | ||
2101 | @code{e2compr-0.40}, an on-the-fly disk compressor for the Linux | ||
2102 | @code{ext2} filesystem. Look at | ||
2103 | @code{http://www.netspace.net.au/~reiter/e2compr}. | ||
2104 | |||
2105 | |||
2106 | |||
2107 | @section Testing | ||
2108 | |||
2109 | A record of the tests I've done. | ||
2110 | |||
2111 | First, some data sets: | ||
2112 | @itemize @bullet | ||
2113 | @item B: a directory containing 6001 files, one for every length in the | ||
2114 | range 0 to 6000 bytes. The files contain random lowercase | ||
2115 | letters. 18.7 megabytes. | ||
2116 | @item H: my home directory tree. Documents, source code, mail files, | ||
2117 | compressed data. H contains B, and also a directory of | ||
2118 | files designed as boundary cases for the sorting; mostly very | ||
2119 | repetitive, nasty files. 565 megabytes. | ||
2120 | @item A: directory tree holding various applications built from source: | ||
2121 | @code{egcs}, @code{gcc-2.8.1}, KDE, GTK, Octave, etc. | ||
2122 | 2200 megabytes. | ||
2123 | @end itemize | ||
2124 | The tests conducted are as follows. Each test means compressing | ||
2125 | (a copy of) each file in the data set, decompressing it and | ||
2126 | comparing it against the original. | ||
2127 | |||
2128 | First, a bunch of tests with block sizes and internal buffer | ||
2129 | sizes set very small, | ||
2130 | to detect any problems with the | ||
2131 | blocking and buffering mechanisms. | ||
2132 | This required modifying the source code so as to try to | ||
2133 | break it. | ||
2134 | @enumerate | ||
2135 | @item Data set H, with | ||
2136 | buffer size of 1 byte, and block size of 23 bytes. | ||
2137 | @item Data set B, buffer sizes 1 byte, block size 1 byte. | ||
2138 | @item As (2) but small-mode decompression. | ||
2139 | @item As (2) with block size 2 bytes. | ||
2140 | @item As (2) with block size 3 bytes. | ||
2141 | @item As (2) with block size 4 bytes. | ||
2142 | @item As (2) with block size 5 bytes. | ||
2143 | @item As (2) with block size 6 bytes and small-mode decompression. | ||
2144 | @item H with buffer size of 1 byte, but normal block | ||
2145 | size (up to 900000 bytes). | ||
2146 | @end enumerate | ||
2147 | Then some tests with unmodified source code. | ||
2148 | @enumerate | ||
2149 | @item H, all settings normal. | ||
2150 | @item As (1), with small-mode decompress. | ||
2151 | @item H, compress with flag @code{-1}. | ||
2152 | @item H, compress with flag @code{-s}, decompress with flag @code{-s}. | ||
2153 | @item Forwards compatibility: H, @code{bzip2-0.1pl2} compressing, | ||
2154 | @code{bzip2-0.9.5} decompressing, all settings normal. | ||
2155 | @item Backwards compatibility: H, @code{bzip2-0.9.5} compressing, | ||
2156 | @code{bzip2-0.1pl2} decompressing, all settings normal. | ||
2157 | @item Bigger tests: A, all settings normal. | ||
2158 | @item As (7), using the fallback (Sadakane-like) sorting algorithm. | ||
2159 | @item As (8), compress with flag @code{-1}, decompress with flag | ||
2160 | @code{-s}. | ||
2161 | @item H, using the fallback sorting algorithm. | ||
2162 | @item Forwards compatibility: A, @code{bzip2-0.1pl2} compressing, | ||
2163 | @code{bzip2-0.9.5} decompressing, all settings normal. | ||
2164 | @item Backwards compatibility: A, @code{bzip2-0.9.5} compressing, | ||
2165 | @code{bzip2-0.1pl2} decompressing, all settings normal. | ||
2166 | @item Misc test: about 400 megabytes of @code{.tar} files with | ||
2167 | @code{bzip2} compiled with Checker (a memory access error | ||
2168 | detector, like Purify). | ||
2169 | @item Misc tests to make sure it builds and runs ok on non-Linux/x86 | ||
2170 | platforms. | ||
2171 | @end enumerate | ||
2172 | These tests were conducted on a 225 MHz IDT WinChip machine, running | ||
2173 | Linux 2.0.36. They represent nearly a week of continuous computation. | ||
2174 | All tests completed successfully. | ||
2175 | |||
2176 | |||
2177 | @section Further reading | ||
2178 | @code{bzip2} is not research work, in the sense that it doesn't present | ||
2179 | any new ideas. Rather, it's an engineering exercise based on existing | ||
2180 | ideas. | ||
2181 | |||
2182 | Four documents describe essentially all the ideas behind @code{bzip2}: | ||
2183 | @example | ||
2184 | Michael Burrows and D. J. Wheeler: | ||
2185 | "A block-sorting lossless data compression algorithm" | ||
2186 | 10th May 1994. | ||
2187 | Digital SRC Research Report 124. | ||
2188 | ftp://ftp.digital.com/pub/DEC/SRC/research-reports/SRC-124.ps.gz | ||
2189 | If you have trouble finding it, try searching at the | ||
2190 | New Zealand Digital Library, http://www.nzdl.org. | ||
2191 | |||
2192 | Daniel S. Hirschberg and Debra A. LeLewer | ||
2193 | "Efficient Decoding of Prefix Codes" | ||
2194 | Communications of the ACM, April 1990, Vol 33, Number 4. | ||
2195 | You might be able to get an electronic copy of this | ||
2196 | from the ACM Digital Library. | ||
2197 | |||
2198 | David J. Wheeler | ||
2199 | Program bred3.c and accompanying document bred3.ps. | ||
2200 | This contains the idea behind the multi-table Huffman | ||
2201 | coding scheme. | ||
2202 | ftp://ftp.cl.cam.ac.uk/users/djw3/ | ||
2203 | |||
2204 | Jon L. Bentley and Robert Sedgewick | ||
2205 | "Fast Algorithms for Sorting and Searching Strings" | ||
2206 | Available from Sedgewick's web page, | ||
2207 | www.cs.princeton.edu/~rs | ||
2208 | @end example | ||
2209 | The following paper gives valuable additional insights into the | ||
2210 | algorithm, but is not immediately the basis of any code | ||
2211 | used in bzip2. | ||
2212 | @example | ||
2213 | Peter Fenwick: | ||
2214 | Block Sorting Text Compression | ||
2215 | Proceedings of the 19th Australasian Computer Science Conference, | ||
2216 | Melbourne, Australia. Jan 31 - Feb 2, 1996. | ||
2217 | ftp://ftp.cs.auckland.ac.nz/pub/peter-f/ACSC96paper.ps | ||
2218 | @end example | ||
2219 | Kunihiko Sadakane's sorting algorithm, mentioned above, | ||
2220 | is available from: | ||
2221 | @example | ||
2222 | http://naomi.is.s.u-tokyo.ac.jp/~sada/papers/Sada98b.ps.gz | ||
2223 | @end example | ||
2224 | The Manber-Myers suffix array construction | ||
2225 | algorithm is described in a paper | ||
2226 | available from: | ||
2227 | @example | ||
2228 | http://www.cs.arizona.edu/people/gene/PAPERS/suffix.ps | ||
2229 | @end example | ||
2230 | Finally, the following paper documents some recent investigations | ||
2231 | I made into the performance of sorting algorithms: | ||
2232 | @example | ||
2233 | Julian Seward: | ||
2234 | On the Performance of BWT Sorting Algorithms | ||
2235 | Proceedings of the IEEE Data Compression Conference 2000 | ||
2236 | Snowbird, Utah. 28-30 March 2000. | ||
2237 | @end example | ||
2238 | |||
2239 | |||
2240 | @contents | ||
2241 | |||
2242 | @bye | ||
2243 | |||
diff --git a/manual.xml b/manual.xml new file mode 100644 index 0000000..1ab5bd7 --- /dev/null +++ b/manual.xml | |||
@@ -0,0 +1,2966 @@ | |||
1 | <?xml version="1.0"?> <!-- -*- sgml -*- --> | ||
2 | <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" | ||
3 | "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd"[ | ||
4 | |||
5 | <!-- various strings, dates etc. common to all docs --> | ||
6 | <!ENTITY % common-ents SYSTEM "entities.xml"> %common-ents; | ||
7 | ]> | ||
8 | |||
9 | <book lang="en" id="userman" xreflabel="bzip2 Manual"> | ||
10 | |||
11 | <bookinfo> | ||
12 | <title>bzip2 and libbzip2, version 1.0.3</title> | ||
13 | <subtitle>A program and library for data compression</subtitle> | ||
14 | <copyright> | ||
15 | <year>&bz-lifespan;</year> | ||
16 | <holder>Julian Seward</holder> | ||
17 | </copyright> | ||
18 | <releaseinfo>Version &bz-version; of &bz-date;</releaseinfo> | ||
19 | |||
20 | <authorgroup> | ||
21 | <author> | ||
22 | <firstname>Julian</firstname> | ||
23 | <surname>Seward</surname> | ||
24 | <affiliation> | ||
25 | <orgname>&bz-url;</orgname> | ||
26 | </affiliation> | ||
27 | </author> | ||
28 | </authorgroup> | ||
29 | |||
30 | <legalnotice> | ||
31 | |||
32 | <para>This program, <computeroutput>bzip2</computeroutput>, the | ||
33 | associated library <computeroutput>libbzip2</computeroutput>, and | ||
34 | all documentation, are copyright © &bz-lifespan; Julian Seward. | ||
35 | All rights reserved.</para> | ||
36 | |||
37 | <para>Redistribution and use in source and binary forms, with | ||
38 | or without modification, are permitted provided that the | ||
39 | following conditions are met:</para> | ||
40 | |||
41 | <itemizedlist mark='bullet'> | ||
42 | |||
43 | <listitem><para>Redistributions of source code must retain the | ||
44 | above copyright notice, this list of conditions and the | ||
45 | following disclaimer.</para></listitem> | ||
46 | |||
47 | <listitem><para>The origin of this software must not be | ||
48 | misrepresented; you must not claim that you wrote the original | ||
49 | software. If you use this software in a product, an | ||
50 | acknowledgment in the product documentation would be | ||
51 | appreciated but is not required.</para></listitem> | ||
52 | |||
53 | <listitem><para>Altered source versions must be plainly marked | ||
54 | as such, and must not be misrepresented as being the original | ||
55 | software.</para></listitem> | ||
56 | |||
57 | <listitem><para>The name of the author may not be used to | ||
58 | endorse or promote products derived from this software without | ||
59 | specific prior written permission.</para></listitem> | ||
60 | |||
61 | </itemizedlist> | ||
62 | |||
63 | <para>THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY | ||
64 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, | ||
65 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | ||
66 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | ||
67 | AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
68 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | ||
69 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
70 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | ||
71 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
72 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING | ||
73 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | ||
74 | THE POSSIBILITY OF SUCH DAMAGE.</para> | ||
75 | |||
76 | <para>PATENTS: To the best of my knowledge, | ||
77 | <computeroutput>bzip2</computeroutput> and | ||
78 | <computeroutput>libbzip2</computeroutput> do not use any patented | ||
79 | algorithms. However, I do not have the resources to carry | ||
80 | out a patent search. Therefore I cannot give any guarantee of | ||
81 | the above statement. | ||
82 | </para> | ||
83 | |||
84 | </legalnotice> | ||
85 | |||
86 | </bookinfo> | ||
87 | |||
88 | |||
89 | |||
90 | <chapter id="intro" xreflabel="Introduction"> | ||
91 | <title>Introduction</title> | ||
92 | |||
93 | <para><computeroutput>bzip2</computeroutput> compresses files | ||
94 | using the Burrows-Wheeler block-sorting text compression | ||
95 | algorithm, and Huffman coding. Compression is generally | ||
96 | considerably better than that achieved by more conventional | ||
97 | LZ77/LZ78-based compressors, and approaches the performance of | ||
98 | the PPM family of statistical compressors.</para> | ||
99 | |||
100 | <para><computeroutput>bzip2</computeroutput> is built on top of | ||
101 | <computeroutput>libbzip2</computeroutput>, a flexible library for | ||
102 | handling compressed data in the | ||
103 | <computeroutput>bzip2</computeroutput> format. This manual | ||
104 | describes both how to use the program and how to work with the | ||
105 | library interface. Most of the manual is devoted to this | ||
106 | library, not the program, which is good news if your interest is | ||
107 | only in the program.</para> | ||
108 | |||
109 | <itemizedlist mark='bullet'> | ||
110 | |||
111 | <listitem><para><xref linkend="using"/> describes how to use | ||
112 | <computeroutput>bzip2</computeroutput>; this is the only part | ||
113 | you need to read if you just want to know how to operate the | ||
114 | program.</para></listitem> | ||
115 | |||
116 | <listitem><para><xref linkend="libprog"/> describes the | ||
117 | programming interfaces in detail, and</para></listitem> | ||
118 | |||
119 | <listitem><para><xref linkend="misc"/> records some | ||
120 | miscellaneous notes which I thought ought to be recorded | ||
121 | somewhere.</para></listitem> | ||
122 | |||
123 | </itemizedlist> | ||
124 | |||
125 | </chapter> | ||
126 | |||
127 | |||
128 | <chapter id="using" xreflabel="How to use bzip2"> | ||
129 | <title>How to use bzip2</title> | ||
130 | |||
131 | <para>This chapter contains a copy of the | ||
132 | <computeroutput>bzip2</computeroutput> man page, and nothing | ||
133 | else.</para> | ||
134 | |||
135 | <sect1 id="name" xreflabel="NAME"> | ||
136 | <title>NAME</title> | ||
137 | |||
138 | <itemizedlist mark='bullet'> | ||
139 | |||
140 | <listitem><para><computeroutput>bzip2</computeroutput>, | ||
141 | <computeroutput>bunzip2</computeroutput> - a block-sorting file | ||
142 | compressor, v1.0.3</para></listitem> | ||
143 | |||
144 | <listitem><para><computeroutput>bzcat</computeroutput> - | ||
145 | decompresses files to stdout</para></listitem> | ||
146 | |||
147 | <listitem><para><computeroutput>bzip2recover</computeroutput> - | ||
148 | recovers data from damaged bzip2 files</para></listitem> | ||
149 | |||
150 | </itemizedlist> | ||
151 | |||
152 | </sect1> | ||
153 | |||
154 | |||
155 | <sect1 id="synopsis" xreflabel="SYNOPSIS"> | ||
156 | <title>SYNOPSIS</title> | ||
157 | |||
158 | <itemizedlist mark='bullet'> | ||
159 | |||
160 | <listitem><para><computeroutput>bzip2</computeroutput> [ | ||
161 | -cdfkqstvzVL123456789 ] [ filenames ... ]</para></listitem> | ||
162 | |||
163 | <listitem><para><computeroutput>bunzip2</computeroutput> [ | ||
164 | -fkvsVL ] [ filenames ... ]</para></listitem> | ||
165 | |||
166 | <listitem><para><computeroutput>bzcat</computeroutput> [ -s ] [ | ||
167 | filenames ... ]</para></listitem> | ||
168 | |||
169 | <listitem><para><computeroutput>bzip2recover</computeroutput> | ||
170 | filename</para></listitem> | ||
171 | |||
172 | </itemizedlist> | ||
173 | |||
174 | </sect1> | ||
175 | |||
176 | |||
177 | <sect1 id="description" xreflabel="DESCRIPTION"> | ||
178 | <title>DESCRIPTION</title> | ||
179 | |||
180 | <para><computeroutput>bzip2</computeroutput> compresses files | ||
181 | using the Burrows-Wheeler block sorting text compression | ||
182 | algorithm, and Huffman coding. Compression is generally | ||
183 | considerably better than that achieved by more conventional | ||
184 | LZ77/LZ78-based compressors, and approaches the performance of | ||
185 | the PPM family of statistical compressors.</para> | ||
186 | |||
187 | <para>The command-line options are deliberately very similar to | ||
188 | those of GNU <computeroutput>gzip</computeroutput>, but they are | ||
189 | not identical.</para> | ||
190 | |||
191 | <para><computeroutput>bzip2</computeroutput> expects a list of | ||
192 | file names to accompany the command-line flags. Each file is | ||
193 | replaced by a compressed version of itself, with the name | ||
194 | <computeroutput>original_name.bz2</computeroutput>. Each | ||
195 | compressed file has the same modification date, permissions, and, | ||
196 | when possible, ownership as the corresponding original, so that | ||
197 | these properties can be correctly restored at decompression time. | ||
198 | File name handling is naive in the sense that there is no | ||
199 | mechanism for preserving original file names, permissions, | ||
200 | ownerships or dates in filesystems which lack these concepts, or | ||
201 | have serious file name length restrictions, such as | ||
202 | MS-DOS.</para> | ||
203 | |||
204 | <para><computeroutput>bzip2</computeroutput> and | ||
205 | <computeroutput>bunzip2</computeroutput> will by default not | ||
206 | overwrite existing files. If you want this to happen, specify | ||
207 | the <computeroutput>-f</computeroutput> flag.</para> | ||
208 | |||
209 | <para>If no file names are specified, | ||
210 | <computeroutput>bzip2</computeroutput> compresses from standard | ||
211 | input to standard output. In this case, | ||
212 | <computeroutput>bzip2</computeroutput> will decline to write | ||
213 | compressed output to a terminal, as this would be entirely | ||
214 | incomprehensible and therefore pointless.</para> | ||
215 | |||
216 | <para><computeroutput>bunzip2</computeroutput> (or | ||
217 | <computeroutput>bzip2 -d</computeroutput>) decompresses all | ||
218 | specified files. Files which were not created by | ||
219 | <computeroutput>bzip2</computeroutput> will be detected and | ||
220 | ignored, and a warning issued. | ||
221 | <computeroutput>bzip2</computeroutput> attempts to guess the | ||
222 | filename for the decompressed file from that of the compressed | ||
223 | file as follows:</para> | ||
224 | |||
225 | <itemizedlist mark='bullet'> | ||
226 | |||
227 | <listitem><para><computeroutput>filename.bz2 </computeroutput> | ||
228 | becomes | ||
229 | <computeroutput>filename</computeroutput></para></listitem> | ||
230 | |||
231 | <listitem><para><computeroutput>filename.bz </computeroutput> | ||
232 | becomes | ||
233 | <computeroutput>filename</computeroutput></para></listitem> | ||
234 | |||
235 | <listitem><para><computeroutput>filename.tbz2</computeroutput> | ||
236 | becomes | ||
237 | <computeroutput>filename.tar</computeroutput></para></listitem> | ||
238 | |||
239 | <listitem><para><computeroutput>filename.tbz </computeroutput> | ||
240 | becomes | ||
241 | <computeroutput>filename.tar</computeroutput></para></listitem> | ||
242 | |||
243 | <listitem><para><computeroutput>anyothername </computeroutput> | ||
244 | becomes | ||
245 | <computeroutput>anyothername.out</computeroutput></para></listitem> | ||
246 | |||
247 | </itemizedlist> | ||
248 | |||
249 | <para>If the file does not end in one of the recognised endings, | ||
250 | <computeroutput>.bz2</computeroutput>, | ||
251 | <computeroutput>.bz</computeroutput>, | ||
252 | <computeroutput>.tbz2</computeroutput> or | ||
253 | <computeroutput>.tbz</computeroutput>, | ||
254 | <computeroutput>bzip2</computeroutput> complains that it cannot | ||
255 | guess the name of the original file, and uses the original name | ||
256 | with <computeroutput>.out</computeroutput> appended.</para> | ||
257 | |||
258 | <para>As with compression, supplying no filenames causes | ||
259 | decompression from standard input to standard output.</para> | ||
260 | |||
261 | <para><computeroutput>bunzip2</computeroutput> will correctly | ||
262 | decompress a file which is the concatenation of two or more | ||
263 | compressed files. The result is the concatenation of the | ||
264 | corresponding uncompressed files. Integrity testing | ||
265 | (<computeroutput>-t</computeroutput>) of concatenated compressed | ||
266 | files is also supported.</para> | ||
267 | |||
268 | <para>You can also compress or decompress files to the standard | ||
269 | output by giving the <computeroutput>-c</computeroutput> flag. | ||
270 | Multiple files may be compressed and decompressed like this. The | ||
271 | resulting outputs are fed sequentially to stdout. Compression of | ||
272 | multiple files in this manner generates a stream containing | ||
273 | multiple compressed file representations. Such a stream can be | ||
274 | decompressed correctly only by | ||
275 | <computeroutput>bzip2</computeroutput> version 0.9.0 or later. | ||
276 | Earlier versions of <computeroutput>bzip2</computeroutput> will | ||
277 | stop after decompressing the first file in the stream.</para> | ||
278 | |||
279 | <para><computeroutput>bzcat</computeroutput> (or | ||
280 | <computeroutput>bzip2 -dc</computeroutput>) decompresses all | ||
281 | specified files to the standard output.</para> | ||
282 | |||
283 | <para><computeroutput>bzip2</computeroutput> will read arguments | ||
284 | from the environment variables | ||
285 | <computeroutput>BZIP2</computeroutput> and | ||
286 | <computeroutput>BZIP</computeroutput>, in that order, and will | ||
287 | process them before any arguments read from the command line. | ||
288 | This gives a convenient way to supply default arguments.</para> | ||
289 | |||
290 | <para>Compression is always performed, even if the compressed | ||
291 | file is slightly larger than the original. Files of less than | ||
292 | about one hundred bytes tend to get larger, since the compression | ||
293 | mechanism has a constant overhead in the region of 50 bytes. | ||
294 | Random data (including the output of most file compressors) is | ||
295 | coded at about 8.05 bits per byte, giving an expansion of around | ||
296 | 0.5%.</para> | ||
297 | |||
298 | <para>As a self-check for your protection, | ||
299 | <computeroutput>bzip2</computeroutput> uses 32-bit CRCs to make | ||
300 | sure that the decompressed version of a file is identical to the | ||
301 | original. This guards against corruption of the compressed data, | ||
302 | and against undetected bugs in | ||
303 | <computeroutput>bzip2</computeroutput> (hopefully very unlikely). | ||
304 | The chances of data corruption going undetected is microscopic, | ||
305 | about one chance in four billion for each file processed. Be | ||
306 | aware, though, that the check occurs upon decompression, so it | ||
307 | can only tell you that something is wrong. It can't help you | ||
308 | recover the original uncompressed data. You can use | ||
309 | <computeroutput>bzip2recover</computeroutput> to try to recover | ||
310 | data from damaged files.</para> | ||
311 | |||
312 | <para>Return values: 0 for a normal exit, 1 for environmental | ||
313 | problems (file not found, invalid flags, I/O errors, etc.), 2 | ||
314 | to indicate a corrupt compressed file, 3 for an internal | ||
315 | consistency error (eg, bug) which caused | ||
316 | <computeroutput>bzip2</computeroutput> to panic.</para> | ||
317 | |||
318 | </sect1> | ||
319 | |||
320 | |||
321 | <sect1 id="options" xreflabel="OPTIONS"> | ||
322 | <title>OPTIONS</title> | ||
323 | |||
324 | <variablelist> | ||
325 | |||
326 | <varlistentry> | ||
327 | <term><computeroutput>-c --stdout</computeroutput></term> | ||
328 | <listitem><para>Compress or decompress to standard | ||
329 | output.</para></listitem> | ||
330 | </varlistentry> | ||
331 | |||
332 | <varlistentry> | ||
333 | <term><computeroutput>-d --decompress</computeroutput></term> | ||
334 | <listitem><para>Force decompression. | ||
335 | <computeroutput>bzip2</computeroutput>, | ||
336 | <computeroutput>bunzip2</computeroutput> and | ||
337 | <computeroutput>bzcat</computeroutput> are really the same | ||
338 | program, and the decision about what actions to take is done on | ||
339 | the basis of which name is used. This flag overrides that | ||
340 | mechanism, and forces bzip2 to decompress.</para></listitem> | ||
341 | </varlistentry> | ||
342 | |||
343 | <varlistentry> | ||
344 | <term><computeroutput>-z --compress</computeroutput></term> | ||
345 | <listitem><para>The complement to | ||
346 | <computeroutput>-d</computeroutput>: forces compression, | ||
347 | regardless of the invokation name.</para></listitem> | ||
348 | </varlistentry> | ||
349 | |||
350 | <varlistentry> | ||
351 | <term><computeroutput>-t --test</computeroutput></term> | ||
352 | <listitem><para>Check integrity of the specified file(s), but | ||
353 | don't decompress them. This really performs a trial | ||
354 | decompression and throws away the result.</para></listitem> | ||
355 | </varlistentry> | ||
356 | |||
357 | <varlistentry> | ||
358 | <term><computeroutput>-f --force</computeroutput></term> | ||
359 | <listitem><para>Force overwrite of output files. Normally, | ||
360 | <computeroutput>bzip2</computeroutput> will not overwrite | ||
361 | existing output files. Also forces | ||
362 | <computeroutput>bzip2</computeroutput> to break hard links to | ||
363 | files, which it otherwise wouldn't do.</para> | ||
364 | <para><computeroutput>bzip2</computeroutput> normally declines | ||
365 | to decompress files which don't have the correct magic header | ||
366 | bytes. If forced (<computeroutput>-f</computeroutput>), | ||
367 | however, it will pass such files through unmodified. This is | ||
368 | how GNU <computeroutput>gzip</computeroutput> behaves.</para> | ||
369 | </listitem> | ||
370 | </varlistentry> | ||
371 | |||
372 | <varlistentry> | ||
373 | <term><computeroutput>-k --keep</computeroutput></term> | ||
374 | <listitem><para>Keep (don't delete) input files during | ||
375 | compression or decompression.</para></listitem> | ||
376 | </varlistentry> | ||
377 | |||
378 | <varlistentry> | ||
379 | <term><computeroutput>-s --small</computeroutput></term> | ||
380 | <listitem><para>Reduce memory usage, for compression, | ||
381 | decompression and testing. Files are decompressed and tested | ||
382 | using a modified algorithm which only requires 2.5 bytes per | ||
383 | block byte. This means any file can be decompressed in 2300k | ||
384 | of memory, albeit at about half the normal speed.</para> | ||
385 | <para>During compression, <computeroutput>-s</computeroutput> | ||
386 | selects a block size of 200k, which limits memory use to around | ||
387 | the same figure, at the expense of your compression ratio. In | ||
388 | short, if your machine is low on memory (8 megabytes or less), | ||
389 | use <computeroutput>-s</computeroutput> for everything. See | ||
390 | <xref linkend="memory-management"/> below.</para></listitem> | ||
391 | </varlistentry> | ||
392 | |||
393 | <varlistentry> | ||
394 | <term><computeroutput>-q --quiet</computeroutput></term> | ||
395 | <listitem><para>Suppress non-essential warning messages. | ||
396 | Messages pertaining to I/O errors and other critical events | ||
397 | will not be suppressed.</para></listitem> | ||
398 | </varlistentry> | ||
399 | |||
400 | <varlistentry> | ||
401 | <term><computeroutput>-v --verbose</computeroutput></term> | ||
402 | <listitem><para>Verbose mode -- show the compression ratio for | ||
403 | each file processed. Further | ||
404 | <computeroutput>-v</computeroutput>'s increase the verbosity | ||
405 | level, spewing out lots of information which is primarily of | ||
406 | interest for diagnostic purposes.</para></listitem> | ||
407 | </varlistentry> | ||
408 | |||
409 | <varlistentry> | ||
410 | <term><computeroutput>-L --license -V --version</computeroutput></term> | ||
411 | <listitem><para>Display the software version, license terms and | ||
412 | conditions.</para></listitem> | ||
413 | </varlistentry> | ||
414 | |||
415 | <varlistentry> | ||
416 | <term><computeroutput>-1</computeroutput> (or | ||
417 | <computeroutput>--fast</computeroutput>) to | ||
418 | <computeroutput>-9</computeroutput> (or | ||
419 | <computeroutput>-best</computeroutput>)</term> | ||
420 | <listitem><para>Set the block size to 100 k, 200 k ... 900 k | ||
421 | when compressing. Has no effect when decompressing. See <xref | ||
422 | linkend="memory-management" /> below. The | ||
423 | <computeroutput>--fast</computeroutput> and | ||
424 | <computeroutput>--best</computeroutput> aliases are primarily | ||
425 | for GNU <computeroutput>gzip</computeroutput> compatibility. | ||
426 | In particular, <computeroutput>--fast</computeroutput> doesn't | ||
427 | make things significantly faster. And | ||
428 | <computeroutput>--best</computeroutput> merely selects the | ||
429 | default behaviour.</para></listitem> | ||
430 | </varlistentry> | ||
431 | |||
432 | <varlistentry> | ||
433 | <term><computeroutput>--</computeroutput></term> | ||
434 | <listitem><para>Treats all subsequent arguments as file names, | ||
435 | even if they start with a dash. This is so you can handle | ||
436 | files with names beginning with a dash, for example: | ||
437 | <computeroutput>bzip2 -- | ||
438 | -myfilename</computeroutput>.</para></listitem> | ||
439 | </varlistentry> | ||
440 | |||
441 | <varlistentry> | ||
442 | <term><computeroutput>--repetitive-fast</computeroutput></term> | ||
443 | <term><computeroutput>--repetitive-best</computeroutput></term> | ||
444 | <listitem><para>These flags are redundant in versions 0.9.5 and | ||
445 | above. They provided some coarse control over the behaviour of | ||
446 | the sorting algorithm in earlier versions, which was sometimes | ||
447 | useful. 0.9.5 and above have an improved algorithm which | ||
448 | renders these flags irrelevant.</para></listitem> | ||
449 | </varlistentry> | ||
450 | |||
451 | </variablelist> | ||
452 | |||
453 | </sect1> | ||
454 | |||
455 | |||
456 | <sect1 id="memory-management" xreflabel="MEMORY MANAGEMENT"> | ||
457 | <title>MEMORY MANAGEMENT</title> | ||
458 | |||
459 | <para><computeroutput>bzip2</computeroutput> compresses large | ||
460 | files in blocks. The block size affects both the compression | ||
461 | ratio achieved, and the amount of memory needed for compression | ||
462 | and decompression. The flags <computeroutput>-1</computeroutput> | ||
463 | through <computeroutput>-9</computeroutput> specify the block | ||
464 | size to be 100,000 bytes through 900,000 bytes (the default) | ||
465 | respectively. At decompression time, the block size used for | ||
466 | compression is read from the header of the compressed file, and | ||
467 | <computeroutput>bunzip2</computeroutput> then allocates itself | ||
468 | just enough memory to decompress the file. Since block sizes are | ||
469 | stored in compressed files, it follows that the flags | ||
470 | <computeroutput>-1</computeroutput> to | ||
471 | <computeroutput>-9</computeroutput> are irrelevant to and so | ||
472 | ignored during decompression.</para> | ||
473 | |||
474 | <para>Compression and decompression requirements, in bytes, can be | ||
475 | estimated as:</para> | ||
476 | <programlisting> | ||
477 | Compression: 400k + ( 8 x block size ) | ||
478 | |||
479 | Decompression: 100k + ( 4 x block size ), or | ||
480 | 100k + ( 2.5 x block size ) | ||
481 | </programlisting> | ||
482 | |||
483 | <para>Larger block sizes give rapidly diminishing marginal | ||
484 | returns. Most of the compression comes from the first two or | ||
485 | three hundred k of block size, a fact worth bearing in mind when | ||
486 | using <computeroutput>bzip2</computeroutput> on small machines. | ||
487 | It is also important to appreciate that the decompression memory | ||
488 | requirement is set at compression time by the choice of block | ||
489 | size.</para> | ||
490 | |||
491 | <para>For files compressed with the default 900k block size, | ||
492 | <computeroutput>bunzip2</computeroutput> will require about 3700 | ||
493 | kbytes to decompress. To support decompression of any file on a | ||
494 | 4 megabyte machine, <computeroutput>bunzip2</computeroutput> has | ||
495 | an option to decompress using approximately half this amount of | ||
496 | memory, about 2300 kbytes. Decompression speed is also halved, | ||
497 | so you should use this option only where necessary. The relevant | ||
498 | flag is <computeroutput>-s</computeroutput>.</para> | ||
499 | |||
500 | <para>In general, try and use the largest block size memory | ||
501 | constraints allow, since that maximises the compression achieved. | ||
502 | Compression and decompression speed are virtually unaffected by | ||
503 | block size.</para> | ||
504 | |||
505 | <para>Another significant point applies to files which fit in a | ||
506 | single block -- that means most files you'd encounter using a | ||
507 | large block size. The amount of real memory touched is | ||
508 | proportional to the size of the file, since the file is smaller | ||
509 | than a block. For example, compressing a file 20,000 bytes long | ||
510 | with the flag <computeroutput>-9</computeroutput> will cause the | ||
511 | compressor to allocate around 7600k of memory, but only touch | ||
512 | 400k + 20000 * 8 = 560 kbytes of it. Similarly, the decompressor | ||
513 | will allocate 3700k but only touch 100k + 20000 * 4 = 180 | ||
514 | kbytes.</para> | ||
515 | |||
516 | <para>Here is a table which summarises the maximum memory usage | ||
517 | for different block sizes. Also recorded is the total compressed | ||
518 | size for 14 files of the Calgary Text Compression Corpus | ||
519 | totalling 3,141,622 bytes. This column gives some feel for how | ||
520 | compression varies with block size. These figures tend to | ||
521 | understate the advantage of larger block sizes for larger files, | ||
522 | since the Corpus is dominated by smaller files.</para> | ||
523 | |||
524 | <programlisting> | ||
525 | Compress Decompress Decompress Corpus | ||
526 | Flag usage usage -s usage Size | ||
527 | |||
528 | -1 1200k 500k 350k 914704 | ||
529 | -2 2000k 900k 600k 877703 | ||
530 | -3 2800k 1300k 850k 860338 | ||
531 | -4 3600k 1700k 1100k 846899 | ||
532 | -5 4400k 2100k 1350k 845160 | ||
533 | -6 5200k 2500k 1600k 838626 | ||
534 | -7 6100k 2900k 1850k 834096 | ||
535 | -8 6800k 3300k 2100k 828642 | ||
536 | -9 7600k 3700k 2350k 828642 | ||
537 | </programlisting> | ||
538 | |||
539 | </sect1> | ||
540 | |||
541 | |||
542 | <sect1 id="recovering" xreflabel="RECOVERING DATA FROM DAMAGED FILES"> | ||
543 | <title>RECOVERING DATA FROM DAMAGED FILES</title> | ||
544 | |||
545 | <para><computeroutput>bzip2</computeroutput> compresses files in | ||
546 | blocks, usually 900kbytes long. Each block is handled | ||
547 | independently. If a media or transmission error causes a | ||
548 | multi-block <computeroutput>.bz2</computeroutput> file to become | ||
549 | damaged, it may be possible to recover data from the undamaged | ||
550 | blocks in the file.</para> | ||
551 | |||
552 | <para>The compressed representation of each block is delimited by | ||
553 | a 48-bit pattern, which makes it possible to find the block | ||
554 | boundaries with reasonable certainty. Each block also carries | ||
555 | its own 32-bit CRC, so damaged blocks can be distinguished from | ||
556 | undamaged ones.</para> | ||
557 | |||
558 | <para><computeroutput>bzip2recover</computeroutput> is a simple | ||
559 | program whose purpose is to search for blocks in | ||
560 | <computeroutput>.bz2</computeroutput> files, and write each block | ||
561 | out into its own <computeroutput>.bz2</computeroutput> file. You | ||
562 | can then use <computeroutput>bzip2 -t</computeroutput> to test | ||
563 | the integrity of the resulting files, and decompress those which | ||
564 | are undamaged.</para> | ||
565 | |||
566 | <para><computeroutput>bzip2recover</computeroutput> takes a | ||
567 | single argument, the name of the damaged file, and writes a | ||
568 | number of files <computeroutput>rec0001file.bz2</computeroutput>, | ||
569 | <computeroutput>rec0002file.bz2</computeroutput>, etc, containing | ||
570 | the extracted blocks. The output filenames are designed so that | ||
571 | the use of wildcards in subsequent processing -- for example, | ||
572 | <computeroutput>bzip2 -dc rec*file.bz2 > | ||
573 | recovered_data</computeroutput> -- lists the files in the correct | ||
574 | order.</para> | ||
575 | |||
576 | <para><computeroutput>bzip2recover</computeroutput> should be of | ||
577 | most use dealing with large <computeroutput>.bz2</computeroutput> | ||
578 | files, as these will contain many blocks. It is clearly futile | ||
579 | to use it on damaged single-block files, since a damaged block | ||
580 | cannot be recovered. If you wish to minimise any potential data | ||
581 | loss through media or transmission errors, you might consider | ||
582 | compressing with a smaller block size.</para> | ||
583 | |||
584 | </sect1> | ||
585 | |||
586 | |||
587 | <sect1 id="performance" xreflabel="PERFORMANCE NOTES"> | ||
588 | <title>PERFORMANCE NOTES</title> | ||
589 | |||
590 | <para>The sorting phase of compression gathers together similar | ||
591 | strings in the file. Because of this, files containing very long | ||
592 | runs of repeated symbols, like "aabaabaabaab ..." (repeated | ||
593 | several hundred times) may compress more slowly than normal. | ||
594 | Versions 0.9.5 and above fare much better than previous versions | ||
595 | in this respect. The ratio between worst-case and average-case | ||
596 | compression time is in the region of 10:1. For previous | ||
597 | versions, this figure was more like 100:1. You can use the | ||
598 | <computeroutput>-vvvv</computeroutput> option to monitor progress | ||
599 | in great detail, if you want.</para> | ||
600 | |||
601 | <para>Decompression speed is unaffected by these | ||
602 | phenomena.</para> | ||
603 | |||
604 | <para><computeroutput>bzip2</computeroutput> usually allocates | ||
605 | several megabytes of memory to operate in, and then charges all | ||
606 | over it in a fairly random fashion. This means that performance, | ||
607 | both for compressing and decompressing, is largely determined by | ||
608 | the speed at which your machine can service cache misses. | ||
609 | Because of this, small changes to the code to reduce the miss | ||
610 | rate have been observed to give disproportionately large | ||
611 | performance improvements. I imagine | ||
612 | <computeroutput>bzip2</computeroutput> will perform best on | ||
613 | machines with very large caches.</para> | ||
614 | |||
615 | </sect1> | ||
616 | |||
617 | |||
618 | |||
619 | <sect1 id="caveats" xreflabel="CAVEATS"> | ||
620 | <title>CAVEATS</title> | ||
621 | |||
622 | <para>I/O error messages are not as helpful as they could be. | ||
623 | <computeroutput>bzip2</computeroutput> tries hard to detect I/O | ||
624 | errors and exit cleanly, but the details of what the problem is | ||
625 | sometimes seem rather misleading.</para> | ||
626 | |||
627 | <para>This manual page pertains to version &bz-version; of | ||
628 | <computeroutput>bzip2</computeroutput>. Compressed data created | ||
629 | by this version is entirely forwards and backwards compatible | ||
630 | with the previous public releases, versions 0.1pl2, 0.9.0 and | ||
631 | 0.9.5, 1.0.0, 1.0.1 and 1.0.2, but with the following exception: 0.9.0 | ||
632 | and above can correctly decompress multiple concatenated | ||
633 | compressed files. 0.1pl2 cannot do this; it will stop after | ||
634 | decompressing just the first file in the stream.</para> | ||
635 | |||
636 | <para><computeroutput>bzip2recover</computeroutput> versions | ||
637 | prior to 1.0.2 used 32-bit integers to represent bit positions in | ||
638 | compressed files, so it could not handle compressed files more | ||
639 | than 512 megabytes long. Versions 1.0.2 and above use 64-bit ints | ||
640 | on some platforms which support them (GNU supported targets, and | ||
641 | Windows). To establish whether or not | ||
642 | <computeroutput>bzip2recover</computeroutput> was built with such | ||
643 | a limitation, run it without arguments. In any event you can | ||
644 | build yourself an unlimited version if you can recompile it with | ||
645 | <computeroutput>MaybeUInt64</computeroutput> set to be an | ||
646 | unsigned 64-bit integer.</para> | ||
647 | |||
648 | </sect1> | ||
649 | |||
650 | |||
651 | |||
652 | <sect1 id="author" xreflabel="AUTHOR"> | ||
653 | <title>AUTHOR</title> | ||
654 | |||
655 | <para>Julian Seward, | ||
656 | <computeroutput>&bz-email;</computeroutput></para> | ||
657 | |||
658 | <para>The ideas embodied in | ||
659 | <computeroutput>bzip2</computeroutput> are due to (at least) the | ||
660 | following people: Michael Burrows and David Wheeler (for the | ||
661 | block sorting transformation), David Wheeler (again, for the | ||
662 | Huffman coder), Peter Fenwick (for the structured coding model in | ||
663 | the original <computeroutput>bzip</computeroutput>, and many | ||
664 | refinements), and Alistair Moffat, Radford Neal and Ian Witten | ||
665 | (for the arithmetic coder in the original | ||
666 | <computeroutput>bzip</computeroutput>). I am much indebted for | ||
667 | their help, support and advice. See the manual in the source | ||
668 | distribution for pointers to sources of documentation. Christian | ||
669 | von Roques encouraged me to look for faster sorting algorithms, | ||
670 | so as to speed up compression. Bela Lubkin encouraged me to | ||
671 | improve the worst-case compression performance. | ||
672 | Donna Robinson XMLised the documentation. | ||
673 | Many people sent | ||
674 | patches, helped with portability problems, lent machines, gave | ||
675 | advice and were generally helpful.</para> | ||
676 | |||
677 | </sect1> | ||
678 | |||
679 | </chapter> | ||
680 | |||
681 | |||
682 | |||
683 | <chapter id="libprog" xreflabel="Programming with libbzip2"> | ||
684 | <title> | ||
685 | Programming with <computeroutput>libbzip2</computeroutput> | ||
686 | </title> | ||
687 | |||
688 | <para>This chapter describes the programming interface to | ||
689 | <computeroutput>libbzip2</computeroutput>.</para> | ||
690 | |||
691 | <para>For general background information, particularly about | ||
692 | memory use and performance aspects, you'd be well advised to read | ||
693 | <xref linkend="using"/> as well.</para> | ||
694 | |||
695 | |||
696 | <sect1 id="top-level" xreflabel="Top-level structure"> | ||
697 | <title>Top-level structure</title> | ||
698 | |||
699 | <para><computeroutput>libbzip2</computeroutput> is a flexible | ||
700 | library for compressing and decompressing data in the | ||
701 | <computeroutput>bzip2</computeroutput> data format. Although | ||
702 | packaged as a single entity, it helps to regard the library as | ||
703 | three separate parts: the low level interface, and the high level | ||
704 | interface, and some utility functions.</para> | ||
705 | |||
706 | <para>The structure of | ||
707 | <computeroutput>libbzip2</computeroutput>'s interfaces is similar | ||
708 | to that of Jean-loup Gailly's and Mark Adler's excellent | ||
709 | <computeroutput>zlib</computeroutput> library.</para> | ||
710 | |||
711 | <para>All externally visible symbols have names beginning | ||
712 | <computeroutput>BZ2_</computeroutput>. This is new in version | ||
713 | 1.0. The intention is to minimise pollution of the namespaces of | ||
714 | library clients.</para> | ||
715 | |||
716 | <para>To use any part of the library, you need to | ||
717 | <computeroutput>#include <bzlib.h></computeroutput> | ||
718 | into your sources.</para> | ||
719 | |||
720 | |||
721 | |||
722 | <sect2 id="ll-summary" xreflabel="Low-level summary"> | ||
723 | <title>Low-level summary</title> | ||
724 | |||
725 | <para>This interface provides services for compressing and | ||
726 | decompressing data in memory. There's no provision for dealing | ||
727 | with files, streams or any other I/O mechanisms, just straight | ||
728 | memory-to-memory work. In fact, this part of the library can be | ||
729 | compiled without inclusion of | ||
730 | <computeroutput>stdio.h</computeroutput>, which may be helpful | ||
731 | for embedded applications.</para> | ||
732 | |||
733 | <para>The low-level part of the library has no global variables | ||
734 | and is therefore thread-safe.</para> | ||
735 | |||
736 | <para>Six routines make up the low level interface: | ||
737 | <computeroutput>BZ2_bzCompressInit</computeroutput>, | ||
738 | <computeroutput>BZ2_bzCompress</computeroutput>, and | ||
739 | <computeroutput>BZ2_bzCompressEnd</computeroutput> for | ||
740 | compression, and a corresponding trio | ||
741 | <computeroutput>BZ2_bzDecompressInit</computeroutput>, | ||
742 | <computeroutput>BZ2_bzDecompress</computeroutput> and | ||
743 | <computeroutput>BZ2_bzDecompressEnd</computeroutput> for | ||
744 | decompression. The <computeroutput>*Init</computeroutput> | ||
745 | functions allocate memory for compression/decompression and do | ||
746 | other initialisations, whilst the | ||
747 | <computeroutput>*End</computeroutput> functions close down | ||
748 | operations and release memory.</para> | ||
749 | |||
750 | <para>The real work is done by | ||
751 | <computeroutput>BZ2_bzCompress</computeroutput> and | ||
752 | <computeroutput>BZ2_bzDecompress</computeroutput>. These | ||
753 | compress and decompress data from a user-supplied input buffer to | ||
754 | a user-supplied output buffer. These buffers can be any size; | ||
755 | arbitrary quantities of data are handled by making repeated calls | ||
756 | to these functions. This is a flexible mechanism allowing a | ||
757 | consumer-pull style of activity, or producer-push, or a mixture | ||
758 | of both.</para> | ||
759 | |||
760 | </sect2> | ||
761 | |||
762 | |||
763 | <sect2 id="hl-summary" xreflabel="High-level summary"> | ||
764 | <title>High-level summary</title> | ||
765 | |||
766 | <para>This interface provides some handy wrappers around the | ||
767 | low-level interface to facilitate reading and writing | ||
768 | <computeroutput>bzip2</computeroutput> format files | ||
769 | (<computeroutput>.bz2</computeroutput> files). The routines | ||
770 | provide hooks to facilitate reading files in which the | ||
771 | <computeroutput>bzip2</computeroutput> data stream is embedded | ||
772 | within some larger-scale file structure, or where there are | ||
773 | multiple <computeroutput>bzip2</computeroutput> data streams | ||
774 | concatenated end-to-end.</para> | ||
775 | |||
776 | <para>For reading files, | ||
777 | <computeroutput>BZ2_bzReadOpen</computeroutput>, | ||
778 | <computeroutput>BZ2_bzRead</computeroutput>, | ||
779 | <computeroutput>BZ2_bzReadClose</computeroutput> and | ||
780 | <computeroutput>BZ2_bzReadGetUnused</computeroutput> are | ||
781 | supplied. For writing files, | ||
782 | <computeroutput>BZ2_bzWriteOpen</computeroutput>, | ||
783 | <computeroutput>BZ2_bzWrite</computeroutput> and | ||
784 | <computeroutput>BZ2_bzWriteFinish</computeroutput> are | ||
785 | available.</para> | ||
786 | |||
787 | <para>As with the low-level library, no global variables are used | ||
788 | so the library is per se thread-safe. However, if I/O errors | ||
789 | occur whilst reading or writing the underlying compressed files, | ||
790 | you may have to consult <computeroutput>errno</computeroutput> to | ||
791 | determine the cause of the error. In that case, you'd need a C | ||
792 | library which correctly supports | ||
793 | <computeroutput>errno</computeroutput> in a multithreaded | ||
794 | environment.</para> | ||
795 | |||
796 | <para>To make the library a little simpler and more portable, | ||
797 | <computeroutput>BZ2_bzReadOpen</computeroutput> and | ||
798 | <computeroutput>BZ2_bzWriteOpen</computeroutput> require you to | ||
799 | pass them file handles (<computeroutput>FILE*</computeroutput>s) | ||
800 | which have previously been opened for reading or writing | ||
801 | respectively. That avoids portability problems associated with | ||
802 | file operations and file attributes, whilst not being much of an | ||
803 | imposition on the programmer.</para> | ||
804 | |||
805 | </sect2> | ||
806 | |||
807 | |||
808 | <sect2 id="util-fns-summary" xreflabel="Utility functions summary"> | ||
809 | <title>Utility functions summary</title> | ||
810 | |||
811 | <para>For very simple needs, | ||
812 | <computeroutput>BZ2_bzBuffToBuffCompress</computeroutput> and | ||
813 | <computeroutput>BZ2_bzBuffToBuffDecompress</computeroutput> are | ||
814 | provided. These compress data in memory from one buffer to | ||
815 | another buffer in a single function call. You should assess | ||
816 | whether these functions fulfill your memory-to-memory | ||
817 | compression/decompression requirements before investing effort in | ||
818 | understanding the more general but more complex low-level | ||
819 | interface.</para> | ||
820 | |||
821 | <para>Yoshioka Tsuneo | ||
822 | (<computeroutput>QWF00133@niftyserve.or.jp</computeroutput> / | ||
823 | <computeroutput>tsuneo-y@is.aist-nara.ac.jp</computeroutput>) has | ||
824 | contributed some functions to give better | ||
825 | <computeroutput>zlib</computeroutput> compatibility. These | ||
826 | functions are <computeroutput>BZ2_bzopen</computeroutput>, | ||
827 | <computeroutput>BZ2_bzread</computeroutput>, | ||
828 | <computeroutput>BZ2_bzwrite</computeroutput>, | ||
829 | <computeroutput>BZ2_bzflush</computeroutput>, | ||
830 | <computeroutput>BZ2_bzclose</computeroutput>, | ||
831 | <computeroutput>BZ2_bzerror</computeroutput> and | ||
832 | <computeroutput>BZ2_bzlibVersion</computeroutput>. You may find | ||
833 | these functions more convenient for simple file reading and | ||
834 | writing, than those in the high-level interface. These functions | ||
835 | are not (yet) officially part of the library, and are minimally | ||
836 | documented here. If they break, you get to keep all the pieces. | ||
837 | I hope to document them properly when time permits.</para> | ||
838 | |||
839 | <para>Yoshioka also contributed modifications to allow the | ||
840 | library to be built as a Windows DLL.</para> | ||
841 | |||
842 | </sect2> | ||
843 | |||
844 | </sect1> | ||
845 | |||
846 | |||
847 | <sect1 id="err-handling" xreflabel="Error handling"> | ||
848 | <title>Error handling</title> | ||
849 | |||
850 | <para>The library is designed to recover cleanly in all | ||
851 | situations, including the worst-case situation of decompressing | ||
852 | random data. I'm not 100% sure that it can always do this, so | ||
853 | you might want to add a signal handler to catch segmentation | ||
854 | violations during decompression if you are feeling especially | ||
855 | paranoid. I would be interested in hearing more about the | ||
856 | robustness of the library to corrupted compressed data.</para> | ||
857 | |||
858 | <para>Version 1.0.3 more robust in this respect than any | ||
859 | previous version. Investigations with Valgrind (a tool for detecting | ||
860 | problems with memory management) indicate | ||
861 | that, at least for the few files I tested, all single-bit errors | ||
862 | in the decompressed data are caught properly, with no | ||
863 | segmentation faults, no uses of uninitialised data, no out of | ||
864 | range reads or writes, and no infinite looping in the decompressor. | ||
865 | So it's certainly pretty robust, although | ||
866 | I wouldn't claim it to be totally bombproof.</para> | ||
867 | |||
868 | <para>The file <computeroutput>bzlib.h</computeroutput> contains | ||
869 | all definitions needed to use the library. In particular, you | ||
870 | should definitely not include | ||
871 | <computeroutput>bzlib_private.h</computeroutput>.</para> | ||
872 | |||
873 | <para>In <computeroutput>bzlib.h</computeroutput>, the various | ||
874 | return values are defined. The following list is not intended as | ||
875 | an exhaustive description of the circumstances in which a given | ||
876 | value may be returned -- those descriptions are given later. | ||
877 | Rather, it is intended to convey the rough meaning of each return | ||
878 | value. The first five actions are normal and not intended to | ||
879 | denote an error situation.</para> | ||
880 | |||
881 | <variablelist> | ||
882 | |||
883 | <varlistentry> | ||
884 | <term><computeroutput>BZ_OK</computeroutput></term> | ||
885 | <listitem><para>The requested action was completed | ||
886 | successfully.</para></listitem> | ||
887 | </varlistentry> | ||
888 | |||
889 | <varlistentry> | ||
890 | <term><computeroutput>BZ_RUN_OK, BZ_FLUSH_OK, | ||
891 | BZ_FINISH_OK</computeroutput></term> | ||
892 | <listitem><para>In | ||
893 | <computeroutput>BZ2_bzCompress</computeroutput>, the requested | ||
894 | flush/finish/nothing-special action was completed | ||
895 | successfully.</para></listitem> | ||
896 | </varlistentry> | ||
897 | |||
898 | <varlistentry> | ||
899 | <term><computeroutput>BZ_STREAM_END</computeroutput></term> | ||
900 | <listitem><para>Compression of data was completed, or the | ||
901 | logical stream end was detected during | ||
902 | decompression.</para></listitem> | ||
903 | </varlistentry> | ||
904 | |||
905 | </variablelist> | ||
906 | |||
907 | <para>The following return values indicate an error of some | ||
908 | kind.</para> | ||
909 | |||
910 | <variablelist> | ||
911 | |||
912 | <varlistentry> | ||
913 | <term><computeroutput>BZ_CONFIG_ERROR</computeroutput></term> | ||
914 | <listitem><para>Indicates that the library has been improperly | ||
915 | compiled on your platform -- a major configuration error. | ||
916 | Specifically, it means that | ||
917 | <computeroutput>sizeof(char)</computeroutput>, | ||
918 | <computeroutput>sizeof(short)</computeroutput> and | ||
919 | <computeroutput>sizeof(int)</computeroutput> are not 1, 2 and | ||
920 | 4 respectively, as they should be. Note that the library | ||
921 | should still work properly on 64-bit platforms which follow | ||
922 | the LP64 programming model -- that is, where | ||
923 | <computeroutput>sizeof(long)</computeroutput> and | ||
924 | <computeroutput>sizeof(void*)</computeroutput> are 8. Under | ||
925 | LP64, <computeroutput>sizeof(int)</computeroutput> is still 4, | ||
926 | so <computeroutput>libbzip2</computeroutput>, which doesn't | ||
927 | use the <computeroutput>long</computeroutput> type, is | ||
928 | OK.</para></listitem> | ||
929 | </varlistentry> | ||
930 | |||
931 | <varlistentry> | ||
932 | <term><computeroutput>BZ_SEQUENCE_ERROR</computeroutput></term> | ||
933 | <listitem><para>When using the library, it is important to call | ||
934 | the functions in the correct sequence and with data structures | ||
935 | (buffers etc) in the correct states. | ||
936 | <computeroutput>libbzip2</computeroutput> checks as much as it | ||
937 | can to ensure this is happening, and returns | ||
938 | <computeroutput>BZ_SEQUENCE_ERROR</computeroutput> if not. | ||
939 | Code which complies precisely with the function semantics, as | ||
940 | detailed below, should never receive this value; such an event | ||
941 | denotes buggy code which you should | ||
942 | investigate.</para></listitem> | ||
943 | </varlistentry> | ||
944 | |||
945 | <varlistentry> | ||
946 | <term><computeroutput>BZ_PARAM_ERROR</computeroutput></term> | ||
947 | <listitem><para>Returned when a parameter to a function call is | ||
948 | out of range or otherwise manifestly incorrect. As with | ||
949 | <computeroutput>BZ_SEQUENCE_ERROR</computeroutput>, this | ||
950 | denotes a bug in the client code. The distinction between | ||
951 | <computeroutput>BZ_PARAM_ERROR</computeroutput> and | ||
952 | <computeroutput>BZ_SEQUENCE_ERROR</computeroutput> is a bit | ||
953 | hazy, but still worth making.</para></listitem> | ||
954 | </varlistentry> | ||
955 | |||
956 | <varlistentry> | ||
957 | <term><computeroutput>BZ_MEM_ERROR</computeroutput></term> | ||
958 | <listitem><para>Returned when a request to allocate memory | ||
959 | failed. Note that the quantity of memory needed to decompress | ||
960 | a stream cannot be determined until the stream's header has | ||
961 | been read. So | ||
962 | <computeroutput>BZ2_bzDecompress</computeroutput> and | ||
963 | <computeroutput>BZ2_bzRead</computeroutput> may return | ||
964 | <computeroutput>BZ_MEM_ERROR</computeroutput> even though some | ||
965 | of the compressed data has been read. The same is not true | ||
966 | for compression; once | ||
967 | <computeroutput>BZ2_bzCompressInit</computeroutput> or | ||
968 | <computeroutput>BZ2_bzWriteOpen</computeroutput> have | ||
969 | successfully completed, | ||
970 | <computeroutput>BZ_MEM_ERROR</computeroutput> cannot | ||
971 | occur.</para></listitem> | ||
972 | </varlistentry> | ||
973 | |||
974 | <varlistentry> | ||
975 | <term><computeroutput>BZ_DATA_ERROR</computeroutput></term> | ||
976 | <listitem><para>Returned when a data integrity error is | ||
977 | detected during decompression. Most importantly, this means | ||
978 | when stored and computed CRCs for the data do not match. This | ||
979 | value is also returned upon detection of any other anomaly in | ||
980 | the compressed data.</para></listitem> | ||
981 | </varlistentry> | ||
982 | |||
983 | <varlistentry> | ||
984 | <term><computeroutput>BZ_DATA_ERROR_MAGIC</computeroutput></term> | ||
985 | <listitem><para>As a special case of | ||
986 | <computeroutput>BZ_DATA_ERROR</computeroutput>, it is | ||
987 | sometimes useful to know when the compressed stream does not | ||
988 | start with the correct magic bytes (<computeroutput>'B' 'Z' | ||
989 | 'h'</computeroutput>).</para></listitem> | ||
990 | </varlistentry> | ||
991 | |||
992 | <varlistentry> | ||
993 | <term><computeroutput>BZ_IO_ERROR</computeroutput></term> | ||
994 | <listitem><para>Returned by | ||
995 | <computeroutput>BZ2_bzRead</computeroutput> and | ||
996 | <computeroutput>BZ2_bzWrite</computeroutput> when there is an | ||
997 | error reading or writing in the compressed file, and by | ||
998 | <computeroutput>BZ2_bzReadOpen</computeroutput> and | ||
999 | <computeroutput>BZ2_bzWriteOpen</computeroutput> for attempts | ||
1000 | to use a file for which the error indicator (viz, | ||
1001 | <computeroutput>ferror(f)</computeroutput>) is set. On | ||
1002 | receipt of <computeroutput>BZ_IO_ERROR</computeroutput>, the | ||
1003 | caller should consult <computeroutput>errno</computeroutput> | ||
1004 | and/or <computeroutput>perror</computeroutput> to acquire | ||
1005 | operating-system specific information about the | ||
1006 | problem.</para></listitem> | ||
1007 | </varlistentry> | ||
1008 | |||
1009 | <varlistentry> | ||
1010 | <term><computeroutput>BZ_UNEXPECTED_EOF</computeroutput></term> | ||
1011 | <listitem><para>Returned by | ||
1012 | <computeroutput>BZ2_bzRead</computeroutput> when the | ||
1013 | compressed file finishes before the logical end of stream is | ||
1014 | detected.</para></listitem> | ||
1015 | </varlistentry> | ||
1016 | |||
1017 | <varlistentry> | ||
1018 | <term><computeroutput>BZ_OUTBUFF_FULL</computeroutput></term> | ||
1019 | <listitem><para>Returned by | ||
1020 | <computeroutput>BZ2_bzBuffToBuffCompress</computeroutput> and | ||
1021 | <computeroutput>BZ2_bzBuffToBuffDecompress</computeroutput> to | ||
1022 | indicate that the output data will not fit into the output | ||
1023 | buffer provided.</para></listitem> | ||
1024 | </varlistentry> | ||
1025 | |||
1026 | </variablelist> | ||
1027 | |||
1028 | </sect1> | ||
1029 | |||
1030 | |||
1031 | |||
1032 | <sect1 id="low-level" xreflabel=">Low-level interface"> | ||
1033 | <title>Low-level interface</title> | ||
1034 | |||
1035 | |||
1036 | <sect2 id="bzcompress-init" xreflabel="BZ2_bzCompressInit"> | ||
1037 | <title><computeroutput>BZ2_bzCompressInit</computeroutput></title> | ||
1038 | |||
1039 | <programlisting> | ||
1040 | typedef struct { | ||
1041 | char *next_in; | ||
1042 | unsigned int avail_in; | ||
1043 | unsigned int total_in_lo32; | ||
1044 | unsigned int total_in_hi32; | ||
1045 | |||
1046 | char *next_out; | ||
1047 | unsigned int avail_out; | ||
1048 | unsigned int total_out_lo32; | ||
1049 | unsigned int total_out_hi32; | ||
1050 | |||
1051 | void *state; | ||
1052 | |||
1053 | void *(*bzalloc)(void *,int,int); | ||
1054 | void (*bzfree)(void *,void *); | ||
1055 | void *opaque; | ||
1056 | } bz_stream; | ||
1057 | |||
1058 | int BZ2_bzCompressInit ( bz_stream *strm, | ||
1059 | int blockSize100k, | ||
1060 | int verbosity, | ||
1061 | int workFactor ); | ||
1062 | </programlisting> | ||
1063 | |||
1064 | <para>Prepares for compression. The | ||
1065 | <computeroutput>bz_stream</computeroutput> structure holds all | ||
1066 | data pertaining to the compression activity. A | ||
1067 | <computeroutput>bz_stream</computeroutput> structure should be | ||
1068 | allocated and initialised prior to the call. The fields of | ||
1069 | <computeroutput>bz_stream</computeroutput> comprise the entirety | ||
1070 | of the user-visible data. <computeroutput>state</computeroutput> | ||
1071 | is a pointer to the private data structures required for | ||
1072 | compression.</para> | ||
1073 | |||
1074 | <para>Custom memory allocators are supported, via fields | ||
1075 | <computeroutput>bzalloc</computeroutput>, | ||
1076 | <computeroutput>bzfree</computeroutput>, and | ||
1077 | <computeroutput>opaque</computeroutput>. The value | ||
1078 | <computeroutput>opaque</computeroutput> is passed to as the first | ||
1079 | argument to all calls to <computeroutput>bzalloc</computeroutput> | ||
1080 | and <computeroutput>bzfree</computeroutput>, but is otherwise | ||
1081 | ignored by the library. The call <computeroutput>bzalloc ( | ||
1082 | opaque, n, m )</computeroutput> is expected to return a pointer | ||
1083 | <computeroutput>p</computeroutput> to <computeroutput>n * | ||
1084 | m</computeroutput> bytes of memory, and <computeroutput>bzfree ( | ||
1085 | opaque, p )</computeroutput> should free that memory.</para> | ||
1086 | |||
1087 | <para>If you don't want to use a custom memory allocator, set | ||
1088 | <computeroutput>bzalloc</computeroutput>, | ||
1089 | <computeroutput>bzfree</computeroutput> and | ||
1090 | <computeroutput>opaque</computeroutput> to | ||
1091 | <computeroutput>NULL</computeroutput>, and the library will then | ||
1092 | use the standard <computeroutput>malloc</computeroutput> / | ||
1093 | <computeroutput>free</computeroutput> routines.</para> | ||
1094 | |||
1095 | <para>Before calling | ||
1096 | <computeroutput>BZ2_bzCompressInit</computeroutput>, fields | ||
1097 | <computeroutput>bzalloc</computeroutput>, | ||
1098 | <computeroutput>bzfree</computeroutput> and | ||
1099 | <computeroutput>opaque</computeroutput> should be filled | ||
1100 | appropriately, as just described. Upon return, the internal | ||
1101 | state will have been allocated and initialised, and | ||
1102 | <computeroutput>total_in_lo32</computeroutput>, | ||
1103 | <computeroutput>total_in_hi32</computeroutput>, | ||
1104 | <computeroutput>total_out_lo32</computeroutput> and | ||
1105 | <computeroutput>total_out_hi32</computeroutput> will have been | ||
1106 | set to zero. These four fields are used by the library to inform | ||
1107 | the caller of the total amount of data passed into and out of the | ||
1108 | library, respectively. You should not try to change them. As of | ||
1109 | version 1.0, 64-bit counts are maintained, even on 32-bit | ||
1110 | platforms, using the <computeroutput>_hi32</computeroutput> | ||
1111 | fields to store the upper 32 bits of the count. So, for example, | ||
1112 | the total amount of data in is <computeroutput>(total_in_hi32 | ||
1113 | << 32) + total_in_lo32</computeroutput>.</para> | ||
1114 | |||
1115 | <para>Parameter <computeroutput>blockSize100k</computeroutput> | ||
1116 | specifies the block size to be used for compression. It should | ||
1117 | be a value between 1 and 9 inclusive, and the actual block size | ||
1118 | used is 100000 x this figure. 9 gives the best compression but | ||
1119 | takes most memory.</para> | ||
1120 | |||
1121 | <para>Parameter <computeroutput>verbosity</computeroutput> should | ||
1122 | be set to a number between 0 and 4 inclusive. 0 is silent, and | ||
1123 | greater numbers give increasingly verbose monitoring/debugging | ||
1124 | output. If the library has been compiled with | ||
1125 | <computeroutput>-DBZ_NO_STDIO</computeroutput>, no such output | ||
1126 | will appear for any verbosity setting.</para> | ||
1127 | |||
1128 | <para>Parameter <computeroutput>workFactor</computeroutput> | ||
1129 | controls how the compression phase behaves when presented with | ||
1130 | worst case, highly repetitive, input data. If compression runs | ||
1131 | into difficulties caused by repetitive data, the library switches | ||
1132 | from the standard sorting algorithm to a fallback algorithm. The | ||
1133 | fallback is slower than the standard algorithm by perhaps a | ||
1134 | factor of three, but always behaves reasonably, no matter how bad | ||
1135 | the input.</para> | ||
1136 | |||
1137 | <para>Lower values of <computeroutput>workFactor</computeroutput> | ||
1138 | reduce the amount of effort the standard algorithm will expend | ||
1139 | before resorting to the fallback. You should set this parameter | ||
1140 | carefully; too low, and many inputs will be handled by the | ||
1141 | fallback algorithm and so compress rather slowly, too high, and | ||
1142 | your average-to-worst case compression times can become very | ||
1143 | large. The default value of 30 gives reasonable behaviour over a | ||
1144 | wide range of circumstances.</para> | ||
1145 | |||
1146 | <para>Allowable values range from 0 to 250 inclusive. 0 is a | ||
1147 | special case, equivalent to using the default value of 30.</para> | ||
1148 | |||
1149 | <para>Note that the compressed output generated is the same | ||
1150 | regardless of whether or not the fallback algorithm is | ||
1151 | used.</para> | ||
1152 | |||
1153 | <para>Be aware also that this parameter may disappear entirely in | ||
1154 | future versions of the library. In principle it should be | ||
1155 | possible to devise a good way to automatically choose which | ||
1156 | algorithm to use. Such a mechanism would render the parameter | ||
1157 | obsolete.</para> | ||
1158 | |||
1159 | <para>Possible return values:</para> | ||
1160 | |||
1161 | <programlisting> | ||
1162 | BZ_CONFIG_ERROR | ||
1163 | if the library has been mis-compiled | ||
1164 | BZ_PARAM_ERROR | ||
1165 | if strm is NULL | ||
1166 | or blockSize < 1 or blockSize > 9 | ||
1167 | or verbosity < 0 or verbosity > 4 | ||
1168 | or workFactor < 0 or workFactor > 250 | ||
1169 | BZ_MEM_ERROR | ||
1170 | if not enough memory is available | ||
1171 | BZ_OK | ||
1172 | otherwise | ||
1173 | </programlisting> | ||
1174 | |||
1175 | <para>Allowable next actions:</para> | ||
1176 | |||
1177 | <programlisting> | ||
1178 | BZ2_bzCompress | ||
1179 | if BZ_OK is returned | ||
1180 | no specific action needed in case of error | ||
1181 | </programlisting> | ||
1182 | |||
1183 | </sect2> | ||
1184 | |||
1185 | |||
1186 | <sect2 id="bzCompress" xreflabel="BZ2_bzCompress"> | ||
1187 | <title><computeroutput>BZ2_bzCompress</computeroutput></title> | ||
1188 | |||
1189 | <programlisting> | ||
1190 | int BZ2_bzCompress ( bz_stream *strm, int action ); | ||
1191 | </programlisting> | ||
1192 | |||
1193 | <para>Provides more input and/or output buffer space for the | ||
1194 | library. The caller maintains input and output buffers, and | ||
1195 | calls <computeroutput>BZ2_bzCompress</computeroutput> to transfer | ||
1196 | data between them.</para> | ||
1197 | |||
1198 | <para>Before each call to | ||
1199 | <computeroutput>BZ2_bzCompress</computeroutput>, | ||
1200 | <computeroutput>next_in</computeroutput> should point at the data | ||
1201 | to be compressed, and <computeroutput>avail_in</computeroutput> | ||
1202 | should indicate how many bytes the library may read. | ||
1203 | <computeroutput>BZ2_bzCompress</computeroutput> updates | ||
1204 | <computeroutput>next_in</computeroutput>, | ||
1205 | <computeroutput>avail_in</computeroutput> and | ||
1206 | <computeroutput>total_in</computeroutput> to reflect the number | ||
1207 | of bytes it has read.</para> | ||
1208 | |||
1209 | <para>Similarly, <computeroutput>next_out</computeroutput> should | ||
1210 | point to a buffer in which the compressed data is to be placed, | ||
1211 | with <computeroutput>avail_out</computeroutput> indicating how | ||
1212 | much output space is available. | ||
1213 | <computeroutput>BZ2_bzCompress</computeroutput> updates | ||
1214 | <computeroutput>next_out</computeroutput>, | ||
1215 | <computeroutput>avail_out</computeroutput> and | ||
1216 | <computeroutput>total_out</computeroutput> to reflect the number | ||
1217 | of bytes output.</para> | ||
1218 | |||
1219 | <para>You may provide and remove as little or as much data as you | ||
1220 | like on each call of | ||
1221 | <computeroutput>BZ2_bzCompress</computeroutput>. In the limit, | ||
1222 | it is acceptable to supply and remove data one byte at a time, | ||
1223 | although this would be terribly inefficient. You should always | ||
1224 | ensure that at least one byte of output space is available at | ||
1225 | each call.</para> | ||
1226 | |||
1227 | <para>A second purpose of | ||
1228 | <computeroutput>BZ2_bzCompress</computeroutput> is to request a | ||
1229 | change of mode of the compressed stream.</para> | ||
1230 | |||
1231 | <para>Conceptually, a compressed stream can be in one of four | ||
1232 | states: IDLE, RUNNING, FLUSHING and FINISHING. Before | ||
1233 | initialisation | ||
1234 | (<computeroutput>BZ2_bzCompressInit</computeroutput>) and after | ||
1235 | termination (<computeroutput>BZ2_bzCompressEnd</computeroutput>), | ||
1236 | a stream is regarded as IDLE.</para> | ||
1237 | |||
1238 | <para>Upon initialisation | ||
1239 | (<computeroutput>BZ2_bzCompressInit</computeroutput>), the stream | ||
1240 | is placed in the RUNNING state. Subsequent calls to | ||
1241 | <computeroutput>BZ2_bzCompress</computeroutput> should pass | ||
1242 | <computeroutput>BZ_RUN</computeroutput> as the requested action; | ||
1243 | other actions are illegal and will result in | ||
1244 | <computeroutput>BZ_SEQUENCE_ERROR</computeroutput>.</para> | ||
1245 | |||
1246 | <para>At some point, the calling program will have provided all | ||
1247 | the input data it wants to. It will then want to finish up -- in | ||
1248 | effect, asking the library to process any data it might have | ||
1249 | buffered internally. In this state, | ||
1250 | <computeroutput>BZ2_bzCompress</computeroutput> will no longer | ||
1251 | attempt to read data from | ||
1252 | <computeroutput>next_in</computeroutput>, but it will want to | ||
1253 | write data to <computeroutput>next_out</computeroutput>. Because | ||
1254 | the output buffer supplied by the user can be arbitrarily small, | ||
1255 | the finishing-up operation cannot necessarily be done with a | ||
1256 | single call of | ||
1257 | <computeroutput>BZ2_bzCompress</computeroutput>.</para> | ||
1258 | |||
1259 | <para>Instead, the calling program passes | ||
1260 | <computeroutput>BZ_FINISH</computeroutput> as an action to | ||
1261 | <computeroutput>BZ2_bzCompress</computeroutput>. This changes | ||
1262 | the stream's state to FINISHING. Any remaining input (ie, | ||
1263 | <computeroutput>next_in[0 .. avail_in-1]</computeroutput>) is | ||
1264 | compressed and transferred to the output buffer. To do this, | ||
1265 | <computeroutput>BZ2_bzCompress</computeroutput> must be called | ||
1266 | repeatedly until all the output has been consumed. At that | ||
1267 | point, <computeroutput>BZ2_bzCompress</computeroutput> returns | ||
1268 | <computeroutput>BZ_STREAM_END</computeroutput>, and the stream's | ||
1269 | state is set back to IDLE. | ||
1270 | <computeroutput>BZ2_bzCompressEnd</computeroutput> should then be | ||
1271 | called.</para> | ||
1272 | |||
1273 | <para>Just to make sure the calling program does not cheat, the | ||
1274 | library makes a note of <computeroutput>avail_in</computeroutput> | ||
1275 | at the time of the first call to | ||
1276 | <computeroutput>BZ2_bzCompress</computeroutput> which has | ||
1277 | <computeroutput>BZ_FINISH</computeroutput> as an action (ie, at | ||
1278 | the time the program has announced its intention to not supply | ||
1279 | any more input). By comparing this value with that of | ||
1280 | <computeroutput>avail_in</computeroutput> over subsequent calls | ||
1281 | to <computeroutput>BZ2_bzCompress</computeroutput>, the library | ||
1282 | can detect any attempts to slip in more data to compress. Any | ||
1283 | calls for which this is detected will return | ||
1284 | <computeroutput>BZ_SEQUENCE_ERROR</computeroutput>. This | ||
1285 | indicates a programming mistake which should be corrected.</para> | ||
1286 | |||
1287 | <para>Instead of asking to finish, the calling program may ask | ||
1288 | <computeroutput>BZ2_bzCompress</computeroutput> to take all the | ||
1289 | remaining input, compress it and terminate the current | ||
1290 | (Burrows-Wheeler) compression block. This could be useful for | ||
1291 | error control purposes. The mechanism is analogous to that for | ||
1292 | finishing: call <computeroutput>BZ2_bzCompress</computeroutput> | ||
1293 | with an action of <computeroutput>BZ_FLUSH</computeroutput>, | ||
1294 | remove output data, and persist with the | ||
1295 | <computeroutput>BZ_FLUSH</computeroutput> action until the value | ||
1296 | <computeroutput>BZ_RUN</computeroutput> is returned. As with | ||
1297 | finishing, <computeroutput>BZ2_bzCompress</computeroutput> | ||
1298 | detects any attempt to provide more input data once the flush has | ||
1299 | begun.</para> | ||
1300 | |||
1301 | <para>Once the flush is complete, the stream returns to the | ||
1302 | normal RUNNING state.</para> | ||
1303 | |||
1304 | <para>This all sounds pretty complex, but isn't really. Here's a | ||
1305 | table which shows which actions are allowable in each state, what | ||
1306 | action will be taken, what the next state is, and what the | ||
1307 | non-error return values are. Note that you can't explicitly ask | ||
1308 | what state the stream is in, but nor do you need to -- it can be | ||
1309 | inferred from the values returned by | ||
1310 | <computeroutput>BZ2_bzCompress</computeroutput>.</para> | ||
1311 | |||
1312 | <programlisting> | ||
1313 | IDLE/any | ||
1314 | Illegal. IDLE state only exists after BZ2_bzCompressEnd or | ||
1315 | before BZ2_bzCompressInit. | ||
1316 | Return value = BZ_SEQUENCE_ERROR | ||
1317 | |||
1318 | RUNNING/BZ_RUN | ||
1319 | Compress from next_in to next_out as much as possible. | ||
1320 | Next state = RUNNING | ||
1321 | Return value = BZ_RUN_OK | ||
1322 | |||
1323 | RUNNING/BZ_FLUSH | ||
1324 | Remember current value of next_in. Compress from next_in | ||
1325 | to next_out as much as possible, but do not accept any more input. | ||
1326 | Next state = FLUSHING | ||
1327 | Return value = BZ_FLUSH_OK | ||
1328 | |||
1329 | RUNNING/BZ_FINISH | ||
1330 | Remember current value of next_in. Compress from next_in | ||
1331 | to next_out as much as possible, but do not accept any more input. | ||
1332 | Next state = FINISHING | ||
1333 | Return value = BZ_FINISH_OK | ||
1334 | |||
1335 | FLUSHING/BZ_FLUSH | ||
1336 | Compress from next_in to next_out as much as possible, | ||
1337 | but do not accept any more input. | ||
1338 | If all the existing input has been used up and all compressed | ||
1339 | output has been removed | ||
1340 | Next state = RUNNING; Return value = BZ_RUN_OK | ||
1341 | else | ||
1342 | Next state = FLUSHING; Return value = BZ_FLUSH_OK | ||
1343 | |||
1344 | FLUSHING/other | ||
1345 | Illegal. | ||
1346 | Return value = BZ_SEQUENCE_ERROR | ||
1347 | |||
1348 | FINISHING/BZ_FINISH | ||
1349 | Compress from next_in to next_out as much as possible, | ||
1350 | but to not accept any more input. | ||
1351 | If all the existing input has been used up and all compressed | ||
1352 | output has been removed | ||
1353 | Next state = IDLE; Return value = BZ_STREAM_END | ||
1354 | else | ||
1355 | Next state = FINISHING; Return value = BZ_FINISHING | ||
1356 | |||
1357 | FINISHING/other | ||
1358 | Illegal. | ||
1359 | Return value = BZ_SEQUENCE_ERROR | ||
1360 | </programlisting> | ||
1361 | |||
1362 | |||
1363 | <para>That still looks complicated? Well, fair enough. The | ||
1364 | usual sequence of calls for compressing a load of data is:</para> | ||
1365 | |||
1366 | <orderedlist> | ||
1367 | |||
1368 | <listitem><para>Get started with | ||
1369 | <computeroutput>BZ2_bzCompressInit</computeroutput>.</para></listitem> | ||
1370 | |||
1371 | <listitem><para>Shovel data in and shlurp out its compressed form | ||
1372 | using zero or more calls of | ||
1373 | <computeroutput>BZ2_bzCompress</computeroutput> with action = | ||
1374 | <computeroutput>BZ_RUN</computeroutput>.</para></listitem> | ||
1375 | |||
1376 | <listitem><para>Finish up. Repeatedly call | ||
1377 | <computeroutput>BZ2_bzCompress</computeroutput> with action = | ||
1378 | <computeroutput>BZ_FINISH</computeroutput>, copying out the | ||
1379 | compressed output, until | ||
1380 | <computeroutput>BZ_STREAM_END</computeroutput> is | ||
1381 | returned.</para></listitem> <listitem><para>Close up and go home. Call | ||
1382 | <computeroutput>BZ2_bzCompressEnd</computeroutput>.</para></listitem> | ||
1383 | |||
1384 | </orderedlist> | ||
1385 | |||
1386 | <para>If the data you want to compress fits into your input | ||
1387 | buffer all at once, you can skip the calls of | ||
1388 | <computeroutput>BZ2_bzCompress ( ..., BZ_RUN )</computeroutput> | ||
1389 | and just do the <computeroutput>BZ2_bzCompress ( ..., BZ_FINISH | ||
1390 | )</computeroutput> calls.</para> | ||
1391 | |||
1392 | <para>All required memory is allocated by | ||
1393 | <computeroutput>BZ2_bzCompressInit</computeroutput>. The | ||
1394 | compression library can accept any data at all (obviously). So | ||
1395 | you shouldn't get any error return values from the | ||
1396 | <computeroutput>BZ2_bzCompress</computeroutput> calls. If you | ||
1397 | do, they will be | ||
1398 | <computeroutput>BZ_SEQUENCE_ERROR</computeroutput>, and indicate | ||
1399 | a bug in your programming.</para> | ||
1400 | |||
1401 | <para>Trivial other possible return values:</para> | ||
1402 | |||
1403 | <programlisting> | ||
1404 | BZ_PARAM_ERROR | ||
1405 | if strm is NULL, or strm->s is NULL | ||
1406 | </programlisting> | ||
1407 | |||
1408 | </sect2> | ||
1409 | |||
1410 | |||
1411 | <sect2 id="bzCompress-end" xreflabel="BZ2_bzCompressEnd"> | ||
1412 | <title><computeroutput>BZ2_bzCompressEnd</computeroutput></title> | ||
1413 | |||
1414 | <programlisting> | ||
1415 | int BZ2_bzCompressEnd ( bz_stream *strm ); | ||
1416 | </programlisting> | ||
1417 | |||
1418 | <para>Releases all memory associated with a compression | ||
1419 | stream.</para> | ||
1420 | |||
1421 | <para>Possible return values:</para> | ||
1422 | |||
1423 | <programlisting> | ||
1424 | BZ_PARAM_ERROR if strm is NULL or strm->s is NULL | ||
1425 | BZ_OK otherwise | ||
1426 | </programlisting> | ||
1427 | |||
1428 | </sect2> | ||
1429 | |||
1430 | |||
1431 | <sect2 id="bzDecompress-init" xreflabel="BZ2_bzDecompressInit"> | ||
1432 | <title><computeroutput>BZ2_bzDecompressInit</computeroutput></title> | ||
1433 | |||
1434 | <programlisting> | ||
1435 | int BZ2_bzDecompressInit ( bz_stream *strm, int verbosity, int small ); | ||
1436 | </programlisting> | ||
1437 | |||
1438 | <para>Prepares for decompression. As with | ||
1439 | <computeroutput>BZ2_bzCompressInit</computeroutput>, a | ||
1440 | <computeroutput>bz_stream</computeroutput> record should be | ||
1441 | allocated and initialised before the call. Fields | ||
1442 | <computeroutput>bzalloc</computeroutput>, | ||
1443 | <computeroutput>bzfree</computeroutput> and | ||
1444 | <computeroutput>opaque</computeroutput> should be set if a custom | ||
1445 | memory allocator is required, or made | ||
1446 | <computeroutput>NULL</computeroutput> for the normal | ||
1447 | <computeroutput>malloc</computeroutput> / | ||
1448 | <computeroutput>free</computeroutput> routines. Upon return, the | ||
1449 | internal state will have been initialised, and | ||
1450 | <computeroutput>total_in</computeroutput> and | ||
1451 | <computeroutput>total_out</computeroutput> will be zero.</para> | ||
1452 | |||
1453 | <para>For the meaning of parameter | ||
1454 | <computeroutput>verbosity</computeroutput>, see | ||
1455 | <computeroutput>BZ2_bzCompressInit</computeroutput>.</para> | ||
1456 | |||
1457 | <para>If <computeroutput>small</computeroutput> is nonzero, the | ||
1458 | library will use an alternative decompression algorithm which | ||
1459 | uses less memory but at the cost of decompressing more slowly | ||
1460 | (roughly speaking, half the speed, but the maximum memory | ||
1461 | requirement drops to around 2300k). See <xref linkend="using"/> | ||
1462 | for more information on memory management.</para> | ||
1463 | |||
1464 | <para>Note that the amount of memory needed to decompress a | ||
1465 | stream cannot be determined until the stream's header has been | ||
1466 | read, so even if | ||
1467 | <computeroutput>BZ2_bzDecompressInit</computeroutput> succeeds, a | ||
1468 | subsequent <computeroutput>BZ2_bzDecompress</computeroutput> | ||
1469 | could fail with | ||
1470 | <computeroutput>BZ_MEM_ERROR</computeroutput>.</para> | ||
1471 | |||
1472 | <para>Possible return values:</para> | ||
1473 | |||
1474 | <programlisting> | ||
1475 | BZ_CONFIG_ERROR | ||
1476 | if the library has been mis-compiled | ||
1477 | BZ_PARAM_ERROR | ||
1478 | if ( small != 0 && small != 1 ) | ||
1479 | or (verbosity <; 0 || verbosity > 4) | ||
1480 | BZ_MEM_ERROR | ||
1481 | if insufficient memory is available | ||
1482 | </programlisting> | ||
1483 | |||
1484 | <para>Allowable next actions:</para> | ||
1485 | |||
1486 | <programlisting> | ||
1487 | BZ2_bzDecompress | ||
1488 | if BZ_OK was returned | ||
1489 | no specific action required in case of error | ||
1490 | </programlisting> | ||
1491 | |||
1492 | </sect2> | ||
1493 | |||
1494 | |||
1495 | <sect2 id="bzDecompress" xreflabel="BZ2_bzDecompress"> | ||
1496 | <title><computeroutput>BZ2_bzDecompress</computeroutput></title> | ||
1497 | |||
1498 | <programlisting> | ||
1499 | int BZ2_bzDecompress ( bz_stream *strm ); | ||
1500 | </programlisting> | ||
1501 | |||
1502 | <para>Provides more input and/out output buffer space for the | ||
1503 | library. The caller maintains input and output buffers, and uses | ||
1504 | <computeroutput>BZ2_bzDecompress</computeroutput> to transfer | ||
1505 | data between them.</para> | ||
1506 | |||
1507 | <para>Before each call to | ||
1508 | <computeroutput>BZ2_bzDecompress</computeroutput>, | ||
1509 | <computeroutput>next_in</computeroutput> should point at the | ||
1510 | compressed data, and <computeroutput>avail_in</computeroutput> | ||
1511 | should indicate how many bytes the library may read. | ||
1512 | <computeroutput>BZ2_bzDecompress</computeroutput> updates | ||
1513 | <computeroutput>next_in</computeroutput>, | ||
1514 | <computeroutput>avail_in</computeroutput> and | ||
1515 | <computeroutput>total_in</computeroutput> to reflect the number | ||
1516 | of bytes it has read.</para> | ||
1517 | |||
1518 | <para>Similarly, <computeroutput>next_out</computeroutput> should | ||
1519 | point to a buffer in which the uncompressed output is to be | ||
1520 | placed, with <computeroutput>avail_out</computeroutput> | ||
1521 | indicating how much output space is available. | ||
1522 | <computeroutput>BZ2_bzCompress</computeroutput> updates | ||
1523 | <computeroutput>next_out</computeroutput>, | ||
1524 | <computeroutput>avail_out</computeroutput> and | ||
1525 | <computeroutput>total_out</computeroutput> to reflect the number | ||
1526 | of bytes output.</para> | ||
1527 | |||
1528 | <para>You may provide and remove as little or as much data as you | ||
1529 | like on each call of | ||
1530 | <computeroutput>BZ2_bzDecompress</computeroutput>. In the limit, | ||
1531 | it is acceptable to supply and remove data one byte at a time, | ||
1532 | although this would be terribly inefficient. You should always | ||
1533 | ensure that at least one byte of output space is available at | ||
1534 | each call.</para> | ||
1535 | |||
1536 | <para>Use of <computeroutput>BZ2_bzDecompress</computeroutput> is | ||
1537 | simpler than | ||
1538 | <computeroutput>BZ2_bzCompress</computeroutput>.</para> | ||
1539 | |||
1540 | <para>You should provide input and remove output as described | ||
1541 | above, and repeatedly call | ||
1542 | <computeroutput>BZ2_bzDecompress</computeroutput> until | ||
1543 | <computeroutput>BZ_STREAM_END</computeroutput> is returned. | ||
1544 | Appearance of <computeroutput>BZ_STREAM_END</computeroutput> | ||
1545 | denotes that <computeroutput>BZ2_bzDecompress</computeroutput> | ||
1546 | has detected the logical end of the compressed stream. | ||
1547 | <computeroutput>BZ2_bzDecompress</computeroutput> will not | ||
1548 | produce <computeroutput>BZ_STREAM_END</computeroutput> until all | ||
1549 | output data has been placed into the output buffer, so once | ||
1550 | <computeroutput>BZ_STREAM_END</computeroutput> appears, you are | ||
1551 | guaranteed to have available all the decompressed output, and | ||
1552 | <computeroutput>BZ2_bzDecompressEnd</computeroutput> can safely | ||
1553 | be called.</para> | ||
1554 | |||
1555 | <para>If case of an error return value, you should call | ||
1556 | <computeroutput>BZ2_bzDecompressEnd</computeroutput> to clean up | ||
1557 | and release memory.</para> | ||
1558 | |||
1559 | <para>Possible return values:</para> | ||
1560 | |||
1561 | <programlisting> | ||
1562 | BZ_PARAM_ERROR | ||
1563 | if strm is NULL or strm->s is NULL | ||
1564 | or strm->avail_out < 1 | ||
1565 | BZ_DATA_ERROR | ||
1566 | if a data integrity error is detected in the compressed stream | ||
1567 | BZ_DATA_ERROR_MAGIC | ||
1568 | if the compressed stream doesn't begin with the right magic bytes | ||
1569 | BZ_MEM_ERROR | ||
1570 | if there wasn't enough memory available | ||
1571 | BZ_STREAM_END | ||
1572 | if the logical end of the data stream was detected and all | ||
1573 | output in has been consumed, eg s-->avail_out > 0 | ||
1574 | BZ_OK | ||
1575 | otherwise | ||
1576 | </programlisting> | ||
1577 | |||
1578 | <para>Allowable next actions:</para> | ||
1579 | |||
1580 | <programlisting> | ||
1581 | BZ2_bzDecompress | ||
1582 | if BZ_OK was returned | ||
1583 | BZ2_bzDecompressEnd | ||
1584 | otherwise | ||
1585 | </programlisting> | ||
1586 | |||
1587 | </sect2> | ||
1588 | |||
1589 | |||
1590 | <sect2 id="bzDecompress-end" xreflabel="BZ2_bzDecompressEnd"> | ||
1591 | <title><computeroutput>BZ2_bzDecompressEnd</computeroutput></title> | ||
1592 | |||
1593 | <programlisting> | ||
1594 | int BZ2_bzDecompressEnd ( bz_stream *strm ); | ||
1595 | </programlisting> | ||
1596 | |||
1597 | <para>Releases all memory associated with a decompression | ||
1598 | stream.</para> | ||
1599 | |||
1600 | <para>Possible return values:</para> | ||
1601 | |||
1602 | <programlisting> | ||
1603 | BZ_PARAM_ERROR | ||
1604 | if strm is NULL or strm->s is NULL | ||
1605 | BZ_OK | ||
1606 | otherwise | ||
1607 | </programlisting> | ||
1608 | |||
1609 | <para>Allowable next actions:</para> | ||
1610 | |||
1611 | <programlisting> | ||
1612 | None. | ||
1613 | </programlisting> | ||
1614 | |||
1615 | </sect2> | ||
1616 | |||
1617 | </sect1> | ||
1618 | |||
1619 | |||
1620 | <sect1 id="hl-interface" xreflabel="High-level interface"> | ||
1621 | <title>High-level interface</title> | ||
1622 | |||
1623 | <para>This interface provides functions for reading and writing | ||
1624 | <computeroutput>bzip2</computeroutput> format files. First, some | ||
1625 | general points.</para> | ||
1626 | |||
1627 | <itemizedlist mark='bullet'> | ||
1628 | |||
1629 | <listitem><para>All of the functions take an | ||
1630 | <computeroutput>int*</computeroutput> first argument, | ||
1631 | <computeroutput>bzerror</computeroutput>. After each call, | ||
1632 | <computeroutput>bzerror</computeroutput> should be consulted | ||
1633 | first to determine the outcome of the call. If | ||
1634 | <computeroutput>bzerror</computeroutput> is | ||
1635 | <computeroutput>BZ_OK</computeroutput>, the call completed | ||
1636 | successfully, and only then should the return value of the | ||
1637 | function (if any) be consulted. If | ||
1638 | <computeroutput>bzerror</computeroutput> is | ||
1639 | <computeroutput>BZ_IO_ERROR</computeroutput>, there was an | ||
1640 | error reading/writing the underlying compressed file, and you | ||
1641 | should then consult <computeroutput>errno</computeroutput> / | ||
1642 | <computeroutput>perror</computeroutput> to determine the cause | ||
1643 | of the difficulty. <computeroutput>bzerror</computeroutput> | ||
1644 | may also be set to various other values; precise details are | ||
1645 | given on a per-function basis below.</para></listitem> | ||
1646 | |||
1647 | <listitem><para>If <computeroutput>bzerror</computeroutput> indicates | ||
1648 | an error (ie, anything except | ||
1649 | <computeroutput>BZ_OK</computeroutput> and | ||
1650 | <computeroutput>BZ_STREAM_END</computeroutput>), you should | ||
1651 | immediately call | ||
1652 | <computeroutput>BZ2_bzReadClose</computeroutput> (or | ||
1653 | <computeroutput>BZ2_bzWriteClose</computeroutput>, depending on | ||
1654 | whether you are attempting to read or to write) to free up all | ||
1655 | resources associated with the stream. Once an error has been | ||
1656 | indicated, behaviour of all calls except | ||
1657 | <computeroutput>BZ2_bzReadClose</computeroutput> | ||
1658 | (<computeroutput>BZ2_bzWriteClose</computeroutput>) is | ||
1659 | undefined. The implication is that (1) | ||
1660 | <computeroutput>bzerror</computeroutput> should be checked | ||
1661 | after each call, and (2) if | ||
1662 | <computeroutput>bzerror</computeroutput> indicates an error, | ||
1663 | <computeroutput>BZ2_bzReadClose</computeroutput> | ||
1664 | (<computeroutput>BZ2_bzWriteClose</computeroutput>) should then | ||
1665 | be called to clean up.</para></listitem> | ||
1666 | |||
1667 | <listitem><para>The <computeroutput>FILE*</computeroutput> arguments | ||
1668 | passed to <computeroutput>BZ2_bzReadOpen</computeroutput> / | ||
1669 | <computeroutput>BZ2_bzWriteOpen</computeroutput> should be set | ||
1670 | to binary mode. Most Unix systems will do this by default, but | ||
1671 | other platforms, including Windows and Mac, will not. If you | ||
1672 | omit this, you may encounter problems when moving code to new | ||
1673 | platforms.</para></listitem> | ||
1674 | |||
1675 | <listitem><para>Memory allocation requests are handled by | ||
1676 | <computeroutput>malloc</computeroutput> / | ||
1677 | <computeroutput>free</computeroutput>. At present there is no | ||
1678 | facility for user-defined memory allocators in the file I/O | ||
1679 | functions (could easily be added, though).</para></listitem> | ||
1680 | |||
1681 | </itemizedlist> | ||
1682 | |||
1683 | |||
1684 | |||
1685 | <sect2 id="bzreadopen" xreflabel="BZ2_bzReadOpen"> | ||
1686 | <title><computeroutput>BZ2_bzReadOpen</computeroutput></title> | ||
1687 | |||
1688 | <programlisting> | ||
1689 | typedef void BZFILE; | ||
1690 | |||
1691 | BZFILE *BZ2_bzReadOpen( int *bzerror, FILE *f, | ||
1692 | int verbosity, int small, | ||
1693 | void *unused, int nUnused ); | ||
1694 | </programlisting> | ||
1695 | |||
1696 | <para>Prepare to read compressed data from file handle | ||
1697 | <computeroutput>f</computeroutput>. | ||
1698 | <computeroutput>f</computeroutput> should refer to a file which | ||
1699 | has been opened for reading, and for which the error indicator | ||
1700 | (<computeroutput>ferror(f)</computeroutput>)is not set. If | ||
1701 | <computeroutput>small</computeroutput> is 1, the library will try | ||
1702 | to decompress using less memory, at the expense of speed.</para> | ||
1703 | |||
1704 | <para>For reasons explained below, | ||
1705 | <computeroutput>BZ2_bzRead</computeroutput> will decompress the | ||
1706 | <computeroutput>nUnused</computeroutput> bytes starting at | ||
1707 | <computeroutput>unused</computeroutput>, before starting to read | ||
1708 | from the file <computeroutput>f</computeroutput>. At most | ||
1709 | <computeroutput>BZ_MAX_UNUSED</computeroutput> bytes may be | ||
1710 | supplied like this. If this facility is not required, you should | ||
1711 | pass <computeroutput>NULL</computeroutput> and | ||
1712 | <computeroutput>0</computeroutput> for | ||
1713 | <computeroutput>unused</computeroutput> and | ||
1714 | n<computeroutput>Unused</computeroutput> respectively.</para> | ||
1715 | |||
1716 | <para>For the meaning of parameters | ||
1717 | <computeroutput>small</computeroutput> and | ||
1718 | <computeroutput>verbosity</computeroutput>, see | ||
1719 | <computeroutput>BZ2_bzDecompressInit</computeroutput>.</para> | ||
1720 | |||
1721 | <para>The amount of memory needed to decompress a file cannot be | ||
1722 | determined until the file's header has been read. So it is | ||
1723 | possible that <computeroutput>BZ2_bzReadOpen</computeroutput> | ||
1724 | returns <computeroutput>BZ_OK</computeroutput> but a subsequent | ||
1725 | call of <computeroutput>BZ2_bzRead</computeroutput> will return | ||
1726 | <computeroutput>BZ_MEM_ERROR</computeroutput>.</para> | ||
1727 | |||
1728 | <para>Possible assignments to | ||
1729 | <computeroutput>bzerror</computeroutput>:</para> | ||
1730 | |||
1731 | <programlisting> | ||
1732 | BZ_CONFIG_ERROR | ||
1733 | if the library has been mis-compiled | ||
1734 | BZ_PARAM_ERROR | ||
1735 | if f is NULL | ||
1736 | or small is neither 0 nor 1 | ||
1737 | or ( unused == NULL && nUnused != 0 ) | ||
1738 | or ( unused != NULL && !(0 <= nUnused <= BZ_MAX_UNUSED) ) | ||
1739 | BZ_IO_ERROR | ||
1740 | if ferror(f) is nonzero | ||
1741 | BZ_MEM_ERROR | ||
1742 | if insufficient memory is available | ||
1743 | BZ_OK | ||
1744 | otherwise. | ||
1745 | </programlisting> | ||
1746 | |||
1747 | <para>Possible return values:</para> | ||
1748 | |||
1749 | <programlisting> | ||
1750 | Pointer to an abstract BZFILE | ||
1751 | if bzerror is BZ_OK | ||
1752 | NULL | ||
1753 | otherwise | ||
1754 | </programlisting> | ||
1755 | |||
1756 | <para>Allowable next actions:</para> | ||
1757 | |||
1758 | <programlisting> | ||
1759 | BZ2_bzRead | ||
1760 | if bzerror is BZ_OK | ||
1761 | BZ2_bzClose | ||
1762 | otherwise | ||
1763 | </programlisting> | ||
1764 | |||
1765 | </sect2> | ||
1766 | |||
1767 | |||
1768 | <sect2 id="bzread" xreflabel="BZ2_bzRead"> | ||
1769 | <title><computeroutput>BZ2_bzRead</computeroutput></title> | ||
1770 | |||
1771 | <programlisting> | ||
1772 | int BZ2_bzRead ( int *bzerror, BZFILE *b, void *buf, int len ); | ||
1773 | </programlisting> | ||
1774 | |||
1775 | <para>Reads up to <computeroutput>len</computeroutput> | ||
1776 | (uncompressed) bytes from the compressed file | ||
1777 | <computeroutput>b</computeroutput> into the buffer | ||
1778 | <computeroutput>buf</computeroutput>. If the read was | ||
1779 | successful, <computeroutput>bzerror</computeroutput> is set to | ||
1780 | <computeroutput>BZ_OK</computeroutput> and the number of bytes | ||
1781 | read is returned. If the logical end-of-stream was detected, | ||
1782 | <computeroutput>bzerror</computeroutput> will be set to | ||
1783 | <computeroutput>BZ_STREAM_END</computeroutput>, and the number of | ||
1784 | bytes read is returned. All other | ||
1785 | <computeroutput>bzerror</computeroutput> values denote an | ||
1786 | error.</para> | ||
1787 | |||
1788 | <para><computeroutput>BZ2_bzRead</computeroutput> will supply | ||
1789 | <computeroutput>len</computeroutput> bytes, unless the logical | ||
1790 | stream end is detected or an error occurs. Because of this, it | ||
1791 | is possible to detect the stream end by observing when the number | ||
1792 | of bytes returned is less than the number requested. | ||
1793 | Nevertheless, this is regarded as inadvisable; you should instead | ||
1794 | check <computeroutput>bzerror</computeroutput> after every call | ||
1795 | and watch out for | ||
1796 | <computeroutput>BZ_STREAM_END</computeroutput>.</para> | ||
1797 | |||
1798 | <para>Internally, <computeroutput>BZ2_bzRead</computeroutput> | ||
1799 | copies data from the compressed file in chunks of size | ||
1800 | <computeroutput>BZ_MAX_UNUSED</computeroutput> bytes before | ||
1801 | decompressing it. If the file contains more bytes than strictly | ||
1802 | needed to reach the logical end-of-stream, | ||
1803 | <computeroutput>BZ2_bzRead</computeroutput> will almost certainly | ||
1804 | read some of the trailing data before signalling | ||
1805 | <computeroutput>BZ_SEQUENCE_END</computeroutput>. To collect the | ||
1806 | read but unused data once | ||
1807 | <computeroutput>BZ_SEQUENCE_END</computeroutput> has appeared, | ||
1808 | call <computeroutput>BZ2_bzReadGetUnused</computeroutput> | ||
1809 | immediately before | ||
1810 | <computeroutput>BZ2_bzReadClose</computeroutput>.</para> | ||
1811 | |||
1812 | <para>Possible assignments to | ||
1813 | <computeroutput>bzerror</computeroutput>:</para> | ||
1814 | |||
1815 | <programlisting> | ||
1816 | BZ_PARAM_ERROR | ||
1817 | if b is NULL or buf is NULL or len < 0 | ||
1818 | BZ_SEQUENCE_ERROR | ||
1819 | if b was opened with BZ2_bzWriteOpen | ||
1820 | BZ_IO_ERROR | ||
1821 | if there is an error reading from the compressed file | ||
1822 | BZ_UNEXPECTED_EOF | ||
1823 | if the compressed file ended before | ||
1824 | the logical end-of-stream was detected | ||
1825 | BZ_DATA_ERROR | ||
1826 | if a data integrity error was detected in the compressed stream | ||
1827 | BZ_DATA_ERROR_MAGIC | ||
1828 | if the stream does not begin with the requisite header bytes | ||
1829 | (ie, is not a bzip2 data file). This is really | ||
1830 | a special case of BZ_DATA_ERROR. | ||
1831 | BZ_MEM_ERROR | ||
1832 | if insufficient memory was available | ||
1833 | BZ_STREAM_END | ||
1834 | if the logical end of stream was detected. | ||
1835 | BZ_OK | ||
1836 | otherwise. | ||
1837 | </programlisting> | ||
1838 | |||
1839 | <para>Possible return values:</para> | ||
1840 | |||
1841 | <programlisting> | ||
1842 | number of bytes read | ||
1843 | if bzerror is BZ_OK or BZ_STREAM_END | ||
1844 | undefined | ||
1845 | otherwise | ||
1846 | </programlisting> | ||
1847 | |||
1848 | <para>Allowable next actions:</para> | ||
1849 | |||
1850 | <programlisting> | ||
1851 | collect data from buf, then BZ2_bzRead or BZ2_bzReadClose | ||
1852 | if bzerror is BZ_OK | ||
1853 | collect data from buf, then BZ2_bzReadClose or BZ2_bzReadGetUnused | ||
1854 | if bzerror is BZ_SEQUENCE_END | ||
1855 | BZ2_bzReadClose | ||
1856 | otherwise | ||
1857 | </programlisting> | ||
1858 | |||
1859 | </sect2> | ||
1860 | |||
1861 | |||
1862 | <sect2 id="bzreadgetunused" xreflabel="BZ2_bzReadGetUnused"> | ||
1863 | <title><computeroutput>BZ2_bzReadGetUnused</computeroutput></title> | ||
1864 | |||
1865 | <programlisting> | ||
1866 | void BZ2_bzReadGetUnused( int* bzerror, BZFILE *b, | ||
1867 | void** unused, int* nUnused ); | ||
1868 | </programlisting> | ||
1869 | |||
1870 | <para>Returns data which was read from the compressed file but | ||
1871 | was not needed to get to the logical end-of-stream. | ||
1872 | <computeroutput>*unused</computeroutput> is set to the address of | ||
1873 | the data, and <computeroutput>*nUnused</computeroutput> to the | ||
1874 | number of bytes. <computeroutput>*nUnused</computeroutput> will | ||
1875 | be set to a value between <computeroutput>0</computeroutput> and | ||
1876 | <computeroutput>BZ_MAX_UNUSED</computeroutput> inclusive.</para> | ||
1877 | |||
1878 | <para>This function may only be called once | ||
1879 | <computeroutput>BZ2_bzRead</computeroutput> has signalled | ||
1880 | <computeroutput>BZ_STREAM_END</computeroutput> but before | ||
1881 | <computeroutput>BZ2_bzReadClose</computeroutput>.</para> | ||
1882 | |||
1883 | <para>Possible assignments to | ||
1884 | <computeroutput>bzerror</computeroutput>:</para> | ||
1885 | |||
1886 | <programlisting> | ||
1887 | BZ_PARAM_ERROR | ||
1888 | if b is NULL | ||
1889 | or unused is NULL or nUnused is NULL | ||
1890 | BZ_SEQUENCE_ERROR | ||
1891 | if BZ_STREAM_END has not been signalled | ||
1892 | or if b was opened with BZ2_bzWriteOpen | ||
1893 | BZ_OK | ||
1894 | otherwise | ||
1895 | </programlisting> | ||
1896 | |||
1897 | <para>Allowable next actions:</para> | ||
1898 | |||
1899 | <programlisting> | ||
1900 | BZ2_bzReadClose | ||
1901 | </programlisting> | ||
1902 | |||
1903 | </sect2> | ||
1904 | |||
1905 | |||
1906 | <sect2 id="bzreadclose" xreflabel="BZ2_bzReadClose"> | ||
1907 | <title><computeroutput>BZ2_bzReadClose</computeroutput></title> | ||
1908 | |||
1909 | <programlisting> | ||
1910 | void BZ2_bzReadClose ( int *bzerror, BZFILE *b ); | ||
1911 | </programlisting> | ||
1912 | |||
1913 | <para>Releases all memory pertaining to the compressed file | ||
1914 | <computeroutput>b</computeroutput>. | ||
1915 | <computeroutput>BZ2_bzReadClose</computeroutput> does not call | ||
1916 | <computeroutput>fclose</computeroutput> on the underlying file | ||
1917 | handle, so you should do that yourself if appropriate. | ||
1918 | <computeroutput>BZ2_bzReadClose</computeroutput> should be called | ||
1919 | to clean up after all error situations.</para> | ||
1920 | |||
1921 | <para>Possible assignments to | ||
1922 | <computeroutput>bzerror</computeroutput>:</para> | ||
1923 | |||
1924 | <programlisting> | ||
1925 | BZ_SEQUENCE_ERROR | ||
1926 | if b was opened with BZ2_bzOpenWrite | ||
1927 | BZ_OK | ||
1928 | otherwise | ||
1929 | </programlisting> | ||
1930 | |||
1931 | <para>Allowable next actions:</para> | ||
1932 | |||
1933 | <programlisting> | ||
1934 | none | ||
1935 | </programlisting> | ||
1936 | |||
1937 | </sect2> | ||
1938 | |||
1939 | |||
1940 | <sect2 id="bzwriteopen" xreflabel="BZ2_bzWriteOpen"> | ||
1941 | <title><computeroutput>BZ2_bzWriteOpen</computeroutput></title> | ||
1942 | |||
1943 | <programlisting> | ||
1944 | BZFILE *BZ2_bzWriteOpen( int *bzerror, FILE *f, | ||
1945 | int blockSize100k, int verbosity, | ||
1946 | int workFactor ); | ||
1947 | </programlisting> | ||
1948 | |||
1949 | <para>Prepare to write compressed data to file handle | ||
1950 | <computeroutput>f</computeroutput>. | ||
1951 | <computeroutput>f</computeroutput> should refer to a file which | ||
1952 | has been opened for writing, and for which the error indicator | ||
1953 | (<computeroutput>ferror(f)</computeroutput>)is not set.</para> | ||
1954 | |||
1955 | <para>For the meaning of parameters | ||
1956 | <computeroutput>blockSize100k</computeroutput>, | ||
1957 | <computeroutput>verbosity</computeroutput> and | ||
1958 | <computeroutput>workFactor</computeroutput>, see | ||
1959 | <computeroutput>BZ2_bzCompressInit</computeroutput>.</para> | ||
1960 | |||
1961 | <para>All required memory is allocated at this stage, so if the | ||
1962 | call completes successfully, | ||
1963 | <computeroutput>BZ_MEM_ERROR</computeroutput> cannot be signalled | ||
1964 | by a subsequent call to | ||
1965 | <computeroutput>BZ2_bzWrite</computeroutput>.</para> | ||
1966 | |||
1967 | <para>Possible assignments to | ||
1968 | <computeroutput>bzerror</computeroutput>:</para> | ||
1969 | |||
1970 | <programlisting> | ||
1971 | BZ_CONFIG_ERROR | ||
1972 | if the library has been mis-compiled | ||
1973 | BZ_PARAM_ERROR | ||
1974 | if f is NULL | ||
1975 | or blockSize100k < 1 or blockSize100k > 9 | ||
1976 | BZ_IO_ERROR | ||
1977 | if ferror(f) is nonzero | ||
1978 | BZ_MEM_ERROR | ||
1979 | if insufficient memory is available | ||
1980 | BZ_OK | ||
1981 | otherwise | ||
1982 | </programlisting> | ||
1983 | |||
1984 | <para>Possible return values:</para> | ||
1985 | |||
1986 | <programlisting> | ||
1987 | Pointer to an abstract BZFILE | ||
1988 | if bzerror is BZ_OK | ||
1989 | NULL | ||
1990 | otherwise | ||
1991 | </programlisting> | ||
1992 | |||
1993 | <para>Allowable next actions:</para> | ||
1994 | |||
1995 | <programlisting> | ||
1996 | BZ2_bzWrite | ||
1997 | if bzerror is BZ_OK | ||
1998 | (you could go directly to BZ2_bzWriteClose, but this would be pretty pointless) | ||
1999 | BZ2_bzWriteClose | ||
2000 | otherwise | ||
2001 | </programlisting> | ||
2002 | |||
2003 | </sect2> | ||
2004 | |||
2005 | |||
2006 | <sect2 id="bzwrite" xreflabel="BZ2_bzWrite"> | ||
2007 | <title><computeroutput>BZ2_bzWrite</computeroutput></title> | ||
2008 | |||
2009 | <programlisting> | ||
2010 | void BZ2_bzWrite ( int *bzerror, BZFILE *b, void *buf, int len ); | ||
2011 | </programlisting> | ||
2012 | |||
2013 | <para>Absorbs <computeroutput>len</computeroutput> bytes from the | ||
2014 | buffer <computeroutput>buf</computeroutput>, eventually to be | ||
2015 | compressed and written to the file.</para> | ||
2016 | |||
2017 | <para>Possible assignments to | ||
2018 | <computeroutput>bzerror</computeroutput>:</para> | ||
2019 | |||
2020 | <programlisting> | ||
2021 | BZ_PARAM_ERROR | ||
2022 | if b is NULL or buf is NULL or len < 0 | ||
2023 | BZ_SEQUENCE_ERROR | ||
2024 | if b was opened with BZ2_bzReadOpen | ||
2025 | BZ_IO_ERROR | ||
2026 | if there is an error writing the compressed file. | ||
2027 | BZ_OK | ||
2028 | otherwise | ||
2029 | </programlisting> | ||
2030 | |||
2031 | </sect2> | ||
2032 | |||
2033 | |||
2034 | <sect2 id="bzwriteclose" xreflabel="BZ2_bzWriteClose"> | ||
2035 | <title><computeroutput>BZ2_bzWriteClose</computeroutput></title> | ||
2036 | |||
2037 | <programlisting> | ||
2038 | void BZ2_bzWriteClose( int *bzerror, BZFILE* f, | ||
2039 | int abandon, | ||
2040 | unsigned int* nbytes_in, | ||
2041 | unsigned int* nbytes_out ); | ||
2042 | |||
2043 | void BZ2_bzWriteClose64( int *bzerror, BZFILE* f, | ||
2044 | int abandon, | ||
2045 | unsigned int* nbytes_in_lo32, | ||
2046 | unsigned int* nbytes_in_hi32, | ||
2047 | unsigned int* nbytes_out_lo32, | ||
2048 | unsigned int* nbytes_out_hi32 ); | ||
2049 | </programlisting> | ||
2050 | |||
2051 | <para>Compresses and flushes to the compressed file all data so | ||
2052 | far supplied by <computeroutput>BZ2_bzWrite</computeroutput>. | ||
2053 | The logical end-of-stream markers are also written, so subsequent | ||
2054 | calls to <computeroutput>BZ2_bzWrite</computeroutput> are | ||
2055 | illegal. All memory associated with the compressed file | ||
2056 | <computeroutput>b</computeroutput> is released. | ||
2057 | <computeroutput>fflush</computeroutput> is called on the | ||
2058 | compressed file, but it is not | ||
2059 | <computeroutput>fclose</computeroutput>'d.</para> | ||
2060 | |||
2061 | <para>If <computeroutput>BZ2_bzWriteClose</computeroutput> is | ||
2062 | called to clean up after an error, the only action is to release | ||
2063 | the memory. The library records the error codes issued by | ||
2064 | previous calls, so this situation will be detected automatically. | ||
2065 | There is no attempt to complete the compression operation, nor to | ||
2066 | <computeroutput>fflush</computeroutput> the compressed file. You | ||
2067 | can force this behaviour to happen even in the case of no error, | ||
2068 | by passing a nonzero value to | ||
2069 | <computeroutput>abandon</computeroutput>.</para> | ||
2070 | |||
2071 | <para>If <computeroutput>nbytes_in</computeroutput> is non-null, | ||
2072 | <computeroutput>*nbytes_in</computeroutput> will be set to be the | ||
2073 | total volume of uncompressed data handled. Similarly, | ||
2074 | <computeroutput>nbytes_out</computeroutput> will be set to the | ||
2075 | total volume of compressed data written. For compatibility with | ||
2076 | older versions of the library, | ||
2077 | <computeroutput>BZ2_bzWriteClose</computeroutput> only yields the | ||
2078 | lower 32 bits of these counts. Use | ||
2079 | <computeroutput>BZ2_bzWriteClose64</computeroutput> if you want | ||
2080 | the full 64 bit counts. These two functions are otherwise | ||
2081 | absolutely identical.</para> | ||
2082 | |||
2083 | <para>Possible assignments to | ||
2084 | <computeroutput>bzerror</computeroutput>:</para> | ||
2085 | |||
2086 | <programlisting> | ||
2087 | BZ_SEQUENCE_ERROR | ||
2088 | if b was opened with BZ2_bzReadOpen | ||
2089 | BZ_IO_ERROR | ||
2090 | if there is an error writing the compressed file | ||
2091 | BZ_OK | ||
2092 | otherwise | ||
2093 | </programlisting> | ||
2094 | |||
2095 | </sect2> | ||
2096 | |||
2097 | |||
2098 | <sect2 id="embed" xreflabel="Handling embedded compressed data streams"> | ||
2099 | <title>Handling embedded compressed data streams</title> | ||
2100 | |||
2101 | <para>The high-level library facilitates use of | ||
2102 | <computeroutput>bzip2</computeroutput> data streams which form | ||
2103 | some part of a surrounding, larger data stream.</para> | ||
2104 | |||
2105 | <itemizedlist mark='bullet'> | ||
2106 | |||
2107 | <listitem><para>For writing, the library takes an open file handle, | ||
2108 | writes compressed data to it, | ||
2109 | <computeroutput>fflush</computeroutput>es it but does not | ||
2110 | <computeroutput>fclose</computeroutput> it. The calling | ||
2111 | application can write its own data before and after the | ||
2112 | compressed data stream, using that same file handle.</para></listitem> | ||
2113 | |||
2114 | <listitem><para>Reading is more complex, and the facilities are not as | ||
2115 | general as they could be since generality is hard to reconcile | ||
2116 | with efficiency. <computeroutput>BZ2_bzRead</computeroutput> | ||
2117 | reads from the compressed file in blocks of size | ||
2118 | <computeroutput>BZ_MAX_UNUSED</computeroutput> bytes, and in | ||
2119 | doing so probably will overshoot the logical end of compressed | ||
2120 | stream. To recover this data once decompression has ended, | ||
2121 | call <computeroutput>BZ2_bzReadGetUnused</computeroutput> after | ||
2122 | the last call of <computeroutput>BZ2_bzRead</computeroutput> | ||
2123 | (the one returning | ||
2124 | <computeroutput>BZ_STREAM_END</computeroutput>) but before | ||
2125 | calling | ||
2126 | <computeroutput>BZ2_bzReadClose</computeroutput>.</para></listitem> | ||
2127 | |||
2128 | </itemizedlist> | ||
2129 | |||
2130 | <para>This mechanism makes it easy to decompress multiple | ||
2131 | <computeroutput>bzip2</computeroutput> streams placed end-to-end. | ||
2132 | As the end of one stream, when | ||
2133 | <computeroutput>BZ2_bzRead</computeroutput> returns | ||
2134 | <computeroutput>BZ_STREAM_END</computeroutput>, call | ||
2135 | <computeroutput>BZ2_bzReadGetUnused</computeroutput> to collect | ||
2136 | the unused data (copy it into your own buffer somewhere). That | ||
2137 | data forms the start of the next compressed stream. To start | ||
2138 | uncompressing that next stream, call | ||
2139 | <computeroutput>BZ2_bzReadOpen</computeroutput> again, feeding in | ||
2140 | the unused data via the <computeroutput>unused</computeroutput> / | ||
2141 | <computeroutput>nUnused</computeroutput> parameters. Keep doing | ||
2142 | this until <computeroutput>BZ_STREAM_END</computeroutput> return | ||
2143 | coincides with the physical end of file | ||
2144 | (<computeroutput>feof(f)</computeroutput>). In this situation | ||
2145 | <computeroutput>BZ2_bzReadGetUnused</computeroutput> will of | ||
2146 | course return no data.</para> | ||
2147 | |||
2148 | <para>This should give some feel for how the high-level interface | ||
2149 | can be used. If you require extra flexibility, you'll have to | ||
2150 | bite the bullet and get to grips with the low-level | ||
2151 | interface.</para> | ||
2152 | |||
2153 | </sect2> | ||
2154 | |||
2155 | |||
2156 | <sect2 id="std-rdwr" xreflabel="Standard file-reading/writing code"> | ||
2157 | <title>Standard file-reading/writing code</title> | ||
2158 | |||
2159 | <para>Here's how you'd write data to a compressed file:</para> | ||
2160 | |||
2161 | <programlisting> | ||
2162 | FILE* f; | ||
2163 | BZFILE* b; | ||
2164 | int nBuf; | ||
2165 | char buf[ /* whatever size you like */ ]; | ||
2166 | int bzerror; | ||
2167 | int nWritten; | ||
2168 | |||
2169 | f = fopen ( "myfile.bz2", "w" ); | ||
2170 | if ( !f ) { | ||
2171 | /* handle error */ | ||
2172 | } | ||
2173 | b = BZ2_bzWriteOpen( &bzerror, f, 9 ); | ||
2174 | if (bzerror != BZ_OK) { | ||
2175 | BZ2_bzWriteClose ( b ); | ||
2176 | /* handle error */ | ||
2177 | } | ||
2178 | |||
2179 | while ( /* condition */ ) { | ||
2180 | /* get data to write into buf, and set nBuf appropriately */ | ||
2181 | nWritten = BZ2_bzWrite ( &bzerror, b, buf, nBuf ); | ||
2182 | if (bzerror == BZ_IO_ERROR) { | ||
2183 | BZ2_bzWriteClose ( &bzerror, b ); | ||
2184 | /* handle error */ | ||
2185 | } | ||
2186 | } | ||
2187 | |||
2188 | BZ2_bzWriteClose( &bzerror, b ); | ||
2189 | if (bzerror == BZ_IO_ERROR) { | ||
2190 | /* handle error */ | ||
2191 | } | ||
2192 | </programlisting> | ||
2193 | |||
2194 | <para>And to read from a compressed file:</para> | ||
2195 | |||
2196 | <programlisting> | ||
2197 | FILE* f; | ||
2198 | BZFILE* b; | ||
2199 | int nBuf; | ||
2200 | char buf[ /* whatever size you like */ ]; | ||
2201 | int bzerror; | ||
2202 | int nWritten; | ||
2203 | |||
2204 | f = fopen ( "myfile.bz2", "r" ); | ||
2205 | if ( !f ) { | ||
2206 | /* handle error */ | ||
2207 | } | ||
2208 | b = BZ2_bzReadOpen ( &bzerror, f, 0, NULL, 0 ); | ||
2209 | if ( bzerror != BZ_OK ) { | ||
2210 | BZ2_bzReadClose ( &bzerror, b ); | ||
2211 | /* handle error */ | ||
2212 | } | ||
2213 | |||
2214 | bzerror = BZ_OK; | ||
2215 | while ( bzerror == BZ_OK && /* arbitrary other conditions */) { | ||
2216 | nBuf = BZ2_bzRead ( &bzerror, b, buf, /* size of buf */ ); | ||
2217 | if ( bzerror == BZ_OK ) { | ||
2218 | /* do something with buf[0 .. nBuf-1] */ | ||
2219 | } | ||
2220 | } | ||
2221 | if ( bzerror != BZ_STREAM_END ) { | ||
2222 | BZ2_bzReadClose ( &bzerror, b ); | ||
2223 | /* handle error */ | ||
2224 | } else { | ||
2225 | BZ2_bzReadClose ( &bzerror ); | ||
2226 | } | ||
2227 | </programlisting> | ||
2228 | |||
2229 | </sect2> | ||
2230 | |||
2231 | </sect1> | ||
2232 | |||
2233 | |||
2234 | <sect1 id="util-fns" xreflabel="Utility functions"> | ||
2235 | <title>Utility functions</title> | ||
2236 | |||
2237 | |||
2238 | <sect2 id="bzbufftobuffcompress" xreflabel="BZ2_bzBuffToBuffCompress"> | ||
2239 | <title><computeroutput>BZ2_bzBuffToBuffCompress</computeroutput></title> | ||
2240 | |||
2241 | <programlisting> | ||
2242 | int BZ2_bzBuffToBuffCompress( char* dest, | ||
2243 | unsigned int* destLen, | ||
2244 | char* source, | ||
2245 | unsigned int sourceLen, | ||
2246 | int blockSize100k, | ||
2247 | int verbosity, | ||
2248 | int workFactor ); | ||
2249 | </programlisting> | ||
2250 | |||
2251 | <para>Attempts to compress the data in <computeroutput>source[0 | ||
2252 | .. sourceLen-1]</computeroutput> into the destination buffer, | ||
2253 | <computeroutput>dest[0 .. *destLen-1]</computeroutput>. If the | ||
2254 | destination buffer is big enough, | ||
2255 | <computeroutput>*destLen</computeroutput> is set to the size of | ||
2256 | the compressed data, and <computeroutput>BZ_OK</computeroutput> | ||
2257 | is returned. If the compressed data won't fit, | ||
2258 | <computeroutput>*destLen</computeroutput> is unchanged, and | ||
2259 | <computeroutput>BZ_OUTBUFF_FULL</computeroutput> is | ||
2260 | returned.</para> | ||
2261 | |||
2262 | <para>Compression in this manner is a one-shot event, done with a | ||
2263 | single call to this function. The resulting compressed data is a | ||
2264 | complete <computeroutput>bzip2</computeroutput> format data | ||
2265 | stream. There is no mechanism for making additional calls to | ||
2266 | provide extra input data. If you want that kind of mechanism, | ||
2267 | use the low-level interface.</para> | ||
2268 | |||
2269 | <para>For the meaning of parameters | ||
2270 | <computeroutput>blockSize100k</computeroutput>, | ||
2271 | <computeroutput>verbosity</computeroutput> and | ||
2272 | <computeroutput>workFactor</computeroutput>, see | ||
2273 | <computeroutput>BZ2_bzCompressInit</computeroutput>.</para> | ||
2274 | |||
2275 | <para>To guarantee that the compressed data will fit in its | ||
2276 | buffer, allocate an output buffer of size 1% larger than the | ||
2277 | uncompressed data, plus six hundred extra bytes.</para> | ||
2278 | |||
2279 | <para><computeroutput>BZ2_bzBuffToBuffDecompress</computeroutput> | ||
2280 | will not write data at or beyond | ||
2281 | <computeroutput>dest[*destLen]</computeroutput>, even in case of | ||
2282 | buffer overflow.</para> | ||
2283 | |||
2284 | <para>Possible return values:</para> | ||
2285 | |||
2286 | <programlisting> | ||
2287 | BZ_CONFIG_ERROR | ||
2288 | if the library has been mis-compiled | ||
2289 | BZ_PARAM_ERROR | ||
2290 | if dest is NULL or destLen is NULL | ||
2291 | or blockSize100k < 1 or blockSize100k > 9 | ||
2292 | or verbosity < 0 or verbosity > 4 | ||
2293 | or workFactor < 0 or workFactor > 250 | ||
2294 | BZ_MEM_ERROR | ||
2295 | if insufficient memory is available | ||
2296 | BZ_OUTBUFF_FULL | ||
2297 | if the size of the compressed data exceeds *destLen | ||
2298 | BZ_OK | ||
2299 | otherwise | ||
2300 | </programlisting> | ||
2301 | |||
2302 | </sect2> | ||
2303 | |||
2304 | |||
2305 | <sect2 id="bzbufftobuffdecompress" xreflabel="BZ2_bzBuffToBuffDecompress"> | ||
2306 | <title><computeroutput>BZ2_bzBuffToBuffDecompress</computeroutput></title> | ||
2307 | |||
2308 | <programlisting> | ||
2309 | int BZ2_bzBuffToBuffDecompress( char* dest, | ||
2310 | unsigned int* destLen, | ||
2311 | char* source, | ||
2312 | unsigned int sourceLen, | ||
2313 | int small, | ||
2314 | int verbosity ); | ||
2315 | </programlisting> | ||
2316 | |||
2317 | <para>Attempts to decompress the data in <computeroutput>source[0 | ||
2318 | .. sourceLen-1]</computeroutput> into the destination buffer, | ||
2319 | <computeroutput>dest[0 .. *destLen-1]</computeroutput>. If the | ||
2320 | destination buffer is big enough, | ||
2321 | <computeroutput>*destLen</computeroutput> is set to the size of | ||
2322 | the uncompressed data, and <computeroutput>BZ_OK</computeroutput> | ||
2323 | is returned. If the compressed data won't fit, | ||
2324 | <computeroutput>*destLen</computeroutput> is unchanged, and | ||
2325 | <computeroutput>BZ_OUTBUFF_FULL</computeroutput> is | ||
2326 | returned.</para> | ||
2327 | |||
2328 | <para><computeroutput>source</computeroutput> is assumed to hold | ||
2329 | a complete <computeroutput>bzip2</computeroutput> format data | ||
2330 | stream. | ||
2331 | <computeroutput>BZ2_bzBuffToBuffDecompress</computeroutput> tries | ||
2332 | to decompress the entirety of the stream into the output | ||
2333 | buffer.</para> | ||
2334 | |||
2335 | <para>For the meaning of parameters | ||
2336 | <computeroutput>small</computeroutput> and | ||
2337 | <computeroutput>verbosity</computeroutput>, see | ||
2338 | <computeroutput>BZ2_bzDecompressInit</computeroutput>.</para> | ||
2339 | |||
2340 | <para>Because the compression ratio of the compressed data cannot | ||
2341 | be known in advance, there is no easy way to guarantee that the | ||
2342 | output buffer will be big enough. You may of course make | ||
2343 | arrangements in your code to record the size of the uncompressed | ||
2344 | data, but such a mechanism is beyond the scope of this | ||
2345 | library.</para> | ||
2346 | |||
2347 | <para><computeroutput>BZ2_bzBuffToBuffDecompress</computeroutput> | ||
2348 | will not write data at or beyond | ||
2349 | <computeroutput>dest[*destLen]</computeroutput>, even in case of | ||
2350 | buffer overflow.</para> | ||
2351 | |||
2352 | <para>Possible return values:</para> | ||
2353 | |||
2354 | <programlisting> | ||
2355 | BZ_CONFIG_ERROR | ||
2356 | if the library has been mis-compiled | ||
2357 | BZ_PARAM_ERROR | ||
2358 | if dest is NULL or destLen is NULL | ||
2359 | or small != 0 && small != 1 | ||
2360 | or verbosity < 0 or verbosity > 4 | ||
2361 | BZ_MEM_ERROR | ||
2362 | if insufficient memory is available | ||
2363 | BZ_OUTBUFF_FULL | ||
2364 | if the size of the compressed data exceeds *destLen | ||
2365 | BZ_DATA_ERROR | ||
2366 | if a data integrity error was detected in the compressed data | ||
2367 | BZ_DATA_ERROR_MAGIC | ||
2368 | if the compressed data doesn't begin with the right magic bytes | ||
2369 | BZ_UNEXPECTED_EOF | ||
2370 | if the compressed data ends unexpectedly | ||
2371 | BZ_OK | ||
2372 | otherwise | ||
2373 | </programlisting> | ||
2374 | |||
2375 | </sect2> | ||
2376 | |||
2377 | </sect1> | ||
2378 | |||
2379 | |||
2380 | <sect1 id="zlib-compat" xreflabel="zlib compatibility functions"> | ||
2381 | <title><computeroutput>zlib</computeroutput> compatibility functions</title> | ||
2382 | |||
2383 | <para>Yoshioka Tsuneo has contributed some functions to give | ||
2384 | better <computeroutput>zlib</computeroutput> compatibility. | ||
2385 | These functions are <computeroutput>BZ2_bzopen</computeroutput>, | ||
2386 | <computeroutput>BZ2_bzread</computeroutput>, | ||
2387 | <computeroutput>BZ2_bzwrite</computeroutput>, | ||
2388 | <computeroutput>BZ2_bzflush</computeroutput>, | ||
2389 | <computeroutput>BZ2_bzclose</computeroutput>, | ||
2390 | <computeroutput>BZ2_bzerror</computeroutput> and | ||
2391 | <computeroutput>BZ2_bzlibVersion</computeroutput>. These | ||
2392 | functions are not (yet) officially part of the library. If they | ||
2393 | break, you get to keep all the pieces. Nevertheless, I think | ||
2394 | they work ok.</para> | ||
2395 | |||
2396 | <programlisting> | ||
2397 | typedef void BZFILE; | ||
2398 | |||
2399 | const char * BZ2_bzlibVersion ( void ); | ||
2400 | </programlisting> | ||
2401 | |||
2402 | <para>Returns a string indicating the library version.</para> | ||
2403 | |||
2404 | <programlisting> | ||
2405 | BZFILE * BZ2_bzopen ( const char *path, const char *mode ); | ||
2406 | BZFILE * BZ2_bzdopen ( int fd, const char *mode ); | ||
2407 | </programlisting> | ||
2408 | |||
2409 | <para>Opens a <computeroutput>.bz2</computeroutput> file for | ||
2410 | reading or writing, using either its name or a pre-existing file | ||
2411 | descriptor. Analogous to <computeroutput>fopen</computeroutput> | ||
2412 | and <computeroutput>fdopen</computeroutput>.</para> | ||
2413 | |||
2414 | <programlisting> | ||
2415 | int BZ2_bzread ( BZFILE* b, void* buf, int len ); | ||
2416 | int BZ2_bzwrite ( BZFILE* b, void* buf, int len ); | ||
2417 | </programlisting> | ||
2418 | |||
2419 | <para>Reads/writes data from/to a previously opened | ||
2420 | <computeroutput>BZFILE</computeroutput>. Analogous to | ||
2421 | <computeroutput>fread</computeroutput> and | ||
2422 | <computeroutput>fwrite</computeroutput>.</para> | ||
2423 | |||
2424 | <programlisting> | ||
2425 | int BZ2_bzflush ( BZFILE* b ); | ||
2426 | void BZ2_bzclose ( BZFILE* b ); | ||
2427 | </programlisting> | ||
2428 | |||
2429 | <para>Flushes/closes a <computeroutput>BZFILE</computeroutput>. | ||
2430 | <computeroutput>BZ2_bzflush</computeroutput> doesn't actually do | ||
2431 | anything. Analogous to <computeroutput>fflush</computeroutput> | ||
2432 | and <computeroutput>fclose</computeroutput>.</para> | ||
2433 | |||
2434 | <programlisting> | ||
2435 | const char * BZ2_bzerror ( BZFILE *b, int *errnum ) | ||
2436 | </programlisting> | ||
2437 | |||
2438 | <para>Returns a string describing the more recent error status of | ||
2439 | <computeroutput>b</computeroutput>, and also sets | ||
2440 | <computeroutput>*errnum</computeroutput> to its numerical | ||
2441 | value.</para> | ||
2442 | |||
2443 | </sect1> | ||
2444 | |||
2445 | |||
2446 | <sect1 id="stdio-free" | ||
2447 | xreflabel="Using the library in a stdio-free environment"> | ||
2448 | <title>Using the library in a <computeroutput>stdio</computeroutput>-free environment</title> | ||
2449 | |||
2450 | |||
2451 | <sect2 id="stdio-bye" xreflabel="Getting rid of stdio"> | ||
2452 | <title>Getting rid of <computeroutput>stdio</computeroutput></title> | ||
2453 | |||
2454 | <para>In a deeply embedded application, you might want to use | ||
2455 | just the memory-to-memory functions. You can do this | ||
2456 | conveniently by compiling the library with preprocessor symbol | ||
2457 | <computeroutput>BZ_NO_STDIO</computeroutput> defined. Doing this | ||
2458 | gives you a library containing only the following eight | ||
2459 | functions:</para> | ||
2460 | |||
2461 | <para><computeroutput>BZ2_bzCompressInit</computeroutput>, | ||
2462 | <computeroutput>BZ2_bzCompress</computeroutput>, | ||
2463 | <computeroutput>BZ2_bzCompressEnd</computeroutput> | ||
2464 | <computeroutput>BZ2_bzDecompressInit</computeroutput>, | ||
2465 | <computeroutput>BZ2_bzDecompress</computeroutput>, | ||
2466 | <computeroutput>BZ2_bzDecompressEnd</computeroutput> | ||
2467 | <computeroutput>BZ2_bzBuffToBuffCompress</computeroutput>, | ||
2468 | <computeroutput>BZ2_bzBuffToBuffDecompress</computeroutput></para> | ||
2469 | |||
2470 | <para>When compiled like this, all functions will ignore | ||
2471 | <computeroutput>verbosity</computeroutput> settings.</para> | ||
2472 | |||
2473 | </sect2> | ||
2474 | |||
2475 | |||
2476 | <sect2 id="critical-error" xreflabel="Critical error handling"> | ||
2477 | <title>Critical error handling</title> | ||
2478 | |||
2479 | <para><computeroutput>libbzip2</computeroutput> contains a number | ||
2480 | of internal assertion checks which should, needless to say, never | ||
2481 | be activated. Nevertheless, if an assertion should fail, | ||
2482 | behaviour depends on whether or not the library was compiled with | ||
2483 | <computeroutput>BZ_NO_STDIO</computeroutput> set.</para> | ||
2484 | |||
2485 | <para>For a normal compile, an assertion failure yields the | ||
2486 | message:</para> | ||
2487 | |||
2488 | <blockquote> | ||
2489 | <para>bzip2/libbzip2: internal error number N.</para> | ||
2490 | <para>This is a bug in bzip2/libbzip2, &bz-version; of &bz-date;. | ||
2491 | Please report it to me at: &bz-email;. If this happened | ||
2492 | when you were using some program which uses libbzip2 as a | ||
2493 | component, you should also report this bug to the author(s) | ||
2494 | of that program. Please make an effort to report this bug; | ||
2495 | timely and accurate bug reports eventually lead to higher | ||
2496 | quality software. Thanks. Julian Seward, &bz-date;. | ||
2497 | </para></blockquote> | ||
2498 | |||
2499 | <para>where <computeroutput>N</computeroutput> is some error code | ||
2500 | number. If <computeroutput>N == 1007</computeroutput>, it also | ||
2501 | prints some extra text advising the reader that unreliable memory | ||
2502 | is often associated with internal error 1007. (This is a | ||
2503 | frequently-observed-phenomenon with versions 1.0.0/1.0.1).</para> | ||
2504 | |||
2505 | <para><computeroutput>exit(3)</computeroutput> is then | ||
2506 | called.</para> | ||
2507 | |||
2508 | <para>For a <computeroutput>stdio</computeroutput>-free library, | ||
2509 | assertion failures result in a call to a function declared | ||
2510 | as:</para> | ||
2511 | |||
2512 | <programlisting> | ||
2513 | extern void bz_internal_error ( int errcode ); | ||
2514 | </programlisting> | ||
2515 | |||
2516 | <para>The relevant code is passed as a parameter. You should | ||
2517 | supply such a function.</para> | ||
2518 | |||
2519 | <para>In either case, once an assertion failure has occurred, any | ||
2520 | <computeroutput>bz_stream</computeroutput> records involved can | ||
2521 | be regarded as invalid. You should not attempt to resume normal | ||
2522 | operation with them.</para> | ||
2523 | |||
2524 | <para>You may, of course, change critical error handling to suit | ||
2525 | your needs. As I said above, critical errors indicate bugs in | ||
2526 | the library and should not occur. All "normal" error situations | ||
2527 | are indicated via error return codes from functions, and can be | ||
2528 | recovered from.</para> | ||
2529 | |||
2530 | </sect2> | ||
2531 | |||
2532 | </sect1> | ||
2533 | |||
2534 | |||
2535 | <sect1 id="win-dll" xreflabel="Making a Windows DLL"> | ||
2536 | <title>Making a Windows DLL</title> | ||
2537 | |||
2538 | <para>Everything related to Windows has been contributed by | ||
2539 | Yoshioka Tsuneo | ||
2540 | (<computeroutput>QWF00133@niftyserve.or.jp</computeroutput> / | ||
2541 | <computeroutput>tsuneo-y@is.aist-nara.ac.jp</computeroutput>), so | ||
2542 | you should send your queries to him (but perhaps Cc: me, | ||
2543 | <computeroutput>&bz-email;</computeroutput>).</para> | ||
2544 | |||
2545 | <para>My vague understanding of what to do is: using Visual C++ | ||
2546 | 5.0, open the project file | ||
2547 | <computeroutput>libbz2.dsp</computeroutput>, and build. That's | ||
2548 | all.</para> | ||
2549 | |||
2550 | <para>If you can't open the project file for some reason, make a | ||
2551 | new one, naming these files: | ||
2552 | <computeroutput>blocksort.c</computeroutput>, | ||
2553 | <computeroutput>bzlib.c</computeroutput>, | ||
2554 | <computeroutput>compress.c</computeroutput>, | ||
2555 | <computeroutput>crctable.c</computeroutput>, | ||
2556 | <computeroutput>decompress.c</computeroutput>, | ||
2557 | <computeroutput>huffman.c</computeroutput>, | ||
2558 | <computeroutput>randtable.c</computeroutput> and | ||
2559 | <computeroutput>libbz2.def</computeroutput>. You will also need | ||
2560 | to name the header files <computeroutput>bzlib.h</computeroutput> | ||
2561 | and <computeroutput>bzlib_private.h</computeroutput>.</para> | ||
2562 | |||
2563 | <para>If you don't use VC++, you may need to define the | ||
2564 | proprocessor symbol | ||
2565 | <computeroutput>_WIN32</computeroutput>.</para> | ||
2566 | |||
2567 | <para>Finally, <computeroutput>dlltest.c</computeroutput> is a | ||
2568 | sample program using the DLL. It has a project file, | ||
2569 | <computeroutput>dlltest.dsp</computeroutput>.</para> | ||
2570 | |||
2571 | <para>If you just want a makefile for Visual C, have a look at | ||
2572 | <computeroutput>makefile.msc</computeroutput>.</para> | ||
2573 | |||
2574 | <para>Be aware that if you compile | ||
2575 | <computeroutput>bzip2</computeroutput> itself on Win32, you must | ||
2576 | set <computeroutput>BZ_UNIX</computeroutput> to 0 and | ||
2577 | <computeroutput>BZ_LCCWIN32</computeroutput> to 1, in the file | ||
2578 | <computeroutput>bzip2.c</computeroutput>, before compiling. | ||
2579 | Otherwise the resulting binary won't work correctly.</para> | ||
2580 | |||
2581 | <para>I haven't tried any of this stuff myself, but it all looks | ||
2582 | plausible.</para> | ||
2583 | |||
2584 | </sect1> | ||
2585 | |||
2586 | </chapter> | ||
2587 | |||
2588 | |||
2589 | |||
2590 | <chapter id="misc" xreflabel="Miscellanea"> | ||
2591 | <title>Miscellanea</title> | ||
2592 | |||
2593 | <para>These are just some random thoughts of mine. Your mileage | ||
2594 | may vary.</para> | ||
2595 | |||
2596 | |||
2597 | <sect1 id="limits" xreflabel="Limitations of the compressed file format"> | ||
2598 | <title>Limitations of the compressed file format</title> | ||
2599 | |||
2600 | <para><computeroutput>bzip2-1.0.X</computeroutput>, | ||
2601 | <computeroutput>0.9.5</computeroutput> and | ||
2602 | <computeroutput>0.9.0</computeroutput> use exactly the same file | ||
2603 | format as the original version, | ||
2604 | <computeroutput>bzip2-0.1</computeroutput>. This decision was | ||
2605 | made in the interests of stability. Creating yet another | ||
2606 | incompatible compressed file format would create further | ||
2607 | confusion and disruption for users.</para> | ||
2608 | |||
2609 | <para>Nevertheless, this is not a painless decision. Development | ||
2610 | work since the release of | ||
2611 | <computeroutput>bzip2-0.1</computeroutput> in August 1997 has | ||
2612 | shown complexities in the file format which slow down | ||
2613 | decompression and, in retrospect, are unnecessary. These | ||
2614 | are:</para> | ||
2615 | |||
2616 | <itemizedlist mark='bullet'> | ||
2617 | |||
2618 | <listitem><para>The run-length encoder, which is the first of the | ||
2619 | compression transformations, is entirely irrelevant. The | ||
2620 | original purpose was to protect the sorting algorithm from the | ||
2621 | very worst case input: a string of repeated symbols. But | ||
2622 | algorithm steps Q6a and Q6b in the original Burrows-Wheeler | ||
2623 | technical report (SRC-124) show how repeats can be handled | ||
2624 | without difficulty in block sorting.</para></listitem> | ||
2625 | |||
2626 | <listitem><para>The randomisation mechanism doesn't really need to be | ||
2627 | there. Udi Manber and Gene Myers published a suffix array | ||
2628 | construction algorithm a few years back, which can be employed | ||
2629 | to sort any block, no matter how repetitive, in O(N log N) | ||
2630 | time. Subsequent work by Kunihiko Sadakane has produced a | ||
2631 | derivative O(N (log N)^2) algorithm which usually outperforms | ||
2632 | the Manber-Myers algorithm.</para> | ||
2633 | |||
2634 | <para>I could have changed to Sadakane's algorithm, but I find | ||
2635 | it to be slower than <computeroutput>bzip2</computeroutput>'s | ||
2636 | existing algorithm for most inputs, and the randomisation | ||
2637 | mechanism protects adequately against bad cases. I didn't | ||
2638 | think it was a good tradeoff to make. Partly this is due to | ||
2639 | the fact that I was not flooded with email complaints about | ||
2640 | <computeroutput>bzip2-0.1</computeroutput>'s performance on | ||
2641 | repetitive data, so perhaps it isn't a problem for real | ||
2642 | inputs.</para> | ||
2643 | |||
2644 | <para>Probably the best long-term solution, and the one I have | ||
2645 | incorporated into 0.9.5 and above, is to use the existing | ||
2646 | sorting algorithm initially, and fall back to a O(N (log N)^2) | ||
2647 | algorithm if the standard algorithm gets into | ||
2648 | difficulties.</para></listitem> | ||
2649 | |||
2650 | <listitem><para>The compressed file format was never designed to be | ||
2651 | handled by a library, and I have had to jump though some hoops | ||
2652 | to produce an efficient implementation of decompression. It's | ||
2653 | a bit hairy. Try passing | ||
2654 | <computeroutput>decompress.c</computeroutput> through the C | ||
2655 | preprocessor and you'll see what I mean. Much of this | ||
2656 | complexity could have been avoided if the compressed size of | ||
2657 | each block of data was recorded in the data stream.</para></listitem> | ||
2658 | |||
2659 | <listitem><para>An Adler-32 checksum, rather than a CRC32 checksum, | ||
2660 | would be faster to compute.</para></listitem> | ||
2661 | |||
2662 | </itemizedlist> | ||
2663 | |||
2664 | <para>It would be fair to say that the | ||
2665 | <computeroutput>bzip2</computeroutput> format was frozen before I | ||
2666 | properly and fully understood the performance consequences of | ||
2667 | doing so.</para> | ||
2668 | |||
2669 | <para>Improvements which I was able to incorporate into 0.9.0, | ||
2670 | despite using the same file format, are:</para> | ||
2671 | |||
2672 | <itemizedlist mark='bullet'> | ||
2673 | |||
2674 | <listitem><para>Single array implementation of the inverse BWT. This | ||
2675 | significantly speeds up decompression, presumably because it | ||
2676 | reduces the number of cache misses.</para></listitem> | ||
2677 | |||
2678 | <listitem><para>Faster inverse MTF transform for large MTF values. | ||
2679 | The new implementation is based on the notion of sliding blocks | ||
2680 | of values.</para></listitem> | ||
2681 | |||
2682 | <listitem><para><computeroutput>bzip2-0.9.0</computeroutput> now reads | ||
2683 | and writes files with <computeroutput>fread</computeroutput> | ||
2684 | and <computeroutput>fwrite</computeroutput>; version 0.1 used | ||
2685 | <computeroutput>putc</computeroutput> and | ||
2686 | <computeroutput>getc</computeroutput>. Duh! Well, you live | ||
2687 | and learn.</para></listitem> | ||
2688 | |||
2689 | </itemizedlist> | ||
2690 | |||
2691 | <para>Further ahead, it would be nice to be able to do random | ||
2692 | access into files. This will require some careful design of | ||
2693 | compressed file formats.</para> | ||
2694 | |||
2695 | </sect1> | ||
2696 | |||
2697 | |||
2698 | <sect1 id="port-issues" xreflabel="Portability issues"> | ||
2699 | <title>Portability issues</title> | ||
2700 | |||
2701 | <para>After some consideration, I have decided not to use GNU | ||
2702 | <computeroutput>autoconf</computeroutput> to configure 0.9.5 or | ||
2703 | 1.0.</para> | ||
2704 | |||
2705 | <para><computeroutput>autoconf</computeroutput>, admirable and | ||
2706 | wonderful though it is, mainly assists with portability problems | ||
2707 | between Unix-like platforms. But | ||
2708 | <computeroutput>bzip2</computeroutput> doesn't have much in the | ||
2709 | way of portability problems on Unix; most of the difficulties | ||
2710 | appear when porting to the Mac, or to Microsoft's operating | ||
2711 | systems. <computeroutput>autoconf</computeroutput> doesn't help | ||
2712 | in those cases, and brings in a whole load of new | ||
2713 | complexity.</para> | ||
2714 | |||
2715 | <para>Most people should be able to compile the library and | ||
2716 | program under Unix straight out-of-the-box, so to speak, | ||
2717 | especially if you have a version of GNU C available.</para> | ||
2718 | |||
2719 | <para>There are a couple of | ||
2720 | <computeroutput>__inline__</computeroutput> directives in the | ||
2721 | code. GNU C (<computeroutput>gcc</computeroutput>) should be | ||
2722 | able to handle them. If you're not using GNU C, your C compiler | ||
2723 | shouldn't see them at all. If your compiler does, for some | ||
2724 | reason, see them and doesn't like them, just | ||
2725 | <computeroutput>#define</computeroutput> | ||
2726 | <computeroutput>__inline__</computeroutput> to be | ||
2727 | <computeroutput>/* */</computeroutput>. One easy way to do this | ||
2728 | is to compile with the flag | ||
2729 | <computeroutput>-D__inline__=</computeroutput>, which should be | ||
2730 | understood by most Unix compilers.</para> | ||
2731 | |||
2732 | <para>If you still have difficulties, try compiling with the | ||
2733 | macro <computeroutput>BZ_STRICT_ANSI</computeroutput> defined. | ||
2734 | This should enable you to build the library in a strictly ANSI | ||
2735 | compliant environment. Building the program itself like this is | ||
2736 | dangerous and not supported, since you remove | ||
2737 | <computeroutput>bzip2</computeroutput>'s checks against | ||
2738 | compressing directories, symbolic links, devices, and other | ||
2739 | not-really-a-file entities. This could cause filesystem | ||
2740 | corruption!</para> | ||
2741 | |||
2742 | <para>One other thing: if you create a | ||
2743 | <computeroutput>bzip2</computeroutput> binary for public distribution, | ||
2744 | please consider linking it statically (<computeroutput>gcc | ||
2745 | -static</computeroutput>). This avoids all sorts of library-version | ||
2746 | issues that others may encounter later on.</para> | ||
2747 | |||
2748 | <para>If you build <computeroutput>bzip2</computeroutput> on | ||
2749 | Win32, you must set <computeroutput>BZ_UNIX</computeroutput> to 0 | ||
2750 | and <computeroutput>BZ_LCCWIN32</computeroutput> to 1, in the | ||
2751 | file <computeroutput>bzip2.c</computeroutput>, before compiling. | ||
2752 | Otherwise the resulting binary won't work correctly.</para> | ||
2753 | |||
2754 | </sect1> | ||
2755 | |||
2756 | |||
2757 | <sect1 id="bugs" xreflabel="Reporting bugs"> | ||
2758 | <title>Reporting bugs</title> | ||
2759 | |||
2760 | <para>I tried pretty hard to make sure | ||
2761 | <computeroutput>bzip2</computeroutput> is bug free, both by | ||
2762 | design and by testing. Hopefully you'll never need to read this | ||
2763 | section for real.</para> | ||
2764 | |||
2765 | <para>Nevertheless, if <computeroutput>bzip2</computeroutput> dies | ||
2766 | with a segmentation fault, a bus error or an internal assertion | ||
2767 | failure, it will ask you to email me a bug report. Experience from | ||
2768 | years of feedback of bzip2 users indicates that almost all these | ||
2769 | problems can be traced to either compiler bugs or hardware | ||
2770 | problems.</para> | ||
2771 | |||
2772 | <itemizedlist mark='bullet'> | ||
2773 | |||
2774 | <listitem><para>Recompile the program with no optimisation, and | ||
2775 | see if it works. And/or try a different compiler. I heard all | ||
2776 | sorts of stories about various flavours of GNU C (and other | ||
2777 | compilers) generating bad code for | ||
2778 | <computeroutput>bzip2</computeroutput>, and I've run across two | ||
2779 | such examples myself.</para> | ||
2780 | |||
2781 | <para>2.7.X versions of GNU C are known to generate bad code | ||
2782 | from time to time, at high optimisation levels. If you get | ||
2783 | problems, try using the flags | ||
2784 | <computeroutput>-O2</computeroutput> | ||
2785 | <computeroutput>-fomit-frame-pointer</computeroutput> | ||
2786 | <computeroutput>-fno-strength-reduce</computeroutput>. You | ||
2787 | should specifically <emphasis>not</emphasis> use | ||
2788 | <computeroutput>-funroll-loops</computeroutput>.</para> | ||
2789 | |||
2790 | <para>You may notice that the Makefile runs six tests as part | ||
2791 | of the build process. If the program passes all of these, it's | ||
2792 | a pretty good (but not 100%) indication that the compiler has | ||
2793 | done its job correctly.</para></listitem> | ||
2794 | |||
2795 | <listitem><para>If <computeroutput>bzip2</computeroutput> | ||
2796 | crashes randomly, and the crashes are not repeatable, you may | ||
2797 | have a flaky memory subsystem. | ||
2798 | <computeroutput>bzip2</computeroutput> really hammers your | ||
2799 | memory hierarchy, and if it's a bit marginal, you may get these | ||
2800 | problems. Ditto if your disk or I/O subsystem is slowly | ||
2801 | failing. Yup, this really does happen.</para> | ||
2802 | |||
2803 | <para>Try using a different machine of the same type, and see | ||
2804 | if you can repeat the problem.</para></listitem> | ||
2805 | |||
2806 | <listitem><para>This isn't really a bug, but ... If | ||
2807 | <computeroutput>bzip2</computeroutput> tells you your file is | ||
2808 | corrupted on decompression, and you obtained the file via FTP, | ||
2809 | there is a possibility that you forgot to tell FTP to do a | ||
2810 | binary mode transfer. That absolutely will cause the file to | ||
2811 | be non-decompressible. You'll have to transfer it | ||
2812 | again.</para></listitem> | ||
2813 | |||
2814 | </itemizedlist> | ||
2815 | |||
2816 | <para>If you've incorporated | ||
2817 | <computeroutput>libbzip2</computeroutput> into your own program | ||
2818 | and are getting problems, please, please, please, check that the | ||
2819 | parameters you are passing in calls to the library, are correct, | ||
2820 | and in accordance with what the documentation says is allowable. | ||
2821 | I have tried to make the library robust against such problems, | ||
2822 | but I'm sure I haven't succeeded.</para> | ||
2823 | |||
2824 | <para>Finally, if the above comments don't help, you'll have to | ||
2825 | send me a bug report. Now, it's just amazing how many people | ||
2826 | will send me a bug report saying something like:</para> | ||
2827 | |||
2828 | <programlisting> | ||
2829 | bzip2 crashed with segmentation fault on my machine | ||
2830 | </programlisting> | ||
2831 | |||
2832 | <para>and absolutely nothing else. Needless to say, a such a | ||
2833 | report is <emphasis>totally, utterly, completely and | ||
2834 | comprehensively 100% useless; a waste of your time, my time, and | ||
2835 | net bandwidth</emphasis>. With no details at all, there's no way | ||
2836 | I can possibly begin to figure out what the problem is.</para> | ||
2837 | |||
2838 | <para>The rules of the game are: facts, facts, facts. Don't omit | ||
2839 | them because "oh, they won't be relevant". At the bare | ||
2840 | minimum:</para> | ||
2841 | |||
2842 | <programlisting> | ||
2843 | Machine type. Operating system version. | ||
2844 | Exact version of bzip2 (do bzip2 -V). | ||
2845 | Exact version of the compiler used. | ||
2846 | Flags passed to the compiler. | ||
2847 | </programlisting> | ||
2848 | |||
2849 | <para>However, the most important single thing that will help me | ||
2850 | is the file that you were trying to compress or decompress at the | ||
2851 | time the problem happened. Without that, my ability to do | ||
2852 | anything more than speculate about the cause, is limited.</para> | ||
2853 | |||
2854 | </sect1> | ||
2855 | |||
2856 | |||
2857 | <sect1 id="package" xreflabel="Did you get the right package?"> | ||
2858 | <title>Did you get the right package?</title> | ||
2859 | |||
2860 | <para><computeroutput>bzip2</computeroutput> is a resource hog. | ||
2861 | It soaks up large amounts of CPU cycles and memory. Also, it | ||
2862 | gives very large latencies. In the worst case, you can feed many | ||
2863 | megabytes of uncompressed data into the library before getting | ||
2864 | any compressed output, so this probably rules out applications | ||
2865 | requiring interactive behaviour.</para> | ||
2866 | |||
2867 | <para>These aren't faults of my implementation, I hope, but more | ||
2868 | an intrinsic property of the Burrows-Wheeler transform | ||
2869 | (unfortunately). Maybe this isn't what you want.</para> | ||
2870 | |||
2871 | <para>If you want a compressor and/or library which is faster, | ||
2872 | uses less memory but gets pretty good compression, and has | ||
2873 | minimal latency, consider Jean-loup Gailly's and Mark Adler's | ||
2874 | work, <computeroutput>zlib-1.2.1</computeroutput> and | ||
2875 | <computeroutput>gzip-1.2.4</computeroutput>. Look for them at | ||
2876 | <ulink url="http://www.zlib.org">http://www.zlib.org</ulink> and | ||
2877 | <ulink url="http://www.gzip.org">http://www.gzip.org</ulink> | ||
2878 | respectively.</para> | ||
2879 | |||
2880 | <para>For something faster and lighter still, you might try Markus F | ||
2881 | X J Oberhumer's <computeroutput>LZO</computeroutput> real-time | ||
2882 | compression/decompression library, at | ||
2883 | <ulink url="http://www.oberhumer.com/opensource">http://www.oberhumer.com/opensource</ulink>.</para> | ||
2884 | |||
2885 | </sect1> | ||
2886 | |||
2887 | |||
2888 | |||
2889 | <sect1 id="reading" xreflabel="Further Reading"> | ||
2890 | <title>Further Reading</title> | ||
2891 | |||
2892 | <para><computeroutput>bzip2</computeroutput> is not research | ||
2893 | work, in the sense that it doesn't present any new ideas. | ||
2894 | Rather, it's an engineering exercise based on existing | ||
2895 | ideas.</para> | ||
2896 | |||
2897 | <para>Four documents describe essentially all the ideas behind | ||
2898 | <computeroutput>bzip2</computeroutput>:</para> | ||
2899 | |||
2900 | <literallayout>Michael Burrows and D. J. Wheeler: | ||
2901 | "A block-sorting lossless data compression algorithm" | ||
2902 | 10th May 1994. | ||
2903 | Digital SRC Research Report 124. | ||
2904 | ftp://ftp.digital.com/pub/DEC/SRC/research-reports/SRC-124.ps.gz | ||
2905 | If you have trouble finding it, try searching at the | ||
2906 | New Zealand Digital Library, http://www.nzdl.org. | ||
2907 | |||
2908 | Daniel S. Hirschberg and Debra A. LeLewer | ||
2909 | "Efficient Decoding of Prefix Codes" | ||
2910 | Communications of the ACM, April 1990, Vol 33, Number 4. | ||
2911 | You might be able to get an electronic copy of this | ||
2912 | from the ACM Digital Library. | ||
2913 | |||
2914 | David J. Wheeler | ||
2915 | Program bred3.c and accompanying document bred3.ps. | ||
2916 | This contains the idea behind the multi-table Huffman coding scheme. | ||
2917 | ftp://ftp.cl.cam.ac.uk/users/djw3/ | ||
2918 | |||
2919 | Jon L. Bentley and Robert Sedgewick | ||
2920 | "Fast Algorithms for Sorting and Searching Strings" | ||
2921 | Available from Sedgewick's web page, | ||
2922 | www.cs.princeton.edu/~rs | ||
2923 | </literallayout> | ||
2924 | |||
2925 | <para>The following paper gives valuable additional insights into | ||
2926 | the algorithm, but is not immediately the basis of any code used | ||
2927 | in bzip2.</para> | ||
2928 | |||
2929 | <literallayout>Peter Fenwick: | ||
2930 | Block Sorting Text Compression | ||
2931 | Proceedings of the 19th Australasian Computer Science Conference, | ||
2932 | Melbourne, Australia. Jan 31 - Feb 2, 1996. | ||
2933 | ftp://ftp.cs.auckland.ac.nz/pub/peter-f/ACSC96paper.ps</literallayout> | ||
2934 | |||
2935 | <para>Kunihiko Sadakane's sorting algorithm, mentioned above, is | ||
2936 | available from:</para> | ||
2937 | |||
2938 | <literallayout>http://naomi.is.s.u-tokyo.ac.jp/~sada/papers/Sada98b.ps.gz | ||
2939 | </literallayout> | ||
2940 | |||
2941 | <para>The Manber-Myers suffix array construction algorithm is | ||
2942 | described in a paper available from:</para> | ||
2943 | |||
2944 | <literallayout>http://www.cs.arizona.edu/people/gene/PAPERS/suffix.ps | ||
2945 | </literallayout> | ||
2946 | |||
2947 | <para>Finally, the following papers document some | ||
2948 | investigations I made into the performance of sorting | ||
2949 | and decompression algorithms:</para> | ||
2950 | |||
2951 | <literallayout>Julian Seward | ||
2952 | On the Performance of BWT Sorting Algorithms | ||
2953 | Proceedings of the IEEE Data Compression Conference 2000 | ||
2954 | Snowbird, Utah. 28-30 March 2000. | ||
2955 | |||
2956 | Julian Seward | ||
2957 | Space-time Tradeoffs in the Inverse B-W Transform | ||
2958 | Proceedings of the IEEE Data Compression Conference 2001 | ||
2959 | Snowbird, Utah. 27-29 March 2001. | ||
2960 | </literallayout> | ||
2961 | |||
2962 | </sect1> | ||
2963 | |||
2964 | </chapter> | ||
2965 | |||
2966 | </book> | ||
diff --git a/randtable.c b/randtable.c index 5c922e9..940462d 100644 --- a/randtable.c +++ b/randtable.c | |||
@@ -8,7 +8,7 @@ | |||
8 | This file is a part of bzip2 and/or libbzip2, a program and | 8 | This file is a part of bzip2 and/or libbzip2, a program and |
9 | library for lossless, block-sorting data compression. | 9 | library for lossless, block-sorting data compression. |
10 | 10 | ||
11 | Copyright (C) 1996-2002 Julian R Seward. All rights reserved. | 11 | Copyright (C) 1996-2005 Julian R Seward. All rights reserved. |
12 | 12 | ||
13 | Redistribution and use in source and binary forms, with or without | 13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions | 14 | modification, are permitted provided that the following conditions |
@@ -42,7 +42,7 @@ | |||
42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 42 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 | 43 | ||
44 | Julian Seward, Cambridge, UK. | 44 | Julian Seward, Cambridge, UK. |
45 | jseward@acm.org | 45 | jseward@bzip.org |
46 | bzip2/libbzip2 version 1.0 of 21 March 2000 | 46 | bzip2/libbzip2 version 1.0 of 21 March 2000 |
47 | 47 | ||
48 | This program is based on (at least) the work of: | 48 | This program is based on (at least) the work of: |
diff --git a/xmlproc.sh b/xmlproc.sh new file mode 100755 index 0000000..6fe4d57 --- /dev/null +++ b/xmlproc.sh | |||
@@ -0,0 +1,99 @@ | |||
1 | #!/bin/bash | ||
2 | # see the README in this directory for usage etc. | ||
3 | |||
4 | usage() { | ||
5 | echo ''; | ||
6 | echo 'Usage: xmlproc.sh -[option] <filename.xml>'; | ||
7 | echo 'Specify a target from:'; | ||
8 | echo '-v verify xml file conforms to dtd'; | ||
9 | echo '-html output in html format (single file)'; | ||
10 | echo '-ps output in postscript format'; | ||
11 | echo '-pdf output in pdf format'; | ||
12 | exit; | ||
13 | } | ||
14 | |||
15 | if test $# -ne 2; then | ||
16 | usage | ||
17 | fi | ||
18 | # assign the variable for the output type | ||
19 | action=$1; shift | ||
20 | # assign the output filename | ||
21 | xmlfile=$1; shift | ||
22 | # and check user input it correct | ||
23 | if !(test -f $xmlfile); then | ||
24 | echo "No such file: $xmlfile"; | ||
25 | exit; | ||
26 | fi | ||
27 | # some other stuff we will use | ||
28 | OUT=output | ||
29 | xsl_fo=bz-fo.xsl | ||
30 | xsl_html=bz-html.xsl | ||
31 | |||
32 | basename=$xmlfile | ||
33 | basename=${basename//'.xml'/''} | ||
34 | |||
35 | fofile="${basename}.fo" | ||
36 | htmlfile="${basename}.html" | ||
37 | pdffile="${basename}.pdf" | ||
38 | psfile="${basename}.ps" | ||
39 | xmlfmtfile="${basename}.fmt" | ||
40 | |||
41 | # first process the xmlfile with CDATA tags | ||
42 | ./format.pl $xmlfile $xmlfmtfile | ||
43 | # so the shell knows where the catalogs live | ||
44 | export XML_CATALOG_FILES=/etc/xml/catalog | ||
45 | |||
46 | # post-processing tidy up | ||
47 | cleanup() { | ||
48 | echo "Cleaning up: # $@" | ||
49 | while [ $# != 0 ] | ||
50 | do | ||
51 | arg=$1; shift; | ||
52 | echo " deleting $arg"; | ||
53 | rm $arg | ||
54 | done | ||
55 | } | ||
56 | |||
57 | case $action in | ||
58 | -v) | ||
59 | flags='--noout --xinclude --noblanks --postvalid' | ||
60 | dtd='--dtdvalid http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd' | ||
61 | xmllint $flags $dtd $xmlfmtfile 2> $OUT | ||
62 | egrep 'error' $OUT | ||
63 | rm $OUT | ||
64 | ;; | ||
65 | |||
66 | -html) | ||
67 | echo "Creating $htmlfile ..." | ||
68 | xsltproc --nonet --xinclude -o $htmlfile $xsl_html $xmlfmtfile | ||
69 | cleanup $xmlfmtfile | ||
70 | ;; | ||
71 | |||
72 | -pdf) | ||
73 | echo "Creating $pdffile ..." | ||
74 | xsltproc --nonet --xinclude -o $fofile $xsl_fo $xmlfmtfile | ||
75 | pdfxmltex $fofile >$OUT </dev/null | ||
76 | pdfxmltex $fofile >$OUT </dev/null | ||
77 | pdfxmltex $fofile >$OUT </dev/null | ||
78 | cleanup $OUT $xmlfmtfile *.aux *.fo *.log *.out | ||
79 | ;; | ||
80 | |||
81 | -ps) | ||
82 | echo "Creating $psfile ..." | ||
83 | xsltproc --nonet --xinclude -o $fofile $xsl_fo $xmlfmtfile | ||
84 | pdfxmltex $fofile >$OUT </dev/null | ||
85 | pdfxmltex $fofile >$OUT </dev/null | ||
86 | pdfxmltex $fofile >$OUT </dev/null | ||
87 | pdftops $pdffile $psfile | ||
88 | cleanup $OUT $xmlfmtfile $pdffile *.aux *.fo *.log *.out | ||
89 | # passivetex is broken, so we can't go this route yet. | ||
90 | # xmltex $fofile >$OUT </dev/null | ||
91 | # xmltex $fofile >$OUT </dev/null | ||
92 | # xmltex $fofile >$OUT </dev/null | ||
93 | # dvips -R -q -o bzip-manual.ps *.dvi | ||
94 | ;; | ||
95 | |||
96 | *) | ||
97 | usage | ||
98 | ;; | ||
99 | esac | ||