summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2016-11-05 08:43:29 -0700
committerMark Adler <madler@alumni.caltech.edu>2016-12-04 07:48:47 -0800
commit9dc5a8585f429109ef1948ab71b6b71bfa7181e2 (patch)
tree334ecf8e4d86004a967bef2b6f9901792d64934a
parent7161ad76e2d0ac7de2a6235fcad3b9dfc99e9140 (diff)
downloadzlib-9dc5a8585f429109ef1948ab71b6b71bfa7181e2.tar.gz
zlib-9dc5a8585f429109ef1948ab71b6b71bfa7181e2.tar.bz2
zlib-9dc5a8585f429109ef1948ab71b6b71bfa7181e2.zip
Speed up deflation for level 0 (storing).
The previous code slid the window and the hash table and copied every input byte three times in order to just write the data as stored blocks with no compression. This commit minimizes sliding and copying, especially for large input and output buffers. Level 0 compression is now more than 20 times faster than before the commit. Most of the speedup is due to deferring hash table slides until deflateParams() is called to change the compression level away from 0. More speedup is due to copying directly from next_in to next_out when the amounts of available input data and output space permit it, avoiding the intermediate pending buffer. Additionally, only the last 32K of the used input data is copied back to the sliding window when large input buffers are provided.
-rw-r--r--deflate.c293
1 files changed, 215 insertions, 78 deletions
diff --git a/deflate.c b/deflate.c
index 0a78b5a..47d55af 100644
--- a/deflate.c
+++ b/deflate.c
@@ -1,5 +1,5 @@
1/* deflate.c -- compress data using the deflation algorithm 1/* deflate.c -- compress data using the deflation algorithm
2 * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler 2 * Copyright (C) 1995-2016 Jean-loup Gailly and Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h 3 * For conditions of distribution and use, see copyright notice in zlib.h
4 */ 4 */
5 5
@@ -52,7 +52,7 @@
52#include "deflate.h" 52#include "deflate.h"
53 53
54const char deflate_copyright[] = 54const char deflate_copyright[] =
55 " deflate 1.2.8.1 Copyright 1995-2013 Jean-loup Gailly and Mark Adler "; 55 " deflate 1.2.8.1 Copyright 1995-2016 Jean-loup Gailly and Mark Adler ";
56/* 56/*
57 If you use the zlib library in a product, an acknowledgment is welcome 57 If you use the zlib library in a product, an acknowledgment is welcome
58 in the documentation of your product. If for some reason you cannot 58 in the documentation of your product. If for some reason you cannot
@@ -74,6 +74,7 @@ typedef block_state (*compress_func) OF((deflate_state *s, int flush));
74/* Compression function. Returns the block state after the call. */ 74/* Compression function. Returns the block state after the call. */
75 75
76local int deflateStateCheck OF((z_streamp strm)); 76local int deflateStateCheck OF((z_streamp strm));
77local void slide_hash OF((deflate_state *s));
77local void fill_window OF((deflate_state *s)); 78local void fill_window OF((deflate_state *s));
78local block_state deflate_stored OF((deflate_state *s, int flush)); 79local block_state deflate_stored OF((deflate_state *s, int flush));
79local block_state deflate_fast OF((deflate_state *s, int flush)); 80local block_state deflate_fast OF((deflate_state *s, int flush));
@@ -191,6 +192,37 @@ local const config configuration_table[10] = {
191 s->head[s->hash_size-1] = NIL; \ 192 s->head[s->hash_size-1] = NIL; \
192 zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head)); 193 zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
193 194
195/* ===========================================================================
196 * Slide the hash table when sliding the window down (could be avoided with 32
197 * bit values at the expense of memory usage). We slide even when level == 0 to
198 * keep the hash table consistent if we switch back to level > 0 later.
199 */
200local void slide_hash(s)
201 deflate_state *s;
202{
203 unsigned n, m;
204 Posf *p;
205 uInt wsize = s->w_size;
206
207 n = s->hash_size;
208 p = &s->head[n];
209 do {
210 m = *--p;
211 *p = (Pos)(m >= wsize ? m - wsize : NIL);
212 } while (--n);
213 n = wsize;
214#ifndef FASTEST
215 p = &s->prev[n];
216 do {
217 m = *--p;
218 *p = (Pos)(m >= wsize ? m - wsize : NIL);
219 /* If n is not on any hash chain, prev[n] is garbage but
220 * its value will never be used.
221 */
222 } while (--n);
223#endif
224}
225
194/* ========================================================================= */ 226/* ========================================================================= */
195int ZEXPORT deflateInit_(strm, level, version, stream_size) 227int ZEXPORT deflateInit_(strm, level, version, stream_size)
196 z_streamp strm; 228 z_streamp strm;
@@ -540,6 +572,13 @@ int ZEXPORT deflateParams(strm, level, strategy)
540 return Z_BUF_ERROR; 572 return Z_BUF_ERROR;
541 } 573 }
542 if (s->level != level) { 574 if (s->level != level) {
575 if (s->level == 0 && s->matches != 0) {
576 if (s->matches == 1)
577 slide_hash(s);
578 else
579 CLEAR_HASH(s);
580 s->matches = 0;
581 }
543 s->level = level; 582 s->level = level;
544 s->max_lazy_match = configuration_table[level].max_lazy; 583 s->max_lazy_match = configuration_table[level].max_lazy;
545 s->good_match = configuration_table[level].good_length; 584 s->good_match = configuration_table[level].good_length;
@@ -659,10 +698,10 @@ local void putShortMSB (s, b)
659} 698}
660 699
661/* ========================================================================= 700/* =========================================================================
662 * Flush as much pending output as possible. All deflate() output goes 701 * Flush as much pending output as possible. All deflate() output, except for
663 * through this function so some applications may wish to modify it 702 * some deflate_stored() output, goes through this function so some
664 * to avoid allocating a large strm->next_out buffer and copying into it. 703 * applications may wish to modify it to avoid allocating a large
665 * (See also read_buf()). 704 * strm->next_out buffer and copying into it. (See also read_buf()).
666 */ 705 */
667local void flush_pending(strm) 706local void flush_pending(strm)
668 z_streamp strm; 707 z_streamp strm;
@@ -1420,8 +1459,7 @@ local void check_match(s, start, match, length)
1420local void fill_window(s) 1459local void fill_window(s)
1421 deflate_state *s; 1460 deflate_state *s;
1422{ 1461{
1423 register unsigned n, m; 1462 unsigned n;
1424 register Posf *p;
1425 unsigned more; /* Amount of free space at the end of the window. */ 1463 unsigned more; /* Amount of free space at the end of the window. */
1426 uInt wsize = s->w_size; 1464 uInt wsize = s->w_size;
1427 1465
@@ -1448,35 +1486,11 @@ local void fill_window(s)
1448 */ 1486 */
1449 if (s->strstart >= wsize+MAX_DIST(s)) { 1487 if (s->strstart >= wsize+MAX_DIST(s)) {
1450 1488
1451 zmemcpy(s->window, s->window+wsize, (unsigned)wsize); 1489 zmemcpy(s->window, s->window+wsize, (unsigned)wsize - more);
1452 s->match_start -= wsize; 1490 s->match_start -= wsize;
1453 s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ 1491 s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
1454 s->block_start -= (long) wsize; 1492 s->block_start -= (long) wsize;
1455 1493 slide_hash(s);
1456 /* Slide the hash table (could be avoided with 32 bit values
1457 at the expense of memory usage). We slide even when level == 0
1458 to keep the hash table consistent if we switch back to level > 0
1459 later. (Using level 0 permanently is not an optimal usage of
1460 zlib, so we don't care about this pathological case.)
1461 */
1462 n = s->hash_size;
1463 p = &s->head[n];
1464 do {
1465 m = *--p;
1466 *p = (Pos)(m >= wsize ? m-wsize : NIL);
1467 } while (--n);
1468
1469 n = wsize;
1470#ifndef FASTEST
1471 p = &s->prev[n];
1472 do {
1473 m = *--p;
1474 *p = (Pos)(m >= wsize ? m-wsize : NIL);
1475 /* If n is not on any hash chain, prev[n] is garbage but
1476 * its value will never be used.
1477 */
1478 } while (--n);
1479#endif
1480 more += wsize; 1494 more += wsize;
1481 } 1495 }
1482 if (s->strm->avail_in == 0) break; 1496 if (s->strm->avail_in == 0) break;
@@ -1582,70 +1596,193 @@ local void fill_window(s)
1582 if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \ 1596 if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
1583} 1597}
1584 1598
1599/* Maximum stored block length in deflate format (not including header). */
1600#define MAX_STORED 65535
1601
1602/* Minimum of a and b. */
1603#define MIN(a, b) ((a) > (b) ? (b) : (a))
1604
1585/* =========================================================================== 1605/* ===========================================================================
1586 * Copy without compression as much as possible from the input stream, return 1606 * Copy without compression as much as possible from the input stream, return
1587 * the current block state. 1607 * the current block state.
1588 * This function does not insert new strings in the dictionary since 1608 *
1589 * uncompressible data is probably not useful. This function is used 1609 * In case deflateParams() is used to later switch to a non-zero compression
1590 * only for the level=0 compression option. 1610 * level, s->matches (otherwise unused when storing) keeps track of the number
1591 * NOTE: this function should be optimized to avoid extra copying from 1611 * of hash table slides to perform. If s->matches is 1, then one hash table
1592 * window to pending_buf. 1612 * slide will be done when switching. If s->matches is 2, the maximum value
1613 * allowed here, then the hash table will be cleared, since two or more slides
1614 * is the same as a clear.
1615 *
1616 * deflate_stored() is written to minimize the number of times an input byte is
1617 * copied. It is most efficient with large input and output buffers, which
1618 * maximizes the opportunites to have a single copy from next_in to next_out.
1593 */ 1619 */
1594local block_state deflate_stored(s, flush) 1620local block_state deflate_stored(s, flush)
1595 deflate_state *s; 1621 deflate_state *s;
1596 int flush; 1622 int flush;
1597{ 1623{
1598 /* Stored blocks are limited to 0xffff bytes, pending_buf is limited 1624 /* Smallest worthy block size when not flushing or finishing. By default
1599 * to pending_buf_size, and each stored block has a 5 byte header: 1625 * this is 32K. This can be as small as 507 bytes for memLevel == 1. For
1626 * large input and output buffers, the stored block size will be larger.
1600 */ 1627 */
1601 ulg max_block_size = 0xffff; 1628 unsigned min_block = MIN(s->pending_buf_size - 5, s->w_size);
1602 ulg max_start;
1603 1629
1604 if (max_block_size > s->pending_buf_size - 5) { 1630 /* Copy as many min_block or larger stored blocks directly to next_out as
1605 max_block_size = s->pending_buf_size - 5; 1631 * possible. If flushing, copy the remaining available input to next_out as
1606 } 1632 * stored blocks, if there is enough space.
1607 1633 */
1608 /* Copy as much as possible from input to output: */ 1634 unsigned len, left, have, last;
1635 unsigned used = s->strm->avail_in;
1609 for (;;) { 1636 for (;;) {
1610 /* Fill the window as much as possible: */ 1637 /* Set len to the maximum size block that we can copy directly with the
1611 if (s->lookahead <= 1) { 1638 * available input data and output space. Set left to how much of that
1639 * would be copied from what's left in the window.
1640 */
1641 len = MAX_STORED; /* maximum deflate stored block length */
1642 have = (s->bi_valid + 42) >> 3; /* number of header bytes */
1643 /* maximum stored block length that will fit in avail_out: */
1644 have = s->strm->avail_out > have ? s->strm->avail_out - have : 0;
1645 left = s->strstart - s->block_start; /* bytes left in window */
1646 if (len > (ulg)left + s->strm->avail_in)
1647 len = left + s->strm->avail_in; /* limit len to the input */
1648 if (len > have)
1649 len = have; /* limit len to the output */
1650 if (left > len)
1651 left = len; /* limit window pull to len */
1652
1653 /* If the stored block would be less than min_block in length, or if
1654 * unable to copy all of the available input when flushing, then try
1655 * copying to the window and the pending buffer instead. Also don't
1656 * write an empty block when flushing -- deflate() does that.
1657 */
1658 if (len < min_block && (len == 0 || flush == Z_NO_FLUSH ||
1659 len - left != s->strm->avail_in))
1660 break;
1612 1661
1613 Assert(s->strstart < s->w_size+MAX_DIST(s) || 1662 /* Make a dummy stored block in pending to get the header bytes,
1614 s->block_start >= (long)s->w_size, "slide too late"); 1663 * including any pending bits. This also updates the debugging counts.
1664 */
1665 last = flush == Z_FINISH && len - left == s->strm->avail_in ? 1 : 0;
1666 _tr_stored_block(s, (char *)0, 0L, last);
1615 1667
1616 fill_window(s); 1668 /* Replace the lengths in the dummy stored block with len. */
1617 if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; 1669 s->pending_buf[s->pending - 4] = len;
1670 s->pending_buf[s->pending - 3] = len >> 8;
1671 s->pending_buf[s->pending - 2] = ~len;
1672 s->pending_buf[s->pending - 1] = ~len >> 8;
1618 1673
1619 if (s->lookahead == 0) break; /* flush the current block */ 1674 /* Write the stored block header bytes. */
1675 flush_pending(s->strm);
1676
1677 /* Update debugging counts for the data about to be copied. */
1678#ifdef ZLIB_DEBUG
1679 s->compressed_len += len << 3;
1680 s->bits_sent += len << 3;
1681#endif
1682
1683 /* Copy uncompressed bytes from the window to next_out. */
1684 if (left) {
1685 zmemcpy(s->strm->next_out, s->window + s->block_start, left);
1686 s->strm->next_out += left;
1687 s->strm->avail_out -= left;
1688 s->strm->total_out += left;
1689 s->block_start += left;
1690 len -= left;
1620 } 1691 }
1621 Assert(s->block_start >= 0L, "block gone"); 1692
1622 1693 /* Copy uncompressed bytes directly from next_in to next_out, updating
1623 s->strstart += s->lookahead; 1694 * the check value.
1624 s->lookahead = 0; 1695 */
1625 1696 if (len) {
1626 /* Emit a stored block if pending_buf will be full: */ 1697 read_buf(s->strm, s->strm->next_out, len);
1627 max_start = max_block_size + (ulg)s->block_start; 1698 s->strm->next_out += len;
1628 if (s->strstart == 0 || (ulg)s->strstart >= max_start) { 1699 s->strm->avail_out -= len;
1629 /* strstart == 0 is possible when wraparound on 16-bit machine */ 1700 s->strm->total_out += len;
1630 s->lookahead = (uInt)(s->strstart - max_start);
1631 s->strstart = (uInt)max_start;
1632 FLUSH_BLOCK(s, 0);
1633 } 1701 }
1634 /* Flush if we may have to slide, otherwise block_start may become 1702 }
1635 * negative and the data will be gone: 1703
1704 /* Update the sliding window with the last s->w_size bytes of the copied
1705 * data, or append all of the copied data to the existing window if less
1706 * than s->w_size bytes were copied. Also update the number of bytes to
1707 * insert in the hash tables, in the event that deflateParams() switches to
1708 * a non-zero compression level.
1709 */
1710 used -= s->strm->avail_in; /* number of input bytes directly copied */
1711 if (used) {
1712 /* If any input was used, then no unused input remains in the window,
1713 * therefore s->block_start == s->strstart.
1636 */ 1714 */
1637 if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { 1715 if (used >= s->w_size) { /* supplant the previous history */
1638 FLUSH_BLOCK(s, 0); 1716 s->matches = 2; /* clear hash */
1717 zmemcpy(s->window, s->strm->next_in - s->w_size, s->w_size);
1718 s->strstart = s->w_size;
1639 } 1719 }
1720 else {
1721 if (s->window_size - s->strstart <= used) {
1722 /* Slide the window down. */
1723 s->strstart -= s->w_size;
1724 zmemcpy(s->window, s->window + s->w_size, s->strstart);
1725 if (s->matches < 2)
1726 s->matches++; /* add a pending slide_hash() */
1727 }
1728 zmemcpy(s->window + s->strstart, s->strm->next_in - used, used);
1729 s->strstart += used;
1730 }
1731 s->block_start = s->strstart;
1732 s->insert += MIN(used, s->w_size - s->insert);
1640 } 1733 }
1641 s->insert = 0; 1734
1642 if (flush == Z_FINISH) { 1735 /* If flushing or finishing and all input has been consumed, then done. If
1643 FLUSH_BLOCK(s, 1); 1736 * the code above couldn't write a complete block to next_out, then the
1644 return finish_done; 1737 * code following this won't be able to either.
1738 */
1739 if (flush != Z_NO_FLUSH && s->strm->avail_in == 0 &&
1740 s->strstart == s->block_start)
1741 return flush == Z_FINISH ? finish_done : block_done;
1742
1743 /* Fill the window with any remaining input. */
1744 have = s->window_size - s->strstart - 1;
1745 if (s->strm->avail_in > have && s->block_start >= s->w_size) {
1746 /* Slide the window down. */
1747 s->block_start -= s->w_size;
1748 s->strstart -= s->w_size;
1749 zmemcpy(s->window, s->window + s->w_size, s->strstart);
1750 if (s->matches < 2)
1751 s->matches++; /* add a pending slide_hash() */
1752 have += s->w_size; /* more space now */
1645 } 1753 }
1646 if ((long)s->strstart > s->block_start) 1754 if (have > s->strm->avail_in)
1647 FLUSH_BLOCK(s, 0); 1755 have = s->strm->avail_in;
1648 return block_done; 1756 if (have) {
1757 read_buf(s->strm, s->window + s->strstart, have);
1758 s->strstart += have;
1759 }
1760
1761 /* There was not enough avail_out to write a complete worthy or flushed
1762 * stored block to next_out. Write a stored block to pending instead, if we
1763 * have enough input for a worthy block, or if flushing and there is enough
1764 * room for the remaining input as a stored block in the pending buffer.
1765 */
1766 have = (s->bi_valid + 42) >> 3; /* number of header bytes */
1767 /* maximum stored block length that will fit in pending: */
1768 have = MIN(s->pending_buf_size - have, MAX_STORED);
1769 min_block = MIN(have, s->w_size);
1770 left = s->strstart - s->block_start;
1771 if (left >= min_block ||
1772 (left && flush != Z_NO_FLUSH && s->strm->avail_in == 0 &&
1773 left <= have)) {
1774 len = MIN(left, have);
1775 last = flush == Z_FINISH && s->strm->avail_in == 0 &&
1776 len == left ? 1 : 0;
1777 _tr_stored_block(s, (charf *)s->window + s->block_start, len, last);
1778 s->block_start += len;
1779 flush_pending(s->strm);
1780 if (last)
1781 return finish_started;
1782 }
1783
1784 /* We've done all we can with the available input and output. */
1785 return need_more;
1649} 1786}
1650 1787
1651/* =========================================================================== 1788/* ===========================================================================