Add LIT_MEM define to use more memory for a small deflate speedup.

A bug fix in zlib 1.2.12 resulted in a slight slowdown (1-2%) of
deflate. This commit provides the option to #define LIT_MEM, which
uses more memory to reverse most of that slowdown. The memory for
the pending buffer and symbol buffers is increased by 25%, which
increases the total memory usage with the default parameters by
about 6%.
This commit is contained in:
Mark Adler 2023-09-18 21:17:00 -07:00
parent bd9c329c10
commit ac8f12c97d
3 changed files with 67 additions and 3 deletions

View File

@ -493,7 +493,11 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method,
* symbols from which it is being constructed.
*/
#ifdef LIT_MEM
s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, 5);
#else
s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, 4);
#endif
s->pending_buf_size = (ulg)s->lit_bufsize * 4;
if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
@ -503,8 +507,14 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method,
deflateEnd (strm);
return Z_MEM_ERROR;
}
#ifdef LIT_MEM
s->d_buf = (ushf *)(s->pending_buf + (s->lit_bufsize << 1));
s->l_buf = s->pending_buf + (s->lit_bufsize << 2);
s->sym_end = s->lit_bufsize - 1;
#else
s->sym_buf = s->pending_buf + s->lit_bufsize;
s->sym_end = (s->lit_bufsize - 1) * 3;
#endif
/* We avoid equality with lit_bufsize*3 because of wraparound at 64K
* on 16 bit machines and because stored blocks are restricted to
* 64K-1 bytes.
@ -720,9 +730,15 @@ int ZEXPORT deflatePrime(z_streamp strm, int bits, int value) {
if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
s = strm->state;
#ifdef LIT_MEM
if (bits < 0 || bits > 16 ||
(uchf *)s->d_buf < s->pending_out + ((Buf_size + 7) >> 3))
return Z_BUF_ERROR;
#else
if (bits < 0 || bits > 16 ||
s->sym_buf < s->pending_out + ((Buf_size + 7) >> 3))
return Z_BUF_ERROR;
#endif
do {
put = Buf_size - s->bi_valid;
if (put > bits)
@ -1308,7 +1324,12 @@ int ZEXPORT deflateCopy(z_streamp dest, z_streamp source) {
zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
#ifdef LIT_MEM
ds->d_buf = (ushf *)(ds->pending_buf + (ds->lit_bufsize << 1));
ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2);
#else
ds->sym_buf = ds->pending_buf + ds->lit_bufsize;
#endif
ds->l_desc.dyn_tree = ds->dyn_ltree;
ds->d_desc.dyn_tree = ds->dyn_dtree;

View File

@ -23,6 +23,10 @@
# define GZIP
#endif
/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
the cost of a larger memory footprint */
/* #define LIT_MEM */
/* ===========================================================================
* Internal compression state.
*/
@ -217,7 +221,12 @@ typedef struct internal_state {
/* Depth of each subtree used as tie breaker for trees of equal frequency
*/
#ifdef LIT_MEM
ushf *d_buf; /* buffer for distances */
uchf *l_buf; /* buffer for literals/lengths */
#else
uchf *sym_buf; /* buffer for distances and literals/lengths */
#endif
uInt lit_bufsize;
/* Size of match buffer for literals/lengths. There are 4 reasons for
@ -239,7 +248,7 @@ typedef struct internal_state {
* - I can't count above 4
*/
uInt sym_next; /* running index in sym_buf */
uInt sym_next; /* running index in symbol buffer */
uInt sym_end; /* symbol table full when sym_next reaches this */
ulg opt_len; /* bit length of current block with optimal trees */
@ -318,6 +327,25 @@ void ZLIB_INTERNAL _tr_stored_block(deflate_state *s, charf *buf,
extern const uch ZLIB_INTERNAL _dist_code[];
#endif
#ifdef LIT_MEM
# define _tr_tally_lit(s, c, flush) \
{ uch cc = (c); \
s->d_buf[s->sym_next] = 0; \
s->l_buf[s->sym_next++] = cc; \
s->dyn_ltree[cc].Freq++; \
flush = (s->sym_next == s->sym_end); \
}
# define _tr_tally_dist(s, distance, length, flush) \
{ uch len = (uch)(length); \
ush dist = (ush)(distance); \
s->d_buf[s->sym_next] = dist; \
s->l_buf[s->sym_next++] = len; \
dist--; \
s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
s->dyn_dtree[d_code(dist)].Freq++; \
flush = (s->sym_next == s->sym_end); \
}
#else
# define _tr_tally_lit(s, c, flush) \
{ uch cc = (c); \
s->sym_buf[s->sym_next++] = 0; \
@ -337,6 +365,7 @@ void ZLIB_INTERNAL _tr_stored_block(deflate_state *s, charf *buf,
s->dyn_dtree[d_code(dist)].Freq++; \
flush = (s->sym_next == s->sym_end); \
}
#endif
#else
# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
# define _tr_tally_dist(s, distance, length, flush) \

18
trees.c
View File

@ -899,14 +899,19 @@ local void compress_block(deflate_state *s, const ct_data *ltree,
const ct_data *dtree) {
unsigned dist; /* distance of matched string */
int lc; /* match length or unmatched char (if dist == 0) */
unsigned sx = 0; /* running index in sym_buf */
unsigned sx = 0; /* running index in symbol buffers */
unsigned code; /* the code to send */
int extra; /* number of extra bits to send */
if (s->sym_next != 0) do {
#ifdef LIT_MEM
dist = s->d_buf[sx];
lc = s->l_buf[sx++];
#else
dist = s->sym_buf[sx++] & 0xff;
dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8;
lc = s->sym_buf[sx++];
#endif
if (dist == 0) {
send_code(s, lc, ltree); /* send a literal byte */
Tracecv(isgraph(lc), (stderr," '%c' ", lc));
@ -931,8 +936,12 @@ local void compress_block(deflate_state *s, const ct_data *ltree,
}
} /* literal or match pair ? */
/* Check that the overlay between pending_buf and sym_buf is ok: */
/* Check for no overlay of pending_buf on needed symbols */
#ifdef LIT_MEM
Assert(s->pending < (s->lit_bufsize << 1) + sx, "pendingBuf overflow");
#else
Assert(s->pending < s->lit_bufsize + sx, "pendingBuf overflow");
#endif
} while (sx < s->sym_next);
@ -1082,9 +1091,14 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, charf *buf,
* the current block must be flushed.
*/
int ZLIB_INTERNAL _tr_tally(deflate_state *s, unsigned dist, unsigned lc) {
#ifdef LIT_MEM
s->d_buf[s->sym_next] = (ush)dist;
s->l_buf[s->sym_next++] = (uch)lc;
#else
s->sym_buf[s->sym_next++] = (uch)dist;
s->sym_buf[s->sym_next++] = (uch)(dist >> 8);
s->sym_buf[s->sym_next++] = (uch)lc;
#endif
if (dist == 0) {
/* lc is the unmatched char */
s->dyn_ltree[lc].Freq++;