rulimine/decompressor/gzip/tinflate.c

567 lines
15 KiB
C

/*
* tinflate - tiny inflate
*
* Copyright (c) 2003-2019 Joergen Ibsen
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must
* not claim that you wrote the original software. If you use this
* software in a product, an acknowledgment in the product
* documentation would be appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and must
* not be misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source
* distribution.
*/
#include "tinf.h"
#include <limits.h>
#if defined(UINT_MAX) && (UINT_MAX) < 0xFFFFFFFFUL
# error "tinf requires unsigned int to be at least 32-bit"
#endif
/* -- Internal data structures -- */
struct tinf_tree {
unsigned short counts[16]; /* Number of codes with a given length */
unsigned short symbols[288]; /* Symbols sorted by code */
int max_sym;
};
struct tinf_data {
const unsigned char *source;
const unsigned char *source_end;
unsigned int tag;
int bitcount;
int overflow;
unsigned char *dest_start;
unsigned char *dest;
struct tinf_tree ltree; /* Literal/length tree */
struct tinf_tree dtree; /* Distance tree */
};
/* Given an array of code lengths, build a tree */
static int tinf_build_tree(struct tinf_tree *t, const unsigned char *lengths,
unsigned int num) {
unsigned short offs[16];
unsigned int i, num_codes, available;
for (i = 0; i < 16; ++i) {
t->counts[i] = 0;
}
t->max_sym = -1;
/* Count number of codes for each non-zero length */
for (i = 0; i < num; ++i) {
if (lengths[i]) {
t->max_sym = i;
t->counts[lengths[i]]++;
}
}
/* Compute offset table for distribution sort */
for (available = 1, num_codes = 0, i = 0; i < 16; ++i) {
unsigned int used = t->counts[i];
/* Check length contains no more codes than available */
if (used > available) {
return TINF_DATA_ERROR;
}
available = 2 * (available - used);
offs[i] = num_codes;
num_codes += used;
}
/*
* Check all codes were used, or for the special case of only one
* code that it has length 1
*/
if ((num_codes > 1 && available > 0)
|| (num_codes == 1 && t->counts[1] != 1)) {
return TINF_DATA_ERROR;
}
/* Fill in symbols sorted by code */
for (i = 0; i < num; ++i) {
if (lengths[i]) {
t->symbols[offs[lengths[i]]++] = i;
}
}
/*
* For the special case of only one code (which will be 0) add a
* code 1 which results in a symbol that is too large
*/
if (num_codes == 1) {
t->counts[1] = 2;
t->symbols[1] = t->max_sym + 1;
}
return TINF_OK;
}
/* -- Decode functions -- */
static void tinf_refill(struct tinf_data *d, int num) {
/* Read bytes until at least num bits available */
while (d->bitcount < num) {
if (d->source != d->source_end) {
d->tag |= (unsigned int) *d->source++ << d->bitcount;
}
else {
d->overflow = 1;
}
d->bitcount += 8;
}
}
static unsigned int tinf_getbits_no_refill(struct tinf_data *d, int num) {
unsigned int bits;
/* Get bits from tag */
bits = d->tag & ((1UL << num) - 1);
/* Remove bits from tag */
d->tag >>= num;
d->bitcount -= num;
return bits;
}
/* Get num bits from source stream */
static unsigned int tinf_getbits(struct tinf_data *d, int num) {
tinf_refill(d, num);
return tinf_getbits_no_refill(d, num);
}
/* Read a num bit value from stream and add base */
static unsigned int tinf_getbits_base(struct tinf_data *d, int num, int base) {
return base + (num ? tinf_getbits(d, num) : 0);
}
/* Given a data stream and a tree, decode a symbol */
static int tinf_decode_symbol(struct tinf_data *d, const struct tinf_tree *t) {
int base = 0, offs = 0;
int len;
/*
* Get more bits while code index is above number of codes
*
* Rather than the actual code, we are computing the position of the
* code in the sorted order of codes, which is the index of the
* corresponding symbol.
*
* Conceptually, for each code length (level in the tree), there are
* counts[len] leaves on the left and internal nodes on the right.
* The index we have decoded so far is base + offs, and if that
* falls within the leaves we are done. Otherwise we adjust the range
* of offs and add one more bit to it.
*/
for (len = 1; ; ++len) {
offs = 2 * offs + tinf_getbits(d, 1);
if (offs < t->counts[len]) {
break;
}
base += t->counts[len];
offs -= t->counts[len];
}
return t->symbols[base + offs];
}
/* Given a data stream, decode dynamic trees from it */
static int tinf_decode_trees(struct tinf_data *d, struct tinf_tree *lt,
struct tinf_tree *dt) {
unsigned char lengths[288 + 32];
/* Special ordering of code length codes */
static const unsigned char clcidx[19] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5,
11, 4, 12, 3, 13, 2, 14, 1, 15
};
unsigned int hlit, hdist, hclen;
unsigned int i, num, length;
int res;
/* Get 5 bits HLIT (257-286) */
hlit = tinf_getbits_base(d, 5, 257);
/* Get 5 bits HDIST (1-32) */
hdist = tinf_getbits_base(d, 5, 1);
/* Get 4 bits HCLEN (4-19) */
hclen = tinf_getbits_base(d, 4, 4);
/*
* The RFC limits the range of HLIT to 286, but lists HDIST as range
* 1-32, even though distance codes 30 and 31 have no meaning. While
* we could allow the full range of HLIT and HDIST to make it possible
* to decode the fixed trees with this function, we consider it an
* error here.
*
* See also: https://github.com/madler/zlib/issues/82
*/
if (hlit > 286 || hdist > 30) {
return TINF_DATA_ERROR;
}
for (i = 0; i < 19; ++i) {
lengths[i] = 0;
}
/* Read code lengths for code length alphabet */
for (i = 0; i < hclen; ++i) {
/* Get 3 bits code length (0-7) */
unsigned int clen = tinf_getbits(d, 3);
lengths[clcidx[i]] = clen;
}
/* Build code length tree (in literal/length tree to save space) */
res = tinf_build_tree(lt, lengths, 19);
if (res != TINF_OK) {
return res;
}
/* Check code length tree is not empty */
if (lt->max_sym == -1) {
return TINF_DATA_ERROR;
}
/* Decode code lengths for the dynamic trees */
for (num = 0; num < hlit + hdist; ) {
int sym = tinf_decode_symbol(d, lt);
if (sym > lt->max_sym) {
return TINF_DATA_ERROR;
}
switch (sym) {
case 16:
/* Copy previous code length 3-6 times (read 2 bits) */
if (num == 0) {
return TINF_DATA_ERROR;
}
sym = lengths[num - 1];
length = tinf_getbits_base(d, 2, 3);
break;
case 17:
/* Repeat code length 0 for 3-10 times (read 3 bits) */
sym = 0;
length = tinf_getbits_base(d, 3, 3);
break;
case 18:
/* Repeat code length 0 for 11-138 times (read 7 bits) */
sym = 0;
length = tinf_getbits_base(d, 7, 11);
break;
default:
/* Values 0-15 represent the actual code lengths */
length = 1;
break;
}
if (length > hlit + hdist - num) {
return TINF_DATA_ERROR;
}
while (length--) {
lengths[num++] = sym;
}
}
/* Check EOB symbol is present */
if (lengths[256] == 0) {
return TINF_DATA_ERROR;
}
/* Build dynamic trees */
res = tinf_build_tree(lt, lengths, hlit);
if (res != TINF_OK) {
return res;
}
res = tinf_build_tree(dt, lengths + hlit, hdist);
if (res != TINF_OK) {
return res;
}
return TINF_OK;
}
/* -- Block inflate functions -- */
/* Given a stream and two trees, inflate a block of data */
static int tinf_inflate_block_data(struct tinf_data *d, struct tinf_tree *lt,
struct tinf_tree *dt) {
/* Extra bits and base tables for length codes */
static const unsigned char length_bits[30] = {
0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
4, 4, 4, 4, 5, 5, 5, 5, 0, 127
};
static const unsigned short length_base[30] = {
3, 4, 5, 6, 7, 8, 9, 10, 11, 13,
15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
67, 83, 99, 115, 131, 163, 195, 227, 258, 0
};
/* Extra bits and base tables for distance codes */
static const unsigned char dist_bits[30] = {
0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
9, 9, 10, 10, 11, 11, 12, 12, 13, 13
};
static const unsigned short dist_base[30] = {
1, 2, 3, 4, 5, 7, 9, 13, 17, 25,
33, 49, 65, 97, 129, 193, 257, 385, 513, 769,
1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577
};
for (;;) {
int sym = tinf_decode_symbol(d, lt);
/* Check for overflow in bit reader */
if (d->overflow) {
return TINF_DATA_ERROR;
}
if (sym < 256) {
*d->dest++ = sym;
}
else {
int length, dist, offs;
int i;
/* Check for end of block */
if (sym == 256) {
return TINF_OK;
}
/* Check sym is within range and distance tree is not empty */
if (sym > lt->max_sym || sym - 257 > 28 || dt->max_sym == -1) {
return TINF_DATA_ERROR;
}
sym -= 257;
/* Possibly get more bits from length code */
length = tinf_getbits_base(d, length_bits[sym],
length_base[sym]);
dist = tinf_decode_symbol(d, dt);
/* Check dist is within range */
if (dist > dt->max_sym || dist > 29) {
return TINF_DATA_ERROR;
}
/* Possibly get more bits from distance code */
offs = tinf_getbits_base(d, dist_bits[dist],
dist_base[dist]);
if (offs > d->dest - d->dest_start) {
return TINF_DATA_ERROR;
}
/* Copy match */
for (i = 0; i < length; ++i) {
d->dest[i] = d->dest[i - offs];
}
d->dest += length;
}
}
}
/* Inflate an uncompressed block of data */
static int tinf_inflate_uncompressed_block(struct tinf_data *d) {
unsigned int length, invlength;
if (d->source_end - d->source < 4) {
return TINF_DATA_ERROR;
}
/* Get length */
length = *(d->source);
/* Get one's complement of length */
invlength = *(d->source + 2);
/* Check length */
if (length != (~invlength & 0x0000FFFF)) {
return TINF_DATA_ERROR;
}
d->source += 4;
if ((unsigned int)((d->source_end - d->source)) < length) {
return TINF_DATA_ERROR;
}
/* Copy block */
while (length--) {
*d->dest++ = *d->source++;
}
/* Make sure we start next block on a byte boundary */
d->tag = 0;
d->bitcount = 0;
return TINF_OK;
}
/* Build fixed Huffman trees */
static void tinf_build_fixed_trees(struct tinf_tree *lt, struct tinf_tree *dt) {
int i;
/* Build fixed literal/length tree */
for (i = 0; i < 16; ++i) {
lt->counts[i] = 0;
}
lt->counts[7] = 24;
lt->counts[8] = 152;
lt->counts[9] = 112;
for (i = 0; i < 24; ++i) {
lt->symbols[i] = 256 + i;
}
for (i = 0; i < 144; ++i) {
lt->symbols[24 + i] = i;
}
for (i = 0; i < 8; ++i) {
lt->symbols[24 + 144 + i] = 280 + i;
}
for (i = 0; i < 112; ++i) {
lt->symbols[24 + 144 + 8 + i] = 144 + i;
}
lt->max_sym = 285;
/* Build fixed distance tree */
for (i = 0; i < 16; ++i) {
dt->counts[i] = 0;
}
dt->counts[5] = 32;
for (i = 0; i < 32; ++i) {
dt->symbols[i] = i;
}
dt->max_sym = 29;
}
/* Inflate a block of data compressed with fixed Huffman trees */
static int tinf_inflate_fixed_block(struct tinf_data *d) {
/* Build fixed Huffman trees */
tinf_build_fixed_trees(&d->ltree, &d->dtree);
/* Decode block using fixed trees */
return tinf_inflate_block_data(d, &d->ltree, &d->dtree);
}
/* Inflate a block of data compressed with dynamic Huffman trees */
static int tinf_inflate_dynamic_block(struct tinf_data *d) {
/* Decode trees from stream */
int res = tinf_decode_trees(d, &d->ltree, &d->dtree);
if (res != TINF_OK) {
return res;
}
/* Decode block using decoded trees */
return tinf_inflate_block_data(d, &d->ltree, &d->dtree);
}
/* Inflate stream from source to dest */
int tinf_uncompress(void *dest,
const void *source, unsigned int sourceLen) {
struct tinf_data d;
int bfinal;
/* Initialise data */
d.source = (const unsigned char *) source;
d.source_end = d.source + sourceLen;
d.tag = 0;
d.bitcount = 0;
d.overflow = 0;
d.dest = (unsigned char *) dest;
d.dest_start = d.dest;
do {
unsigned int btype;
int res;
/* Read final block flag */
bfinal = tinf_getbits(&d, 1);
/* Read block type (2 bits) */
btype = tinf_getbits(&d, 2);
/* Decompress block */
switch (btype) {
case 0:
/* Decompress uncompressed block */
res = tinf_inflate_uncompressed_block(&d);
break;
case 1:
/* Decompress block with fixed Huffman trees */
res = tinf_inflate_fixed_block(&d);
break;
case 2:
/* Decompress block with dynamic Huffman trees */
res = tinf_inflate_dynamic_block(&d);
break;
default:
res = TINF_DATA_ERROR;
break;
}
if (res != TINF_OK) {
return res;
}
} while (!bfinal);
/* Check for overflow in bit reader */
if (d.overflow) {
return TINF_DATA_ERROR;
}
return TINF_OK;
}