From 4a5e3e7d255f3f8eba9ecdb8bd8080db43bf0aeb Mon Sep 17 00:00:00 2001 From: Mark Adler Date: Sat, 9 Mar 2024 23:40:12 -0800 Subject: [PATCH] Add zipAlreadyThere() to minizip zip.c to help avoid duplicates. --- contrib/minizip/skipset.h | 359 ++++++++++++++++++++++++++++++++++++++ contrib/minizip/zip.c | 239 ++++++++++++++++++++++++- contrib/minizip/zip.h | 24 ++- 3 files changed, 612 insertions(+), 10 deletions(-) create mode 100644 contrib/minizip/skipset.h diff --git a/contrib/minizip/skipset.h b/contrib/minizip/skipset.h new file mode 100644 index 0000000..f829b18 --- /dev/null +++ b/contrib/minizip/skipset.h @@ -0,0 +1,359 @@ +// skipset.h -- set operations using a skiplist +// Copyright (C) 2024 Mark Adler +// See MiniZip_info.txt for the license. + +// This implements a skiplist set, i.e. just keys, no data, with ~O(log n) time +// insert and search operations. The application defines the type of a key, and +// provides a function to compare two keys. + +// This header is not definitions of functions found in another source file -- +// it creates the set functions, with the application's key type, right where +// the #include is. Before this header is #included, these must be defined: +// +// 1. A macro or typedef for set_key_t, the type of a key. +// 2. A macro or function set_cmp(a, b) to compare two keys. The return values +// are < 0 for a < b, 0 for a == b, and > 0 for a > b. +// 3. A macro or function set_drop(s, k) to release the key k's resources, if +// any, when doing a set_end() or set_clear(). s is a pointer to the set +// that key is in, for use with set_free() if desired. +// +// Example usage: +// +// typedef int set_key_t; +// #define set_cmp(a, b) ((a) < (b) ? -1 : (a) == (b) ? 0 : 1) +// #define set_drop(s, k) +// #include "skipset.h" +// +// int test(void) { // return 0: good, 1: bad, -1: out of memory +// set_t set; +// if (setjmp(set.env)) +// return -1; +// set_start(&set); +// set_insert(&set, 2); +// set_insert(&set, 1); +// set_insert(&set, 7); +// int bad = !set_found(&set, 2); +// bad = bad || set_found(&set, 5); +// set_end(&set); +// return bad; +// } +// +// Interface summary (see more details below): +// - set_t is the type of the set being operated on (a set_t pointer is passed) +// - set_start() initializes a new, empty set (initialize set.env first) +// - set_insert() inserts a new key into the set, or not if it's already there +// - set_found() determines whether or not a key is in the set +// - set_end() ends the use of the set, freeing all memory +// - set_clear() empties the set, equivalent to set_end() and then set_start() +// - set_ok() checks if set appears to be usable, i.e. started and not ended +// +// Auxiliary functions available to the application: +// - set_alloc() allocates memory with optional tracking (#define SET_TRACK) +// - set_free() deallocates memory allocated by set_alloc() +// - set_rand() returns 32 random bits (seeded by set_start()) + +#ifndef SKIPSET_H +#define SKIPSET_H + +#include // realloc(), free(), NULL, size_t +#include // jmp_buf, longjmp() +#include // ENOMEM +#include // int16_t, uint32_t, uint64_t +#include // time(), clock() +#include // assert.h + +// Structures and functions below noted as "--private--" should not be used by +// the application. set_t is partially private and partially public -- see the +// comments there. + +// There is no POSIX random() in MSVC, and rand() is awful. For portability, we +// cannot rely on a library function for random numbers. Instead we use the +// fast and effective algorithm below, invented by Melissa O'Neill. + +// *Really* minimal PCG32 code / (c) 2014 M.E. O'Neill / www.pcg-random.org +// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website) +// --private-- Random number generator state. +typedef struct { + uint64_t state; // 64-bit generator state + uint64_t inc; // 63-bit sequence id +} set_rand_t; +// --private-- Initialize the state *gen using seed and seq. seed seeds the +// advancing 64-bit state. seq is a sequence selection constant. +void set_seed(set_rand_t *gen, uint64_t seed, uint64_t seq) { + gen->inc = (seq << 1) | 1; + gen->state = (seed + gen->inc) * 6364136223846793005ULL + gen->inc; +} +// Return 32 random bits, advancing the state *gen. +uint32_t set_rand(set_rand_t *gen) { + uint64_t state = gen->state; + gen->state = state * 6364136223846793005ULL + gen->inc; + uint32_t mix = (uint32_t)(((state >> 18) ^ state) >> 27); + int rot = state >> 59; + return (mix >> rot) | (mix << ((-rot) & 31)); +} +// End of PCG32 code. + +// --private-- Linked-list node. +typedef struct set_node_s set_node_t; +struct set_node_s { + set_key_t key; // the key (not used for head or path) + int16_t size; // number of allocated pointers in right[] + int16_t fill; // number of pointers in right[] filled in + set_node_t **right; // pointer for each level, each to the right +}; + +// A set. The application sets env, may use gen with set_rand(), and may read +// allocs and memory. The remaining variables are --private-- . +typedef struct set_s { + set_node_t *head; // skiplist head -- no key, just links + set_node_t *path; // right[] is path to key from set_found() + set_node_t *node; // node under construction, in case of longjmp() + int16_t depth; // maximum depth of the skiplist + uint64_t ran; // a precious trove of random bits + set_rand_t gen; // random number generator state + jmp_buf env; // setjmp() environment for allocation errors +#ifdef SET_TRACK + size_t allocs; // number of allocations + size_t memory; // total amount of allocated memory (>= requests) +#endif +} set_t; + +// Memory allocation and deallocation. set_alloc(set, ptr, size) returns a +// pointer to an allocation of size bytes if ptr is NULL, or the previous +// allocation ptr resized to size bytes. set_alloc() will never return NULL. +// set_free(set, ptr) frees an allocation created by set_alloc(). These may be +// used by the application. e.g. if allocation tracking is desired. +#ifdef SET_TRACK +// Track the number of allocations and the total backing memory size. +# if defined(_WIN32) +# include +# define SET_ALLOC_SIZE(ptr) _msize(ptr) +# elif defined(__MACH__) +# include +# define SET_ALLOC_SIZE(ptr) malloc_size(ptr) +# elif defined(__linux__) +# include +# define SET_ALLOC_SIZE(ptr) malloc_usable_size(ptr) +# elif defined(__FreeBSD__) +# include +# define SET_ALLOC_SIZE(ptr) malloc_usable_size(ptr) +# elif defined(__NetBSD__) +# include +# define SET_ALLOC_SIZE(ptr) malloc_usable_size(ptr) +# else // e.g. OpenBSD +# define SET_ALLOC_SIZE(ptr) 0 +# endif +// With tracking. +void *set_alloc(set_t *set, void *ptr, size_t size) { + size_t had = ptr == NULL ? 0 : SET_ALLOC_SIZE(ptr); + void *mem = realloc(ptr, size); + if (mem == NULL) + longjmp(set->env, ENOMEM); + set->allocs += ptr == NULL; + set->memory += SET_ALLOC_SIZE(mem) - had; + return mem; +} +void set_free(set_t *set, void *ptr) { + if (ptr != NULL) { + set->allocs--; + set->memory -= SET_ALLOC_SIZE(ptr); + free(ptr); + } +} +#else +// Without tracking. +void *set_alloc(set_t *set, void *ptr, size_t size) { + void *mem = realloc(ptr, size); + if (mem == NULL) + longjmp(set->env, ENOMEM); + return mem; +} +void set_free(set_t *set, void *ptr) { + (void)set; + free(ptr); +} +#endif + +// --private-- Grow node's array right[] as needed to be able to hold at least +// want links. If fill is true, assure that the first want links are filled in, +// setting them to set->head if not previously filled in. Otherwise it is +// assumed that the first want links are about to be filled in. +void set_grow(set_t *set, set_node_t *node, int want, int fill) { + if (node->size < want) { + int more = node->size ? node->size : 1; + while (more < want) + more <<= 1; + node->right = set_alloc(set, node->right, more * sizeof(set_node_t *)); + node->size = (int16_t)more; + } + int i; + if (fill) + for (i = node->fill; i < want; i++) + node->right[i] = set->head; + node->fill = (int16_t)want; +} + +// --private-- Return a new node. key is left uninitialized. +set_node_t *set_node(set_t *set) { + set_node_t *node = set_alloc(set, NULL, sizeof(set_node_t)); + node->size = 0; + node->fill = 0; + node->right = NULL; + return node; +} + +// --private-- Free the list linked from head, along with the keys. +void set_sweep(set_t *set) { + set_node_t *step = set->head->right[0]; + while (step != set->head) { + set_node_t *next = step->right[0]; // save link to next node + set_drop(set, step->key); + set_free(set, step->right); + set_free(set, step); + step = next; + } +} + +// Initialize a new set. set->env must be initialized using setjmp() before +// set_start() is called. A longjmp(set->env, ENOMEM) will be used to handle a +// memory allocation failure during any of the operations. (See setjmp.h and +// errno.h.) The set can still be used if this happens, assuming that it didn't +// happen during set_start(). Whether set_start() completed or not, set_end() +// can be used to free the set's memory after a longjmp(). +void set_start(set_t *set) { +#ifdef SET_TRACK + set->allocs = 0; + set->memory = 0; +#endif + set->head = set->path = set->node = NULL; // in case set_node() fails + set->path = set_node(set); + set->head = set_node(set); + set_grow(set, set->head, 1, 1); // one link back to head for an empty set + *(unsigned char *)&set->head->key = 137; // set id + set->depth = 0; + set_seed(&set->gen, (uint64_t)time(NULL) * (uint64_t)clock(), 0); + set->ran = 1; +} + +// Return true if *set appears to be in a usable state. If *set has been zeroed +// out, then set_ok(set) will be false and set_end(set) will be safe. +int set_ok(set_t *set) { + return set->head != NULL && + set->head->right != NULL && + *(unsigned char *)&set->head->key == 137; +} + +// Empty the set. This frees the memory used for the previous set contents. +// After set_clear(), *set is ready for use, as if after a set_start(). +void set_clear(set_t *set) { + assert(set_ok(set) && "improper use"); + + // Free all the keys and their nodes. + set_sweep(set); + + // Leave the head and path allocations as is. Clear their contents, with + // head pointing to itself and setting depth to zero, for an empty set. + set->head->right[0] = set->head; + set->head->fill = 1; + set->path->fill = 0; + set->depth = 0; +} + +// Done using the set -- free all allocations. The only operation on *set +// permitted after this is set_start(). Though another set_end() would do no +// harm. This can be done at any time after a set_start(), or after a longjmp() +// on any allocation failure, including during a set_start(). +void set_end(set_t *set) { + if (set->head != NULL) { + // Empty the set and free the head node. + if (set->head->right != NULL) { + set_sweep(set); + set_free(set, set->head->right); + } + set_free(set, set->head); + set->head = NULL; + } + if (set->path != NULL) { + // Free the path work area. + set_free(set, set->path->right); + set_free(set, set->path); + set->path = NULL; + } + if (set->node != NULL) { + // Free the node that was under construction when longjmp() hit. + set_drop(set, set->node->key); + set_free(set, set->node->right); + set_free(set, set->node); + set->node = NULL; + } +} + +// Look for key. Return 1 if found or 0 if not. This also puts the path to get +// there in set->path, for use by set_insert(). +int set_found(set_t *set, set_key_t key) { + assert(set_ok(set) && "improper use"); + + // Start at depth and work down and right as determined by key comparisons. + set_node_t *head = set->head, *here = head; + int i = set->depth; + set_grow(set, set->path, i + 1, 0); + do { + while (here->right[i] != head && + set_cmp(here->right[i]->key, key) < 0) + here = here->right[i]; + set->path->right[i] = here; + } while (i--); + + // See if the key matches. + here = here->right[0]; + return here != head && set_cmp(here->key, key) == 0; +} + +// Insert the key key. Return 0 on success, or 1 if key is already in the set. +int set_insert(set_t *set, set_key_t key) { + assert(set_ok(set) && "improper use"); + + if (set_found(set, key)) + // That key is already in the set. + return 1; + + // Randomly generate a new level-- level 0 with probability 1/2, 1 with + // probability 1/4, 2 with probability 1/8, etc. + int level = 0; + for (;;) { + if (set->ran == 1) + // Ran out. Get another 32 random bits. + set->ran = set_rand(&set->gen) | (1ULL << 32); + int bit = set->ran & 1; + set->ran >>= 1; + if (bit) + break; + assert(level < 32767 && + "Overhead, without any fuss, the stars were going out."); + level++; + } + if (level > set->depth) { + // The maximum depth is now deeper. Update the structures. + set_grow(set, set->path, level + 1, 1); + set_grow(set, set->head, level + 1, 1); + set->depth = (int16_t)level; + } + + // Make a new node for the provided key, and insert it in the lists up to + // and including level. + set->node = set_node(set); + set->node->key = key; + set_grow(set, set->node, level + 1, 0); + int i; + for (i = 0; i <= level; i++) { + set->node->right[i] = set->path->right[i]->right[i]; + set->path->right[i]->right[i] = set->node; + } + set->node = NULL; + return 0; +} + +#else +#error ** another skiplist set already created here +// Would need to implement a prefix in order to support multiple sets. +#endif diff --git a/contrib/minizip/zip.c b/contrib/minizip/zip.c index a6329ae..d16e9ae 100644 --- a/contrib/minizip/zip.c +++ b/contrib/minizip/zip.c @@ -123,6 +123,19 @@ typedef struct linkedlist_data_s } linkedlist_data; +// zipAlreadyThere() set functions for a set of zero-terminated strings, and +// a block_t type for reading the central directory datablocks. +typedef char const *set_key_t; +#define set_cmp(a, b) strcmp(a, b) +#define set_drop(s, k) set_free(s, (void *)(intptr_t)(k)) +#include "skipset.h" +typedef struct { + unsigned char *next; // next byte in datablock data + size_t left; // number of bytes left in data (at least) + linkedlist_datablock_internal *node; // current datablock +} block_t; + + typedef struct { z_stream stream; /* zLib stream structure for inflate */ @@ -174,6 +187,10 @@ typedef struct char *globalcomment; #endif + // Support for zipAlreadyThere(). + set_t set; // set for detecting name collisions + block_t block; // block for reading the central directory + } zip64_internal; @@ -264,6 +281,223 @@ local int add_data_in_datablock(linkedlist_data* ll, const void* buf, uLong len) return ZIP_OK; } +// zipAlreadyThere() operations. "set" in the zip internal structure keeps the +// set of names that are in the under-construction central directory so far. A +// skipset provides ~O(log n) time insertion and searching. Central directory +// records, stored in a linked list of allocated memory datablocks, is read +// through "block" in the zip internal structure. + +// The block_*() functions support extracting the central directory file names +// from the datablocks. They are designed to support a growing directory by +// automatically continuing once more data has been appended to the linked +// datablocks. + +// Initialize *block to the head of list. This should only be called once the +// list has at least some data in it, i.e. list->first_block is not NULL. +local void block_init(block_t *block, linkedlist_data *list) { + block->node = list->first_block; + block->next = block->node->data; + block->left = block->node->filled_in_this_block; +} + +// Mark *block as bad, with all subsequent reads returning end, even if more +// data is added to the datablocks. This is invoked if the central directory is +// invalid, so there is no longer any point in attempting to interpret it. +local void block_stop(block_t *block) { + block->left = 0; + block->next = NULL; +} + +// Return true if *block has reached the end of the data in the datablocks. +local int block_end(block_t *block) { + linkedlist_datablock_internal *node = block->node; + if (node == NULL) + // This block was previously terminated with extreme prejudice. + return 1; + if (block->next < node->data + node->filled_in_this_block) + // There are more bytes to read in the current datablock. + return 0; + while (node->next_datablock != NULL) { + if (node->filled_in_this_block != 0) + // There are some bytes in a later datablock. + return 0; + node = node->next_datablock; + } + // Reached the end of the list of datablocks. There's nothing. + return 1; +} + +// Return one byte from *block, or -1 if the end is reached. +local int block_get(block_t *block) { + while (block->left == 0) { + if (block->node == NULL) + // We've been marked bad. Return end. + return -1; + // Update left in case more was filled in since we were last here. + block->left = block->node->filled_in_this_block - + (block->next - block->node->data); + if (block->left != 0) + // There was indeed more data appended in the current datablock. + break; + if (block->node->next_datablock == NULL) + // No more data here, and there is no next datablock. At the end. + return -1; + // Try the next datablock for more data. + block->node = block->node->next_datablock; + block->next = block->node->data; + block->left = block->node->filled_in_this_block; + } + // We have a byte to return. + block->left--; + return *block->next++; +} + +// Return a 16-bit unsigned little-endian value from block, or a negative value +// if the end is reached. +local long block_get2(block_t *block) { + long got = block_get(block); + return got | ((long)block_get(block) << 8); +} + +// Read up to len bytes from block into buf. Return the number of bytes read. +local size_t block_read(block_t *block, unsigned char *buf, size_t len) { + size_t need = len; + while (need) { + if (block->left == 0) { + // Get a byte to update and step through the linked list as needed. + int got = block_get(block); + if (got == -1) + // Reached the end. + break; + *buf++ = (unsigned char)got; + need--; + continue; + } + size_t take = need > block->left ? block->left : need; + memcpy(buf, block->next, take); + block->next += take; + block->left -= take; + buf += take; + need -= take; + } + return len - need; // return the number of bytes copied +} + +// Skip n bytes in block. Return 0 on success or -1 if there are less than n +// bytes to the end. +local int block_skip(block_t *block, size_t n) { + while (n > block->left) { + n -= block->left; + block->next += block->left; + block->left = 0; + if (block_get(block) == -1) + return -1; + n--; + } + block->next += n; + block->left -= n; + return 0; +} + +// Process the next central directory record at *block. Return the allocated, +// zero-terminated file name, or NULL for end of input or invalid data. If +// invalid, *block is marked bad. This uses *set for the allocation of memory. +local char *block_central_name(block_t *block, set_t *set) { + char *name = NULL; + for (;;) { + if (block_end(block)) + // At the end of the central directory (so far). + return NULL; + + // Check for a central directory record signature. + if (block_get2(block) != (CENTRALHEADERMAGIC & 0xffff) || + block_get2(block) != (CENTRALHEADERMAGIC >> 16)) + // Incorrect signature. + break; + + // Go through the remaining fixed-length portion of the record, + // extracting the lengths of the three variable-length fields. + block_skip(block, 24); + unsigned flen = block_get2(block); // file name length + unsigned xlen = block_get2(block); // extra field length + unsigned clen = block_get2(block); // comment field length + if (block_skip(block, 12) == -1) + // Premature end of the record. + break; + + // Extract the name and skip over the extra and comment fields. + name = set_alloc(set, NULL, flen + 1); + if (block_read(block, (unsigned char *)name, flen) < flen || + block_skip(block, xlen + clen) == -1) + // Premature end of the record. + break; + + // Check for embedded nuls in the name. + if (memchr(name, 0, flen) != NULL) { + // This name can never match the zero-terminated name provided to + // zipAlreadyThere(), so we discard it and go back to get another + // name. (Who the heck is putting nuls inside their zip file entry + // names anyway?) + set_free(set, name); + continue; + } + + // All good. Return the zero-terminated file name. + name[flen] = 0; + return name; + } + + // Invalid signature or premature end of the central directory record. + // Abandon trying to process the central directory. + set_free(set, name); + block_stop(block); + return NULL; +} + +// Return 0 if name is not in the central directory so far, 1 if it is, -1 if +// the central directory is invalid, -2 if out of memory, or ZIP_PARAMERROR if +// file is NULL. +extern int ZEXPORT zipAlreadyThere(zipFile file, char const *name) { + zip64_internal *zip = file; + if (zip == NULL) + return ZIP_PARAMERROR; + if (zip->central_dir.first_block == NULL) + // No central directory yet, so no, name isn't there. + return 0; + if (setjmp(zip->set.env)) { + // Memory allocation failure. + set_end(&zip->set); + return -2; + } + if (!set_ok(&zip->set)) { + // This is the first time here with some central directory content. We + // construct this set of names only on demand. Prepare set and block. + set_start(&zip->set); + block_init(&zip->block, &zip->central_dir); + } + + // Update the set of names from the current central directory contents. + // This reads any new central directory records since the last time we were + // here. + for (;;) { + char *there = block_central_name(&zip->block, &zip->set); + if (there == NULL) { + if (zip->block.next == NULL) + // The central directory is invalid. + return -1; + break; + } + + // Add there to the set. + if (set_insert(&zip->set, there)) + // There's already a duplicate in the central directory! We'll just + // let this be and carry on. + set_free(&zip->set, there); + } + + // Return true if name is in the central directory. + return set_found(&zip->set, name); +} /****************************************************************************/ @@ -551,7 +785,7 @@ local ZPOS64_T zip64local_SearchCentralDir64(const zlib_filefunc64_32_def* pzlib for (i=(int)uReadSize-3; (i--)>0;) { - // Signature "0x07064b50" Zip64 end of central directory locater + // Signature "0x07064b50" Zip64 end of central directory locator if (((*(buf+i))==0x50) && ((*(buf+i+1))==0x4b) && ((*(buf+i+2))==0x06) && ((*(buf+i+3))==0x07)) { uPosFound = uReadPos+(unsigned)i; @@ -843,6 +1077,7 @@ extern zipFile ZEXPORT zipOpen3(const void *pathname, int append, zipcharpc* glo ziinit.number_entry = 0; ziinit.add_position_when_writing_offset = 0; init_linkedlist(&(ziinit.central_dir)); + memset(&ziinit.set, 0, sizeof(set_t)); // make sure set appears dormant @@ -1870,6 +2105,8 @@ extern int ZEXPORT zipClose(zipFile file, const char* global_comment) { } free_linkedlist(&(zi->central_dir)); + set_end(&zi->set); // set was zeroed, so this is safe + pos = centraldir_pos_inzip - zi->add_position_when_writing_offset; if(pos >= 0xffffffff || zi->number_entry >= 0xFFFF) { diff --git a/contrib/minizip/zip.h b/contrib/minizip/zip.h index 3e230d3..1f7f0b2 100644 --- a/contrib/minizip/zip.h +++ b/contrib/minizip/zip.h @@ -35,7 +35,7 @@ See header of zip.h -*/ + */ #ifndef _zip12_H #define _zip12_H @@ -127,12 +127,12 @@ extern zipFile ZEXPORT zipOpen64(const void *pathname, int append); If the zipfile cannot be opened, the return value is NULL. Else, the return value is a zipFile Handle, usable with other function of this zip package. -*/ + */ /* Note : there is no delete function into a zipfile. If you want delete file into a zipfile, you must open a zipfile, and create another Of course, you can use RAW reading and writing to copy the file you did not want delete -*/ + */ extern zipFile ZEXPORT zipOpen2(const char *pathname, int append, @@ -186,7 +186,7 @@ extern int ZEXPORT zipOpenNewFileInZip64(zipFile file, zip64 is set to 1 if a zip64 extended information block should be added to the local file header. this MUST be '1' if the uncompressed size is >= 0xffffffff. -*/ + */ extern int ZEXPORT zipOpenNewFileInZip2(zipFile file, @@ -311,12 +311,12 @@ extern int ZEXPORT zipWriteInFileInZip(zipFile file, unsigned len); /* Write data in the zipfile -*/ + */ extern int ZEXPORT zipCloseFileInZip(zipFile file); /* Close the current file in the zipfile -*/ + */ extern int ZEXPORT zipCloseFileInZipRaw(zipFile file, uLong uncompressed_size, @@ -326,17 +326,23 @@ extern int ZEXPORT zipCloseFileInZipRaw64(zipFile file, ZPOS64_T uncompressed_size, uLong crc32); +extern int ZEXPORT zipAlreadyThere(zipFile file, + char const* name); +/* + See if name is already in file's central directory. + */ + /* Close the current file in the zipfile, for file opened with parameter raw=1 in zipOpenNewFileInZip2 uncompressed_size and crc32 are value for the uncompressed size -*/ + */ extern int ZEXPORT zipClose(zipFile file, const char* global_comment); /* Close the zipfile -*/ + */ extern int ZEXPORT zipRemoveExtraInfoBlock(char* pData, int* dataLen, short sHeader); @@ -355,7 +361,7 @@ extern int ZEXPORT zipRemoveExtraInfoBlock(char* pData, int* dataLen, short sHea Remove ZIP64 Extra information from a Local File Header extra field data zipRemoveExtraInfoBlock(pLocalHeaderExtraFieldData, &nLocalHeaderExtraFieldDataLen, 0x0001); -*/ + */ #ifdef __cplusplus }