stb_ds: major string hash fix, minor other changes

- arena and strdup string hashes were badly broken due to not setting up default slot correctly
  - tweak use of seed in 4-byte and 8-byte hash functions to hopefully be slightly stronger
  - a few internal #ifdefs for performance tuning
This commit is contained in:
Sean Barrett 2019-03-03 21:36:15 -08:00
parent 72990efc3e
commit b8960f32b8

View File

@ -274,10 +274,10 @@ NOTES - HASH MAP
* For compilers other than GCC and clang (e.g. Visual Studio), for hmput/hmget/hmdel * For compilers other than GCC and clang (e.g. Visual Studio), for hmput/hmget/hmdel
and variants, the key must be an lvalue (so the macro can take the address of it). and variants, the key must be an lvalue (so the macro can take the address of it).
For GCC and clang, extensions are used that eliminate this requirement if you're Extensions are used that eliminate this requirement if you're using C99 and later
using C99 and later or using C++. in GCC or clang, or if you're using C++ in GCC.
* To test for presence of a key in a hashmap, just do 'hmget(foo,key) >= 0'. * To test for presence of a key in a hashmap, just do 'hmgeti(foo,key) >= 0'.
* The iteration order of your data in the hashmap is determined solely by the * The iteration order of your data in the hashmap is determined solely by the
order of insertions and deletions. In particular, if you never delete, new order of insertions and deletions. In particular, if you never delete, new
@ -417,7 +417,7 @@ extern void * stbds_shmode_func(size_t elemsize, int mode);
#if __clang__ #if __clang__
#define STBDS_ADDRESSOF(typevar, value) ((__typeof__(typevar)[1]){value}) // literal array decays to pointer to value #define STBDS_ADDRESSOF(typevar, value) ((__typeof__(typevar)[1]){value}) // literal array decays to pointer to value
#else #else
#define STBDS_ADDRESSOF(typevar, value) ((typeof(typevar)[]){value}) // literal array decays to pointer to value #define STBDS_ADDRESSOF(typevar, value) ((typeof(typevar)[1]){value}) // literal array decays to pointer to value
#endif #endif
#else #else
#define STBDS_ADDRESSOF(typevar, value) &(value) #define STBDS_ADDRESSOF(typevar, value) &(value)
@ -648,10 +648,15 @@ void *stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap)
// stbds_hm hash table implementation // stbds_hm hash table implementation
// //
#define STBDS_CACHE_LINE_SIZE 64 #ifdef STBDS_INTERNAL_SMALL_BUCKET
#define STBDS_BUCKET_LENGTH 4
#else
#define STBDS_BUCKET_LENGTH 8 #define STBDS_BUCKET_LENGTH 8
#define STBDS_BUCKET_SHIFT 3 #endif
#define STBDS_BUCKET_SHIFT (STBDS_BUCKET_LENGTH == 8 ? 3 : 2)
#define STBDS_BUCKET_MASK (STBDS_BUCKET_LENGTH-1) #define STBDS_BUCKET_MASK (STBDS_BUCKET_LENGTH-1)
#define STBDS_CACHE_LINE_SIZE 64
#define STBDS_ALIGN_FWD(n,a) (((n) + (a) - 1) & ~((a)-1)) #define STBDS_ALIGN_FWD(n,a) (((n) + (a) - 1) & ~((a)-1))
@ -698,13 +703,12 @@ void stbds_rand_seed(size_t seed)
static size_t stbds_probe_position(size_t hash, size_t slot_count, size_t slot_log2) static size_t stbds_probe_position(size_t hash, size_t slot_count, size_t slot_log2)
{ {
#if 1 size_t pos;
size_t pos = (hash >> (STBDS_SIZE_T_BITS-slot_log2)); pos = hash & (slot_count-1);
STBDS_ASSERT(pos < slot_count); #ifdef STBDS_INTERNAL_BUCKET_START
return pos; pos &= ~STBDS_BUCKET_MASK;
#else
return hash & (slot_count-1);
#endif #endif
return pos;
} }
static size_t stbds_log2(size_t slot_count) static size_t stbds_log2(size_t slot_count)
@ -812,7 +816,6 @@ static stbds_hash_index *stbds_make_hash_index(size_t slot_count, stbds_hash_ind
for (;;) { for (;;) {
size_t limit,z; size_t limit,z;
stbds_hash_bucket *bucket; stbds_hash_bucket *bucket;
pos &= (t->slot_count-1);
bucket = &t->storage[pos >> STBDS_BUCKET_SHIFT]; bucket = &t->storage[pos >> STBDS_BUCKET_SHIFT];
STBDS_STATS(++stbds_rehash_probes); STBDS_STATS(++stbds_rehash_probes);
@ -835,6 +838,7 @@ static stbds_hash_index *stbds_make_hash_index(size_t slot_count, stbds_hash_ind
pos += step; // quadratic probing pos += step; // quadratic probing
step += STBDS_BUCKET_LENGTH; step += STBDS_BUCKET_LENGTH;
pos &= (t->slot_count-1);
} }
} }
done: done:
@ -939,7 +943,7 @@ static size_t stbds_siphash_bytes(void *p, size_t len, size_t seed)
#ifdef STBDS_SIPHASH_2_4 #ifdef STBDS_SIPHASH_2_4
return v0^v1^v2^v3; return v0^v1^v2^v3;
#else #else
return v1^v2^v3; // slightly stronger since v0^v3 in above cancels out final round operation return v1^v2^v3; // slightly stronger since v0^v3 in above cancels out final round operation? I tweeted at the authors of SipHash about this but they didn't reply
#endif #endif
} }
@ -954,10 +958,11 @@ size_t stbds_hash_bytes(void *p, size_t len, size_t seed)
unsigned int hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24); unsigned int hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24);
#if 0 #if 0
// HASH32-A Bob Jenkin's hash function w/o large constants // HASH32-A Bob Jenkin's hash function w/o large constants
hash ^= seed ^ len; hash ^= seed;
hash -= (hash<<6); hash -= (hash<<6);
hash ^= (hash>>17); hash ^= (hash>>17);
hash -= (hash<<9); hash -= (hash<<9);
hash ^= seed;
hash ^= (hash<<4); hash ^= (hash<<4);
hash -= (hash<<3); hash -= (hash<<3);
hash ^= (hash<<10); hash ^= (hash<<10);
@ -966,22 +971,24 @@ size_t stbds_hash_bytes(void *p, size_t len, size_t seed)
// HASH32-BB Bob Jenkin's presumably-accidental version of Thomas Wang hash with rotates turned into shifts. // HASH32-BB Bob Jenkin's presumably-accidental version of Thomas Wang hash with rotates turned into shifts.
// Note that converting these back to rotates makes it run a lot slower, presumably due to collisions, so I'm // Note that converting these back to rotates makes it run a lot slower, presumably due to collisions, so I'm
// not really sure what's going on. // not really sure what's going on.
hash ^= seed ^ len; hash ^= seed;
hash = (hash ^ 61) ^ (hash >> 16); hash = (hash ^ 61) ^ (hash >> 16);
hash = hash + (hash << 3); hash = hash + (hash << 3);
hash = hash ^ (hash >> 4); hash = hash ^ (hash >> 4);
hash = hash * 0x27d4eb2d; hash = hash * 0x27d4eb2d;
hash ^= seed;
hash = hash ^ (hash >> 15); hash = hash ^ (hash >> 15);
#else // HASH32-C - Murmur3 #else // HASH32-C - Murmur3
hash ^= seed;
hash *= 0xcc9e2d51; hash *= 0xcc9e2d51;
hash = (hash << 17) | (hash >> 15); hash = (hash << 17) | (hash >> 15);
hash *= 0x1b873593; hash *= 0x1b873593;
hash ^= seed; hash ^= seed;
hash = (hash << 19) | (hash >> 13); hash = (hash << 19) | (hash >> 13);
hash = hash*5 + 0xe6546b64; hash = hash*5 + 0xe6546b64;
hash ^= len;
hash ^= hash >> 16; hash ^= hash >> 16;
hash *= 0x85ebca6b; hash *= 0x85ebca6b;
hash ^= seed;
hash ^= hash >> 13; hash ^= hash >> 13;
hash *= 0xc2b2ae35; hash *= 0xc2b2ae35;
hash ^= hash >> 16; hash ^= hash >> 16;
@ -1006,16 +1013,17 @@ size_t stbds_hash_bytes(void *p, size_t len, size_t seed)
} else if (len == 8 && sizeof(size_t) == 8) { } else if (len == 8 && sizeof(size_t) == 8) {
size_t hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24); size_t hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24);
hash |= (size_t) (d[4] | (d[5] << 8) | (d[6] << 16) | (d[7] << 24)) << 16 << 16; // avoid warning if size_t == 4 hash |= (size_t) (d[4] | (d[5] << 8) | (d[6] << 16) | (d[7] << 24)) << 16 << 16; // avoid warning if size_t == 4
hash ^= seed ^ len; hash ^= seed;
hash = (~hash) + (hash << 21); hash = (~hash) + (hash << 21);
hash ^= STBDS_ROTATE_RIGHT(hash,24); hash ^= STBDS_ROTATE_RIGHT(hash,24);
hash *= 265; hash *= 265;
hash ^= STBDS_ROTATE_RIGHT(hash,14); hash ^= STBDS_ROTATE_RIGHT(hash,14);
hash ^= seed;
hash *= 21; hash *= 21;
hash ^= STBDS_ROTATE_RIGHT(hash,28); hash ^= STBDS_ROTATE_RIGHT(hash,28);
hash += (hash << 31); hash += (hash << 31);
hash = (~hash) + (hash << 18); hash = (~hash) + (hash << 18);
return hash^seed; return hash;
} else { } else {
return stbds_siphash_bytes(p,len,seed); return stbds_siphash_bytes(p,len,seed);
} }
@ -1272,6 +1280,8 @@ void * stbds_shmode_func(size_t elemsize, int mode)
{ {
void *a = stbds_arrgrowf(0, elemsize, 0, 1); void *a = stbds_arrgrowf(0, elemsize, 0, 1);
stbds_hash_index *h; stbds_hash_index *h;
memset(a, 0, elemsize);
stbds_header(a)->length = 1;
stbds_header(a)->hash_table = h = (stbds_hash_index *) stbds_make_hash_index(STBDS_BUCKET_LENGTH, NULL); stbds_header(a)->hash_table = h = (stbds_hash_index *) stbds_make_hash_index(STBDS_BUCKET_LENGTH, NULL);
h->string.mode = mode; h->string.mode = mode;
return STBDS_ARR_TO_HASH(a,elemsize); return STBDS_ARR_TO_HASH(a,elemsize);