Further cosmetic review of hashfn_unstable.h

In follow-up to e97b672c8, * Flesh out comments explaining the incremental interface * Clarify detection of zero bytes when hashing aligned C strings The latter was suggested and reviewed by Jeff Davis Discussion: https://postgr.es/m/48e8f8bbe0be9c789f98776c7438244ab7a7cc63.camel%40j-davis.com
2024-02-06 13:09:03 +07:00 · 2024-02-06 13:09:03 +07:00 · b83033c3cf
commit b83033c3cf
parent 9ed3ee5001
1 changed files with 50 additions and 28 deletions
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@ -53,24 +53,40 @@
 * fasthash as implemented here has two interfaces:
 *
 * 1) Standalone functions, e.g. fasthash32() for a single value with a
- * known length.
+ * known length. These return the same hash code as the original, at
 * least on little-endian machines.
 *
 * 2) Incremental interface. This can used for incorporating multiple
- * inputs. The standalone functions use this internally, so see fasthash64()
+ * inputs. First, initialize the hash state (here with a zero seed):
 * for an an example of how this works.
 *
 * The incremental interface is especially useful if any of the inputs
 * are NUL-terminated C strings, since the length is not needed ahead
 * of time. This avoids needing to call strlen(). This case is optimized
 * in fasthash_accum_cstring() :
 *
 * fasthash_state hs;
 * fasthash_init(&hs, 0);
 * len = fasthash_accum_cstring(&hs, *str);
 * ...
 * return fasthash_final32(&hs, len);
 *
- * The length is computed on-the-fly. Experimentation has found that
+ * If the inputs are of types that can be trivially cast to uint64, it's
 * sufficient to do:
 *
 * hs.accum = value1;
 * fasthash_combine(&hs);
 * hs.accum = value2;
 * fasthash_combine(&hs);
 * ...
 *
 * For longer or variable-length input, fasthash_accum() is a more
 * flexible, but more verbose method. The standalone functions use this
 * internally, so see fasthash64() for an an example of this.
 *
 * After all inputs have been mixed in, finalize the hash:
 *
 * hashcode = fasthash_final32(&hs, 0);
 *
 * The incremental interface allows an optimization for NUL-terminated
 * C strings:
 *
 * len = fasthash_accum_cstring(&hs, str);
 * hashcode = fasthash_final32(&hs, len);
 *
 * By handling the terminator on-the-fly, we can avoid needing a strlen()
 * call to tell us how many bytes to hash. Experimentation has found that
 * SMHasher fails unless we incorporate the length, so it is passed to
 * the finalizer as a tweak.
 */
@ -204,26 +220,33 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 {
 	const char *const start = str;
 	int			remainder;
-	uint64		zero_bytes_le;
+	uint64		zero_byte_low;
 	Assert(PointerIsAligned(start, uint64));
 	/*
 	 * For every chunk of input, check for zero bytes before mixing into the
 	 * hash. The chunk with zeros must contain the NUL terminator. We arrange
 	 * so that zero_byte_low tells us not only that a zero exists, but also
 	 * where it is, so we can hash the remainder of the string.
 	 *
 	 * The haszero64 calculation will set bits corresponding to the lowest
 	 * byte where a zero exists, so that suffices for little-endian machines.
 	 * For big-endian machines, we would need bits set for the highest zero
 	 * byte in the chunk, since the trailing junk past the terminator could
 	 * contain additional zeros. haszero64 does not give us that, so we
 	 * byteswap the chunk first.
 	 */
 	for (;;)
 	{
 		uint64		chunk = *(uint64 *) str;
 		/*
 		 * With little-endian representation, we can use this calculation,
 		 * which sets bits in the first byte in the result word that
 		 * corresponds to a zero byte in the original word. The rest of the
 		 * bytes are indeterminate, so cannot be used on big-endian machines
 		 * without either swapping or a bytewise check.
 		 */
 #ifdef WORDS_BIGENDIAN
-		zero_bytes_le = haszero64(pg_bswap64(chunk));
+		zero_byte_low = haszero64(pg_bswap64(chunk));
 #else
-		zero_bytes_le = haszero64(chunk);
+		zero_byte_low = haszero64(chunk);
 #endif
-		if (zero_bytes_le)
+		if (zero_byte_low)
 			break;
 		hs->accum = chunk;
@ -232,12 +255,11 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	}
 	/*
-	 * For the last word, only use bytes up to the NUL for the hash. Bytes
+	 * The byte corresponding to the NUL will be 0x80, so the rightmost bit
-	 * with set bits will be 0x80, so calculate the first occurrence of a zero
+	 * position will be in the range 7, 15, ..., 63. Turn this into byte
-	 * byte within the input word by counting the number of trailing (because
+	 * position by dividing by 8.
 	 * little-endian) zeros and dividing the result by 8.
 	 */
-	remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE;
+	remainder = pg_rightmost_one_pos64(zero_byte_low) / BITS_PER_BYTE;
 	fasthash_accum(hs, str, remainder);
 	str += remainder;