From 0aba2554409ee3251d7558567edd114d8ed36dcc Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 16 Jan 2024 16:32:48 +0700
Subject: [PATCH] Add optimized C string hashing

Given an already-initialized hash state and a NUL-terminated string,
accumulate the hash of the string into the hash state and return the
length for the caller to (optionally) save for the finalizer. This
avoids a strlen call.

If the string pointer is aligned, we can use a word-at-a-time
algorithm for NUL lookahead. The aligned case is only used on 64-bit
platforms, since it's not worth the extra complexity for 32-bit.

Handling the tail of the string after finishing the word-wise loop
was inspired by NetBSD's strlen(), but no code was taken since that
is written in assembly language.

As demonstration, use this in the search path cache. This brings the
general case performance closer to the special case optimization done
in commit a86c61c9ee. There are other places that could benefit, but
that is left for future work.

Jeff Davis and John Naylor
Reviewed by Heikki Linnakangas, Jian He, Junwang Zhao

Discussion: https://postgr.es/m/3820f030fd008ff14134b3e9ce5cc6dd623ed479.camel%40j-davis.com
Discussion: https://postgr.es/m/b40292c99e623defe5eadedab1d438cf51a4107c.camel%40j-davis.com
---
 src/backend/catalog/namespace.c      |  20 +++--
 src/include/common/hashfn_unstable.h | 130 +++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 5 deletions(-)

diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index eecc50a958..b610aa6242 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -41,7 +41,7 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "commands/dbcommands.h"
-#include "common/hashfn.h"
+#include "common/hashfn_unstable.h"
 #include "funcapi.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
@@ -253,11 +253,21 @@ static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames,
 static inline uint32
 spcachekey_hash(SearchPathCacheKey key)
 {
-	const unsigned char *bytes = (const unsigned char *) key.searchPath;
-	int			blen = strlen(key.searchPath);
+	fasthash_state hs;
+	int			sp_len;
 
-	return hash_combine(hash_bytes(bytes, blen),
-						hash_uint32(key.roleid));
+	fasthash_init(&hs, FH_UNKNOWN_LENGTH, 0);
+
+	hs.accum = key.roleid;
+	fasthash_combine(&hs);
+
+	/*
+	 * Combine search path into the hash and save the length for tweaking the
+	 * final mix.
+	 */
+	sp_len = fasthash_accum_cstring(&hs, key.searchPath);
+
+	return fasthash_final32(&hs, sp_len);
 }
 
 static inline bool
diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 8dfef9855a..b3c56db1c0 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -58,6 +58,24 @@
  * 2) Incremental interface. This can used for incorporating multiple
  * inputs. The standalone functions use this internally, so see fasthash64()
  * for an an example of how this works.
+ *
+ * The incremental interface is especially useful if any of the inputs
+ * are NUL-terminated C strings, since the length is not needed ahead
+ * of time. This avoids needing to call strlen(). This case is optimized
+ * in fasthash_accum_cstring() :
+ *
+ * fasthash_state hs;
+ * fasthash_init(&hs, FH_UNKNOWN_LENGTH, 0);
+ * len = fasthash_accum_cstring(&hs, *str);
+ * ...
+ * return fasthash_final32(&hs, len);
+ *
+ * Here we pass FH_UNKNOWN_LENGTH as a convention, since passing zero
+ * would zero out the internal seed as well. fasthash_accum_cstring()
+ * returns the length of the string, which is computed on-the-fly while
+ * mixing the string into the hash. Experimentation has found that
+ * SMHasher fails unless we incorporate the length, so it is passed to
+ * the finalizer as a tweak.
  */
 
 
@@ -151,6 +169,118 @@ fasthash_accum(fasthash_state *hs, const char *k, int len)
 	fasthash_combine(hs);
 }
 
+/*
+ * Set high bit in lowest byte where the input is zero, from:
+ * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ */
+#define haszero64(v) \
+	(((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
+
+/*
+ * all-purpose workhorse for fasthash_accum_cstring
+ */
+static inline int
+fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
+{
+	const char *const start = str;
+
+	while (*str)
+	{
+		int			chunk_len = 0;
+
+		while (chunk_len < FH_SIZEOF_ACCUM && str[chunk_len] != '\0')
+			chunk_len++;
+
+		fasthash_accum(hs, str, chunk_len);
+		str += chunk_len;
+	}
+
+	return str - start;
+}
+
+/*
+ * specialized workhorse for fasthash_accum_cstring
+ *
+ * With an aligned pointer, we consume the string a word at a time.
+ * Loading the word containing the NUL terminator cannot segfault since
+ * allocation boundaries are suitably aligned.
+ */
+static inline int
+fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
+{
+	const char *const start = str;
+	int			remainder;
+	uint64		zero_bytes_le;
+
+	Assert(PointerIsAligned(start, uint64));
+	for (;;)
+	{
+		uint64		chunk = *(uint64 *) str;
+
+		/*
+		 * With little-endian representation, we can use this calculation,
+		 * which sets bits in the first byte in the result word that
+		 * corresponds to a zero byte in the original word. The rest of the
+		 * bytes are indeterminate, so cannot be used on big-endian machines
+		 * without either swapping or a bytewise check.
+		 */
+#ifdef WORDS_BIGENDIAN
+		zero_bytes_le = haszero64(pg_bswap(chunk));
+#else
+		zero_bytes_le = haszero64(chunk);
+#endif
+		if (zero_bytes_le)
+			break;
+
+		hs->accum = chunk;
+		fasthash_combine(hs);
+		str += FH_SIZEOF_ACCUM;
+	}
+
+	/*
+	 * For the last word, only use bytes up to the NUL for the hash. Bytes
+	 * with set bits will be 0x80, so calculate the first occurrence of a zero
+	 * byte within the input word by counting the number of trailing (because
+	 * little-endian) zeros and dividing the result by 8.
+	 */
+	remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE;
+	fasthash_accum(hs, str, remainder);
+	str += remainder;
+
+	return str - start;
+}
+
+/*
+ * Mix 'str' into the hash state and return the length of the string.
+ */
+static inline int
+fasthash_accum_cstring(fasthash_state *hs, const char *str)
+{
+#if SIZEOF_VOID_P >= 8
+
+	int			len;
+#ifdef USE_ASSERT_CHECKING
+	int			len_check;
+	fasthash_state hs_check;
+
+	memcpy(&hs_check, hs, sizeof(fasthash_state));
+	len_check = fasthash_accum_cstring_unaligned(&hs_check, str);
+#endif
+	if (PointerIsAligned(str, uint64))
+	{
+		len = fasthash_accum_cstring_aligned(hs, str);
+		Assert(hs_check.hash == hs->hash && len_check == len);
+		return len;
+	}
+#endif							/* SIZEOF_VOID_P */
+
+	/*
+	 * It's not worth it to try to make the word-at-a-time optimization work
+	 * on 32-bit platforms.
+	 */
+	return fasthash_accum_cstring_unaligned(hs, str);
+}
+
 /*
  * The finalizer
  *