Attached is a rough start with Andres's earlier ideas, to get
something concrete out there.

I took a look around at other implementations a bit. Many modern hash
functions use MUM-style hashing, which typically uses 128-bit
arithmetic. Even if they already have an incremental interface and
have a compatible license, it seems a bit too much work to adopt just
for a couple string use cases. Might be useful elsewhere, though, but
that's off topic.

However, I did find a couple hash functions that are much simpler to
adapt to a bytewise interface, pass SMHasher, and are decently fast on
short inputs:

- fast-hash, MIT licensed, and apparently has some use in software [1]
- MX3, CC0 license (looking around, seems controversial for a code
license, so didn't go further). [2] Seems to be a for-fun project, but
the accompanying articles are very informative on how to develop these
things.

After wacking fast-hash around, it doesn't really resemble the
original much, and if for some reason we went as far as switching out
the mixing/final functions, it may as well be called completely
original work. I thought it best to start with something whose mixing
behavior passes SMHasher, and hopefully preserve that property.

Note that the combining and final steps share most of their arithmetic
operations. This may have been done on purpose to minimize binary
size, but I didn't check. Also, it incorporates input length into the
calculation. Since we don't know the length of C strings up front, I
threw that out for now. It'd be possible to track the length as we go
and incorporate something into the final step. The hard part is
verifying it hasn't lost any quality.

v5-0001 puts fash-hash as-is into a new header, named in a way to
convey in-memory use e.g. hash tables.

v5-0002 does the minimal to allow dynash to use this for string_hash,
inlined but still calling strlen.

v5-0003 shows one way to do a incremental interface. It might be okay
for simplehash with fixed length keys, but seems awkward for strings.

v5-0004 shows a bytewise incremental interface, with implementations
for dynahash (getting rid of strlen) and guc hash.

[1] https://code.google.com/archive/p/fast-hash/
[2] https://github.com/jonmaiga/mx3
From c2b799dd2418fb68fcfc6ccf006a50f74c9072fe Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 29 Nov 2023 16:28:58 +0700
Subject: [PATCH v5 4/4] Add bytewise interface for incrementing the hash state

This is the nicest interface for guc_name_hash, and seems
good for string_hash too. It's not clear if this is good for
the latter from a performance perspective.
---
 src/backend/utils/hash/dynahash.c    | 14 +++--------
 src/backend/utils/misc/guc.c         | 16 ++++++-------
 src/include/common/hashfn_unstable.h | 35 ++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 1c08dc8942..ab2dbefd12 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -328,19 +328,11 @@ string_hash(const void *key, Size keysize)
 
 	fasthash64_init(&hs, 0);
 
-	while (*buf)
+	while (*buf && s_len < keysize)
 	{
-		int chunk_len = 0;
+		s_len++;
 
-		for (int i = 0;
-			i < 8 && *buf++ && s_len < keysize;
-			i++)
-		{
-			chunk_len++;
-			s_len++;
-		}
-
-		fasthash64_accum(&hs, buf, chunk_len);
+		fasthash64_accum_byte(&hs, *buf);
 	}
 
 	return fasthash64_final32(&hs);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 82d8efbc96..2428f2475c 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -33,6 +33,7 @@
 #include "catalog/objectaccess.h"
 #include "catalog/pg_authid.h"
 #include "catalog/pg_parameter_acl.h"
+#include "common/hashfn_unstable.h"
 #include "guc_internal.h"
 #include "libpq/pqformat.h"
 #include "parser/scansup.h"
@@ -1324,22 +1325,21 @@ guc_name_compare(const char *namea, const char *nameb)
 static uint32
 guc_name_hash(const void *key, Size keysize)
 {
-	uint32		result = 0;
 	const char *name = *(const char *const *) key;
+	fasthash64_state hs;
+
+	fasthash64_init(&hs, 0);
 
 	while (*name)
 	{
 		char		ch = *name++;
 
-		/* Case-fold in the same way as guc_name_compare */
-		if (ch >= 'A' && ch <= 'Z')
-			ch += 'a' - 'A';
+		/* quick and dirty case-folding suitable for hashing */
+		ch |= 0x20;
 
-		/* Merge into hash ... not very bright, but it needn't be */
-		result = pg_rotate_left32(result, 5);
-		result ^= (uint32) ch;
+		fasthash64_accum_byte(&hs, ch);
 	}
-	return result;
+	return fasthash64_final32(&hs);
 }
 
 /*
diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index a95942f7af..1b7db5ac07 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -26,6 +26,7 @@
 typedef struct fasthash64_state
 {
 	uint64 accum;
+	int8	accum_len;
 	uint64 hash;
 } fasthash64_state;
 
@@ -46,6 +47,31 @@ void fasthash64_init(fasthash64_state *hs, uint64_t seed)
 	// setting seed would go here
 }
 
+static inline
+void fasthash64_accum_byte(fasthash64_state *hs, const unsigned char ch)
+{
+	const uint64_t    m = 0x880355f21e6d1965ULL;
+
+	hs->accum |= ch;
+
+	// wip: is there a better way to get sizeof struct member?
+	if (hs->accum_len == sizeof(((fasthash64_state *) 0)->accum))
+	{
+		// combine into hash
+		hs->hash ^= mix(hs->accum);
+		hs->hash *= m;
+
+		// reset accum
+		hs->accum = 0;
+		hs->accum_len = 0;
+	}
+	else
+	{
+		hs->accum <<= sizeof(unsigned char);
+		hs->accum_len += sizeof(unsigned char);
+	}
+}
+
 static inline
 void fasthash64_accum(fasthash64_state *hs, const void *buf, int len)
 {
@@ -94,6 +120,15 @@ void fasthash64_accum(fasthash64_state *hs, const void *buf, int len)
 static inline
 uint64_t fasthash64_final(fasthash64_state *hs)
 {
+	const uint64_t    m = 0x880355f21e6d1965ULL;
+
+	// check for remaining bytes to combine into hash
+	if (hs->accum_len > 0)
+	{
+		hs->hash ^= mix(hs->accum);
+		hs->hash *= m;
+	}
+
 	return mix(hs->hash);
 }
 
-- 
2.42.0

From fcabd2c486b46c0d99ab7e17739ce664e1c5860f Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 28 Nov 2023 19:22:54 +0700
Subject: [PATCH v5 3/4] Add incremental interface to fasthash and use it in
 string_hash

---
 src/backend/utils/hash/dynahash.c    | 25 ++++++++++++++---
 src/include/common/hashfn_unstable.h | 40 +++++++++++++++++++++-------
 2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 6ca1442647..1c08dc8942 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -316,15 +316,34 @@ string_compare(const char *key1, const char *key2, Size keysize)
 static inline uint32
 string_hash(const void *key, Size keysize)
 {
+	fasthash64_state hs;
+	int s_len = 0;
+	const char *buf = (const char *) key;
+
 	/*
 	 * If the string exceeds keysize-1 bytes, we want to hash only that many,
 	 * because when it is copied into the hash table it will be truncated at
 	 * that length.
 	 */
-	size_t		s_len = strlen((const char *) key);
 
-	s_len = Min(s_len, keysize - 1);
-	return fasthash32(key, s_len, 0);
+	fasthash64_init(&hs, 0);
+
+	while (*buf)
+	{
+		int chunk_len = 0;
+
+		for (int i = 0;
+			i < 8 && *buf++ && s_len < keysize;
+			i++)
+		{
+			chunk_len++;
+			s_len++;
+		}
+
+		fasthash64_accum(&hs, buf, chunk_len);
+	}
+
+	return fasthash64_final32(&hs);
 }
 
 
diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 04a24934bc..a95942f7af 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -23,6 +23,11 @@
    SOFTWARE.
 */
 
+typedef struct fasthash64_state
+{
+	uint64 accum;
+	uint64 hash;
+} fasthash64_state;
 
 // Compression function for Merkle-Damgard construction.
 // This function is generated using the framework provided.
@@ -33,19 +38,31 @@ static inline uint64_t mix(uint64_t h) {
 	return h;
 }
 
-// security: if the system allows empty keys (len=3) the seed is exposed, the reverse of mix.
-// objsize: 0-1fd: 509
 static inline
-uint64_t fasthash64(const void *buf, size_t len, uint64_t seed)
+void fasthash64_init(fasthash64_state *hs, uint64_t seed)
+{
+	memset(hs, 0, sizeof(fasthash64_state));
+
+	// setting seed would go here
+}
+
+static inline
+void fasthash64_accum(fasthash64_state *hs, const void *buf, int len)
 {
 	const uint64_t    m = 0x880355f21e6d1965ULL;
 	const uint64_t *pos = (const uint64_t *)buf;
 	const uint64_t *end = pos + (len / 8);
 	const unsigned char *pos2;
-	uint64_t h = seed ^ (len * m);
-	uint64_t v;
+
+	// since we don't know the length for a nul-terminated string
+	// handle some other way -- maybe we can accum the length in
+	// the state and fold it in during the finalizer (cf. xxHash3)
+	//uint64_t h = seed ^ (len * m);
+	uint64_t v = hs->accum;
+	uint64 h = hs->hash;
 
 	while (pos != end) {
+		// wip: use memcpy for alignment-picky platforms
 		v  = *pos++;
 		h ^= mix(v);
 		h *= m;
@@ -71,17 +88,22 @@ uint64_t fasthash64(const void *buf, size_t len, uint64_t seed)
 		h ^= mix(v);
 		h *= m;
 	}
-
-	return mix(h);
 } 
 
+
+static inline
+uint64_t fasthash64_final(fasthash64_state *hs)
+{
+	return mix(hs->hash);
+}
+
 // objsize: 0-236: 566
 static inline
-uint32_t fasthash32(const void *buf, size_t len, uint32_t seed)
+uint32_t fasthash64_final32(fasthash64_state *hs)
 {
 	// the following trick converts the 64-bit hashcode to Fermat
 	// residue, which shall retain information from both the higher
 	// and lower parts of hashcode.
-        uint64_t h = fasthash64(buf, len, seed);
+        uint64_t h = fasthash64_final(hs);
 	return h - (h >> 32);
 }
-- 
2.42.0

From bcfa6ba22de8c85b3448de6aec14edeb27003bc8 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Mon, 27 Nov 2023 17:03:38 +0700
Subject: [PATCH v5 1/4] Vendor fasthash

MIT licensed

Using copy found at

https://github.com/rurban/smhasher/commit/375a0b20272ca928830c1f4c890d34b3919cbcb3
---
 src/include/common/hashfn_unstable.h | 80 ++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 src/include/common/hashfn_unstable.h

diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
new file mode 100644
index 0000000000..76ed27c0a0
--- /dev/null
+++ b/src/include/common/hashfn_unstable.h
@@ -0,0 +1,80 @@
+/* The MIT License
+
+   Copyright (C) 2012 Zilong Tan (eric.zl...@gmail.com)
+
+   Permission is hereby granted, free of charge, to any person
+   obtaining a copy of this software and associated documentation
+   files (the "Software"), to deal in the Software without
+   restriction, including without limitation the rights to use, copy,
+   modify, merge, publish, distribute, sublicense, and/or sell copies
+   of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include "fasthash.h"
+
+// Compression function for Merkle-Damgard construction.
+// This function is generated using the framework provided.
+static inline uint64_t mix(uint64_t h) {				
+	h ^= h >> 23;		
+	h *= 0x2127599bf4325c37ULL;
+	h ^= h >> 47;
+	return h;
+}
+
+// security: if the system allows empty keys (len=3) the seed is exposed, the reverse of mix.
+// objsize: 0-1fd: 509
+uint64_t fasthash64(const void *buf, size_t len, uint64_t seed)
+{
+	const uint64_t    m = 0x880355f21e6d1965ULL;
+	const uint64_t *pos = (const uint64_t *)buf;
+	const uint64_t *end = pos + (len / 8);
+	const unsigned char *pos2;
+	uint64_t h = seed ^ (len * m);
+	uint64_t v;
+
+	while (pos != end) {
+		v  = *pos++;
+		h ^= mix(v);
+		h *= m;
+	}
+
+	pos2 = (const unsigned char*)pos;
+	v = 0;
+
+	switch (len & 7) {
+	case 7: v ^= (uint64_t)pos2[6] << 48;
+	case 6: v ^= (uint64_t)pos2[5] << 40;
+	case 5: v ^= (uint64_t)pos2[4] << 32;
+	case 4: v ^= (uint64_t)pos2[3] << 24;
+	case 3: v ^= (uint64_t)pos2[2] << 16;
+	case 2: v ^= (uint64_t)pos2[1] << 8;
+	case 1: v ^= (uint64_t)pos2[0];
+		h ^= mix(v);
+		h *= m;
+	}
+
+	return mix(h);
+} 
+
+// objsize: 0-236: 566
+uint32_t fasthash32(const void *buf, size_t len, uint32_t seed)
+{
+	// the following trick converts the 64-bit hashcode to Fermat
+	// residue, which shall retain information from both the higher
+	// and lower parts of hashcode.
+        uint64_t h = fasthash64(buf, len, seed);
+	return h - (h >> 32);
+}
-- 
2.42.0

From 0b29afbee9cf795136b722841c8fc4ee4667ee0d Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 28 Nov 2023 18:09:00 +0700
Subject: [PATCH v5 2/4] Inline string_hash and call out to fasthash instead of
 hash_bytes

---
 src/backend/utils/hash/dynahash.c    | 20 ++++++++++++++++++++
 src/common/hashfn.c                  | 19 -------------------
 src/include/common/hashfn.h          |  1 -
 src/include/common/hashfn_unstable.h |  9 ++++++++-
 4 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 012d4a0b1f..6ca1442647 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -98,6 +98,7 @@
 
 #include "access/xact.h"
 #include "common/hashfn.h"
+#include "common/hashfn_unstable.h"
 #include "port/pg_bitutils.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
@@ -307,6 +308,25 @@ string_compare(const char *key1, const char *key2, Size keysize)
 	return strncmp(key1, key2, keysize - 1);
 }
 
+/*
+ * string_hash: hash function for keys that are NUL-terminated strings.
+ *
+ * NOTE: this is the default hash function if none is specified.
+ */
+static inline uint32
+string_hash(const void *key, Size keysize)
+{
+	/*
+	 * If the string exceeds keysize-1 bytes, we want to hash only that many,
+	 * because when it is copied into the hash table it will be truncated at
+	 * that length.
+	 */
+	size_t		s_len = strlen((const char *) key);
+
+	s_len = Min(s_len, keysize - 1);
+	return fasthash32(key, s_len, 0);
+}
+
 
 /************************** CREATE ROUTINES **********************/
 
diff --git a/src/common/hashfn.c b/src/common/hashfn.c
index 2490607eea..65e4dd07ba 100644
--- a/src/common/hashfn.c
+++ b/src/common/hashfn.c
@@ -651,25 +651,6 @@ hash_bytes_uint32_extended(uint32 k, uint64 seed)
 	return ((uint64) b << 32) | c;
 }
 
-/*
- * string_hash: hash function for keys that are NUL-terminated strings.
- *
- * NOTE: this is the default hash function if none is specified.
- */
-uint32
-string_hash(const void *key, Size keysize)
-{
-	/*
-	 * If the string exceeds keysize-1 bytes, we want to hash only that many,
-	 * because when it is copied into the hash table it will be truncated at
-	 * that length.
-	 */
-	Size		s_len = strlen((const char *) key);
-
-	s_len = Min(s_len, keysize - 1);
-	return hash_bytes((const unsigned char *) key, (int) s_len);
-}
-
 /*
  * tag_hash: hash function for fixed-size tag values
  */
diff --git a/src/include/common/hashfn.h b/src/include/common/hashfn.h
index adc1dc1de8..54ab616ba4 100644
--- a/src/include/common/hashfn.h
+++ b/src/include/common/hashfn.h
@@ -52,7 +52,6 @@ hash_uint32_extended(uint32 k, uint64 seed)
 }
 #endif
 
-extern uint32 string_hash(const void *key, Size keysize);
 extern uint32 tag_hash(const void *key, Size keysize);
 extern uint32 uint32_hash(const void *key, Size keysize);
 
diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 76ed27c0a0..04a24934bc 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -23,7 +23,6 @@
    SOFTWARE.
 */
 
-#include "fasthash.h"
 
 // Compression function for Merkle-Damgard construction.
 // This function is generated using the framework provided.
@@ -36,6 +35,7 @@ static inline uint64_t mix(uint64_t h) {
 
 // security: if the system allows empty keys (len=3) the seed is exposed, the reverse of mix.
 // objsize: 0-1fd: 509
+static inline
 uint64_t fasthash64(const void *buf, size_t len, uint64_t seed)
 {
 	const uint64_t    m = 0x880355f21e6d1965ULL;
@@ -56,11 +56,17 @@ uint64_t fasthash64(const void *buf, size_t len, uint64_t seed)
 
 	switch (len & 7) {
 	case 7: v ^= (uint64_t)pos2[6] << 48;
+		/* FALLTHROUGH */
 	case 6: v ^= (uint64_t)pos2[5] << 40;
+		/* FALLTHROUGH */
 	case 5: v ^= (uint64_t)pos2[4] << 32;
+		/* FALLTHROUGH */
 	case 4: v ^= (uint64_t)pos2[3] << 24;
+		/* FALLTHROUGH */
 	case 3: v ^= (uint64_t)pos2[2] << 16;
+		/* FALLTHROUGH */
 	case 2: v ^= (uint64_t)pos2[1] << 8;
+		/* FALLTHROUGH */
 	case 1: v ^= (uint64_t)pos2[0];
 		h ^= mix(v);
 		h *= m;
@@ -70,6 +76,7 @@ uint64_t fasthash64(const void *buf, size_t len, uint64_t seed)
 } 
 
 // objsize: 0-236: 566
+static inline
 uint32_t fasthash32(const void *buf, size_t len, uint32_t seed)
 {
 	// the following trick converts the 64-bit hashcode to Fermat
-- 
2.42.0

Reply via email to