On Tue, Jan 30, 2024 at 7:51 PM Ants Aasma <ants.aa...@cybertec.at> wrote:
>
> It didn't calculate the same result because the if (mask) condition
> was incorrect. Changed it to if (chunk & 0xFF) and removed the right
> shift from the mask.

Yes, you're quite right.

> It seems to be half a nanosecond faster, but as I
> don't have a machine set up for microbenchmarking it's quite close to
> measurement noise.

With my "throughput-ush" test, they look good:

pgbench -n -T 20 -f bench_cstr_aligned.sql -M prepared | grep latency

master:
latency average = 490.722 ms

(Ants Aantsma) v-17 0001:
latency average = 385.263 ms

v17 0001+0002:
latency average = 339.506 ms

> I didn't post the harness as it's currently so messy to be near
> useless to others. But if you'd like to play around,  I can tidy it up
> a bit and post it.

I'd be curious, thanks.
From 77d848a83930abe2badd8c0f1ade79c88c27b455 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 30 Jan 2024 16:14:57 +0700
Subject: [PATCH v17 2/2] Shorten dependency chain for computing hash mask

---
 src/include/common/hashfn_unstable.h | 30 ++++++++++++----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 8ee1b99a20..3a74e6e33b 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -176,19 +176,6 @@ fasthash_accum(fasthash_state *hs, const char *k, int len)
 #define haszero64(v) \
 	(((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
 
-/*
- * Returns non-zero when first byte in memory order is not NUL
- */
-static inline int
-first_byte_nonzero(uint64 v)
-{
-#ifdef WORDS_BIGENDIAN
-		return v >> 56;
-#else
-		return v & 0xFF;
-#endif
-}
-
 /*
  * all-purpose workhorse for fasthash_accum_cstring
  */
@@ -225,6 +212,7 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	int			remainder;
 	uint64		zero_bytes_le;
 	uint64		chunk;
+	uint64		mask;
 
 	Assert(PointerIsAligned(start, uint64));
 	for (;;)
@@ -257,14 +245,22 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	 * byte within the input word by counting the number of trailing (because
 	 * little-endian) zeros and dividing the result by 8.
 	 */
-	if (first_byte_nonzero(chunk))
+	/* If the first byte in the input is the NUL terminator, we have nothing to do */
+	if ((zero_bytes_le & 0xFF) == 0)
 	{
 		remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE;
+		/*
+		 * Create a mask for the remaining bytes and
+		 * combine them into the hash. The mask also covers the NUL
+		 * terminator, but that's harmless. The mask could contain 0x80
+		 * in higher bytes where the input is non-zero, but only if the
+		 * input byte is 0x01, so also harmless.
+		 */
+		mask = zero_bytes_le | (zero_bytes_le - 1);
 #ifdef WORDS_BIGENDIAN
-		hs->accum = chunk & ((~0ULL) << (64 - BITS_PER_BYTE*remainder));
-#else
-		hs->accum = chunk & ((~0ULL) >> (64 - BITS_PER_BYTE*remainder));
+		mask = pg_bswap64(mask);
 #endif
+		hs->accum = chunk & mask;
 		fasthash_combine(hs);
 
 		str += remainder;
-- 
2.43.0

From 5617ee1450316ad4f494c9378ae1c53dbb095b89 Mon Sep 17 00:00:00 2001
From: Ants Aasma <a...@cybertec.at>
Date: Mon, 29 Jan 2024 15:16:02 +0200
Subject: [PATCH v17 1/2] Speed up last iteration of aligned fasthash

We know the length of the string so we can mask out end of the
string with a shift. Without this the aligned version was slower
than unaligned on small strings.
---
 src/include/common/hashfn_unstable.h | 31 ++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 3d927e1fb1..8ee1b99a20 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -176,6 +176,19 @@ fasthash_accum(fasthash_state *hs, const char *k, int len)
 #define haszero64(v) \
 	(((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
 
+/*
+ * Returns non-zero when first byte in memory order is not NUL
+ */
+static inline int
+first_byte_nonzero(uint64 v)
+{
+#ifdef WORDS_BIGENDIAN
+		return v >> 56;
+#else
+		return v & 0xFF;
+#endif
+}
+
 /*
  * all-purpose workhorse for fasthash_accum_cstring
  */
@@ -211,11 +224,12 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	const char *const start = str;
 	int			remainder;
 	uint64		zero_bytes_le;
+	uint64		chunk;
 
 	Assert(PointerIsAligned(start, uint64));
 	for (;;)
 	{
-		uint64		chunk = *(uint64 *) str;
+		chunk = *(uint64 *) str;
 
 		/*
 		 * With little-endian representation, we can use this calculation,
@@ -243,9 +257,18 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	 * byte within the input word by counting the number of trailing (because
 	 * little-endian) zeros and dividing the result by 8.
 	 */
-	remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE;
-	fasthash_accum(hs, str, remainder);
-	str += remainder;
+	if (first_byte_nonzero(chunk))
+	{
+		remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE;
+#ifdef WORDS_BIGENDIAN
+		hs->accum = chunk & ((~0ULL) << (64 - BITS_PER_BYTE*remainder));
+#else
+		hs->accum = chunk & ((~0ULL) >> (64 - BITS_PER_BYTE*remainder));
+#endif
+		fasthash_combine(hs);
+
+		str += remainder;
+	}
 
 	return str - start;
 }
-- 
2.43.0

Reply via email to