On Tue, Jan 30, 2024 at 7:51 PM Ants Aasma <ants.aa...@cybertec.at> wrote: > > It didn't calculate the same result because the if (mask) condition > was incorrect. Changed it to if (chunk & 0xFF) and removed the right > shift from the mask.
Yes, you're quite right. > It seems to be half a nanosecond faster, but as I > don't have a machine set up for microbenchmarking it's quite close to > measurement noise. With my "throughput-ush" test, they look good: pgbench -n -T 20 -f bench_cstr_aligned.sql -M prepared | grep latency master: latency average = 490.722 ms (Ants Aantsma) v-17 0001: latency average = 385.263 ms v17 0001+0002: latency average = 339.506 ms > I didn't post the harness as it's currently so messy to be near > useless to others. But if you'd like to play around, I can tidy it up > a bit and post it. I'd be curious, thanks.
From 77d848a83930abe2badd8c0f1ade79c88c27b455 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 30 Jan 2024 16:14:57 +0700 Subject: [PATCH v17 2/2] Shorten dependency chain for computing hash mask --- src/include/common/hashfn_unstable.h | 30 ++++++++++++---------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h index 8ee1b99a20..3a74e6e33b 100644 --- a/src/include/common/hashfn_unstable.h +++ b/src/include/common/hashfn_unstable.h @@ -176,19 +176,6 @@ fasthash_accum(fasthash_state *hs, const char *k, int len) #define haszero64(v) \ (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080) -/* - * Returns non-zero when first byte in memory order is not NUL - */ -static inline int -first_byte_nonzero(uint64 v) -{ -#ifdef WORDS_BIGENDIAN - return v >> 56; -#else - return v & 0xFF; -#endif -} - /* * all-purpose workhorse for fasthash_accum_cstring */ @@ -225,6 +212,7 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) int remainder; uint64 zero_bytes_le; uint64 chunk; + uint64 mask; Assert(PointerIsAligned(start, uint64)); for (;;) @@ -257,14 +245,22 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) * byte within the input word by counting the number of trailing (because * little-endian) zeros and dividing the result by 8. */ - if (first_byte_nonzero(chunk)) + /* If the first byte in the input is the NUL terminator, we have nothing to do */ + if ((zero_bytes_le & 0xFF) == 0) { remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE; + /* + * Create a mask for the remaining bytes and + * combine them into the hash. The mask also covers the NUL + * terminator, but that's harmless. The mask could contain 0x80 + * in higher bytes where the input is non-zero, but only if the + * input byte is 0x01, so also harmless. + */ + mask = zero_bytes_le | (zero_bytes_le - 1); #ifdef WORDS_BIGENDIAN - hs->accum = chunk & ((~0ULL) << (64 - BITS_PER_BYTE*remainder)); -#else - hs->accum = chunk & ((~0ULL) >> (64 - BITS_PER_BYTE*remainder)); + mask = pg_bswap64(mask); #endif + hs->accum = chunk & mask; fasthash_combine(hs); str += remainder; -- 2.43.0
From 5617ee1450316ad4f494c9378ae1c53dbb095b89 Mon Sep 17 00:00:00 2001 From: Ants Aasma <a...@cybertec.at> Date: Mon, 29 Jan 2024 15:16:02 +0200 Subject: [PATCH v17 1/2] Speed up last iteration of aligned fasthash We know the length of the string so we can mask out end of the string with a shift. Without this the aligned version was slower than unaligned on small strings. --- src/include/common/hashfn_unstable.h | 31 ++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h index 3d927e1fb1..8ee1b99a20 100644 --- a/src/include/common/hashfn_unstable.h +++ b/src/include/common/hashfn_unstable.h @@ -176,6 +176,19 @@ fasthash_accum(fasthash_state *hs, const char *k, int len) #define haszero64(v) \ (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080) +/* + * Returns non-zero when first byte in memory order is not NUL + */ +static inline int +first_byte_nonzero(uint64 v) +{ +#ifdef WORDS_BIGENDIAN + return v >> 56; +#else + return v & 0xFF; +#endif +} + /* * all-purpose workhorse for fasthash_accum_cstring */ @@ -211,11 +224,12 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) const char *const start = str; int remainder; uint64 zero_bytes_le; + uint64 chunk; Assert(PointerIsAligned(start, uint64)); for (;;) { - uint64 chunk = *(uint64 *) str; + chunk = *(uint64 *) str; /* * With little-endian representation, we can use this calculation, @@ -243,9 +257,18 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) * byte within the input word by counting the number of trailing (because * little-endian) zeros and dividing the result by 8. */ - remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE; - fasthash_accum(hs, str, remainder); - str += remainder; + if (first_byte_nonzero(chunk)) + { + remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE; +#ifdef WORDS_BIGENDIAN + hs->accum = chunk & ((~0ULL) << (64 - BITS_PER_BYTE*remainder)); +#else + hs->accum = chunk & ((~0ULL) >> (64 - BITS_PER_BYTE*remainder)); +#endif + fasthash_combine(hs); + + str += remainder; + } return str - start; } -- 2.43.0