Hi John, Many thanks for your feedback!
> There's more we can do here. Above the stanzas changed in the patch > there is this, at least for varlena/bytea: > > hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, > Min(len, PG_CACHE_LINE_SIZE))); > > This makes no sense to me: hash_any() calls hash_bytes() and turns the > result into a Datum, and then we just get it right back out of the > Datum again. I see similar patterns in files other than bytea.c and varlena.c. Implemented as a separate patch. > if (len > PG_CACHE_LINE_SIZE) > hash ^= DatumGetUInt32(hash_uint32((uint32) len)); > > Similar here, but instead of hash_bytes_uint32(), we may as well use > mumurhash32(). Ditto. It's worth noting that timetz_hash() uses a similar pattern but I choose to keep it as is. Changing it will break backward compatibility. Also it breaks our tests. Using hash_bytes_uint32() / hash_bytes_uint32_extended() directly in timetz_hash() / timetz_hash_extended() is safe though. Proposed as a separate patch. > addHyperLogLog says "typically generated using > hash_any()", but that function takes a uint32, not a Datum, so that > comment should probably be changed. hash_bytes() is global, so we can > use it directly. Makes sense. Implemented as an independent patch. -- Best regards, Aleksander Alekseev
From f6753622e89efc632150ffdd1aaddcb58e427106 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Tue, 3 Feb 2026 17:24:19 +0300 Subject: [PATCH v2 4/5] Improve the comment for addHyperLogLog() Previously the comment suggested to use hash_any() which return value is Datum. Since the second argument of addHyperLogLog() is not a Datum but rather uint32, recommending hash_bytes() is more appropriate. Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: TODO FIXME Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com --- src/backend/lib/hyperloglog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c index c74f11217ef..2b82ff12b2e 100644 --- a/src/backend/lib/hyperloglog.c +++ b/src/backend/lib/hyperloglog.c @@ -158,7 +158,7 @@ freeHyperLogLog(hyperLogLogState *cState) * Adds element to the estimator, from caller-supplied hash. * * It is critical that the hash value passed be an actual hash value, typically - * generated using hash_any(). The algorithm relies on a specific bit-pattern + * generated using hash_bytes(). The algorithm relies on a specific bit-pattern * observable in conjunction with stochastic averaging. There must be a * uniform distribution of bits in hash values for each distinct original value * observed. -- 2.43.0
From 18e85156bb64d980f2867d3a36da6f34b26bcc19 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Tue, 3 Feb 2026 17:47:45 +0300 Subject: [PATCH v2 5/5] Avoid unnecessary type casting in timetz_hash() / timetz_hash_extended() Simplify the code by calling hash_bytes_uint32() / hash_bytes_uint32_extended() directly. This removes unnecessary uint -> Datum -> uint casting. Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: TODO FIXME Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com --- src/backend/utils/adt/date.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 621b9175c12..7ec08b3bb48 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -2637,7 +2637,7 @@ timetz_hash(PG_FUNCTION_ARGS) */ thash = DatumGetUInt32(DirectFunctionCall1(hashint8, Int64GetDatumFast(key->time))); - thash ^= DatumGetUInt32(hash_uint32(key->zone)); + thash ^= hash_bytes_uint32(key->zone); PG_RETURN_UINT32(thash); } @@ -2652,8 +2652,8 @@ timetz_hash_extended(PG_FUNCTION_ARGS) thash = DatumGetUInt64(DirectFunctionCall2(hashint8extended, Int64GetDatumFast(key->time), seed)); - thash ^= DatumGetUInt64(hash_uint32_extended(key->zone, - DatumGetInt64(seed))); + thash ^= hash_bytes_uint32_extended(key->zone, + DatumGetInt64(seed)); PG_RETURN_UINT64(thash); } -- 2.43.0
From 944a1909a4659452d80d1516a4a180824928e150 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Tue, 3 Feb 2026 17:13:17 +0300 Subject: [PATCH v2 3/5] Use murmurhash32() instead of hash_uint32() where appropriate Avoid unnecessary uint32 -> Datum -> uint32 casting and use a better hash function when possible. Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: TODO FIXME Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com --- src/backend/commands/async.c | 2 +- src/backend/utils/adt/bytea.c | 2 +- src/backend/utils/adt/varlena.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 9ed65bd65ac..96987b145ca 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -651,7 +651,7 @@ globalChannelTableHash(const void *key, size_t size, void *arg) const GlobalChannelKey *k = (const GlobalChannelKey *) key; dshash_hash h; - h = DatumGetUInt32(hash_uint32(k->dboid)); + h = murmurhash32(k->dboid); h ^= hash_bytes((const unsigned char *) k->channel, strnlen(k->channel, NAMEDATALEN)); diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index 31c849bd2a1..1e255f00570 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -1110,7 +1110,7 @@ bytea_abbrev_convert(Datum original, SortSupport ssup) Min(len, PG_CACHE_LINE_SIZE)); if (len > PG_CACHE_LINE_SIZE) - hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + hash ^= murmurhash32((uint32) len); addHyperLogLog(&bss->full_card, hash); diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 717f26c56fa..24cb47afe47 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -2110,7 +2110,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) Min(len, PG_CACHE_LINE_SIZE)); if (len > PG_CACHE_LINE_SIZE) - hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + hash ^= murmurhash32((uint32) len); addHyperLogLog(&sss->full_card, hash); -- 2.43.0
From 99083753d802bc81f3debeb05f7073896d8e1c8b Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Tue, 3 Feb 2026 16:45:10 +0300 Subject: [PATCH v2 2/5] Avoid unnecessary type casting when using hash_any() hash_any() is merely a wrapper for hash_bytes(). Call it directly when possible in order to avoid unnecessary type casting. Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: TODO FIXME Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com --- contrib/ltree/ltree_op.c | 2 +- src/backend/access/tablesample/bernoulli.c | 4 ++-- src/backend/access/tablesample/system.c | 4 ++-- src/backend/commands/async.c | 8 ++++---- src/backend/nodes/bitmapset.c | 4 ++-- src/backend/tsearch/ts_typanalyze.c | 4 ++-- src/backend/utils/adt/bytea.c | 4 ++-- src/backend/utils/adt/jsonb_gin.c | 2 +- src/backend/utils/adt/jsonb_util.c | 4 ++-- src/backend/utils/adt/varlena.c | 4 ++-- src/backend/utils/cache/funccache.c | 8 ++++---- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c index c1fc77fc804..a9a1bce0861 100644 --- a/contrib/ltree/ltree_op.c +++ b/contrib/ltree/ltree_op.c @@ -144,7 +144,7 @@ hash_ltree(PG_FUNCTION_ARGS) while (an > 0) { - uint32 levelHash = DatumGetUInt32(hash_any((unsigned char *) al->name, al->len)); + uint32 levelHash = hash_bytes((unsigned char *) al->name, al->len); /* * Combine hash values of successive elements by multiplying the diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c index 7d8560464c8..f768d40143e 100644 --- a/src/backend/access/tablesample/bernoulli.c +++ b/src/backend/access/tablesample/bernoulli.c @@ -214,8 +214,8 @@ bernoulli_nextsampletuple(SampleScanState *node, hashinput[1] = tupoffset; - hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, - (int) sizeof(hashinput))); + hash = hash_bytes((const unsigned char *) hashinput, + (int) sizeof(hashinput)); if (hash < sampler->cutoff) break; } diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index a2b9ba8eea9..de13dd8cab9 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -202,8 +202,8 @@ system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) hashinput[0] = nextblock; - hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, - (int) sizeof(hashinput))); + hash = hash_bytes((const unsigned char *) hashinput, + (int) sizeof(hashinput)); if (hash < sampler->cutoff) break; } diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 657c591618d..9ed65bd65ac 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -652,8 +652,8 @@ globalChannelTableHash(const void *key, size_t size, void *arg) dshash_hash h; h = DatumGetUInt32(hash_uint32(k->dboid)); - h ^= DatumGetUInt32(hash_any((const unsigned char *) k->channel, - strnlen(k->channel, NAMEDATALEN))); + h ^= hash_bytes((const unsigned char *) k->channel, + strnlen(k->channel, NAMEDATALEN)); return h; } @@ -3244,8 +3244,8 @@ notification_hash(const void *key, Size keysize) Assert(keysize == sizeof(Notification *)); /* We don't bother to include the payload's trailing null in the hash */ - return DatumGetUInt32(hash_any((const unsigned char *) k->data, - k->channel_len + k->payload_len + 1)); + return hash_bytes((const unsigned char *) k->data, + k->channel_len + k->payload_len + 1); } /* diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index a4765876c31..e6b6ac05e59 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -1420,8 +1420,8 @@ bms_hash_value(const Bitmapset *a) if (a == NULL) return 0; /* All empty sets hash to 0 */ - return DatumGetUInt32(hash_any((const unsigned char *) a->words, - a->nwords * sizeof(bitmapword))); + return hash_bytes((const unsigned char *) a->words, + a->nwords * sizeof(bitmapword)); } /* diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index 0c513d694e7..2a19bd21990 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -496,8 +496,8 @@ lexeme_hash(const void *key, Size keysize) { const LexemeHashKey *l = (const LexemeHashKey *) key; - return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme, - l->length)); + return hash_bytes((const unsigned char *) l->lexeme, + l->length); } /* diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index d880b62688c..31c849bd2a1 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -1106,8 +1106,8 @@ bytea_abbrev_convert(Datum original, SortSupport ssup) * in order to compensate for cases where differences are past * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. */ - hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, - Min(len, PG_CACHE_LINE_SIZE))); + hash = hash_bytes((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE)); if (len > PG_CACHE_LINE_SIZE) hash ^= DatumGetUInt32(hash_uint32((uint32) len)); diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c index d72a6441c5e..f5dbd5589d3 100644 --- a/src/backend/utils/adt/jsonb_gin.c +++ b/src/backend/utils/adt/jsonb_gin.c @@ -1333,7 +1333,7 @@ make_text_key(char flag, const char *str, int len) { uint32 hashval; - hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len)); + hashval = hash_bytes((const unsigned char *) str, len); snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval); str = hashbuf; len = 8; diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index e085042f912..d50dcab3dd3 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -1451,8 +1451,8 @@ JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash) tmp = 0x01; break; case jbvString: - tmp = DatumGetUInt32(hash_any((const unsigned char *) scalarVal->val.string.val, - scalarVal->val.string.len)); + tmp = hash_bytes((const unsigned char *) scalarVal->val.string.val, + scalarVal->val.string.len); break; case jbvNumeric: /* Must hash equal numerics to equal hash codes */ diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 7546caacf91..717f26c56fa 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -2106,8 +2106,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * in order to compensate for cases where differences are past * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. */ - hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, - Min(len, PG_CACHE_LINE_SIZE))); + hash = hash_bytes((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE)); if (len > PG_CACHE_LINE_SIZE) hash ^= DatumGetUInt32(hash_uint32((uint32) len)); diff --git a/src/backend/utils/cache/funccache.c b/src/backend/utils/cache/funccache.c index 701c294b88d..68bbf0b971c 100644 --- a/src/backend/utils/cache/funccache.c +++ b/src/backend/utils/cache/funccache.c @@ -89,13 +89,13 @@ cfunc_hash(const void *key, Size keysize) Assert(keysize == sizeof(CachedFunctionHashKey)); /* Hash all the fixed fields except callResultType */ - h = DatumGetUInt32(hash_any((const unsigned char *) k, - offsetof(CachedFunctionHashKey, callResultType))); + h = hash_bytes((const unsigned char *) k, + offsetof(CachedFunctionHashKey, callResultType)); /* Incorporate input argument types */ if (k->nargs > 0) h = hash_combine(h, - DatumGetUInt32(hash_any((const unsigned char *) k->argtypes, - k->nargs * sizeof(Oid)))); + hash_bytes((const unsigned char *) k->argtypes, + k->nargs * sizeof(Oid))); /* Incorporate callResultType if present */ if (k->callResultType) h = hash_combine(h, hashRowType(k->callResultType)); -- 2.43.0
From e72d2675fd8360f57729f67d60bd630a5ea2e203 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Tue, 13 Jan 2026 14:51:21 +0300 Subject: [PATCH v2 1/5] Refactor *_abbrev_convert() functions Now when all Datums are 64-bit values we can simplify the code by using murmurhash64(). Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com --- src/backend/utils/adt/bytea.c | 7 +------ src/backend/utils/adt/mac.c | 11 +++-------- src/backend/utils/adt/network.c | 6 +----- src/backend/utils/adt/numeric.c | 5 +---- src/backend/utils/adt/uuid.c | 6 +----- src/backend/utils/adt/varlena.c | 7 +------ 6 files changed, 8 insertions(+), 34 deletions(-) diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index fd7662d41ee..d880b62688c 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -1115,12 +1115,7 @@ bytea_abbrev_convert(Datum original, SortSupport ssup) addHyperLogLog(&bss->full_card, hash); /* Hash abbreviated key */ - { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - hash = DatumGetUInt32(hash_uint32(tmp)); - } + hash = (uint32) murmurhash64(DatumGetUInt64(res)); addHyperLogLog(&bss->abbr_card, hash); diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c index f14675dea40..0658846f274 100644 --- a/src/backend/utils/adt/mac.c +++ b/src/backend/utils/adt/mac.c @@ -492,17 +492,12 @@ macaddr_abbrev_convert(Datum original, SortSupport ssup) uss->input_count += 1; /* - * Cardinality estimation. The estimate uses uint32, so XOR the two 32-bit - * halves together to produce slightly more entropy. The two zeroed bytes - * won't have any practical impact on this operation. + * Cardinality estimation. The estimate uses uint32, so we hash the full + * 64-bit value and take the lower 32 bits of the result. */ if (uss->estimating) { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res))); } /* diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index 3a2002097dd..c226af5ca80 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -739,11 +739,7 @@ network_abbrev_convert(Datum original, SortSupport ssup) /* Hash abbreviated key */ if (uss->estimating) { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res))); } return res; diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 891ae6ba7fe..6d5e2a3644f 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -2397,10 +2397,7 @@ numeric_abbrev_convert_var(const NumericVar *var, NumericSortSupport *nss) if (nss->estimating) { - uint32 tmp = ((uint32) result - ^ (uint32) ((uint64) result >> 32)); - - addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&nss->abbr_card, (uint32) murmurhash64(result)); } return NumericAbbrevGetDatum(result); diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index 6ee3752ac78..888802c3012 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -396,11 +396,7 @@ uuid_abbrev_convert(Datum original, SortSupport ssup) if (uss->estimating) { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res))); } /* diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 552ac0c61d3..7546caacf91 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -2115,12 +2115,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) addHyperLogLog(&sss->full_card, hash); /* Hash abbreviated key */ - { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - hash = DatumGetUInt32(hash_uint32(tmp)); - } + hash = (uint32) murmurhash64(DatumGetUInt64(res)); addHyperLogLog(&sss->abbr_card, hash); -- 2.43.0
