Hi John, Thanks again.
> I think it makes sense to squash 0001 and 0003 together, then 0002 and > 0004 together. > [...] > 0005 doesn't buy us as much in readability since the two lines no longer > match. Makes sense. > For the first, we should probably combine in the upper half when using > a 64-bit hash, like this: We could do it if you insist but I'm convinced this is redundant. In a good hash upper 32 bits are as evenly distributed as lower ones so this combining doesn't buy us much. This may even cause more collisions, for values that didn't have them initially. > Further cleanup possible now that we have 64-bit datums: MAC addresses > are always 6 bytes, so abbreviation is no longer relevant -- datum1 is > authoritative. That's in scope for the thread subject but also a > bigger patch, but maybe someone would like to pick it up for PG20. I will pick it up and submit as a separate patch a bit later. -- Best regards, Aleksander Alekseev
From 3cd625d2d42554111b34d70d0820f68c7ec5db11 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Tue, 13 Jan 2026 14:51:21 +0300 Subject: [PATCH v3 1/2] Simplify abbreviated key hashing using murmurhash64() Now when all Datums are 64-bit values we can simplify the code by using murmurhash64() in *_abbrev_convert() functions. Also replace hash_uint32() with murmurhash32() in a few other places for consistency. Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com --- src/backend/commands/async.c | 6 +++--- src/backend/utils/adt/bytea.c | 9 ++------- src/backend/utils/adt/mac.c | 11 +++-------- src/backend/utils/adt/network.c | 6 +----- src/backend/utils/adt/numeric.c | 5 +---- src/backend/utils/adt/uuid.c | 6 +----- src/backend/utils/adt/varlena.c | 9 ++------- 7 files changed, 13 insertions(+), 39 deletions(-) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 657c591618d..315270ddb11 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -651,9 +651,9 @@ globalChannelTableHash(const void *key, size_t size, void *arg) const GlobalChannelKey *k = (const GlobalChannelKey *) key; dshash_hash h; - h = DatumGetUInt32(hash_uint32(k->dboid)); - h ^= DatumGetUInt32(hash_any((const unsigned char *) k->channel, - strnlen(k->channel, NAMEDATALEN))); + h = murmurhash32(k->dboid); + h ^= hash_any((const unsigned char *) k->channel, + strnlen(k->channel, NAMEDATALEN)); return h; } diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index fd7662d41ee..f32bd23d03c 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -1110,17 +1110,12 @@ bytea_abbrev_convert(Datum original, SortSupport ssup) Min(len, PG_CACHE_LINE_SIZE))); if (len > PG_CACHE_LINE_SIZE) - hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + hash ^= murmurhash32((uint32) len); addHyperLogLog(&bss->full_card, hash); /* Hash abbreviated key */ - { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - hash = DatumGetUInt32(hash_uint32(tmp)); - } + hash = (uint32) murmurhash64(DatumGetUInt64(res)); addHyperLogLog(&bss->abbr_card, hash); diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c index f14675dea40..0658846f274 100644 --- a/src/backend/utils/adt/mac.c +++ b/src/backend/utils/adt/mac.c @@ -492,17 +492,12 @@ macaddr_abbrev_convert(Datum original, SortSupport ssup) uss->input_count += 1; /* - * Cardinality estimation. The estimate uses uint32, so XOR the two 32-bit - * halves together to produce slightly more entropy. The two zeroed bytes - * won't have any practical impact on this operation. + * Cardinality estimation. The estimate uses uint32, so we hash the full + * 64-bit value and take the lower 32 bits of the result. */ if (uss->estimating) { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res))); } /* diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index 3a2002097dd..c226af5ca80 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -739,11 +739,7 @@ network_abbrev_convert(Datum original, SortSupport ssup) /* Hash abbreviated key */ if (uss->estimating) { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res))); } return res; diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index d25b8ad505d..bbe2581f0b7 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -2397,10 +2397,7 @@ numeric_abbrev_convert_var(const NumericVar *var, NumericSortSupport *nss) if (nss->estimating) { - uint32 tmp = ((uint32) result - ^ (uint32) ((uint64) result >> 32)); - - addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&nss->abbr_card, (uint32) murmurhash64(result)); } return NumericAbbrevGetDatum(result); diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index 6ee3752ac78..888802c3012 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -396,11 +396,7 @@ uuid_abbrev_convert(Datum original, SortSupport ssup) if (uss->estimating) { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res))); } /* diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 7caf700fd61..ea3d4dcf18b 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -2155,17 +2155,12 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) Min(len, PG_CACHE_LINE_SIZE))); if (len > PG_CACHE_LINE_SIZE) - hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + hash ^= murmurhash32((uint32) len); addHyperLogLog(&sss->full_card, hash); /* Hash abbreviated key */ - { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - hash = DatumGetUInt32(hash_uint32(tmp)); - } + hash = (uint32) murmurhash64(DatumGetUInt64(res)); addHyperLogLog(&sss->abbr_card, hash); -- 2.43.0
From 17d23c6203c727eb3b0a9e1f4993a4fc63f5e734 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Tue, 3 Feb 2026 16:45:10 +0300 Subject: [PATCH v3 2/2] Avoid unnecessary type casting when using hash_any() hash_any() is merely a wrapper for hash_bytes(). Call it directly when possible in order to avoid unnecessary type casting. Additionally, improve the comment for addHyperLogLog().Previously the comment suggested to use hash_any() which return value is Datum. Since the argument of addHyperLogLog() is uint32, recommending hash_bytes() is more appropriate. Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com fix --- contrib/ltree/ltree_op.c | 2 +- src/backend/access/tablesample/bernoulli.c | 4 ++-- src/backend/access/tablesample/system.c | 4 ++-- src/backend/commands/async.c | 8 ++++---- src/backend/lib/hyperloglog.c | 2 +- src/backend/nodes/bitmapset.c | 4 ++-- src/backend/tsearch/ts_typanalyze.c | 4 ++-- src/backend/utils/adt/bytea.c | 4 ++-- src/backend/utils/adt/jsonb_gin.c | 2 +- src/backend/utils/adt/jsonb_util.c | 4 ++-- src/backend/utils/adt/varlena.c | 4 ++-- src/backend/utils/cache/funccache.c | 8 ++++---- 12 files changed, 25 insertions(+), 25 deletions(-) diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c index c1fc77fc804..a9a1bce0861 100644 --- a/contrib/ltree/ltree_op.c +++ b/contrib/ltree/ltree_op.c @@ -144,7 +144,7 @@ hash_ltree(PG_FUNCTION_ARGS) while (an > 0) { - uint32 levelHash = DatumGetUInt32(hash_any((unsigned char *) al->name, al->len)); + uint32 levelHash = hash_bytes((unsigned char *) al->name, al->len); /* * Combine hash values of successive elements by multiplying the diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c index 7d8560464c8..f768d40143e 100644 --- a/src/backend/access/tablesample/bernoulli.c +++ b/src/backend/access/tablesample/bernoulli.c @@ -214,8 +214,8 @@ bernoulli_nextsampletuple(SampleScanState *node, hashinput[1] = tupoffset; - hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, - (int) sizeof(hashinput))); + hash = hash_bytes((const unsigned char *) hashinput, + (int) sizeof(hashinput)); if (hash < sampler->cutoff) break; } diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index a2b9ba8eea9..de13dd8cab9 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -202,8 +202,8 @@ system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) hashinput[0] = nextblock; - hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, - (int) sizeof(hashinput))); + hash = hash_bytes((const unsigned char *) hashinput, + (int) sizeof(hashinput)); if (hash < sampler->cutoff) break; } diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 315270ddb11..aee845218e9 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -652,8 +652,8 @@ globalChannelTableHash(const void *key, size_t size, void *arg) dshash_hash h; h = murmurhash32(k->dboid); - h ^= hash_any((const unsigned char *) k->channel, - strnlen(k->channel, NAMEDATALEN)); + h ^= hash_bytes((const unsigned char *) k->channel, + strnlen(k->channel, NAMEDATALEN)); return h; } @@ -3244,8 +3244,8 @@ notification_hash(const void *key, Size keysize) Assert(keysize == sizeof(Notification *)); /* We don't bother to include the payload's trailing null in the hash */ - return DatumGetUInt32(hash_any((const unsigned char *) k->data, - k->channel_len + k->payload_len + 1)); + return hash_bytes((const unsigned char *) k->data, + k->channel_len + k->payload_len + 1); } /* diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c index c74f11217ef..2b82ff12b2e 100644 --- a/src/backend/lib/hyperloglog.c +++ b/src/backend/lib/hyperloglog.c @@ -158,7 +158,7 @@ freeHyperLogLog(hyperLogLogState *cState) * Adds element to the estimator, from caller-supplied hash. * * It is critical that the hash value passed be an actual hash value, typically - * generated using hash_any(). The algorithm relies on a specific bit-pattern + * generated using hash_bytes(). The algorithm relies on a specific bit-pattern * observable in conjunction with stochastic averaging. There must be a * uniform distribution of bits in hash values for each distinct original value * observed. diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 786f343b3c9..68bb936898b 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -1405,8 +1405,8 @@ bms_hash_value(const Bitmapset *a) if (a == NULL) return 0; /* All empty sets hash to 0 */ - return DatumGetUInt32(hash_any((const unsigned char *) a->words, - a->nwords * sizeof(bitmapword))); + return hash_bytes((const unsigned char *) a->words, + a->nwords * sizeof(bitmapword)); } /* diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index 48ee050e37f..6ca8ead3988 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -496,8 +496,8 @@ lexeme_hash(const void *key, Size keysize) { const LexemeHashKey *l = (const LexemeHashKey *) key; - return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme, - l->length)); + return hash_bytes((const unsigned char *) l->lexeme, + l->length); } /* diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index f32bd23d03c..1e255f00570 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -1106,8 +1106,8 @@ bytea_abbrev_convert(Datum original, SortSupport ssup) * in order to compensate for cases where differences are past * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. */ - hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, - Min(len, PG_CACHE_LINE_SIZE))); + hash = hash_bytes((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE)); if (len > PG_CACHE_LINE_SIZE) hash ^= murmurhash32((uint32) len); diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c index d72a6441c5e..f5dbd5589d3 100644 --- a/src/backend/utils/adt/jsonb_gin.c +++ b/src/backend/utils/adt/jsonb_gin.c @@ -1333,7 +1333,7 @@ make_text_key(char flag, const char *str, int len) { uint32 hashval; - hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len)); + hashval = hash_bytes((const unsigned char *) str, len); snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval); str = hashbuf; len = 8; diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index 91fb9ea09bf..91bff9fda3a 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -1451,8 +1451,8 @@ JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash) tmp = 0x01; break; case jbvString: - tmp = DatumGetUInt32(hash_any((const unsigned char *) scalarVal->val.string.val, - scalarVal->val.string.len)); + tmp = hash_bytes((const unsigned char *) scalarVal->val.string.val, + scalarVal->val.string.len); break; case jbvNumeric: /* Must hash equal numerics to equal hash codes */ diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index ea3d4dcf18b..30545904101 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -2151,8 +2151,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * in order to compensate for cases where differences are past * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. */ - hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, - Min(len, PG_CACHE_LINE_SIZE))); + hash = hash_bytes((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE)); if (len > PG_CACHE_LINE_SIZE) hash ^= murmurhash32((uint32) len); diff --git a/src/backend/utils/cache/funccache.c b/src/backend/utils/cache/funccache.c index 701c294b88d..68bbf0b971c 100644 --- a/src/backend/utils/cache/funccache.c +++ b/src/backend/utils/cache/funccache.c @@ -89,13 +89,13 @@ cfunc_hash(const void *key, Size keysize) Assert(keysize == sizeof(CachedFunctionHashKey)); /* Hash all the fixed fields except callResultType */ - h = DatumGetUInt32(hash_any((const unsigned char *) k, - offsetof(CachedFunctionHashKey, callResultType))); + h = hash_bytes((const unsigned char *) k, + offsetof(CachedFunctionHashKey, callResultType)); /* Incorporate input argument types */ if (k->nargs > 0) h = hash_combine(h, - DatumGetUInt32(hash_any((const unsigned char *) k->argtypes, - k->nargs * sizeof(Oid)))); + hash_bytes((const unsigned char *) k->argtypes, + k->nargs * sizeof(Oid))); /* Incorporate callResultType if present */ if (k->callResultType) h = hash_combine(h, hashRowType(k->callResultType)); -- 2.43.0
