Hi, Since Datums are 64-bit values and MAC addresses have only 6 bytes, the abbreviated key contains the entire MAC address and is authoritative, as pointed out by John Naylor [1]
This fact eliminates the need for cardinality estimation using HyperLogLog since macaddr_abbrev_convert() is dirt cheap and not lossy. There are no reasons to give up on abbreviation. Potentially we could go even further and pass MAC addresses by value rather by reference [2]. This would eliminate the need of abbreviation completely since SortSupport->comparator could just compare two Datums, as we do for Timestamps. This is a more invasive change though that deserves more discussion and thus not proposed here. [1]: https://postgr.es/m/CANWCAZYWdOEnoL_88VpMge1RtRpBz-VRCjdcu-eA4q3U6LvpDw%40mail.gmail.com [2]: https://postgre.es/m/CAJ7c6TM8up%3DYih8pRLPy4wHwLzHf7w22tQ80-8ZBm__E%3D8_5HA%40mail.gmail.com -- Best regards, Aleksander Alekseev
From 558c090d4e112794fb25ca16aa83874c5d2fafb3 Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev <[email protected]> Date: Wed, 25 Feb 2026 14:40:26 +0300 Subject: [PATCH v1] Simplify SortSupport implementation for macaddr Since Datums are 64-bit values and MAC addresses have only 6 bytes, the abbreviated key contains the entire MAC address and is authoritative. Thus there is no need for cardinality estimation using HyperLogLog. Author: Aleksander Alekseev <[email protected]> Suggested-by: John Naylor <[email protected]> Reviewed-by: TODO FIXME Discussion: TODO FIXME --- src/backend/utils/adt/mac.c | 100 ++---------------------------------- 1 file changed, 5 insertions(+), 95 deletions(-) diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c index f14675dea40..433dce015c7 100644 --- a/src/backend/utils/adt/mac.c +++ b/src/backend/utils/adt/mac.c @@ -14,11 +14,9 @@ #include "postgres.h" #include "common/hashfn.h" -#include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "port/pg_bswap.h" #include "utils/fmgrprotos.h" -#include "utils/guc.h" #include "utils/inet.h" #include "utils/sortsupport.h" @@ -33,15 +31,6 @@ #define lobits(addr) \ ((unsigned long)(((addr)->d<<16)|((addr)->e<<8)|((addr)->f))) -/* sortsupport for macaddr */ -typedef struct -{ - int64 input_count; /* number of non-null values seen */ - bool estimating; /* true if estimating cardinality */ - - hyperLogLogState abbr_card; /* cardinality estimator */ -} macaddr_sortsupport_state; - static int macaddr_cmp_internal(macaddr *a1, macaddr *a2); static int macaddr_fast_cmp(Datum x, Datum y, SortSupport ssup); static bool macaddr_abbrev_abort(int memtupcount, SortSupport ssup); @@ -369,24 +358,10 @@ macaddr_sortsupport(PG_FUNCTION_ARGS) if (ssup->abbreviate) { - macaddr_sortsupport_state *uss; - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - - uss = palloc_object(macaddr_sortsupport_state); - uss->input_count = 0; - uss->estimating = true; - initHyperLogLog(&uss->abbr_card, 10); - - ssup->ssup_extra = uss; - ssup->comparator = ssup_datum_unsigned_cmp; ssup->abbrev_converter = macaddr_abbrev_convert; ssup->abbrev_abort = macaddr_abbrev_abort; ssup->abbrev_full_comparator = macaddr_fast_cmp; - - MemoryContextSwitchTo(oldcontext); } PG_RETURN_VOID(); @@ -406,61 +381,12 @@ macaddr_fast_cmp(Datum x, Datum y, SortSupport ssup) } /* - * Callback for estimating effectiveness of abbreviated key optimization. - * - * We pay no attention to the cardinality of the non-abbreviated data, because - * there is no equality fast-path within authoritative macaddr comparator. + * Abbreviation is never aborted for macaddr because the 6-byte MAC address + * fits entirely within a 64-bit Datum, making the abbreviated key authoritative. */ static bool macaddr_abbrev_abort(int memtupcount, SortSupport ssup) { - macaddr_sortsupport_state *uss = ssup->ssup_extra; - double abbr_card; - - if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating) - return false; - - abbr_card = estimateHyperLogLog(&uss->abbr_card); - - /* - * If we have >100k distinct values, then even if we were sorting many - * billion rows we'd likely still break even, and the penalty of undoing - * that many rows of abbrevs would probably not be worth it. At this point - * we stop counting because we know that we're now fully committed. - */ - if (abbr_card > 100000.0) - { - if (trace_sort) - elog(LOG, - "macaddr_abbrev: estimation ends at cardinality %f" - " after " INT64_FORMAT " values (%d rows)", - abbr_card, uss->input_count, memtupcount); - uss->estimating = false; - return false; - } - - /* - * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row - * fudge factor allows us to abort earlier on genuinely pathological data - * where we've had exactly one abbreviated value in the first 2k - * (non-null) rows. - */ - if (abbr_card < uss->input_count / 2000.0 + 0.5) - { - if (trace_sort) - elog(LOG, - "macaddr_abbrev: aborting abbreviation at cardinality %f" - " below threshold %f after " INT64_FORMAT " values (%d rows)", - abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count, - memtupcount); - return true; - } - - if (trace_sort) - elog(LOG, - "macaddr_abbrev: cardinality %f after " INT64_FORMAT - " values (%d rows)", abbr_card, uss->input_count, memtupcount); - return false; } @@ -469,14 +395,13 @@ macaddr_abbrev_abort(int memtupcount, SortSupport ssup) * to abbreviated key representation. * * Packs the bytes of a 6-byte MAC address into a Datum and treats it as an - * unsigned integer for purposes of comparison. On a 64-bit machine, there - * will be two zeroed bytes of padding. The integer is converted to native - * endianness to facilitate easy comparison. + * unsigned integer for purposes of comparison. There will be two zeroed bytes + * of padding. The integer is converted to native endianness to facilitate + * easy comparison. */ static Datum macaddr_abbrev_convert(Datum original, SortSupport ssup) { - macaddr_sortsupport_state *uss = ssup->ssup_extra; macaddr *authoritative = DatumGetMacaddrP(original); Datum res; @@ -489,21 +414,6 @@ macaddr_abbrev_convert(Datum original, SortSupport ssup) "Datum is too small for macaddr"); memset(&res, 0, sizeof(res)); memcpy(&res, authoritative, sizeof(macaddr)); - uss->input_count += 1; - - /* - * Cardinality estimation. The estimate uses uint32, so XOR the two 32-bit - * halves together to produce slightly more entropy. The two zeroed bytes - * won't have any practical impact on this operation. - */ - if (uss->estimating) - { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); - } /* * Byteswap on little-endian machines. -- 2.43.0
