On Wed, Feb 25, 2026, at 13:19, Joel Jacobson wrote:
> On Tue, Feb 24, 2026, at 19:21, Joel Jacobson wrote:
>> This bug seems to sometimes cause the wrong table, the larger table, to
>> be hashed in a Hash Join, and the smaller table to be used for probing.

To help reviewers, instead of relying on benchmark results,
I realized it's much better if we can actually prove the calculation
of skew_ratio is incorrect, and that it becomes correct with the fix.

I therefore added debugging here:

        if (avgfreq > 0.0 && *mcv_freq > avgfreq)
+       {
+               elog(DEBUG1, "mcv_freq=%g, avgfreq=%g, skew_ratio=%g",
+                        *mcv_freq, avgfreq, *mcv_freq / avgfreq);
                estfract *= *mcv_freq / avgfreq;
+       }

And created this minimal schema to prove the incorrect calculation:

CREATE TABLE orders (
    order_id      bigint  NOT NULL,
    tracking_code bigint,
    region        int     NOT NULL
);

CREATE TABLE shipments (
    shipment_id   bigint  NOT NULL,
    tracking_code bigint  NOT NULL
);

-- Skewed tracking_code distribution:
--   10% of rows share a single "hot" tracking_code (value 1),
--   the rest get unique codes.
-- This ensures mcv_freq > avgfreq, triggering the skew adjustment
-- even in the fixed version of estimate_hash_bucket_stats.
INSERT INTO orders
SELECT
    g,
    CASE
        WHEN g > 150000 THEN NULL
        WHEN g <= 15000  THEN 1
        ELSE g
    END,
    (g % 1000) + 1
FROM generate_series(1, 200000) g;

INSERT INTO shipments
SELECT g, CASE WHEN g <= 15000 THEN 1 ELSE g END
FROM generate_series(1, 150000) g;

CREATE INDEX ON orders (region);

ANALYZE orders;
ANALYZE shipments;

-- Ground truth of the skew_ratio for orders WHERE region = 42:
WITH base AS (
    SELECT
        count(*)::numeric                  AS total,
        count(tracking_code)::numeric      AS nonnull,
        count(DISTINCT tracking_code)      AS ndistinct
    FROM orders
    WHERE region = 42
),
mcv AS (
    SELECT count(*)::numeric AS mcv_count
    FROM orders
    WHERE region = 42 AND tracking_code IS NOT NULL
    GROUP BY tracking_code
    ORDER BY count(*) DESC
    LIMIT 1
)
SELECT
    round(mcv_count / total, 6)                        AS mcv_freq,
    round(nonnull / total / ndistinct, 6)              AS avgfreq,
    round((mcv_count / total) / (nonnull / total / ndistinct), 6) AS skew_ratio
FROM base, mcv;

 mcv_freq | avgfreq  | skew_ratio
----------+----------+------------
 0.075000 | 0.005515 |  13.600000
(1 row)

SET client_min_messages = DEBUG1;

SELECT *
FROM orders o
JOIN shipments s ON s.tracking_code = o.tracking_code
WHERE o.region = 42;

-- HEAD (65707ed):
DEBUG:  mcv_freq=0.0748333, avgfreq=8.69725e-06, skew_ratio=8604.25

-- 0001-Fix-estimate_hash_bucket_stats-to-use-filtered-ndist.patch:
DEBUG:  mcv_freq=0.0738667, avgfreq=0.00871124, skew_ratio=8.47947

In HEAD, the skew_ratio for orders.tracking_code is wrongly estimated
to 8604.25, when the ground truth is 13.6, which the fixed estimate
8.47947, is a good approximation of.

The fix only moves the computation of avgfreq from before the
ndistinct adjustment, to after it:

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 29fec65559..d19c4b5d96 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4456,9 +4456,6 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node 
*hashkey, double nbuckets,
        else
                stanullfrac = 0.0;

-       /* Compute avg freq of all distinct data values in raw relation */
-       avgfreq = (1.0 - stanullfrac) / ndistinct;
-
        /*
         * Adjust ndistinct to account for restriction clauses.  Observe we are
         * assuming that the data distribution is affected uniformly by the
@@ -4473,6 +4470,9 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node 
*hashkey, double nbuckets,
                ndistinct = clamp_row_est(ndistinct);
        }

+       /* Compute avg freq of all distinct data values in the filtered 
relation */
+       avgfreq = (1.0 - stanullfrac) / ndistinct;
+
        /*
         * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
         * number of buckets is less than the expected number of distinct 
values;

/Joel


Reply via email to