Re: Postgres picks suboptimal index after building of an extended statistics

Andrey Lepikhov Sun, 29 Aug 2021 23:44:51 -0700

On 12/8/21 04:26, Tomas Vondra wrote:

On 8/11/21 2:48 AM, Peter Geoghegan wrote:
On Wed, Jun 23, 2021 at 7:19 AM Andrey V. Lepikhov
I agree the current behavior is unfortunate, but I'm not convinced theproposed patch is fixing the right place - doesn't this mean the indexcosting won't match the row estimates displayed by EXPLAIN?

I rewrote the patch. It's now simpler and shorter. May be more convenient.
Also, it includes a regression test to detect the problem in future.

I wonder if we should teach clauselist_selectivity about UNIQUE indexes,and improve the cardinality estimates directly, not just costing forindex scans.

I tried to implement this in different ways. But it causes additionaloverhead and code complexity - analyzing a list of indexes and matchclauses of each index with input clauses in each selectivity estimation.

I don't like that way and propose a new patch in attachment.

--
regards,
Andrey Lepikhov
Postgres Professional

From 41ce6007cd552afd1a73983f0b9c9cac0e125d58 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepik...@postgrespro.ru>
Date: Mon, 30 Aug 2021 11:21:57 +0500
Subject: [PATCH] Estimating number of fetched rows in a btree index we save
 selectivity estimation in the costs structure. It will be used by the
 genericcostestimate routine as a top bound for estimation of total tuples,
 visited in the main table.

This code fix the problem with unique index, when we know for sure
that no more than one tuple can be fetched, but clauselist_selectivity
gives us much less accurate estimation because of many possible reasons.
A regression test is added as a demonstration of the problem.
---
 src/backend/utils/adt/selfuncs.c        | 18 ++++++++++--
 src/test/regress/expected/stats_ext.out | 38 +++++++++++++++++++++++++
 src/test/regress/sql/stats_ext.sql      | 34 ++++++++++++++++++++++
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c2aeb4b947..dd1cadad61 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6074,6 +6074,14 @@ genericcostestimate(PlannerInfo *root,
                 */
                numIndexTuples = rint(numIndexTuples / num_sa_scans);
        }
+       else if (costs->indexSelectivity > 0. &&
+               indexSelectivity > costs->indexSelectivity)
+               /*
+                * If caller give us an estimation of amount of fetched index 
tuples,
+                * it could give the selectivity estimation. In this case 
amount of
+                * returned tuples can't be more than amount of fetched tuples.
+                */
+               indexSelectivity = costs->indexSelectivity;
 
        /*
         * We can bound the number of tuples by the index size in any case. 
Also,
@@ -6258,6 +6266,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double 
loop_count,
        bool            found_is_null_op;
        double          num_sa_scans;
        ListCell   *lc;
+       Selectivity btreeSelectivity;
 
        /*
         * For a btree scan, only leading '=' quals plus inequality quals for 
the
@@ -6362,19 +6371,23 @@ btcostestimate(PlannerInfo *root, IndexPath *path, 
double loop_count,
        /*
         * If index is unique and we found an '=' clause for each column, we can
         * just assume numIndexTuples = 1 and skip the expensive
-        * clauselist_selectivity calculations.  However, a ScalarArrayOp or
+        * clauselist_selectivity calculations. However, a ScalarArrayOp or
         * NullTest invalidates that theory, even though it sets eqQualHere.
+        * Value of btreeSelectivity is used as a top bound for selectivity
+        * estimation of returned tuples in the genericcostestimate routine.
         */
        if (index->unique &&
                indexcol == index->nkeycolumns - 1 &&
                eqQualHere &&
                !found_saop &&
                !found_is_null_op)
+       {
                numIndexTuples = 1.0;
+               btreeSelectivity = 1. / index->rel->tuples;
+       }
        else
        {
                List       *selectivityQuals;
-               Selectivity btreeSelectivity;
 
                /*
                 * If the index is partial, AND the index predicate with the
@@ -6402,6 +6415,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double 
loop_count,
         */
        MemSet(&costs, 0, sizeof(costs));
        costs.numIndexTuples = numIndexTuples;
+       costs.indexSelectivity = btreeSelectivity;
 
        genericcostestimate(root, path, loop_count, &costs);
 
diff --git a/src/test/regress/expected/stats_ext.out 
b/src/test/regress/expected/stats_ext.out
index 7524e65142..b90463821f 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -1602,3 +1602,41 @@ NOTICE:  drop cascades to 2 other objects
 DETAIL:  drop cascades to table tststats.priv_test_tbl
 drop cascades to view tststats.priv_test_view
 DROP USER regress_stats_user1;
+-- Reproduction of the problem with picking of suboptimal index.
+SET enable_bitmapscan = 'off';
+-- Table with specific distribution of values
+CREATE TABLE tbl42 AS (
+       SELECT
+               gs % 10 AS x,
+               (gs % 10 + (gs/10::int4) % 10) % 10 AS y,
+               (gs / 100)::int4 AS z
+       FROM generate_series(1,1000) AS gs
+);
+INSERT INTO tbl42 (
+       SELECT gs,gs,gs FROM generate_series(1000,2000) AS gs
+);
+CREATE UNIQUE INDEX good ON tbl42 (x,y,z);
+CREATE INDEX bad ON tbl42(x);
+ANALYZE tbl42;
+-- Optimizer picks optimal, a primary key unique index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Only Scan using good on tbl42
+   Index Cond: ((x = 1) AND (y = 1) AND (z = 1))
+(2 rows)
+
+CREATE STATISTICS aestat(dependencies,ndistinct) ON x,y,z FROM tbl42;
+ANALYZE tbl42;
+-- Optimizer picks suboptimal index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Only Scan using good on tbl42
+   Index Cond: ((x = 1) AND (y = 1) AND (z = 1))
+(2 rows)
+
+-- Clean up
+DROP TABLE tbl42 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql 
b/src/test/regress/sql/stats_ext.sql
index 906503bd0b..f274f53996 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -877,3 +877,37 @@ DROP FUNCTION op_leak(int, int);
 RESET SESSION AUTHORIZATION;
 DROP SCHEMA tststats CASCADE;
 DROP USER regress_stats_user1;
+
+
+-- Reproduction of the problem with picking of suboptimal index.
+SET enable_bitmapscan = 'off';
+
+-- Table with specific distribution of values
+CREATE TABLE tbl42 AS (
+       SELECT
+               gs % 10 AS x,
+               (gs % 10 + (gs/10::int4) % 10) % 10 AS y,
+               (gs / 100)::int4 AS z
+       FROM generate_series(1,1000) AS gs
+);
+INSERT INTO tbl42 (
+       SELECT gs,gs,gs FROM generate_series(1000,2000) AS gs
+);
+
+CREATE UNIQUE INDEX good ON tbl42 (x,y,z);
+CREATE INDEX bad ON tbl42(x);
+ANALYZE tbl42;
+
+-- Optimizer picks optimal, a primary key unique index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+
+CREATE STATISTICS aestat(dependencies,ndistinct) ON x,y,z FROM tbl42;
+ANALYZE tbl42;
+
+-- Optimizer picks suboptimal index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+
+-- Clean up
+DROP TABLE tbl42 CASCADE;
-- 
2.33.0

Re: Postgres picks suboptimal index after building of an extended statistics

Reply via email to