Use LIMIT instead of Unique for DISTINCT when all distinct pathkeys are redundant

David Rowley Wed, 12 Oct 2022 02:19:57 -0700

Over on [1], Klint highlights a query with a DISTINCT which is a
little sub-optimal in PostgreSQL.  ISTM that in cases where all
DISTINCT pathkeys have been marked as redundant due to constants
existing in all of the EquivalenceClasses of the DISTINCT columns,
then it looks like it should be okay not to bother using a Unique node
to remove duplicate results.


When all the distinct pathkeys are redundant then there can only be,
at most, 1 single distinct value. There may be many rows with that
value, but we can remove those extra ones with a LIMIT 1 rather than
troubling over needlessly uniquifing them.

This might not be a hugely common case, but; 1) it is very cheap to
detect and 2) the speedups are likely to be *very* good.

With the attached we get:

regression=# explain (analyze, costs off, timing off) SELECT DISTINCT
four,1,2,3 FROM tenk1 WHERE four = 0;
                   QUERY PLAN
-------------------------------------------------
 Limit (actual rows=1 loops=1)
   ->  Seq Scan on tenk1 (actual rows=1 loops=1)
         Filter: (four = 0)
 Planning Time: 0.215 ms
 Execution Time: 0.071 ms

naturally, if we removed the WHERE four = 0, we can't optimise this
plan using this method.

I see no reason why this also can't work for DISTINCT ON too.

regression=# explain (analyze, costs off, timing off) SELECT DISTINCT
ON (four,two) four,two FROM tenk1 WHERE four = 0 order by 1,2;
                        QUERY PLAN
----------------------------------------------------------
 Unique (actual rows=1 loops=1)
   ->  Sort (actual rows=2500 loops=1)
         Sort Key: two
         Sort Method: quicksort  Memory: 175kB
         ->  Seq Scan on tenk1 (actual rows=2500 loops=1)
               Filter: (four = 0)
               Rows Removed by Filter: 7500
 Planning Time: 0.123 ms
 Execution Time: 4.251 ms
(9 rows)

then, of course, if we introduce some column that the pathkey is not
redundant for then we must do the distinct operation as normal.

regression=# explain (analyze, costs off, timing off) SELECT DISTINCT
four,two FROM tenk1 WHERE four = 0 order by 1,2;
                        QUERY PLAN
----------------------------------------------------------
 Sort (actual rows=1 loops=1)
   Sort Key: two
   Sort Method: quicksort  Memory: 25kB
   ->  HashAggregate (actual rows=1 loops=1)
         Group Key: four, two
         Batches: 1  Memory Usage: 24kB
         ->  Seq Scan on tenk1 (actual rows=2500 loops=1)
               Filter: (four = 0)
               Rows Removed by Filter: 7500
 Planning Time: 0.137 ms
 Execution Time: 4.274 ms
(11 rows)

Does this seem like something we'd want to do?

Patch attached.

David

[1] 
https://postgr.es/m/meypr01mb7101cd5da0a07c9de2b74850a4...@meypr01mb7101.ausprd01.prod.outlook.com

diff --git a/src/backend/optimizer/plan/planner.c 
b/src/backend/optimizer/plan/planner.c
index 5d0fd6e072..84856721f3 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4764,8 +4764,6 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo 
*input_rel,
                 * the other.)
                 */
                List       *needed_pathkeys;
-               Path       *path;
-               ListCell   *lc;
 
                if (parse->hasDistinctOn &&
                        list_length(root->distinct_pathkeys) <
@@ -4774,44 +4772,73 @@ create_final_distinct_paths(PlannerInfo *root, 
RelOptInfo *input_rel,
                else
                        needed_pathkeys = root->distinct_pathkeys;
 
-               foreach(lc, input_rel->pathlist)
+               /*
+                * needed_pathkeys may have become empty if all of the pathkeys 
were
+                * determined to be redundant.  If all of the pathkeys are 
redundant
+                * then each DISTINCT target must only allow a single value, 
therefore
+                * all resulting tuples must be identical.  We can uniquify 
these
+                * tuples simply by just taking the first tuple.  All we do 
here is
+                * add a path to do LIMIT 1 on the cheapest input path.
+                *
+                * XXX we could end up with 2 Limit nodes if the query already 
has a
+                * LIMIT clause. Does that matter?
+                */
+               if (needed_pathkeys == NIL)
+               {
+                       Node       *limitCount = (Node *) makeConst(INT8OID, 
-1, InvalidOid,
+                                                                               
                                sizeof(int64),
+                                                                               
                                Int64GetDatum(1), false,
+                                                                               
                                FLOAT8PASSBYVAL);
+
+                       add_path(distinct_rel, (Path *)
+                                        create_limit_path(root, distinct_rel,
+                                                                          
cheapest_input_path, NULL, limitCount,
+                                                                          
LIMIT_OPTION_COUNT, 0, 1));
+               }
+               else
                {
-                       path = (Path *) lfirst(lc);
+                       Path       *path;
+                       ListCell   *lc;
 
-                       if (pathkeys_contained_in(needed_pathkeys, 
path->pathkeys))
+                       foreach(lc, input_rel->pathlist)
                        {
-                               add_path(distinct_rel, (Path *)
-                                                create_upper_unique_path(root, 
distinct_rel,
-                                                                               
                  path,
-                                                                               
                  list_length(root->distinct_pathkeys),
-                                                                               
                  numDistinctRows));
+                               path = (Path *) lfirst(lc);
+
+                               if (pathkeys_contained_in(needed_pathkeys, 
path->pathkeys))
+                               {
+                                       add_path(distinct_rel, (Path *)
+                                                        
create_upper_unique_path(root, distinct_rel,
+                                                                               
                          path,
+                                                                               
                          list_length(root->distinct_pathkeys),
+                                                                               
                          numDistinctRows));
+                               }
                        }
-               }
 
-               /* For explicit-sort case, always use the more rigorous clause 
*/
-               if (list_length(root->distinct_pathkeys) <
-                       list_length(root->sort_pathkeys))
-               {
-                       needed_pathkeys = root->sort_pathkeys;
-                       /* Assert checks that parser didn't mess up... */
-                       Assert(pathkeys_contained_in(root->distinct_pathkeys,
-                                                                               
 needed_pathkeys));
-               }
-               else
-                       needed_pathkeys = root->distinct_pathkeys;
+                       /* For explicit-sort case, always use the more rigorous 
clause */
+                       if (list_length(root->distinct_pathkeys) <
+                               list_length(root->sort_pathkeys))
+                       {
+                               needed_pathkeys = root->sort_pathkeys;
+                               /* Assert checks that parser didn't mess up... 
*/
+                               
Assert(pathkeys_contained_in(root->distinct_pathkeys,
+                                                                               
         needed_pathkeys));
+                       }
+                       else
+                               needed_pathkeys = root->distinct_pathkeys;
 
-               path = cheapest_input_path;
-               if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-                       path = (Path *) create_sort_path(root, distinct_rel,
-                                                                               
         path,
-                                                                               
         needed_pathkeys,
-                                                                               
         -1.0);
+                       path = cheapest_input_path;
+                       if (!pathkeys_contained_in(needed_pathkeys, 
path->pathkeys))
+                               path = (Path *) create_sort_path(root, 
distinct_rel,
+                                                                               
                 path,
+                                                                               
                 needed_pathkeys,
+                                                                               
                 -1.0);
 
-               add_path(distinct_rel, (Path *)
-                                create_upper_unique_path(root, distinct_rel,
-                                                                               
  path,
-                                                                               
  list_length(root->distinct_pathkeys),
-                                                                               
  numDistinctRows));
+                       add_path(distinct_rel, (Path *)
+                                        create_upper_unique_path(root, 
distinct_rel,
+                                                                               
          path,
+                                                                               
          list_length(root->distinct_pathkeys),
+                                                                               
          numDistinctRows));
+               }
        }
 
        /*
diff --git a/src/test/regress/expected/select_distinct.out 
b/src/test/regress/expected/select_distinct.out
index 748419cee0..6ce889d87c 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -279,6 +279,60 @@ RESET max_parallel_workers_per_gather;
 RESET min_parallel_table_scan_size;
 RESET parallel_setup_cost;
 RESET parallel_tuple_cost;
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+         QUERY PLAN         
+----------------------------
+ Limit
+   ->  Seq Scan on tenk1
+         Filter: (four = 0)
+(3 rows)
+
+-- Ensure the above gives us the correct result
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+ four 
+------
+    0
+(1 row)
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+                 QUERY PLAN                  
+---------------------------------------------
+ Limit
+   ->  Seq Scan on tenk1
+         Filter: ((two <> 0) AND (four = 0))
+(3 rows)
+
+-- Ensure no rows are returned
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+ four 
+------
+(0 rows)
+
+-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+         QUERY PLAN         
+----------------------------
+ Limit
+   ->  Seq Scan on tenk1
+         Filter: (four = 0)
+(3 rows)
+
+-- Ensure we only get 1 row
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+ four | ?column? | ?column? | ?column? 
+------+----------+----------+----------
+    0 |        1 |        2 |        3
+(1 row)
+
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.
diff --git a/src/test/regress/expected/select_distinct_on.out 
b/src/test/regress/expected/select_distinct_on.out
index 3d98990991..2e38f14adb 100644
--- a/src/test/regress/expected/select_distinct_on.out
+++ b/src/test/regress/expected/select_distinct_on.out
@@ -73,3 +73,27 @@ select distinct on (1) floor(random()) as r, f1 from 
int4_tbl order by 1,2;
  0 | -2147483647
 (1 row)
 
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+-- Ensure we also get a LIMIT plan with DISTINCT ON
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 AND two = 0 ORDER BY 1,2;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Result
+   ->  Limit
+         ->  Seq Scan on tenk1
+               Filter: ((four = 0) AND (two = 0))
+(4 rows)
+
+-- and check the result of the above query is correct
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 AND two = 0 ORDER BY 1,2;
+ four | two 
+------+-----
+    0 |   0
+(1 row)
+
diff --git a/src/test/regress/sql/select_distinct.sql 
b/src/test/regress/sql/select_distinct.sql
index f27ff714f8..34020adad1 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -146,6 +146,32 @@ RESET min_parallel_table_scan_size;
 RESET parallel_setup_cost;
 RESET parallel_tuple_cost;
 
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+
+-- Ensure the above gives us the correct result
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+
+-- Ensure no rows are returned
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+
+-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+
+-- Ensure we only get 1 row
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.
diff --git a/src/test/regress/sql/select_distinct_on.sql 
b/src/test/regress/sql/select_distinct_on.sql
index 0920bd64b9..d5d6de1815 100644
--- a/src/test/regress/sql/select_distinct_on.sql
+++ b/src/test/regress/sql/select_distinct_on.sql
@@ -17,3 +17,17 @@ SELECT DISTINCT ON (string4, ten) string4, ten, two
 
 -- bug #5049: early 8.4.x chokes on volatile DISTINCT ON clauses
 select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2;
+
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+
+-- Ensure we also get a LIMIT plan with DISTINCT ON
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 AND two = 0 ORDER BY 1,2;
+
+-- and check the result of the above query is correct
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 AND two = 0 ORDER BY 1,2;

Use LIMIT instead of Unique for DISTINCT when all distinct pathkeys are redundant

Reply via email to