Hi,
While designing an improvement for the cost sort model, I discovered
that the query plan can vary if we slightly change the query text
without pushing semantic differences. See the example below:
CREATE TABLE test(x integer, y integer,z text);
INSERT INTO test (x,y) SELECT x, 1 FROM generate_series(1,1000000) AS x;
CREATE INDEX ON test(x);
CREATE INDEX ON test(y);
VACUUM ANALYZE;
SET max_parallel_workers_per_gather = 0;
First query:
EXPLAIN (ANALYZE, TIMING OFF) SELECT count(*) FROM test t1, test t2
WHERE t1.x=t2.y AND t1.y=t2.x GROUP BY t1.x,t1.y;
And the second one - just reverse the left and right sides of
expressions in the WHERE condition:
EXPLAIN (ANALYZE, TIMING OFF) SELECT count(*) FROM test t1, test t2
WHERE t2.y=t1.x AND t2.x=t1.y GROUP BY t1.x,t1.y;
You can see two different plans here:
GroupAggregate (cost=37824.89..37824.96 rows=1 width=16)
Group Key: t1.y, t1.x
-> Incremental Sort (cost=37824.89..37824.94 rows=2 width=8)
Sort Key: t1.y, t1.x
Presorted Key: t1.y
-> Merge Join (cost=0.85..37824.88 rows=1 width=8)
Merge Cond: (t1.y = t2.x)
Join Filter: (t2.y = t1.x)
-> Index Scan using test_y_idx on test t1
-> Index Scan using test_x_idx on test t2
GroupAggregate (cost=37824.89..37824.92 rows=1 width=16)
Group Key: t1.x, t1.y
-> Sort (cost=37824.89..37824.90 rows=1 width=8)
Sort Key: t1.x, t1.y
Sort Method: quicksort Memory: 25kB
-> Merge Join (cost=0.85..37824.88 rows=1 width=8)
Merge Cond: (t1.y = t2.x)
Join Filter: (t2.y = t1.x)
-> Index Scan using test_y_idx on test t1
-> Index Scan using test_x_idx on test t2
Don't mind for now that the best plan is to do IncrementalSort with
presorted key t1.x. Just pay attention to the fact that the plan has
altered without any valuable reason.
The cost_incremental_sort() routine causes such behaviour: it chooses
the expression to estimate the number of groups from the first
EquivalenceClass member that relies on the syntax.
I tried to invent a simple solution to fight this minor case. But the
most clear and straightforward way here is to save a reference to the
expression that triggered the PathKey creation inside the PathKey itself.
See the sketch of the patch in the attachment.
I'm not sure this instability is worth fixing this way, but the
dependence of the optimisation outcome on the query text looks buggy.
--
regards, Andrei Lepikhov
From 8eb9998fe3306005d5067e53fef6a032515ebed5 Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepi...@gmail.com>
Date: Wed, 26 Jun 2024 16:28:12 +0700
Subject: [PATCH] Add a reference to the expression which has triggered
creation of this pathkey.
For the sake of consistency in number of groups estimation in incremental sort
we need to have an information about expression which caused the pathkey. It
doesn't guarantee correct estimation but at least estimation doesn't rely on
a SQL string.
---
contrib/postgres_fdw/postgres_fdw.c | 6 +++--
src/backend/optimizer/path/costsize.c | 28 +++++++++++++++++++----
src/backend/optimizer/path/pathkeys.c | 33 ++++++++++++++++++++++-----
src/include/nodes/pathnodes.h | 2 ++
src/include/optimizer/paths.h | 3 ++-
5 files changed, 59 insertions(+), 13 deletions(-)
diff --git a/contrib/postgres_fdw/postgres_fdw.c
b/contrib/postgres_fdw/postgres_fdw.c
index 0bb9a5ae8f..da26d99fb1 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -976,6 +976,7 @@ get_useful_pathkeys_for_relation(PlannerInfo *root,
RelOptInfo *rel)
foreach(lc, useful_eclass_list)
{
EquivalenceClass *cur_ec = lfirst(lc);
+ EquivalenceMember *em;
PathKey *pathkey;
/* If redundant with what we did above, skip it. */
@@ -988,14 +989,15 @@ get_useful_pathkeys_for_relation(PlannerInfo *root,
RelOptInfo *rel)
continue;
/* If no pushable expression for this rel, skip it. */
- if (find_em_for_rel(root, cur_ec, rel) == NULL)
+ if ((em = find_em_for_rel(root, cur_ec, rel)) == NULL)
continue;
/* Looks like we can generate a pathkey, so let's do it. */
pathkey = make_canonical_pathkey(root, cur_ec,
linitial_oid(cur_ec->ec_opfamilies),
BTLessStrategyNumber,
-
false);
+
false,
+
em->em_expr); /* TODO */
useful_pathkeys_list = lappend(useful_pathkeys_list,
list_make1(pathkey));
}
diff --git a/src/backend/optimizer/path/costsize.c
b/src/backend/optimizer/path/costsize.c
index ee23ed7835..a64f3e5797 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2038,21 +2038,41 @@ cost_incremental_sort(Path *path,
foreach(l, pathkeys)
{
PathKey *key = (PathKey *) lfirst(l);
- EquivalenceMember *member = (EquivalenceMember *)
- linitial(key->pk_eclass->ec_members);
+
+#ifdef USE_ASSERT_CHECKING
+ ListCell *lc;
+
+ /* Be paranoid, but caring about performace don't check on prod
*/
+ foreach(lc, key->pk_eclass->ec_members)
+ {
+ if (find_ec_member_matching_expr(
+ key->pk_eclass,
key->source_expr,
+
pull_varnos(root, (Node *) key->source_expr)))
+ break;
+ }
+ if (!lc)
+ /*
+ * XXX: Can we build PathKey over derived expression
and does it
+ * trigger this error?
+ */
+ elog(ERROR, "PathKey source expression must be a part
of EQ class");
+#endif
+
+ /* Reassure ourselves no one created pathkeys alternative way */
+ Assert(key->source_expr != NULL);
/*
* Check if the expression contains Var with "varno 0" so that
we
* don't call estimate_num_groups in that case.
*/
- if (bms_is_member(0, pull_varnos(root, (Node *)
member->em_expr)))
+ if (bms_is_member(0, pull_varnos(root, (Node *)
key->source_expr)))
{
unknown_varno = true;
break;
}
/* expression not containing any Vars with "varno 0" */
- presortedExprs = lappend(presortedExprs, member->em_expr);
+ presortedExprs = lappend(presortedExprs, key->source_expr);
if (foreach_current_index(l) + 1 >= presorted_keys)
break;
diff --git a/src/backend/optimizer/path/pathkeys.c
b/src/backend/optimizer/path/pathkeys.c
index 416fc4e240..d859e79208 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -54,7 +54,7 @@ static bool right_merge_direction(PlannerInfo *root, PathKey
*pathkey);
PathKey *
make_canonical_pathkey(PlannerInfo *root,
EquivalenceClass *eclass, Oid
opfamily,
- int strategy, bool nulls_first)
+ int strategy, bool nulls_first, Expr
*src_expr)
{
PathKey *pk;
ListCell *lc;
@@ -89,6 +89,7 @@ make_canonical_pathkey(PlannerInfo *root,
pk->pk_opfamily = opfamily;
pk->pk_strategy = strategy;
pk->pk_nulls_first = nulls_first;
+ pk->source_expr = src_expr;
root->canon_pathkeys = lappend(root->canon_pathkeys, pk);
@@ -241,7 +242,7 @@ make_pathkey_from_sortinfo(PlannerInfo *root,
/* And finally we can find or create a PathKey node */
return make_canonical_pathkey(root, eclass, opfamily,
- strategy,
nulls_first);
+ strategy,
nulls_first, expr);
}
/*
@@ -1117,7 +1118,8 @@ convert_subquery_pathkeys(PlannerInfo *root, RelOptInfo
*rel,
outer_ec,
sub_pathkey->pk_opfamily,
sub_pathkey->pk_strategy,
-
sub_pathkey->pk_nulls_first);
+
sub_pathkey->pk_nulls_first,
+
(Expr *) outer_var);
}
}
else
@@ -1199,7 +1201,8 @@ convert_subquery_pathkeys(PlannerInfo *root, RelOptInfo
*rel,
outer_ec,
sub_pathkey->pk_opfamily,
sub_pathkey->pk_strategy,
-
sub_pathkey->pk_nulls_first);
+
sub_pathkey->pk_nulls_first,
+
(Expr *) outer_var);
/* score = # of equivalence peers */
score =
list_length(outer_ec->ec_members) - 1;
/* +1 if it matches the proper
query_pathkeys item */
@@ -1643,6 +1646,7 @@ select_outer_pathkeys_for_merge(PlannerInfo *root,
List *pathkeys = NIL;
int nClauses = list_length(mergeclauses);
EquivalenceClass **ecs;
+ Expr **exprs;
int *scores;
int necs;
ListCell *lc;
@@ -1657,6 +1661,7 @@ select_outer_pathkeys_for_merge(PlannerInfo *root,
* duplicates) and their "popularity" scores.
*/
ecs = (EquivalenceClass **) palloc(nClauses * sizeof(EquivalenceClass
*));
+ exprs = (Expr **) palloc(nClauses * sizeof(Expr *));
scores = (int *) palloc(nClauses * sizeof(int));
necs = 0;
@@ -1666,14 +1671,21 @@ select_outer_pathkeys_for_merge(PlannerInfo *root,
EquivalenceClass *oeclass;
int score;
ListCell *lc2;
+ Expr *expr;
/* get the outer eclass */
update_mergeclause_eclasses(root, rinfo);
if (rinfo->outer_is_left)
+ {
oeclass = rinfo->left_ec;
+ expr = (Expr *) linitial(((OpExpr *)
rinfo->clause)->args);
+ }
else
+ {
oeclass = rinfo->right_ec;
+ expr = (Expr *) lsecond(((OpExpr *)
rinfo->clause)->args);
+ }
/* reject duplicates */
for (j = 0; j < necs; j++)
@@ -1697,6 +1709,8 @@ select_outer_pathkeys_for_merge(PlannerInfo *root,
}
ecs[necs] = oeclass;
+ exprs[necs] = expr;
+
scores[necs] = score;
necs++;
}
@@ -1798,7 +1812,8 @@ select_outer_pathkeys_for_merge(PlannerInfo *root,
ec,
linitial_oid(ec->ec_opfamilies),
BTLessStrategyNumber,
-
false);
+
false,
+
exprs[best_j]);
/* can't be redundant because no duplicate ECs */
Assert(!pathkey_is_redundant(pathkey, pathkeys));
pathkeys = lappend(pathkeys, pathkey);
@@ -1852,18 +1867,23 @@ make_inner_pathkeys_for_merge(PlannerInfo *root,
EquivalenceClass *oeclass;
EquivalenceClass *ieclass;
PathKey *pathkey;
+ Expr *src_expr;
update_mergeclause_eclasses(root, rinfo);
+ Assert(IsA(rinfo->clause, OpExpr) && rinfo->orclause == NULL);
+
if (rinfo->outer_is_left)
{
oeclass = rinfo->left_ec;
ieclass = rinfo->right_ec;
+ src_expr = (Expr *) lsecond(((OpExpr *)
rinfo->clause)->args);
}
else
{
oeclass = rinfo->right_ec;
ieclass = rinfo->left_ec;
+ src_expr = (Expr *) linitial(((OpExpr *)
rinfo->clause)->args);
}
/* outer eclass should match current or next pathkeys */
@@ -1891,7 +1911,8 @@ make_inner_pathkeys_for_merge(PlannerInfo *root,
ieclass,
opathkey->pk_opfamily,
opathkey->pk_strategy,
-
opathkey->pk_nulls_first);
+
opathkey->pk_nulls_first,
+
src_expr);
/*
* Don't generate redundant pathkeys (which can happen if
multiple
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 2ba297c117..7e99725e00 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1469,6 +1469,8 @@ typedef struct PathKey
Oid pk_opfamily; /* btree opfamily defining the
ordering */
int pk_strategy; /* sort direction (ASC or DESC)
*/
bool pk_nulls_first; /* do NULLs come before normal values?
*/
+
+ Expr *source_expr; /* Expression, which triggered this
creation */
} PathKey;
/*
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 5e88c0224a..4b99229ebf 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -264,7 +264,8 @@ extern bool has_useful_pathkeys(PlannerInfo *root,
RelOptInfo *rel);
extern List *append_pathkeys(List *target, List *source);
extern PathKey *make_canonical_pathkey(PlannerInfo *root,
EquivalenceClass *eclass, Oid opfamily,
- int
strategy, bool nulls_first);
+ int
strategy, bool nulls_first,
+ Expr
*src_expr);
extern void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
List
*live_childrels);
--
2.45.2