Hi hackers,
There is a join optimization we don't do -- removing inner join of a
table with itself on a unique column. Such joins are generated by
various ORMs, so from time to time our customers ask us to look into
this. Most recently, it was discussed on the list in relation to an
article comparing the optimizations that some DBMS make [1].
I started to explore what can be done about this. Attached is a proof of
concept patch. It works for some simple cases:
create table tt(a int primary key, b text);
explain select p.* from tt p join (select * from tt where b ~~ 'a%') q
on p.a = q.a;
QUERY PLAN
──────────────────────────────────────────────────────
Seq Scan on tt p (cost=0.00..25.88 rows=6 width=36)
Filter: (b ~~ 'a%'::text)
It also works for semi-joins like `explain select p.* from tt p where
exists (select * from tt where b ~~ 'a%' and a = p.a);`. This requires a
preparatory step of reducing unique semi joins to inner joins, and we
already do this (reduce_unique_semijoin).
What this patch tries to do is to remove these inner joins when a single
join is being planned (populate_joinrel_with_paths). The main entry
point is reduce_self_unique_join. First, it proves that both input
relations are uniquely constrained by the same index given the
particular join clauses. We already have a way to find such indexes
(relation_has_unique_index_for), so I was able to reuse this. What I'm
not sure about is how to properly remove the join after that. For now, I
just pretend that the join relation being built is the outer baserel,
add to it the restrictions from the inner relation, and then plan it as
usual. Maybe there is a less hacky way to do it? I've seen elsewhere a
suggestion to use an AppendPath for a similar purpose, but here we can't
just use the outer relation we've already planned because the
restriction list is different.
I'd be glad to hear your thoughts on this.
[1]
https://www.postgresql.org/message-id/flat/CAMjNa7cC4X9YR-vAJS-jSYCajhRDvJQnN7m2sLH1wLh-_Z2bsw%40mail.gmail.com#camjna7cc4x9yr-vajs-jsycajhrdvjqnn7m2slh1wlh-_z2...@mail.gmail.com
--
Alexander Kuzmenkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 477b9f7..334793c 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -76,13 +76,9 @@ static void set_rel_size(PlannerInfo *root, RelOptInfo *rel,
Index rti, RangeTblEntry *rte);
static void set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
Index rti, RangeTblEntry *rte);
-static void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel,
- RangeTblEntry *rte);
static void create_plain_partial_paths(PlannerInfo *root, RelOptInfo *rel);
static void set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel,
RangeTblEntry *rte);
-static void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
- RangeTblEntry *rte);
static void set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel,
RangeTblEntry *rte);
static void set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
@@ -366,7 +362,7 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel,
else
{
/* Plain relation */
- set_plain_rel_size(root, rel, rte);
+ set_plain_rel_size(root, rel);
}
break;
case RTE_SUBQUERY:
@@ -449,7 +445,7 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
else
{
/* Plain relation */
- set_plain_rel_pathlist(root, rel, rte);
+ set_plain_rel_pathlist(root, rel);
}
break;
case RTE_SUBQUERY:
@@ -516,8 +512,8 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
* set_plain_rel_size
* Set size estimates for a plain relation (no subquery, no inheritance)
*/
-static void
-set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
+void
+set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel)
{
/*
* Test any partial indexes of rel for applicability. We must do this
@@ -691,8 +687,8 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel,
* set_plain_rel_pathlist
* Build access paths for a plain relation (no subquery, no inheritance)
*/
-static void
-set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
+void
+set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel)
{
Relids required_outer;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index f295558..d79b3c7 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -2961,7 +2961,7 @@ ec_member_matches_indexcol(PlannerInfo *root, RelOptInfo *rel,
}
/*
- * relation_has_unique_index_for
+ * relation_get_unique_index_for
* Determine whether the relation provably has at most one row satisfying
* a set of equality conditions, because the conditions constrain all
* columns of some unique index.
@@ -2982,8 +2982,8 @@ ec_member_matches_indexcol(PlannerInfo *root, RelOptInfo *rel,
* this routine automatically adds in any usable baserestrictinfo clauses.
* (Note that the passed-in restrictlist will be destructively modified!)
*/
-bool
-relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+IndexOptInfo *
+relation_get_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
List *restrictlist,
List *exprlist, List *oprlist)
{
@@ -2993,7 +2993,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
/* Short-circuit if no indexes... */
if (rel->indexlist == NIL)
- return false;
+ return NULL;
/*
* Examine the rel's restriction clauses for usable var = const clauses
@@ -3034,7 +3034,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
/* Short-circuit the easy case */
if (restrictlist == NIL && exprlist == NIL)
- return false;
+ return NULL;
/* Examine each index of the relation ... */
foreach(ic, rel->indexlist)
@@ -3131,10 +3131,10 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
/* Matched all columns of this index? */
if (c == ind->ncolumns)
- return true;
+ return ind;
}
- return false;
+ return NULL;
}
/*
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index 7008e13..d57ac82 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -15,11 +15,15 @@
#include "postgres.h"
#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/joininfo.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
+#include "optimizer/plancat.h"
+#include "optimizer/predtest.h"
#include "optimizer/prep.h"
+#include "optimizer/restrictinfo.h"
#include "partitioning/partbounds.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
@@ -747,6 +751,240 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2)
return joinrel;
}
+static bool
+has_relation_reference_walker(Node *node, void *context)
+{
+ if (node == NULL)
+ return false;
+
+ if (IsA(node, Var))
+ return ((Var *) node)->varno == (Index) (intptr_t) context;
+
+ return expression_tree_walker(node, has_relation_reference_walker, context);
+}
+
+static bool
+has_relation_reference(PathTarget *target, Index relid)
+{
+ return has_relation_reference_walker((Node *) target->exprs,
+ (void *) (intptr_t) relid);
+}
+
+static bool
+set_varno_walker(Node *node, void *context)
+{
+ if (node == NULL)
+ return false;
+
+ if (IsA(node, Var))
+ ((Var *) node)->varno = (Index) (intptr_t) context;
+
+ return expression_tree_walker(node, set_varno_walker, context);
+}
+
+static void
+set_varno(Expr *target, Index relid)
+{
+ set_varno_walker((Node *) target, (void*) (intptr_t) relid);
+}
+
+static inline bool
+clause_sides_match_join(RestrictInfo *rinfo, RelOptInfo *outerrel,
+ RelOptInfo *innerrel)
+{
+ if (bms_is_subset(rinfo->left_relids, outerrel->relids) &&
+ bms_is_subset(rinfo->right_relids, innerrel->relids))
+ {
+ /* lefthand side is outer */
+ rinfo->outer_is_left = true;
+ return true;
+ }
+ else if (bms_is_subset(rinfo->left_relids, innerrel->relids) &&
+ bms_is_subset(rinfo->right_relids, outerrel->relids))
+ {
+ /* righthand side is outer */
+ rinfo->outer_is_left = false;
+ return true;
+ }
+ return false; /* no good for these input relations */
+}
+
+static void
+init_simple_rel(PlannerInfo *root, RelOptInfo *rel, Index relid)
+{
+ RangeTblEntry *rte;
+
+ /* Fetch RTE for relation */
+ rte = root->simple_rte_array[relid];
+ Assert(rte != NULL);
+
+ rel->reloptkind = RELOPT_BASEREL;
+ rel->rtekind = rte->rtekind;
+ rel->rows = 0;
+ rel->relid = relid;
+
+ /* min_attr, max_attr, attr_needed, attr_widths are set below */
+
+ /* Check type of rtable entry */
+ switch (rte->rtekind)
+ {
+ case RTE_RELATION:
+ /* Table --- retrieve statistics from the system catalogs */
+ get_relation_info(root, rte->relid, rte->inh, rel);
+ break;
+ case RTE_SUBQUERY:
+ case RTE_FUNCTION:
+ case RTE_TABLEFUNC:
+ case RTE_VALUES:
+ case RTE_CTE:
+ case RTE_NAMEDTUPLESTORE:
+
+ /*
+ * Subquery, function, tablefunc, values list, CTE, or ENR --- set
+ * up attr range and arrays
+ *
+ * Note: 0 is included in range to support whole-row Vars
+ */
+ rel->min_attr = 0;
+ rel->max_attr = list_length(rte->eref->colnames);
+ rel->attr_needed = (Relids *)
+ palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids));
+ rel->attr_widths = (int32 *)
+ palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32));
+ break;
+ default:
+ elog(ERROR, "unrecognized RTE kind: %d",
+ (int) rte->rtekind);
+ break;
+ }
+}
+
+static IndexOptInfo *
+get_inner_unique_index(PlannerInfo *root, RelOptInfo *innerRel,
+ RelOptInfo *outerRel, List *restrictlist)
+{
+ ListCell *lc;
+ List *rl = list_copy(restrictlist);
+ foreach(lc, rl)
+ {
+ if (!clause_sides_match_join((RestrictInfo *) lfirst(lc),
+ outerRel, innerRel))
+ return NULL;
+ }
+
+ return relation_get_unique_index_for(root, innerRel, rl, 0, 0);
+}
+
+/*
+ * This function optimizes out an inner join of a relation to itself on a
+ * unique column. When the inner relation is not referenced by the targetlist,
+ * we can transfer the filters from the inner relation to the outer one, and
+ * scan it instead of joining.
+ */
+static bool
+reduce_self_unique_join(PlannerInfo *root, RelOptInfo *outerrel,
+ RelOptInfo *innerrel, RelOptInfo *joinrel,
+ List *restrictlist)
+{
+ List *mergedRinfos;
+ ListCell *lc;
+ IndexOptInfo *outeridx, *inneridx;
+ Bitmapset *oldRelids = joinrel->relids;
+
+ outeridx = get_inner_unique_index(root, outerrel, innerrel, restrictlist);
+ if (!outeridx)
+ return false;
+
+ /*
+ * Join can be reduced only if the unique constraint is immediate.
+ * Otherwise, the constraint does not always hold inside a transaction.
+ */
+ if (!outeridx->immediate)
+ return false;
+
+ inneridx = get_inner_unique_index(root, innerrel, outerrel, restrictlist);
+ if (!inneridx)
+ return false;
+
+ /* We must have the same unique index for both relations. */
+ if (outeridx->indexoid != inneridx->indexoid)
+ return false;
+
+ /* A sanity check: this is the same index on the same relation. */
+ Assert(root->simple_rte_array[outerrel->relid]->relid
+ == root->simple_rte_array[innerrel->relid]->relid);
+
+ /*
+ * If some columns of the inner relation are in the target list,
+ * we have to keep it.
+ */
+ if (has_relation_reference(joinrel->reltarget, innerrel->relid))
+ return false;
+
+ /*
+ * All the necessary conditions hold and we can change the join
+ * into a scan on the outer relation. First, append the filters
+ * from inner relation to the outer.
+ */
+ mergedRinfos = list_copy(outerrel->baserestrictinfo);
+ foreach(lc, innerrel->baserestrictinfo)
+ {
+ RestrictInfo *oldRinfo = lfirst_node(RestrictInfo, lc);
+ RestrictInfo *newRinfo;
+ Expr *newClause;
+
+ if (is_redundant_derived_clause(oldRinfo, mergedRinfos))
+ continue; /* derived from same EquivalenceClass */
+
+ newClause = copyObject(oldRinfo->clause);
+ set_varno(newClause, outerrel->relid);
+
+ /*
+ * !!!FIXME This check doesn't work, because predicate_implied_by
+ * uses equal() to compare Var nodes. For the Vars transfered from
+ * another relation, some parameters such as varoattno and location
+ * are different. They are not meaningful for the purposes of
+ * Maybe just don't compare them?
+ */
+ if (!contain_mutable_functions((Node *) newClause)
+ && predicate_implied_by(list_make1(newClause), mergedRinfos,
+ /* weak = */ false ))
+ continue; /* provably implied by r1 */
+
+ /*
+ * Make a new rinfo instead of copying, because it has quite a few
+ * internal fields that depend on relid.
+ */
+ newRinfo = make_restrictinfo(newClause,
+ oldRinfo->is_pushed_down,
+ oldRinfo->outerjoin_delayed,
+ oldRinfo->pseudoconstant,
+ oldRinfo->security_level,
+ bms_make_singleton(outerrel->relid) /* required relids */,
+ NULL /* outer relids */,
+ oldRinfo->nullable_relids);
+ mergedRinfos = lappend(mergedRinfos, newRinfo);
+ }
+
+ /*
+ * Pretend that the join relation is a base relation, and
+ * plan it as usual.
+ */
+ init_simple_rel(root, joinrel, outerrel->relid);
+
+ joinrel->baserestrictinfo = mergedRinfos;
+ joinrel->relids = bms_make_singleton(joinrel->relid);
+
+ set_plain_rel_size(root, joinrel);
+ set_plain_rel_pathlist(root, joinrel);
+
+ joinrel->relids = oldRelids;
+
+ elog(LOG, "Removed self join.");
+
+ return true;
+}
+
/*
* populate_joinrel_with_paths
* Add paths to the given joinrel for given pair of joining relations. The
@@ -786,6 +1024,10 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1,
mark_dummy_rel(joinrel);
break;
}
+
+ if (reduce_self_unique_join(root, rel1, rel2, joinrel, restrictlist))
+ break;
+
add_paths_to_joinrel(root, joinrel, rel1, rel2,
JOIN_INNER, sjinfo,
restrictlist);
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 0e73f9c..128af10 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -601,7 +601,7 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
* reference to unique indexes. Make sure there's at least one
* suitable unique index. It must be immediately enforced, and if
* it's a partial index, it must match the query. (Keep these
- * conditions in sync with relation_has_unique_index_for!)
+ * conditions in sync with relation_get_unique_index_for!)
*/
ListCell *lc;
@@ -658,10 +658,10 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list)
{
/*
* Examine the indexes to see if we have a matching unique index.
- * relation_has_unique_index_for automatically adds any usable
+ * relation_get_unique_index_for automatically adds any usable
* restriction clauses for the rel, so we needn't do that here.
*/
- if (relation_has_unique_index_for(root, rel, clause_list, NIL, NIL))
+ if (relation_get_unique_index_for(root, rel, clause_list, NIL, NIL))
return true;
}
else if (rel->rtekind == RTE_SUBQUERY)
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 0317763..bc9741a 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -4593,9 +4593,17 @@ fix_indexqual_references(PlannerInfo *root, IndexPath *index_path)
* Check to see if the indexkey is on the right; if so, commute
* the clause. The indexkey should be the side that refers to
* (only) the base relation.
+ *
+ * index->rel->relids can be a multi-bit set, so the check
+ * cannot be simplified to a plain comparison. The multi-bit
+ * relids are produced by the unique inner join removal, see
+ * reduce_self_unique_join().
*/
- if (!bms_equal(rinfo->left_relids, index->rel->relids))
+ if (!bms_is_empty(rinfo->right_relids)
+ && bms_is_subset(rinfo->right_relids, index->rel->relids))
+ {
CommuteOpExpr(op);
+ }
/*
* Now replace the indexkey expression with an index Var.
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e190ad4..c65474f 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1583,7 +1583,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
* clauses for the rel, as well.
*/
if (rel->rtekind == RTE_RELATION && sjinfo->semi_can_btree &&
- relation_has_unique_index_for(root, rel, NIL,
+ relation_get_unique_index_for(root, rel, NIL,
sjinfo->semi_rhs_exprs,
sjinfo->semi_operators))
{
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index cafde30..d2f8bcb 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -50,6 +50,8 @@ extern PGDLLIMPORT join_search_hook_type join_search_hook;
extern RelOptInfo *make_one_rel(PlannerInfo *root, List *joinlist);
extern void set_dummy_rel_pathlist(RelOptInfo *rel);
+extern void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel);
+extern void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel);
extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed,
List *initial_rels);
@@ -71,7 +73,7 @@ extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel);
* routines to generate index paths
*/
extern void create_index_paths(PlannerInfo *root, RelOptInfo *rel);
-extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+extern IndexOptInfo *relation_get_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
List *restrictlist,
List *exprlist, List *oprlist);
extern bool indexcol_is_bool_constant_for_query(IndexOptInfo *index,
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index cbc882d..7f3691f 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -2918,6 +2918,65 @@ where nt3.id = 1 and ss2.b3;
(1 row)
--
+-- test removal of semi or inner joins on unique columns
+-- !!!FIXME check coverage
+--
+analyze nt1;
+explain (costs off)
+select a.* from nt1 a
+join (select * from nt1 where a1) b on a.id = b.id;
+ QUERY PLAN
+-------------------
+ Seq Scan on nt1 a
+ Filter: a1
+(2 rows)
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id) and id < 2;
+ QUERY PLAN
+--------------------
+ Seq Scan on nt1 a
+ Filter: (id < 2)
+(2 rows)
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id and id < 2);
+ QUERY PLAN
+--------------------
+ Seq Scan on nt1 a
+ Filter: (id < 2)
+(2 rows)
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id) and a1;
+ QUERY PLAN
+-------------------
+ Seq Scan on nt1 a
+ Filter: a1
+(2 rows)
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id and a1);
+ QUERY PLAN
+-------------------
+ Seq Scan on nt1 a
+ Filter: a1
+(2 rows)
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id and a1) and id < 2;
+ QUERY PLAN
+-----------------------------
+ Seq Scan on nt1 a
+ Filter: (a1 AND (id < 2))
+(2 rows)
+
+--
-- test case where a PlaceHolderVar is propagated into a subquery
--
explain (costs off)
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 86c6d5b..1fc6a02 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -851,6 +851,36 @@ from nt3 as nt3
where nt3.id = 1 and ss2.b3;
--
+-- test removal of semi or inner joins on unique columns
+-- !!!FIXME check coverage
+--
+analyze nt1;
+
+explain (costs off)
+select a.* from nt1 a
+join (select * from nt1 where a1) b on a.id = b.id;
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id) and id < 2;
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id and id < 2);
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id) and a1;
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id and a1);
+
+explain (costs off)
+select a.* from nt1 a
+where exists (select * from nt1 where id = a.id and a1) and id < 2;
+
+--
-- test case where a PlaceHolderVar is propagated into a subquery
--