From f72c0846a1be1d544f904bde7bae1863436f1556 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Fri, 29 Nov 2024 20:05:07 +1300
Subject: [PATCH v10 2/2] Remove redundant GROUP BY columns using UNIQUE
 indexes

d4c3a156c added support that when the GROUP BY contained all of the
columns belonging to a relations PRIMARY KEY, that all other columns
belonging to that relation could be removed from the GROUP BY clause.
That's possible because all other columns are functionally dependent on
the PRIMARY KEY.

Here we expand on that optimization and allow it to work for any UNIQUE
indexes on the table rather than just the PRIMARY KEY.  This normally
requires that all columns in the index are defined with NOT NULL,
however, we can relax that requirement when the index is defined with
NULLS NOT DISTINCT.

Previously remove_useless_groupby_columns() was being called from
grouping_planner() long before RelOptInfos were built for each
RangeTblEntry.  Here we delay calling remove_useless_groupby_columns()
until just after add_base_rels_to_query(), where RelOptInfos are created
in query_planner().  Doing this allows us to much more easily implement
this as all the relation metadata is then available in RelOptInfo form
and we no longer have to query the catalog tables.

Patch originally by Zhang Mingli and later worked on by jian he, but after
I (David) worked on it, there was very little of the original left.

Author: David Rowley, Zhang Mingli, jian he
Discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0%40Spark
---
 src/backend/optimizer/plan/initsplan.c   | 117 +++++++++++++++++------
 src/backend/optimizer/plan/planmain.c    |   3 +
 src/backend/optimizer/plan/planner.c     |   2 -
 src/backend/optimizer/util/plancat.c     |   1 +
 src/include/nodes/pathnodes.h            |   2 +
 src/test/regress/expected/aggregates.out |  56 +++++++++++
 src/test/regress/sql/aggregates.sql      |  27 ++++++
 7 files changed, 179 insertions(+), 29 deletions(-)

diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index c1c4f9f8b9..ac36b70746 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -400,17 +400,13 @@ add_vars_to_attr_needed(PlannerInfo *root, List *vars,
  *
  * Since some other DBMSes do not allow references to ungrouped columns, it's
  * not unusual to find all columns listed in GROUP BY even though listing the
- * primary-key columns would be sufficient.  Deleting such excess columns
- * avoids redundant sorting work, so it's worth doing.
+ * primary-key columns, or columns of a unique constraint would be sufficient.
+ * Deleting such excess columns avoids redundant sorting or hashing work, so
+ * it's worth doing.
  *
  * Relcache invalidations will ensure that cached plans become invalidated
- * when the underlying index of the pkey constraint is dropped.
- *
- * Currently, we only make use of pkey constraints for this, however, we may
- * wish to take this further in the future and also use unique constraints
- * which have NOT NULL columns.  In that case, plan invalidation will still
- * work since relations will receive a relcache invalidation when a NOT NULL
- * constraint is dropped.
+ * when the underlying supporting indexes are dropped or if a column's NOT
+ * NULL attribute is removed.
  */
 void
 remove_useless_groupby_columns(PlannerInfo *root)
@@ -472,9 +468,10 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	foreach(lc, parse->rtable)
 	{
 		RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
+		RelOptInfo *rel;
 		Bitmapset  *relattnos;
-		Bitmapset  *pkattnos;
-		Oid			constraintOid;
+		Bitmapset  *best_keycolumns = NULL;
+		int32		best_nkeycolumns = PG_INT32_MAX;
 
 		relid++;
 
@@ -495,30 +492,96 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		if (bms_membership(relattnos) != BMS_MULTIPLE)
 			continue;
 
-		/*
-		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
-		 */
-		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
-		if (pkattnos == NULL)
-			continue;
+		rel = root->simple_rel_array[relid];
 
 		/*
-		 * If the primary key is a proper subset of relattnos then we have
-		 * some items in the GROUP BY that can be removed.
+		 * Now search each index to check if there are any indexes where the
+		 * indexed columns are a subset of the GROUP BY columns for this
+		 * relation.
 		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		foreach_node(IndexOptInfo, index, rel->indexlist)
+		{
+			Bitmapset  *ind_attnos;
+			bool		nulls_check_ok;
+
+			/*
+			 * Skip any non-unique and deferrable indexes.  Predicate indexes
+			 * have not been checked yet, so we must skip those too as the
+			 * predOK check might fail.
+			 */
+			if (!index->unique || !index->immediate || index->indpred != NIL)
+				continue;
+
+			/* For simplicity we currently don't support expression indexes */
+			if (index->indexprs != NIL)
+				continue;
+
+			ind_attnos = NULL;
+			nulls_check_ok = true;
+			for (int i = 0; i < index->nkeycolumns; i++)
+			{
+				/*
+				 * We must insist that the index columns are all defined NOT
+				 * NULL otherwise duplicate NULLs could exists.  However, we
+				 * can relax this check when the index is defined with NULLS
+				 * NOT DISTINCT as there can only be 1 NULL row, therefore
+				 * functional dependency on the unique columns is maintained,
+				 * despite the NULL.
+				 */
+				if (!index->nullsnotdistinct &&
+					!bms_is_member(index->indexkeys[i],
+								   rel->notnullattnums))
+				{
+					nulls_check_ok = false;
+					break;
+				}
+
+				ind_attnos =
+					bms_add_member(ind_attnos,
+								   index->indexkeys[i] -
+								   FirstLowInvalidHeapAttributeNumber);
+			}
+
+			if (!nulls_check_ok)
+				continue;
+
+			/*
+			 * Skip any indexes where the indexed columns aren't a subset of
+			 * the GROUP BY.
+			 */
+			if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1)
+				continue;
+
+			/*
+			 * Record the attribute numbers from the index with the fewest
+			 * columns.  This allows the largest number of columns to be
+			 * removed from the GROUP BY clause.  In the future, we may wish
+			 * to consider using the narrowest set of columns and looking at
+			 * pg_statistic.stawidth as it might be better to use an index
+			 * with, say two INT4s, rather than, say, one long varlena column.
+			 */
+			if (index->nkeycolumns < best_nkeycolumns)
+			{
+				best_keycolumns = ind_attnos;
+				best_nkeycolumns = index->nkeycolumns;
+			}
+		}
+
+		/* Did we find a suitable index? */
+		if (best_keycolumns != NULL)
 		{
 			/*
 			 * To easily remember whether we've found anything to do, we don't
 			 * allocate the surplusvars[] array until we find something.
 			 */
 			if (surplusvars == NULL)
-				surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
-													 (list_length(parse->rtable) + 1));
+				surplusvars =
+					(Bitmapset **) palloc0(sizeof(Bitmapset *) *
+										   (list_length(parse->rtable) +
+											1));
 
 			/* Remember the attnos of the removable columns */
-			surplusvars[relid] = bms_difference(relattnos, pkattnos);
+			surplusvars[relid] = bms_difference(relattnos, best_keycolumns);
 		}
 	}
 
@@ -541,9 +604,9 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			 * New list must include non-Vars, outer Vars, and anything not
 			 * marked as surplus.
 			 */
-			if (!IsA(var, Var) ||
-				var->varlevelsup > 0 ||
-				!bms_is_member(var->varattno - FirstLowInvalidHeapAttributeNumber,
+			if (!IsA(var, Var) || var->varlevelsup > 0 ||
+				!bms_is_member(var->varattno -
+							   FirstLowInvalidHeapAttributeNumber,
 							   surplusvars[var->varno]))
 				new_groupby = lappend(new_groupby, sgc);
 		}
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index e17d31a5c3..735560e8ca 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -169,6 +169,9 @@ query_planner(PlannerInfo *root,
 	 */
 	add_base_rels_to_query(root, (Node *) parse->jointree);
 
+	/* Remove any redundant GROUP BY columns */
+	remove_useless_groupby_columns(root);
+
 	/*
 	 * Examine the targetlist and join tree, adding entries to baserel
 	 * targetlists for all referenced Vars, and generating PlaceHolderInfo
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 84d9e99b0d..a0a2de7ee4 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1485,8 +1485,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction,
 		{
 			/* Preprocess regular GROUP BY clause, if any */
 			root->processed_groupClause = preprocess_groupclause(root, NIL);
-			/* Remove any redundant GROUP BY columns */
-			remove_useless_groupby_columns(root);
 		}
 
 		/*
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 37b0ca2e43..153390f2dc 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -457,6 +457,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
 			info->predOK = false;	/* set later, in indxpath.c */
 			info->unique = index->indisunique;
+			info->nullsnotdistinct = index->indnullsnotdistinct;
 			info->immediate = index->indimmediate;
 			info->hypothetical = false;
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index add0f9e45f..0759e00e96 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1187,6 +1187,8 @@ struct IndexOptInfo
 	bool		predOK;
 	/* true if a unique index */
 	bool		unique;
+	/* true if the index was defined with NULLS NOT DISTINCT */
+	bool		nullsnotdistinct;
 	/* is uniqueness enforced immediately? */
 	bool		immediate;
 	/* true if index doesn't really exist */
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..27ea09732f 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1448,6 +1448,62 @@ explain (costs off) select * from p_t1 group by a,b,c,d;
          ->  Seq Scan on p_t1_2
 (5 rows)
 
+create unique index t3_c_uidx on t3(c);
+-- Ensure we don't remove any columns from the GROUP BY for a unique
+-- index on a NULLable column.
+explain (costs off) select b,c from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: b, c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Make the column NOT NULL and ensure we remove the redundant column
+alter table t3 alter column c set not null;
+explain (costs off) select b,c from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- When there are multiple supporting unique indexes and the GROUP BY contains
+-- columns to cover all of those, ensure we pick the index with the least
+-- number of columns so that we can remove more columns from the GROUP BY.
+explain (costs off) select a,b,c from t3 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- As above but try ordering the columns differently to ensure we get the
+-- same result.
+explain (costs off) select a,b,c from t3 group by c,a,b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT
+-- NULL constraint on the indexed columns.  Ensure the redundant columns are
+-- removed from the GROUP BY for such a table.
+drop index t3_c_uidx;
+alter table t3 alter column c drop not null;
+create unique index t3_c_uidx on t3(c) nulls not distinct;
+explain (costs off) select b,c from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
 drop table t1 cascade;
 NOTICE:  drop cascades to table t1c
 drop table t2;
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..4e23fb4f38 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -507,6 +507,33 @@ create temp table p_t1_2 partition of p_t1 for values in(2);
 -- Ensure we can remove non-PK columns for partitioned tables.
 explain (costs off) select * from p_t1 group by a,b,c,d;
 
+create unique index t3_c_uidx on t3(c);
+
+-- Ensure we don't remove any columns from the GROUP BY for a unique
+-- index on a NULLable column.
+explain (costs off) select b,c from t3 group by b,c;
+
+-- Make the column NOT NULL and ensure we remove the redundant column
+alter table t3 alter column c set not null;
+explain (costs off) select b,c from t3 group by b,c;
+
+-- When there are multiple supporting unique indexes and the GROUP BY contains
+-- columns to cover all of those, ensure we pick the index with the least
+-- number of columns so that we can remove more columns from the GROUP BY.
+explain (costs off) select a,b,c from t3 group by a,b,c;
+
+-- As above but try ordering the columns differently to ensure we get the
+-- same result.
+explain (costs off) select a,b,c from t3 group by c,a,b;
+
+-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT
+-- NULL constraint on the indexed columns.  Ensure the redundant columns are
+-- removed from the GROUP BY for such a table.
+drop index t3_c_uidx;
+alter table t3 alter column c drop not null;
+create unique index t3_c_uidx on t3(c) nulls not distinct;
+explain (costs off) select b,c from t3 group by b,c;
+
 drop table t1 cascade;
 drop table t2;
 drop table t3;
-- 
2.34.1