From 99d9072b2fb16e526a1478bee575943045bab786 Mon Sep 17 00:00:00 2001
From: jian he <jian.universality@gmail.com>
Date: Thu, 28 Nov 2024 12:37:22 +0800
Subject: [PATCH v5] remove useless group by columns via unique-not-null

In the `remove_useless_groupby_columns` function,
use a unique index and a not-null constraint to eliminate unnecessary columns.
Here, the primary key is treated as a "unique index and not-null".
If there are multiple candidates,
use the unique index with the fewest columns to remove the other columns
in groupby expression.

discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spar
---
 src/backend/catalog/index.c              | 103 ++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     |  55 +++++++++---
 src/include/catalog/index.h              |   1 +
 src/test/regress/expected/aggregates.out | 106 +++++++++++++++++++++++
 src/test/regress/sql/aggregates.sql      |  51 +++++++++++
 5 files changed, 305 insertions(+), 11 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..80a1e44167 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -4257,3 +4257,106 @@ RestoreReindexState(const void *reindexstate)
 	/* Note the worker has its own transaction nesting level */
 	reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
+
+/*
+ * get_unique_not_null_attnos
+ *		Returns a List of Bitmapsets with the attribute numbers (offset by
+ *		FirstLowInvalidHeapAttributeNumber) of all valid non-partial unique
+ *		indexes which contain only non-nullable columns.
+ */
+List *
+get_unique_not_null_attnos(Oid relid)
+{
+	Bitmapset	*not_null_attnos = NULL;
+	Relation	pg_index;
+	HeapTuple	indexTuple;
+	SysScanDesc scan;
+	ScanKeyData skey;
+	List 		*not_null_cs;
+	List	   *indlist = NIL;
+
+	/* Use CookedConstraint to fetch not-null constraint. */
+	not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+	if (not_null_cs == NIL)
+		return NULL;
+
+	foreach_ptr(CookedConstraint, cc, not_null_cs)
+		not_null_attnos = bms_add_member(not_null_attnos, cc->attnum - FirstLowInvalidHeapAttributeNumber);
+
+	/* Scan pg_index for unique index of the target rel */
+	pg_index = table_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Bitmapset  	*unique_notnull_attnos = NULL;
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Datum		adatum;
+		bool		isNull;
+		bool		all_notnull = true;
+		int			numkeys;
+		Datum	   *keys;
+		int			nelems;
+
+		/* we're only interested in unique index */
+		if (!index->indisunique)
+			continue;
+
+		/* Skip invalid or deferred index */
+		if (!index->indisvalid || !index->indimmediate)
+			continue;
+
+		/* Skip expression index or predicate index */
+		if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+			!heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+			continue;
+
+		/* Extract the pg_index->indkey int2vector */
+		adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+							  RelationGetDescr(pg_index), &isNull);
+		if (isNull)
+			elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+		deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+								  &keys,
+								  NULL,
+								  &nelems);
+
+		if (nelems != index->indnatts)
+			elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+		Assert(nelems >= index->indnkeyatts);
+		numkeys = index->indnkeyatts;
+
+		for (int i = 0; i < numkeys; i++)
+		{
+			int	attno = DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber;
+
+			/* ignore this index if any columns are NULLable */
+			if (!bms_is_member(attno, not_null_attnos))
+			{
+				all_notnull = false;
+				break;
+			}
+
+			unique_notnull_attnos = bms_add_member(unique_notnull_attnos, attno);
+		}
+
+		/* if the index has no NULLable columns, add it to the list */
+		if (all_notnull)
+			indlist = lappend(indlist, unique_notnull_attnos);
+	}
+
+	systable_endscan(scan);
+
+	/* TODO sort the indlist by index OID */
+
+	table_close(pg_index, AccessShareLock);
+	return indlist;
+}
\ No newline at end of file
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b665a7762e..1803e5556a 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "catalog/index.h"
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -2805,8 +2806,10 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	{
 		RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
 		Bitmapset  *relattnos;
-		Bitmapset  *pkattnos;
-		Oid			constraintOid;
+		Bitmapset  *best_attnos = NULL;
+		List	   *unique_notnull_attnos;
+		int32		best_num_attnos = PG_INT32_MAX;
+		ListCell   *lc2;
 
 		relid++;
 
@@ -2828,18 +2831,48 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			continue;
 
 		/*
-		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
+		 * Fetch details about all suitable unique indexes with NOT NULL
+		 * constraints on all columns.
 		 */
-		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
-		if (pkattnos == NULL)
-			continue;
+		unique_notnull_attnos = get_unique_not_null_attnos(rte->relid);
 
 		/*
-		 * If the primary key is a proper subset of relattnos then we have
-		 * some items in the GROUP BY that can be removed.
+		 * Now search each index to check if there are any indexes where the
+		 * indexed columns are a subset of the GROUP BY columns for this
+		 * relation.
 		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		foreach(lc2, unique_notnull_attnos)
+		{
+			Bitmapset  *ind_attnos = lfirst_node(Bitmapset, lc2);
+			int			members;
+
+			/*
+			 * Skip any indexes where the indexed columns aren't a subset of
+			 * the GROUP BY.
+			 */
+			if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1)
+				continue;
+
+			/*
+			 * Record the attribute numbers from the index with the fewest
+			 * columns.  This allows the largest number of columns to be
+			 * removed from the GROUP BY clause.  In the future, we may wish
+			 * to consider using the narrowest set of columns and looking at
+			 * pg_statistic.stawidth as it might be better to use an index
+			 * with, say two INT4s, rather than, say, one long varlena column.
+			 */
+			members = bms_num_members(ind_attnos);
+
+			if (members < best_num_attnos)
+			{
+				best_attnos = ind_attnos;
+				best_num_attnos = members;
+			}
+		}
+
+		list_free(unique_notnull_attnos);
+
+		if (best_attnos != NULL)
 		{
 			/*
 			 * To easily remember whether we've found anything to do, we don't
@@ -2850,7 +2883,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
 													 (list_length(parse->rtable) + 1));
 
 			/* Remember the attnos of the removable columns */
-			surplusvars[relid] = bms_difference(relattnos, pkattnos);
+			surplusvars[relid] = bms_difference(relattnos, best_attnos);
 		}
 	}
 
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..dacc0f6ebd 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
 
 extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
 
+extern List *get_unique_not_null_attnos(Oid relid);
 
 /*
  * itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..c60615cfbd 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1454,6 +1454,112 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 --
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+alter table  t5 attach partition t5_0 for values from (0,0) to (10, 10);
+alter table  t5 attach partition t5_1 for values from (10,10) to (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t1
+(3 rows)
+
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t2
+(3 rows)
+
+--can remove column b, since c has unique index and is not-null
+explain (costs off) select count(*) from t2 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t2
+(3 rows)
+
+--more than one candidate can be used to remove other column.
+--optimizer will use primary key or unique index.
+explain (costs off) select * from t3 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.a, t5.b
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.b, t5.a
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+       QUERY PLAN        
+-------------------------
+ HashAggregate
+   Group Key: b, c, a, d
+   ->  Seq Scan on t6
+(3 rows)
+
+drop table t1, t2,t3,t4,t5,t6;
+--
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
 create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..b4b16b29a8 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -512,6 +512,57 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+alter table  t5 attach partition t5_0 for values from (0,0) to (10, 10);
+alter table  t5 attach partition t5_1 for values from (10,10) to (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+
+--can remove column b, since c has unique index and is not-null
+explain (costs off) select count(*) from t2 group by b,c;
+
+--more than one candidate can be used to remove other column.
+--optimizer will use primary key or unique index.
+explain (costs off) select * from t3 group by a,b,c,d;
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+
+drop table t1, t2,t3,t4,t5,t6;
 --
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
-- 
2.34.1

