From a591924bcd0eabb4559be07c6f5a2eb3930f222f Mon Sep 17 00:00:00 2001
From: erthalion <9erthalion6@gmail.com>
Date: Fri, 29 Mar 2019 15:20:22 +0100
Subject: [PATCH v11 3/3] Reorder by values distribution

---
 src/backend/optimizer/path/pathkeys.c    | 145 +++++++++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     |  33 ++++++-
 src/backend/utils/misc/guc.c             |  21 ++++-
 src/include/optimizer/paths.h            |  10 +++
 src/test/regress/expected/aggregates.out |  16 ++--
 src/test/regress/expected/stats_ext.out  |  20 ++---
 6 files changed, 225 insertions(+), 20 deletions(-)

diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 350b14b8c7..cee55a3feb 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -327,6 +327,11 @@ pathkeys_contained_in(List *keys1, List *keys2)
 	return false;
 }
 
+/************************<DEBUG PART>*************************************/
+bool debug_group_by_reorder_by_pathkeys = true;
+bool debug_cheapest_group_by = true;
+/************************</DEBUG PART>************************************/
+
 /*
  * Reorder GROUP BY pathkeys and clauses to match order of pathkeys. Function
  * returns new lists,  original GROUP BY lists stay untouched.
@@ -340,6 +345,9 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
 	ListCell	*key;
 	int			n;
 
+	if (debug_group_by_reorder_by_pathkeys == false)
+		return 0;
+
 	if (pathkeys == NIL || *group_pathkeys == NIL)
 		return 0;
 
@@ -379,6 +387,143 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
 	return n;
 }
 
+/*
+ * Order tail of list of group pathkeys by uniqueness descendetly. It allows to
+ * speedup sorting. Returns newly allocated lists, old ones stay untouched.
+ * n_preordered defines a head of list which order should be prevented.
+ */
+void
+get_cheapest_group_keys_order(PlannerInfo *root, double nrows,
+							  List *target_list,
+							  List **group_pathkeys, List **group_clauses,
+							  int n_preordered)
+{
+	struct
+	{
+		PathKey			*pathkey;
+		SortGroupClause	*sgc;
+		Node			*pathkeyExpr;
+	}
+				   *keys, tmp;
+	int				nkeys = list_length(*group_pathkeys) - n_preordered;
+	List		   *pathkeyExprList = NIL,
+				   *new_group_pathkeys = NIL,
+				   *new_group_clauses = NIL;
+	ListCell	   *cell;
+	int				i = 0, n_keys_to_est;
+
+	if (!debug_cheapest_group_by)
+		return;
+
+	if (nkeys < 2)
+		return; /* nothing to do */
+
+	/*
+	 * Nothing to do here, since reordering of group clauses to match ORDER BY
+	 * already performed in preprocess_groupclause
+	 */
+	if (n_preordered == 0 && root->sort_pathkeys)
+		return;
+
+	keys = palloc(nkeys * sizeof(*keys));
+
+	/*
+	 * Collect information about pathkey for subsequent usage
+	 */
+	for_each_cell(cell, list_nth_cell(*group_pathkeys, n_preordered))
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(cell);
+
+		keys[i].pathkey = pathkey;
+		keys[i].sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+											  *group_clauses);
+		keys[i].pathkeyExpr = get_sortgroupclause_expr(keys[i].sgc,
+													   target_list);
+		i++;
+	}
+
+	/*
+	 * Find the cheapest to sort order of columns. We will find a first column
+	 * with bigger number of group, then pair (first column in pair is  already
+	 * defined in first step), them triple and so on.
+	 */
+	for(n_keys_to_est = 1; n_keys_to_est <= nkeys - 1; n_keys_to_est++)
+	{
+		ListCell   *tail_cell;
+		int			best_i = 0;
+		double		best_est_num_groups = -1;
+
+		/* expand list of columns and remeber last cell */
+		pathkeyExprList = lappend(pathkeyExprList, NULL);
+		tail_cell = list_tail(pathkeyExprList);
+
+		/*
+		 * Find the best last column - the best means bigger number of groups,
+		 * previous columns are already choosen
+		 */
+		for(i = n_keys_to_est - 1; i < nkeys; i++)
+		{
+			double  est_num_groups;
+
+			lfirst(tail_cell) = keys[i].pathkeyExpr;
+			est_num_groups = estimate_num_groups(root, pathkeyExprList,
+												 nrows, NULL);
+
+			if (est_num_groups > best_est_num_groups)
+			{
+				best_est_num_groups = est_num_groups;
+				best_i = i;
+			}
+		}
+
+		/* Save the best choice */
+		lfirst(tail_cell) = keys[best_i].pathkeyExpr;
+		if (best_i != n_keys_to_est - 1)
+		{
+			tmp = keys[n_keys_to_est - 1];
+			keys[n_keys_to_est - 1] = keys[best_i];
+			keys[best_i] = tmp;
+		}
+	}
+	list_free(pathkeyExprList);
+
+	/*
+	 * Construct result lists, keys array is already ordered to get a cheapest
+	 * sort
+	 */
+	i = 0;
+	foreach(cell, *group_pathkeys)
+	{
+		PathKey	   *pathkey;
+		SortGroupClause *sgc;
+
+		if (i < n_preordered)
+		{
+			pathkey = (PathKey *) lfirst(cell);
+			sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+										  *group_clauses);
+		}
+		else
+		{
+			pathkey = keys[i - n_preordered].pathkey;
+			sgc = keys[i - n_preordered].sgc;
+		}
+
+		new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+		new_group_clauses = lappend(new_group_clauses, sgc);
+
+		i++;
+	}
+
+	pfree(keys);
+
+	/* Just append the rest GROUP BY clauses */
+	new_group_clauses = list_concat_unique_ptr(new_group_clauses, *group_clauses);
+
+	*group_pathkeys = new_group_pathkeys;
+	*group_clauses = new_group_clauses;
+}
+
 /*
  * get_cheapest_path_for_pathkeys
  *	  Find the cheapest path (according to the specified criterion) that
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4a27b0dc47..dafa217cb3 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6384,7 +6384,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			bool		is_sorted;
 			List	   *group_pathkeys = root->group_pathkeys,
 					   *group_clauses = parse->groupClause;
-			int			n_preordered_groups;
+			int			n_preordered_groups = 0;
 
 			if (parse->groupingSets)
 			{
@@ -6408,11 +6408,20 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			{
 				/* Sort the cheapest-total path if it isn't already sorted */
 				if (!is_sorted)
+				{
+					if (!parse->groupingSets)
+						get_cheapest_group_keys_order(root,
+													  path->rows,
+													  extra->targetList,
+													  &group_pathkeys,
+													  &group_clauses,
+													  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				/* Now decide what to stick atop it */
 				if (parse->groupingSets)
@@ -6486,6 +6495,12 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 				{
 					if (path != partially_grouped_rel->cheapest_total_path)
 						continue;
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
@@ -6760,11 +6775,19 @@ create_partial_grouping_paths(PlannerInfo *root,
 			{
 				/* Sort the cheapest partial path, if it isn't already */
 				if (!is_sorted)
+				{
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				if (parse->hasAggs)
 					add_path(partially_grouped_rel, (Path *)
@@ -6811,11 +6834,19 @@ create_partial_grouping_paths(PlannerInfo *root,
 
 				/* Sort the cheapest partial path, if it isn't already */
 				if (!is_sorted)
+				{
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				if (parse->hasAggs)
 					add_partial_path(partially_grouped_rel, (Path *)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1766e46037..30e95e8cbb 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1949,7 +1949,26 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
-
+/************************<DEBUG OPT GROUP BY>*********************************/
+	{
+		{"debug_group_by_reorder_by_pathkeys", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("enable reorder GROUP BY by pathkeys"),
+			NULL
+		},
+		&debug_group_by_reorder_by_pathkeys,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"debug_enable_cheapest_group_by", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("find a cheapest order of columns in GROUP BY."),
+			NULL
+		},
+		&debug_cheapest_group_by,
+		true,
+		NULL, NULL, NULL
+	},
+/************************</DEBUG OPT GROUP BY>********************************/
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index de0d076a1a..dd7ad7f330 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -186,6 +186,16 @@ extern bool pathkeys_contained_in(List *keys1, List *keys2);
 extern int group_keys_reorder_by_pathkeys(List *pathkeys,
 										  List **group_pathkeys,
 										  List **group_clauses);
+/************************<DEBUG OPT GROUP BY>*********************************/
+extern bool debug_group_by_reorder_by_pathkeys;
+extern bool debug_cheapest_group_by;
+/************************</DEBUG OPT GROUP BY>********************************/
+extern void get_cheapest_group_keys_order(PlannerInfo *root,
+										  double nrows,
+										  List *target_list,
+										  List **group_pathkeys,
+										  List **group_clauses,
+										  int	n_preordered);
 extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 							   Relids required_outer,
 							   CostSelector cost_criterion,
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 265c996d5e..b285b6921e 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2228,9 +2228,9 @@ SELECT count(*) FROM btg GROUP BY v, p;
          QUERY PLAN          
 -----------------------------
  GroupAggregate
-   Group Key: v, p
+   Group Key: p, v
    ->  Sort
-         Sort Key: v, p
+         Sort Key: p, v
          ->  Seq Scan on btg
 (5 rows)
 
@@ -2239,9 +2239,9 @@ SELECT count(*) FROM btg GROUP BY v, p, c;
          QUERY PLAN          
 -----------------------------
  GroupAggregate
-   Group Key: v, p, c
+   Group Key: p, v, c
    ->  Sort
-         Sort Key: v, p, c
+         Sort Key: p, v, c
          ->  Seq Scan on btg
 (5 rows)
 
@@ -2261,9 +2261,9 @@ SELECT count(*) FROM btg GROUP BY v, p, d, c;
           QUERY PLAN          
 ------------------------------
  GroupAggregate
-   Group Key: v, p, d, c
+   Group Key: p, v, d, c
    ->  Sort
-         Sort Key: v, p, d, c
+         Sort Key: p, v, d, c
          ->  Seq Scan on btg
 (5 rows)
 
@@ -2318,9 +2318,9 @@ SELECT count(*) FROM btg GROUP BY p, d, e;
          QUERY PLAN          
 -----------------------------
  GroupAggregate
-   Group Key: p, d, e
+   Group Key: p, e, d
    ->  Sort
-         Sort Key: p, d, e
+         Sort Key: p, e, d
          ->  Seq Scan on btg
 (5 rows)
 
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index b32663459d..c59a0ddda5 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -249,9 +249,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b, c, d
+   Group Key: a, d, c, b
    ->  Sort
-         Sort Key: a, b, c, d
+         Sort Key: a, d, c, b
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -260,9 +260,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: b, c, d
+   Group Key: b, d, c
    ->  Sort
-         Sort Key: b, c, d
+         Sort Key: b, d, c
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -286,9 +286,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b
+   Group Key: b, a
    ->  Sort
-         Sort Key: a, b
+         Sort Key: b, a
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -297,9 +297,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b, c
+   Group Key: b, a, c
    ->  Sort
-         Sort Key: a, b, c
+         Sort Key: b, a, c
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -308,9 +308,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b, c, d
+   Group Key: d, b, a, c
    ->  Sort
-         Sort Key: a, b, c, d
+         Sort Key: d, b, a, c
          ->  Seq Scan on ndistinct
 (5 rows)
 
-- 
2.16.4

