From 682a1fa6f591fd687cb80c78a0968631e4e0987b Mon Sep 17 00:00:00 2001
From: "Paul A. Jungwirth" <pj@illuminatedcomputing.com>
Date: Sat, 25 Nov 2023 09:12:52 -0800
Subject: [PATCH v1] Use statitics for estimating UNNEST(column) rows

Extends estimate_array_length to consult DECHIST statistics when called
with a Var. This improves planning with array_unnest_support (added in
a391ff3c3d41) and should give more accurate results for other callers
too. DECHIST has the average *distinct* element count which isn't
exactly what we want, but it should still be an improvement over a fixed
10.
---
 src/backend/optimizer/path/costsize.c    |  8 ++---
 src/backend/utils/adt/array_typanalyze.c |  1 +
 src/backend/utils/adt/arrayfuncs.c       |  2 +-
 src/backend/utils/adt/selfuncs.c         | 39 +++++++++++++++++++-----
 src/include/utils/selfuncs.h             |  2 +-
 src/test/regress/expected/arrays.out     | 10 ++++++
 src/test/regress/sql/arrays.sql          |  4 +++
 7 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index d6ceafd51c5..8bcdcc7fd84 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1254,7 +1254,7 @@ cost_tidscan(Path *path, PlannerInfo *root,
 			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
 			Node	   *arraynode = (Node *) lsecond(saop->args);
 
-			ntuples += estimate_array_length(arraynode);
+			ntuples += estimate_array_length(root, arraynode);
 		}
 		else if (IsA(qual, CurrentOfExpr))
 		{
@@ -4741,7 +4741,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 		Node	   *arraynode = (Node *) lsecond(saop->args);
 		QualCost	sacosts;
 		QualCost	hcosts;
-		int			estarraylen = estimate_array_length(arraynode);
+		int			estarraylen = estimate_array_length(context->root, arraynode);
 
 		set_sa_opfuncid(saop);
 		sacosts.startup = sacosts.per_tuple = 0;
@@ -4779,7 +4779,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 			 */
 			context->total.startup += sacosts.startup;
 			context->total.per_tuple += sacosts.per_tuple *
-				estimate_array_length(arraynode) * 0.5;
+				estimate_array_length(context->root, arraynode) * 0.5;
 		}
 	}
 	else if (IsA(node, Aggref) ||
@@ -4830,7 +4830,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 		context->total.startup += perelemcost.startup;
 		if (perelemcost.per_tuple > 0)
 			context->total.per_tuple += perelemcost.per_tuple *
-				estimate_array_length((Node *) acoerce->arg);
+				estimate_array_length(context->root, (Node *) acoerce->arg);
 	}
 	else if (IsA(node, RowCompareExpr))
 	{
diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c
index 04b3764b686..a60d5fb5d24 100644
--- a/src/backend/utils/adt/array_typanalyze.c
+++ b/src/backend/utils/adt/array_typanalyze.c
@@ -604,6 +604,7 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
 			/*
 			 * Prepare to fill stanumbers with the histogram, followed by the
 			 * average count.  This array must be stored in anl_context.
+			 * We use the average count in estimate_array_length.
 			 */
 			hist = (float4 *)
 				MemoryContextAlloc(stats->anl_context,
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index 631012a0f28..f902deff025 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -6337,7 +6337,7 @@ array_unnest_support(PG_FUNCTION_ARGS)
 			/* We can use estimated argument values here */
 			arg1 = estimate_expression_value(req->root, linitial(args));
 
-			req->rows = estimate_array_length(arg1);
+			req->rows = estimate_array_length(req->root, arg1);
 			ret = (Node *) req;
 		}
 	}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 35c9e3c86fe..807c4e5ba80 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2131,7 +2131,7 @@ scalararraysel(PlannerInfo *root,
  * It's important that this agree with scalararraysel.
  */
 int
-estimate_array_length(Node *arrayexpr)
+estimate_array_length(PlannerInfo *root, Node *arrayexpr)
 {
 	/* look through any binary-compatible relabeling of arrayexpr */
 	arrayexpr = strip_array_coercion(arrayexpr);
@@ -2152,11 +2152,36 @@ estimate_array_length(Node *arrayexpr)
 	{
 		return list_length(((ArrayExpr *) arrayexpr)->elements);
 	}
-	else
+	else if (arrayexpr && IsA(arrayexpr, Var))
 	{
-		/* default guess --- see also scalararraysel */
-		return 10;
+		/*
+		 * It's a column, so use its average elem count.
+		 * This is stored in the last stanumbers element of the DECHIST statistics.
+		 * Actually that is the count of *distinct* elements,
+		 * but this is still better than 10.
+		 */
+		VariableStatData vardata;
+		AttStatsSlot sslot;
+		int nelem = -1;
+
+		examine_variable(root, arrayexpr, 0, &vardata);
+
+		if (HeapTupleIsValid(vardata.statsTuple))
+		{
+			if (get_attstatsslot(&sslot, vardata.statsTuple, STATISTIC_KIND_DECHIST, InvalidOid, ATTSTATSSLOT_NUMBERS))
+			{
+				nelem = sslot.numbers[sslot.nnumbers - 1];
+				free_attstatsslot(&sslot);
+			}
+		}
+		ReleaseVariableStats(vardata);
+
+		if (nelem >= 0)
+			return nelem;
 	}
+
+	/* default guess --- see also scalararraysel */
+	return 10;
 }
 
 /*
@@ -6538,7 +6563,7 @@ genericcostestimate(PlannerInfo *root,
 		if (IsA(rinfo->clause, ScalarArrayOpExpr))
 		{
 			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
-			int			alength = estimate_array_length(lsecond(saop->args));
+			int			alength = estimate_array_length(root, lsecond(saop->args));
 
 			if (alength > 1)
 				num_sa_scans *= alength;
@@ -6818,7 +6843,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 			{
 				ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
 				Node	   *other_operand = (Node *) lsecond(saop->args);
-				int			alength = estimate_array_length(other_operand);
+				int			alength = estimate_array_length(root, other_operand);
 
 				clause_op = saop->opno;
 				found_saop = true;
@@ -7412,7 +7437,7 @@ gincost_scalararrayopexpr(PlannerInfo *root,
 	{
 		counts->exactEntries++;
 		counts->searchEntries++;
-		counts->arrayScans *= estimate_array_length(rightop);
+		counts->arrayScans *= estimate_array_length(root, rightop);
 		return true;
 	}
 
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 2f76c473db4..bf9678157f7 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -200,7 +200,7 @@ extern Selectivity scalararraysel(PlannerInfo *root,
 								  ScalarArrayOpExpr *clause,
 								  bool is_join_clause,
 								  int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);
-extern int	estimate_array_length(Node *arrayexpr);
+extern int	estimate_array_length(PlannerInfo *root, Node *arrayexpr);
 extern Selectivity rowcomparesel(PlannerInfo *root,
 								 RowCompareExpr *clause,
 								 int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index 23404982f71..96a0c4a1e34 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -2699,3 +2699,13 @@ SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
 ERROR:  sample size must be between 0 and 6
 SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail
 ERROR:  sample size must be between 0 and 6
+-- test rowcount estimate of unnest(column)
+analyze arrtest;
+explain select elem from arrtest, unnest(a) x(elem);
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Nested Loop  (cost=0.00..1.15 rows=6 width=2)
+   ->  Seq Scan on arrtest  (cost=0.00..1.03 rows=3 width=29)
+   ->  Function Scan on unnest x  (cost=0.00..0.02 rows=2 width=2)
+(3 rows)
+
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index 50aa539fdc1..21507cf8129 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -825,3 +825,7 @@ SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]
 SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
 SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
 SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail
+
+-- test rowcount estimate of unnest(column)
+analyze arrtest;
+explain select elem from arrtest, unnest(a) x(elem);
-- 
2.32.0 (Apple Git-132)

