A user complained on pgsql-performance that SELECT col FROM table
GROUP BY col LIMIT 2; performs a full table scan. ISTM that it's safe
to return tuples from hash-aggregate as they are found when no
aggregate functions are in use. Attached is a first shot at that. The
planner is modified so that when the optimization applies, hash table
size check is compared against the limit and start up cost comes from
the input. The executor is modified so that when the hash table is not
filled yet and the optimization applies, nodes are returned
immediately.

Can somebody poke holes in this? The patch definitely needs some code
cleanup in nodeAgg.c, but otherwise it passes regression tests and
seems to work as intended. It also optimizes the SELECT DISTINCT col
FROM table LIMIT 2; case, but not SELECT DISTINCT ON (col) col FROM
table LIMIT 2 because it is explicitly forced to use sorted
aggregation.

Ants Aasma
-- 
Cybertec Schönig & Schönig GmbH
Gröhrmühlgasse 26
A-2700 Wiener Neustadt
Web: http://www.postgresql-support.de
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index c9aa921..53cdd09 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -274,9 +274,10 @@ static Bitmapset *find_unaggregated_cols(AggState *aggstate);
 static bool find_unaggregated_cols_walker(Node *node, Bitmapset **colnos);
 static void build_hash_table(AggState *aggstate);
 static AggHashEntry lookup_hash_entry(AggState *aggstate,
-				  TupleTableSlot *inputslot);
+				  TupleTableSlot *inputslot, bool *isnew);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
 static void agg_fill_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_fill_hash_and_retrieve(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
 static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
 
@@ -920,12 +921,11 @@ hash_agg_entry_size(int numAggs)
  * When called, CurrentMemoryContext should be the per-query context.
  */
 static AggHashEntry
-lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
+lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot, bool *isnew)
 {
 	TupleTableSlot *hashslot = aggstate->hashslot;
 	ListCell   *l;
 	AggHashEntry entry;
-	bool		isnew;
 
 	/* if first time through, initialize hashslot by cloning input slot */
 	if (hashslot->tts_tupleDescriptor == NULL)
@@ -948,9 +948,9 @@ lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
 	/* find or create the hashtable entry using the filtered tuple */
 	entry = (AggHashEntry) LookupTupleHashEntry(aggstate->hashtable,
 												hashslot,
-												&isnew);
+												isnew);
 
-	if (isnew)
+	if (*isnew)
 	{
 		/* initialize aggregates for new tuple group */
 		initialize_aggregates(aggstate, aggstate->peragg, entry->pergroup);
@@ -1004,7 +1004,12 @@ ExecAgg(AggState *node)
 	if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
 	{
 		if (!node->table_filled)
-			agg_fill_hash_table(node);
+		{
+			if (node->numaggs)
+				agg_fill_hash_table(node);
+			else
+				return agg_fill_hash_and_retrieve(node);
+		}
 		return agg_retrieve_hash_table(node);
 	}
 	else
@@ -1222,6 +1227,7 @@ agg_fill_hash_table(AggState *aggstate)
 	ExprContext *tmpcontext;
 	AggHashEntry entry;
 	TupleTableSlot *outerslot;
+	bool 		isnew;
 
 	/*
 	 * get state info from node
@@ -1243,7 +1249,7 @@ agg_fill_hash_table(AggState *aggstate)
 		tmpcontext->ecxt_outertuple = outerslot;
 
 		/* Find or build hashtable entry for this tuple's group */
-		entry = lookup_hash_entry(aggstate, outerslot);
+		entry = lookup_hash_entry(aggstate, outerslot, &isnew);
 
 		/* Advance the aggregates */
 		advance_aggregates(aggstate, entry->pergroup);
@@ -1258,6 +1264,101 @@ agg_fill_hash_table(AggState *aggstate)
 }
 
 /*
+ * ExecAgg for hashed case: phase 1, read input and build hash table
+ * return found tuples immediately.
+ */
+static TupleTableSlot *
+agg_fill_hash_and_retrieve(AggState *aggstate)
+{
+	PlanState  *outerPlan;
+	ExprContext *tmpcontext;
+	AggHashEntry entry;
+	TupleTableSlot *outerslot;
+	bool		isnew;
+	ExprContext *econtext;
+	TupleTableSlot *firstSlot;
+
+	/*
+	 * get state info from node
+	 */
+	outerPlan = outerPlanState(aggstate);
+	/* tmpcontext is the per-input-tuple expression context */
+	tmpcontext = aggstate->tmpcontext;
+
+	econtext = aggstate->ss.ps.ps_ExprContext;
+	firstSlot = aggstate->ss.ss_ScanTupleSlot;
+
+	Assert(aggstate->numaggs == 0);
+
+	/*
+	 * Process each outer-plan tuple, and then fetch the next one, until we
+	 * exhaust the outer plan.
+	 */
+	for (;;)
+	{
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+		{
+			aggstate->table_filled = true;
+			/* Initialize to walk the hash table */
+			ResetTupleHashIterator(aggstate->hashtable, &aggstate->hashiter);
+			return NULL;
+		}
+		/* set up for advance_aggregates call */
+		tmpcontext->ecxt_outertuple = outerslot;
+
+		/* Find or build hashtable entry for this tuple's group */
+		entry = lookup_hash_entry(aggstate, outerslot, &isnew);
+
+		/* Reset per-input-tuple context after each tuple */
+		ResetExprContext(tmpcontext);
+
+		if (isnew)
+		{
+			/*
+			 * Store the copied first input tuple in the tuple table slot reserved
+			 * for it, so that it can be used in ExecProject.
+			 */
+			ExecStoreMinimalTuple(entry->shared.firstTuple,
+								  firstSlot,
+								  false);
+
+			/*
+			 * Use the representative input tuple for any references to
+			 * non-aggregated input columns in the qual and tlist.
+			 */
+			econtext->ecxt_outertuple = firstSlot;
+
+			/*
+			 * Check the qual (HAVING clause); if the group does not match, ignore
+			 * it and loop back to try to process another group.
+			 */
+			if (ExecQual(aggstate->ss.ps.qual, econtext, false))
+			{
+				/* FIXME: copy and paste */
+				/*
+				 * Form and return a projection tuple using the aggregate results
+				 * and the representative input tuple.
+				 */
+				TupleTableSlot *result;
+				ExprDoneCond isDone;
+
+				result = ExecProject(aggstate->ss.ps.ps_ProjInfo, &isDone);
+
+				if (isDone != ExprEndResult)
+				{
+					aggstate->ss.ps.ps_TupFromTlist =
+						(isDone == ExprMultipleResult);
+					return result;
+				}
+			}
+			else
+				InstrCountFiltered1(aggstate, 1);
+		}
+	}
+}
+
+/*
  * ExecAgg for hashed case: phase 2, retrieving groups from hash table
  */
 static TupleTableSlot *
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 9cae27b..e393e5a 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1495,9 +1495,13 @@ cost_agg(Path *path, PlannerInfo *root,
 		total_cost = startup_cost + cpu_tuple_cost;
 		output_tuples = 1;
 	}
-	else if (aggstrategy == AGG_SORTED)
+	else if (aggstrategy == AGG_SORTED || aggcosts->numAggs == 0)
 	{
-		/* Here we are able to deliver output on-the-fly */
+		/*
+		 * Here we are able to deliver output on-the-fly.
+		 * If there are no aggregates, the executor can start outputing
+		 * distinct tuples as it finds them even for hashed aggregates.
+		 */
 		startup_cost = input_startup_cost;
 		total_cost = input_total_cost;
 		/* calcs phrased this way to match HASHED case, see note above */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index dcf32c0..8bdaf59 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2321,6 +2321,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	List	   *current_pathkeys;
 	Path		hashed_p;
 	Path		sorted_p;
+	double		returnedGroups = dNumGroups;
 
 	/*
 	 * Executor doesn't support hashed aggregation with DISTINCT or ORDER BY
@@ -2362,7 +2363,14 @@ choose_hashed_grouping(PlannerInfo *root,
 	/* plus the per-hash-entry overhead */
 	hashentrysize += hash_agg_entry_size(agg_costs->numAggs);
 
-	if (hashentrysize * dNumGroups > work_mem * 1024L)
+	/* We don't need the whole hashtable if we can return rows on the fly */
+	if (tuple_fraction >= 1.0)
+		tuple_fraction /= dNumGroups;
+
+	if (!parse->hasAggs)
+		returnedGroups = tuple_fraction * dNumGroups;
+
+	if (hashentrysize * returnedGroups > work_mem * 1024L)
 		return false;
 
 	/*
@@ -2444,9 +2452,6 @@ choose_hashed_grouping(PlannerInfo *root,
 	 * Now make the decision using the top-level tuple fraction.  First we
 	 * have to convert an absolute count (LIMIT) into fractional form.
 	 */
-	if (tuple_fraction >= 1.0)
-		tuple_fraction /= dNumGroups;
-
 	if (compare_fractional_path_costs(&hashed_p, &sorted_p,
 									  tuple_fraction) < 0)
 	{
@@ -2528,7 +2533,10 @@ choose_hashed_distinct(PlannerInfo *root,
 	 */
 	hashentrysize = MAXALIGN(path_width) + MAXALIGN(sizeof(MinimalTupleData));
 
-	if (hashentrysize * dNumDistinctRows > work_mem * 1024L)
+	if (tuple_fraction >= 1.0)
+		tuple_fraction /= dNumDistinctRows;
+
+	if (hashentrysize * dNumDistinctRows * tuple_fraction > work_mem * 1024L)
 		return false;
 
 	/*
@@ -2595,9 +2603,6 @@ choose_hashed_distinct(PlannerInfo *root,
 	 * Now make the decision using the top-level tuple fraction.  First we
 	 * have to convert an absolute count (LIMIT) into fractional form.
 	 */
-	if (tuple_fraction >= 1.0)
-		tuple_fraction /= dNumDistinctRows;
-
 	if (compare_fractional_path_costs(&hashed_p, &sorted_p,
 									  tuple_fraction) < 0)
 	{
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to