I've been profiling simple aggregate queries, looking for any low-hanging fruit. For this query:

-- setup
create table floats as select g::float8 as a, g::float8 as b, g::float8 as c from generate_series(1, 10000000) g;
vacuum freeze floats;

-- query
select sum(a), sum(b+c) from floats;


perf report says:

# Children Self Command Shared Object Symbol # ........ ........ .......... ................. ........................................
#
    25.70%     0.00%  postmaster  [unknown]          [k] 0000000000000000
    14.23%    13.75%  postmaster  postgres           [.] ExecProject
    11.18%    10.57%  postmaster  postgres           [.] slot_deform_tuple
     9.58%     9.04%  postmaster  postgres           [.] advance_aggregates
     8.96%     0.00%  postmaster  [unknown]          [.] 0x00000000000298d4
8.77% 8.42% postmaster postgres [.] ExecMakeFunctionResultNoSets
     7.78%     0.00%  postmaster  [unknown]          [.] 0x0000000001d38260
6.63% 6.15% postmaster postgres [.] advance_transition_function
     6.61%     0.00%  postmaster  [unknown]          [.] 0x0000000001e99e40
     6.47%     0.00%  postmaster  libc-2.23.so       [.] __GI___libc_read
     6.24%     5.88%  postmaster  postgres           [.] heap_getnext
4.62% 4.62% postmaster [kernel.kallsyms] [k] copy_user_enhanced_fast_string
     3.91%     3.82%  postmaster  postgres           [.] slot_getsomeattrs
     3.29%     3.18%  postmaster  postgres           [.] slot_getattr
     3.06%     3.00%  postmaster  postgres           [.] ExecClearTuple
     2.59%     0.00%  postmaster  [unknown]          [.] 0x0000000001e9a370
     2.57%     2.45%  postmaster  postgres           [.] ExecScan
     2.56%     2.37%  postmaster  postgres           [.] float8pl
     2.54%     2.43%  postmaster  postgres           [.] heapgetpage
     2.25%     2.17%  postmaster  postgres           [.] ExecAgg
     2.10%     1.96%  postmaster  postgres           [.] ExecStoreTuple
     2.00%     1.91%  postmaster  postgres           [.] ExecProcNode

ExecProject stands out. I find that pretty surprising.

We're using ExecProject to extract the arguments from the input tuples, to pass to the aggregate transition functions. It looks like that's a pretty expensive way of doing it, for a typical aggregate that takes only one argument.

We actually used to call ExecEvalExpr() directly for each argument, but that was changed by the patch that added support for ordered set aggregates. It looks like that was a bad idea, from a performance point of view.

I propose that we go back to calling ExecEvalExpr() directly, for non-ordered aggregates, per the attached patch. That makes that example query about 10% faster on my laptop, which is in line with the fact that ExecProject() accounted for about 13% of the CPU time.

Another idea is that maybe we should add a fast-path to ExecProject(), for these trivial cases.

- Heikki
From 106c5742fde2ec83576323db74a7249d7d85101f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 31 Aug 2016 17:27:52 +0300
Subject: [PATCH] Skip ExecProject for non-ordered aggregates.

When support for ordered aggregates was added, we started using
ExecProject to compute the arguments for the transient function. However,
it turns out that the old way of just calling ExecEvalExpr() directly for
each argument is somewhat faster. At least for typical aggregates that only
take one or two arguments. So go back to using ExecEvalExpr() for non-ordered
aggregates.

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index ce2fc28..e32b140 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -229,9 +229,8 @@ typedef struct AggStatePerTransData
 	/* Oid of state value's datatype */
 	Oid			aggtranstype;
 
-	/* ExprStates of the FILTER and argument expressions. */
+	/* ExprStates of the FILTER and direct-argument expressions. */
 	ExprState  *aggfilter;		/* state of FILTER expression, if any */
-	List	   *args;			/* states of aggregated-argument expressions */
 	List	   *aggdirectargs;	/* states of direct-argument expressions */
 
 	/*
@@ -288,17 +287,21 @@ typedef struct AggStatePerTransData
 				transtypeByVal;
 
 	/*
-	 * Stuff for evaluation of inputs.  We used to just use ExecEvalExpr, but
-	 * with the addition of ORDER BY we now need at least a slot for passing
-	 * data to the sort object, which requires a tupledesc, so we might as
-	 * well go whole hog and use ExecProject too.
+	 * Stuff for evaluation of inputs.
+	 *
+	 * For non-ordered aggregates, we call ExecEvalExpr for each argument,
+	 * represented by the expression trees in transInputs. For ordered
+	 * aggregates, we need at least a slot for passing data to the sort
+	 * object, which requires a tupledesc, so we might as well go whole hog
+	 * and use ExecProject to evaluate the arguments, too.
 	 */
+	ExprState **transInputs;	/* expression trees for arguments */
 	TupleDesc	evaldesc;		/* descriptor of input tuples */
 	ProjectionInfo *evalproj;	/* projection machinery */
 
 	/*
-	 * Slots for holding the evaluated input arguments.  These are set up
-	 * during ExecInitAgg() and then used for each input row.
+	 * Slots for holding the evaluated input arguments, for ordered aggregates.
+	 * These are set up during ExecInitAgg() and then used for each input row.
 	 */
 	TupleTableSlot *evalslot;	/* current input tuple */
 	TupleTableSlot *uniqslot;	/* used for multi-column DISTINCT */
@@ -850,7 +853,6 @@ advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup)
 		ExprState  *filter = pertrans->aggfilter;
 		int			numTransInputs = pertrans->numTransInputs;
 		int			i;
-		TupleTableSlot *slot;
 
 		/* Skip anything FILTERed out */
 		if (filter)
@@ -864,12 +866,13 @@ advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup)
 				continue;
 		}
 
-		/* Evaluate the current input expressions for this aggregate */
-		slot = ExecProject(pertrans->evalproj, NULL);
-
 		if (pertrans->numSortCols > 0)
 		{
 			/* DISTINCT and/or ORDER BY case */
+			TupleTableSlot *slot;
+
+			/* Evaluate the current input expressions for this aggregate */
+			slot = ExecProject(pertrans->evalproj, NULL);
 			Assert(slot->tts_nvalid == pertrans->numInputs);
 
 			/*
@@ -906,13 +909,16 @@ advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup)
 			/* We can apply the transition function immediately */
 			FunctionCallInfo fcinfo = &pertrans->transfn_fcinfo;
 
-			/* Load values into fcinfo */
-			/* Start from 1, since the 0th arg will be the transition value */
-			Assert(slot->tts_nvalid >= numTransInputs);
+			/*
+			 * Evaluate the current input expressions for this aggregate.
+			 * Start from 1, since the 0th arg will be the transition value.
+			 */
 			for (i = 0; i < numTransInputs; i++)
 			{
-				fcinfo->arg[i + 1] = slot->tts_values[i];
-				fcinfo->argnull[i + 1] = slot->tts_isnull[i];
+				ExprState  *argstate = pertrans->transInputs[i];
+
+				fcinfo->arg[i + 1] = ExecEvalExpr(argstate, aggstate->tmpcontext,
+												  &fcinfo->argnull[i + 1], NULL);
 			}
 
 			for (setno = 0; setno < numGroupingSets; setno++)
@@ -2925,6 +2931,7 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans,
 	int			numDistinctCols;
 	int			naggs;
 	int			i;
+	List	   *targetlist;
 
 	/* Begin filling in the pertrans data */
 	pertrans->aggref = aggref;
@@ -3080,8 +3087,8 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans,
 									   (PlanState *) aggstate);
 	pertrans->aggdirectargs = (List *) ExecInitExpr((Expr *) aggref->aggdirectargs,
 													(PlanState *) aggstate);
-	pertrans->args = (List *) ExecInitExpr((Expr *) aggref->args,
-										   (PlanState *) aggstate);
+	targetlist = (List *) ExecInitExpr((Expr *) aggref->args,
+									   (PlanState *) aggstate);
 
 	/*
 	 * Complain if the aggregate's arguments contain any aggregates; nested
@@ -3093,12 +3100,26 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans,
 				(errcode(ERRCODE_GROUPING_ERROR),
 				 errmsg("aggregate function calls cannot be nested")));
 
-	/* Set up projection info for evaluation */
-	pertrans->evalproj = ExecBuildProjectionInfo(pertrans->args,
+	/*
+	 * Set up projection info for evaluating transition function's arguments.
+	 *
+	 * For non-ordered aggregates, we use plain ExecEvalExpr() instead of
+	 * ExecProject(), so also set up a simple array of expression trees.
+	 */
+	pertrans->evalproj = ExecBuildProjectionInfo(targetlist,
 												 aggstate->tmpcontext,
 												 pertrans->evalslot,
 												 NULL);
 
+	pertrans->transInputs = palloc(pertrans->numTransInputs * sizeof(ExprState *));
+	i = 0;
+	foreach(lc, targetlist)
+	{
+		GenericExprState *gstate = (GenericExprState *) lfirst(lc);
+
+		pertrans->transInputs[i++] = gstate->arg;
+	}
+
 	/*
 	 * If we're doing either DISTINCT or ORDER BY for a plain agg, then we
 	 * have a list of SortGroupClause nodes; fish out the data in them and
-- 
2.9.3

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to