Re: [HACKERS] Inlining comparators as a performance optimisation

Heikki Linnakangas Mon, 05 Dec 2011 05:24:34 -0800

On 05.12.2011 02:14, Peter Geoghegan wrote:

On 4 December 2011 19:17, Tom Lane<t...@sss.pgh.pa.us>  wrote:

I have not done any performance testing on this patch, but it might be
interesting to check it with the same test cases Peter's been using.


I've attached a revision of exactly the same benchmark run to get the
results in results_server.ods .

You'll see very similar figures to results_server.ods for HEAD and for
my patch, as you'd expect. I think the results speak for themselves. I
maintain that we should use specialisations - that's where most of the
benefit is to be found.

I'm still not convinced the big gain is in inlining the comparisonfunctions. Your patch contains a few orthogonal tricks, too:

* A fastpath for the case of just one scankey. No reason we can't dothat without inlining.

* inlining the swap-functions within qsort. Thaẗ́'s different frominlining the comparison functions, and could be done regardless of thedata type. The qsort element size in tuplesort.c is always sizeof(SortTuple)

And there's some additional specializations we can do, not implementedin your patch, that don't depend on inlining the comparison functions:


* A fastpath for the case of no NULLs

* separate comparetup functions for ascending and descending sorts (thisallows tail call elimination of the call to type-specific comparisonfunction in the ascending version)

* call CHECK_FOR_INTERRUPTS() less frequently.

To see how much difference those things make, I hacked together theattached patch. I didn't base this on Robert's/Tom's patch, but insteadjust added a quick & dirty FunctionCallInvoke-overhead-skipping versionof btint4cmp. I believe that aspect of this patch it's similar inperformance characteristics, though.

In my quick tests, it gets quite close in performance to your inliningpatch, when sorting int4s and the input contains no NULLs. But pleasegive it a try in your test environment, to get numbers comparable withyour other tests.

Perhaps you can get even more gain by adding the no-NULLs, and asc/descspecial cases to your inlining patch, too, but let's squeeze all the fatout of the non-inlined version first. One nice aspect of many of thesenon-inlining optimizations is that they help with external sorts, too.


--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 3505236..85bec20 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -114,6 +114,8 @@
 #include "utils/rel.h"
 #include "utils/tuplesort.h"
 
+#include "utils/builtins.h"
+
 
 /* sort-type codes for sort__start probes */
 #define HEAP_SORT		0
@@ -273,6 +275,8 @@ struct Tuplesortstate
 	int			memtupcount;	/* number of tuples currently present */
 	int			memtupsize;		/* allocated length of memtuples array */
 
+	bool		hasnulls_initial;	/* any NULLs in the first key col? */
+
 	/*
 	 * While building initial runs, this is the current output run number
 	 * (starting at 0).  Afterwards, it is the number of initial runs we made.
@@ -341,6 +345,8 @@ struct Tuplesortstate
 	TupleDesc	tupDesc;
 	ScanKey		scanKeys;		/* array of length nKeys */
 
+	int (*comparator) (Datum, Datum);
+
 	/*
 	 * These variables are specific to the CLUSTER case; they are set by
 	 * tuplesort_begin_cluster.  Note CLUSTER also uses tupDesc and
@@ -459,6 +465,11 @@ static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
 static void markrunend(Tuplesortstate *state, int tapenum);
 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
 				Tuplesortstate *state);
+static int comparetup_heap_1key_asc_nonulls(const SortTuple *a, const SortTuple *b,
+				Tuplesortstate *state);
+static int comparetup_heap_1key_asc_nonulls_btint4cmp(const SortTuple *a, const SortTuple *b,
+				Tuplesortstate *state);
+static int compare_int4(Datum first, Datum second);
 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
 static void writetup_heap(Tuplesortstate *state, int tapenum,
 			  SortTuple *stup);
@@ -551,6 +562,7 @@ tuplesort_begin_common(int workMem, bool randomAccess)
 	state->availMem = state->allowedMem;
 	state->sortcontext = sortcontext;
 	state->tapeset = NULL;
+	state->hasnulls_initial = false;
 
 	state->memtupcount = 0;
 	state->memtupsize = 1024;	/* initial guess */
@@ -1247,11 +1259,41 @@ tuplesort_performsort(Tuplesortstate *state)
 			 * amount of memory.  Just qsort 'em and we're done.
 			 */
 			if (state->memtupcount > 1)
+			{
+				int	(*comparetup) (const SortTuple *a, const SortTuple *b,
+								   Tuplesortstate *state);
+
+				/* If there is only one key col, the input contains no NULLs,
+				 * and it's ascending, we can use a fastpath version of
+				 * comparetup_heap that skips the over those things.
+				 */
+				if (state->comparetup == comparetup_heap &&
+					state->nKeys == 1 &&
+					!(state->scanKeys[0].sk_flags & SK_BT_DESC) &&
+					!state->hasnulls_initial)
+				{
+					/*
+					 * furthermore, if the comparison function happens to be
+					 * btint4cmp, we have an additional fastpath implementation
+					 * for that that skips the FunctionCallInvoke overhead
+					 */
+					if (state->scanKeys[0].sk_func.fn_addr == btint4cmp)
+					{
+						state->comparator = compare_int4;
+						comparetup = comparetup_heap_1key_asc_nonulls_btint4cmp;
+					}
+					else
+						comparetup = comparetup_heap_1key_asc_nonulls;
+				}
+				else
+					comparetup = state->comparetup;
+
 				qsort_arg((void *) state->memtuples,
 						  state->memtupcount,
 						  sizeof(SortTuple),
-						  (qsort_arg_comparator) state->comparetup,
+						  (qsort_arg_comparator) comparetup,
 						  (void *) state);
+			}
 			state->current = 0;
 			state->eof_reached = false;
 			state->markpos_offset = 0;
@@ -2713,6 +2755,44 @@ ApplySortFunction(FmgrInfo *sortFunction, int sortFlags, Oid collation,
  */
 
 static int
+compare_int4(Datum first, Datum second)
+{
+	int32           a = DatumGetInt32(first);
+	int32           b = DatumGetInt32(second);
+	if (a > b)
+		return 1;
+	else if (a == b)
+		return 0;
+	else
+		return -1;
+}
+
+static int
+comparetup_heap_1key_asc_nonulls(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+{
+	ScanKey		scanKey = state->scanKeys;
+
+	/* Allow interrupting long sorts */
+	CHECK_FOR_INTERRUPTS();
+
+	/* Compare the sort key */
+	return DatumGetInt32(myFunctionCall2Coll(&scanKey->sk_func,
+											 scanKey->sk_collation,
+											 a->datum1, b->datum1));
+}
+
+
+static int
+comparetup_heap_1key_asc_nonulls_btint4cmp(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+{
+	/* Allow interrupting long sorts */
+	CHECK_FOR_INTERRUPTS();
+
+	/* Compare the sort key */
+	return state->comparator(a->datum1, b->datum1);
+}
+
+static int
 comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
 {
 	ScanKey		scanKey = state->scanKeys;
@@ -2784,6 +2864,10 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
 								state->scanKeys[0].sk_attno,
 								state->tupDesc,
 								&stup->isnull1);
+
+	/* update hasnulls_initial flag */
+	if (stup->isnull1)
+		state->hasnulls_initial = true;
 }
 
 static void
diff --git a/src/port/qsort_arg.c b/src/port/qsort_arg.c
index 28d1894..0fa0d75 100644
--- a/src/port/qsort_arg.c
+++ b/src/port/qsort_arg.c
@@ -99,6 +99,124 @@ med3(char *a, char *b, char *c, qsort_arg_comparator cmp, void *arg)
 		: (cmp(b, c, arg) > 0 ? b : (cmp(a, c, arg) < 0 ? a : c));
 }
 
+
+/*
+ * A specialized version for es == sizeof(SortTuple). We only use use the
+ * SortTuple struct here to get its size and alignment.
+ */
+typedef uintptr_t Datum; /* copied from postgres.h */
+typedef struct
+{
+	void	   *tuple;			/* the tuple proper */
+	Datum		datum1;			/* value of first key column */
+	bool		isnull1;		/* is first key column NULL? */
+	int32		tupindex;		/* see notes above */
+} SortTuple;
+
+
+#define swap_SortTuple(a, b)							\
+	do {												\
+		SortTuple tmp;									\
+		tmp = *(SortTuple *) (a);						\
+		*(SortTuple *) (a) = *(SortTuple *) (b);		\
+		*(SortTuple *) (b) = tmp;						\
+	} while(0)
+
+static void
+qsort_arg_SortTuple(void *a, size_t n, qsort_arg_comparator cmp, void *arg)
+{
+	size_t		es = sizeof(SortTuple);
+	char	   *pa,
+			   *pb,
+			   *pc,
+			   *pd,
+			   *pl,
+			   *pm,
+			   *pn;
+	int			d,
+				r,
+				swaptype,
+				presorted;
+
+loop:SWAPINIT(a, es);
+	if (n < 7)
+	{
+		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+			for (pl = pm; pl > (char *) a && cmp(pl - es, pl, arg) > 0;
+				 pl -= es)
+				swap_SortTuple(pl, pl - es);
+		return;
+	}
+	presorted = 1;
+	for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+	{
+		if (cmp(pm - es, pm, arg) > 0)
+		{
+			presorted = 0;
+			break;
+		}
+	}
+	if (presorted)
+		return;
+	pm = (char *) a + (n / 2) * es;
+	if (n > 7)
+	{
+		pl = (char *) a;
+		pn = (char *) a + (n - 1) * es;
+		if (n > 40)
+		{
+			d = (n / 8) * es;
+			pl = med3(pl, pl + d, pl + 2 * d, cmp, arg);
+			pm = med3(pm - d, pm, pm + d, cmp, arg);
+			pn = med3(pn - 2 * d, pn - d, pn, cmp, arg);
+		}
+		pm = med3(pl, pm, pn, cmp, arg);
+	}
+	swap_SortTuple(a, pm);
+	pa = pb = (char *) a + es;
+	pc = pd = (char *) a + (n - 1) * es;
+	for (;;)
+	{
+		while (pb <= pc && (r = cmp(pb, a, arg)) <= 0)
+		{
+			if (r == 0)
+			{
+				swap_SortTuple(pa, pb);
+				pa += es;
+			}
+			pb += es;
+		}
+		while (pb <= pc && (r = cmp(pc, a, arg)) >= 0)
+		{
+			if (r == 0)
+			{
+				swap_SortTuple(pc, pd);
+				pd -= es;
+			}
+			pc -= es;
+		}
+		if (pb > pc)
+			break;
+		swap_SortTuple(pb, pc);
+		pb += es;
+		pc -= es;
+	}
+	pn = (char *) a + n * es;
+	r = Min(pa - (char *) a, pb - pa);
+	vecswap(a, pb - r, r);
+	r = Min(pd - pc, pn - pd - es);
+	vecswap(pb, pn - r, r);
+	if ((r = pb - pa) > es)
+		qsort_arg_SortTuple(a, r / es, cmp, arg);
+	if ((r = pd - pc) > es)
+	{
+		/* Iterate rather than recurse to save stack space */
+		a = pn - r;
+		n = r / es;
+		goto loop;
+	}
+}
+
 void
 qsort_arg(void *a, size_t n, size_t es, qsort_arg_comparator cmp, void *arg)
 {
@@ -114,6 +232,12 @@ qsort_arg(void *a, size_t n, size_t es, qsort_arg_comparator cmp, void *arg)
 				swaptype,
 				presorted;
 
+	if (es == sizeof(SortTuple))
+	{
+		qsort_arg_SortTuple(a, n, cmp, arg);
+		return;
+	}
+
 loop:SWAPINIT(a, es);
 	if (n < 7)
 	{

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Inlining comparators as a performance optimisation

Reply via email to