On 2016-11-18 08:00:40 -0500, Robert Haas wrote:
> On Tue, Nov 15, 2016 at 2:28 PM, Andres Freund <and...@anarazel.de> wrote:
> > I've a working fix for this, and for a similar issue Robert found. I'm
> > still playing around with it, but basically the fix is to make the
> > growth policy a bit more adaptive.
> 
> Any chance you can post a patch soon?

Here's my WIP series addressing this and related problems. With this
we're again noticeably faster than the dynahash implementation, in both
the case here, and the query you brought up over IM.

This definitely needs some more TLC, but the general approach seems
good. I particularly like that it apparently allows us to increase the
default fillfactor without much downside according to my measurements.

Greetings,

Andres Freund
>From efc603dd67a1c95f2d24844b86436ed6b7a28c80 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 23 Nov 2016 00:23:42 -0800
Subject: [PATCH 1/3] Resize simplehash.h table in case of long runs.

This also allows to increase the default hashtable.
---
 src/include/lib/simplehash.h | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 12aedbc..9522fa5 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -152,10 +152,12 @@ SH_SCOPE void SH_STAT(SH_TYPE *tb);
 
 #include "utils/memutils.h"
 
-/* conservative fillfactor for a robin hood table, might want to adjust */
-#define SH_FILLFACTOR (0.8)
+/* normal fillfactor, unless already close to maximum */
+#define SH_FILLFACTOR (0.9)
 /* increase fillfactor if we otherwise would error out */
-#define SH_MAX_FILLFACTOR (0.95)
+#define SH_MAX_FILLFACTOR (0.98)
+/* collision chain length at which we resize */
+#define SH_MAX_FILLFACTOR (0.98)
 /* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */
 #define SH_MAX_SIZE (((uint64) PG_UINT32_MAX) + 1)
 
@@ -550,6 +552,34 @@ SH_INSERT(SH_TYPE *tb, SH_KEY_TYPE key, bool *found)
 #endif
 			entry->status = SH_STATUS_IN_USE;
 			*found = false;
+
+			/*
+			 * To avoid negative consequences from overly imbalanced
+			 * hashtables, grow the hashtable if collisions lead to large
+			 * runs. The most likely cause of such imbalance is filling a
+			 * (currently) small table, from a currently big one, in
+			 * hash-table order.
+			 *
+			 * FIXME: compute boundaries in a more principled manner.
+			 */
+			if (unlikely(insertdist > 20 ||
+						 SH_DISTANCE_FROM_OPTIMAL(tb, curoptimal, emptyelem) > 1000))
+			{
+				/* don't grow if the table would grow overly much */
+				if (tb->members / ((double) tb->size) > 0.1)
+				{
+					/*
+					 * elog(WARNING, "clumping b, growing this thing");
+					 * SH_STAT(tb);
+					 */
+					/*
+					 * Trigger a growth cycle next round, easier than resizing
+					 * now.
+					 */
+					tb->grow_threshold = 0;
+				}
+			}
+
 			return entry;
 		}
 
-- 
2.10.2

>From 150bc7fb1e9eacaee5d46852fa464e83e7930aca Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Tue, 22 Nov 2016 20:15:37 -0800
Subject: [PATCH 2/3] Signal when a master backend is doing parallel work.

---
 src/backend/access/transam/parallel.c | 14 ++++++++------
 src/backend/executor/nodeGather.c     |  6 ++++++
 src/include/access/parallel.h         | 20 +++++++++++++++++++-
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 59dc394..c038ea1 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -88,12 +88,14 @@ typedef struct FixedParallelState
 } FixedParallelState;
 
 /*
- * Our parallel worker number.  We initialize this to -1, meaning that we are
- * not a parallel worker.  In parallel workers, it will be set to a value >= 0
- * and < the number of workers before any user code is invoked; each parallel
- * worker will get a different parallel worker number.
+ * Our parallel worker number.  We initialize this to PARALLEL_WORKER_MASTER,
+ * meaning that we are not a parallel worker.  In parallel workers, it will be
+ * set to a value >= 0 and < the number of workers before any user code is
+ * invoked; each parallel worker will get a different parallel worker number.
+ * If the master backend performs parallel work, this will temporarily be set
+ * to PARALLEL_WORKER_MASTER_PARALLEL while doing so.
  */
-int			ParallelWorkerNumber = -1;
+int			ParallelWorkerNumber = PARALLEL_WORKER_MASTER;
 
 /* Is there a parallel message pending which we need to receive? */
 volatile bool ParallelMessagePending = false;
@@ -955,7 +957,7 @@ ParallelWorkerMain(Datum main_arg)
 	BackgroundWorkerUnblockSignals();
 
 	/* Determine and set our parallel worker number. */
-	Assert(ParallelWorkerNumber == -1);
+	Assert(ParallelWorkerNumber == PARALLEL_WORKER_MASTER);
 	memcpy(&ParallelWorkerNumber, MyBgworkerEntry->bgw_extra, sizeof(int));
 
 	/* Set up a memory context and resource owner. */
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 880ca62..c292610 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -305,7 +305,13 @@ gather_getnext(GatherState *gatherstate)
 
 		if (gatherstate->need_to_scan_locally)
 		{
+			/*
+			 * Signal that the master backend temparily is doing parallel
+			 * work.
+			 */
+			ParallelWorkerNumber = PARALLEL_WORKER_MASTER_PARALLEL;
 			outerTupleSlot = ExecProcNode(outerPlan);
+			ParallelWorkerNumber = PARALLEL_WORKER_MASTER;
 
 			if (!TupIsNull(outerTupleSlot))
 				return outerTupleSlot;
diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h
index 2f8f36f..ce8b6b6 100644
--- a/src/include/access/parallel.h
+++ b/src/include/access/parallel.h
@@ -50,7 +50,25 @@ extern volatile bool ParallelMessagePending;
 extern int	ParallelWorkerNumber;
 extern bool InitializingParallelWorker;
 
-#define		IsParallelWorker()		(ParallelWorkerNumber >= 0)
+/*
+ * Values for ParallelWorkerNumber, values >= 0 are workers backends.
+ */
+/* master backend */
+#define PARALLEL_WORKER_MASTER			-2
+/* master backend, performing parallel work */
+#define PARALLEL_WORKER_MASTER_PARALLEL	-1
+
+/* Is the process a parallel worker? */
+#define		IsParallelWorker() \
+	(ParallelWorkerNumber > PARALLEL_WORKER_MASTER_PARALLEL)
+
+/*
+ * Is the process performing parallelized work? This can be a the case in a
+ * worker (in which case IsParallelWorker() would be true), or it can be the
+ * master backend performing parallel work itself.
+ */
+#define		PerformingParallelWork() \
+	(ParallelWorkerNumber > PARALLEL_WORKER_MASTER)
 
 extern ParallelContext *CreateParallelContext(parallel_worker_main_type entrypoint, int nworkers);
 extern ParallelContext *CreateParallelContextForExternalFunction(char *library_name, char *function_name, int nworkers);
-- 
2.10.2

>From a7ace03974bb716e7869863e619209cbd7326a7c Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Tue, 22 Nov 2016 20:16:14 -0800
Subject: [PATCH 3/3] Use different hash IVs for tuplehash tables in parallel
 workers.

That's to avoid a higher likelihood for combining tables when scanning
several hashtables from parallel workers, and combining them into a
bigger one.
---
 src/backend/executor/execGrouping.c | 17 ++++++++++++++++-
 src/include/nodes/execnodes.h       |  1 +
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 94cc59d..a089ccd 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -18,6 +18,8 @@
  */
 #include "postgres.h"
 
+#include "access/hash.h"
+#include "access/parallel.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
@@ -314,6 +316,19 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 	hashtable->in_hash_funcs = NULL;
 	hashtable->cur_eq_funcs = NULL;
 
+	/*
+	 * If parallelism is in use, even if the master backend is performing the
+	 * scan itself, we don't want to create the hashtable exactly the same way
+	 * in all workers. As hashtables are iterated over in keyspace-order,
+	 * doing so in all processes in the same way is likely to lead to
+	 * "unbalanced" hashtables when the table size initially is
+	 * underestimated.
+	 */
+	if (PerformingParallelWork())
+		hashtable->hash_iv = hash_uint32(ParallelWorkerNumber);
+	else
+		hashtable->hash_iv = 0;
+
 	hashtable->hashtab = tuplehash_create(tablecxt, nbuckets);
 	hashtable->hashtab->private_data = hashtable;
 
@@ -450,7 +465,7 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
 	TupleHashTable hashtable = (TupleHashTable) tb->private_data;
 	int			numCols = hashtable->numCols;
 	AttrNumber *keyColIdx = hashtable->keyColIdx;
-	uint32		hashkey = 0;
+	uint32		hashkey = hashtable->hash_iv;
 	TupleTableSlot *slot;
 	FmgrInfo   *hashfunctions;
 	int			i;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index f6f73f3..7431352 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -528,6 +528,7 @@ typedef struct TupleHashTableData
 	TupleTableSlot *inputslot;	/* current input tuple's slot */
 	FmgrInfo   *in_hash_funcs;	/* hash functions for input datatype(s) */
 	FmgrInfo   *cur_eq_funcs;	/* equality functions for input vs. table */
+	uint32		hash_iv;		/* hash-function IV */
 }	TupleHashTableData;
 
 typedef tuplehash_iterator TupleHashIterator;
-- 
2.10.2

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to