Re: [HACKERS] bad estimation together with large work_mem generates terrible slow hash joins

Tomas Vondra Mon, 30 Jun 2014 16:25:30 -0700

On 30.6.2014 23:12, Tomas Vondra wrote:
> Hi,
> 
> attached is v5 of the patch. The main change is that scaling the number
> of buckets is done only once, after the initial hash table is build. The
> main advantage of this is lower price. This also allowed some cleanup of
> unecessary code.
> 
> However, this new patch causes warning like this:
> 
>     WARNING:  temporary file leak: File 231 still referenced
> 
> I've been eyeballing the code for a while now, but I still fail to see
> where this comes from :-( Any ideas?


Meh, the patches are wrong - I haven't realized the tight coupling
between buckets/batches in ExecHashGetBucketAndBatch:

  *bucketno = hashvalue & (nbuckets - 1);
  *batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1);

The previous patches worked mostly by pure luck (the nbuckets was set
only in the first batch), but once I moved the code at the end, it
started failing. And by "worked" I mean "didn't throw an error, but
probably returned bogus results with (nbatch>1)".

However, ISTM this nbuckets-nbatch coupling is not really necessary,
it's only constructed this way to assign independent batch/bucket. So I
went and changed the code like this:

  *bucketno = hashvalue & (nbuckets - 1);
  *batchno = (hashvalue >> (32 - hashtable->log2_nbatch));

I.e. using the top bits for batchno, low bits for bucketno (as before).

Hopefully I got it right this time. At least it seems to be working for
cases that failed before (no file leaks, proper rowcounts so far).

regards
Tomas

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 0d9663c..db3a953 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1900,18 +1900,20 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 		if (es->format != EXPLAIN_FORMAT_TEXT)
 		{
 			ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
+			ExplainPropertyLong("Original Hash Buckets",
+								hashtable->nbuckets_original, es);
 			ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
 			ExplainPropertyLong("Original Hash Batches",
 								hashtable->nbatch_original, es);
 			ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
 		}
-		else if (hashtable->nbatch_original != hashtable->nbatch)
+		else if ((hashtable->nbatch_original != hashtable->nbatch) || (hashtable->nbuckets_original != hashtable->nbuckets))
 		{
 			appendStringInfoSpaces(es->str, es->indent * 2);
 			appendStringInfo(es->str,
-			"Buckets: %d  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
-							 hashtable->nbuckets, hashtable->nbatch,
-							 hashtable->nbatch_original, spacePeakKb);
+			"Buckets: %d (originally %d)  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
+							 hashtable->nbuckets, hashtable->nbuckets_original,
+							 hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
 		}
 		else
 		{
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 589b2f1..48cca62 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -37,8 +37,10 @@
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
 
+bool enable_hashjoin_bucket = true;
 
 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
 static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
 					  int mcvsToUse);
 static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -48,6 +50,33 @@ static void ExecHashSkewTableInsert(HashJoinTable hashtable,
 static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
 
 
+/*
+ * Compute appropriate size for hashtable given the estimated size of the
+ * relation to be hashed (number of rows and average row width).
+ *
+ * This is exported so that the planner's costsize.c can use it.
+ */
+
+/* Target bucket loading (tuples per bucket) */
+#define NTUP_PER_BUCKET			10
+
+/* Multiple of NTUP_PER_BUCKET triggering the increase of nbuckets.
+ * 
+ * Once we reach the threshold we double the number of buckets, and we
+ * want to get 1.0 on average (to get NTUP_PER_BUCKET on average). That
+ * means these two equations should hold
+ * 
+ *   b = 2a         (growth)
+ *   (a + b)/2 = 1  (oscillate around NTUP_PER_BUCKET)
+ * 
+ * which means b=1.3333 (a = b/2). If we wanted higher threshold, we
+ * could grow the nbuckets to (4*nbuckets), thus using (b=4a) for
+ * growth, leading to (b=1.6). Or (b=8a) giving 1.7777 etc.
+ * 
+ * Let's start with doubling the bucket count, i.e. 1.333. */
+#define NTUP_GROW_COEFFICIENT	1.333
+#define NTUP_GROW_THRESHOLD		(NTUP_PER_BUCKET * NTUP_GROW_COEFFICIENT)
+
 /* ----------------------------------------------------------------
  *		ExecHash
  *
@@ -116,6 +145,7 @@ MultiExecHash(HashState *node)
 				/* It's a skew tuple, so put it into that hash table */
 				ExecHashSkewTableInsert(hashtable, slot, hashvalue,
 										bucketNumber);
+				hashtable->skewTuples += 1;
 			}
 			else
 			{
@@ -126,6 +156,23 @@ MultiExecHash(HashState *node)
 		}
 	}
 
+	/* If average number of tuples per bucket is over the defined threshold,
+	 * increase the number of buckets to get below it. */
+	if (enable_hashjoin_bucket) {
+
+		/* consider only tuples in the non-skew buckets */
+		double nonSkewTuples = (hashtable->totalTuples - hashtable->skewTuples);
+
+		if ((nonSkewTuples / hashtable->nbatch) > (hashtable->nbuckets * NTUP_GROW_THRESHOLD)) {
+
+#ifdef HJDEBUG
+			printf("Increasing nbucket to %d (average per bucket = %.1f)\n",
+				   nbuckets,  (batchTuples / hashtable->nbuckets));
+#endif
+			ExecHashIncreaseNumBuckets(hashtable);
+		}
+	}
+
 	/* must provide our own instrumentation support */
 	if (node->ps.instrument)
 		InstrStopNode(node->ps.instrument, hashtable->totalTuples);
@@ -239,6 +286,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	int			nbatch;
 	int			num_skew_mcvs;
 	int			log2_nbuckets;
+	int			log2_nbatch;
 	int			nkeys;
 	int			i;
 	ListCell   *ho;
@@ -263,6 +311,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	log2_nbuckets = my_log2(nbuckets);
 	Assert(nbuckets == (1 << log2_nbuckets));
 
+	/* nbatch must be a power of 2 */
+	log2_nbatch = my_log2(nbatch);
+	Assert(nbuckets == (1 << log2_nbatch));
+
 	/*
 	 * Initialize the hash table control block.
 	 *
@@ -271,6 +323,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	 */
 	hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
 	hashtable->nbuckets = nbuckets;
+	hashtable->nbuckets_original = nbuckets;
 	hashtable->log2_nbuckets = log2_nbuckets;
 	hashtable->buckets = NULL;
 	hashtable->keepNulls = keepNulls;
@@ -281,10 +334,12 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	hashtable->skewBucketNums = NULL;
 	hashtable->nbatch = nbatch;
 	hashtable->curbatch = 0;
+	hashtable->log2_nbatch = log2_nbatch;
 	hashtable->nbatch_original = nbatch;
 	hashtable->nbatch_outstart = nbatch;
 	hashtable->growEnabled = true;
 	hashtable->totalTuples = 0;
+	hashtable->skewTuples = 0;
 	hashtable->innerBatchFile = NULL;
 	hashtable->outerBatchFile = NULL;
 	hashtable->spaceUsed = 0;
@@ -375,17 +430,6 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	return hashtable;
 }
 
-
-/*
- * Compute appropriate size for hashtable given the estimated size of the
- * relation to be hashed (number of rows and average row width).
- *
- * This is exported so that the planner's costsize.c can use it.
- */
-
-/* Target bucket loading (tuples per bucket) */
-#define NTUP_PER_BUCKET			10
-
 void
 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 						int *numbuckets,
@@ -605,6 +649,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	MemoryContextSwitchTo(oldcxt);
 
 	hashtable->nbatch = nbatch;
+	hashtable->log2_nbatch += 1;
 
 	/*
 	 * Scan through the existing hash table entries and dump out any that are
@@ -682,6 +727,116 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 }
 
 /*
+ * ExecHashIncreaseNumBuckets
+ *		increase the original number of buckets in order to reduce
+ *		number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+	int			i;
+	int			oldnbuckets = hashtable->nbuckets;
+	HashJoinTuple  *oldbuckets = hashtable->buckets;
+	MemoryContext   oldcxt;
+	double		batchTuples = (hashtable->totalTuples / hashtable->nbatch);
+
+	/*
+	 * Determine the proper number of buckets, i.e. stop once the average
+	 * per bucket gets below the threshold (1.33 * NTUP_PER_BUCKET).
+	 * 
+	 * Also, check for overflow - this can only happen with extremely large
+	 * work_mem values, because (INT_MAX/2) means ~8GB only for the buckets.
+	 * With tuples, the hash table would require tens of GBs of work_mem.
+	 * 
+	 * XXX Technically there's also a limit for buckets fitting into work_mem
+	 * (with NTUP_PER_BUCKET tuples), but this can't be really exceeded
+	 * because when filling work_mem, another batch will be added (thus the
+	 * number of tuples will drop and more buckets won't be needed anymore).
+	 * 
+	 * That is, something like this will be enforced implicitly:
+	 * 
+	 *    work_mem * 1024L >= (nbuckets * tupsize * NTUP_GROW_THRESHOLD)
+	 * 
+	 * So it's enough to check only the overflow here.
+	 */
+
+	/* double the number of buckets until we get below the growth threshold, or
+	 * until we hit the overflow protection */
+	while ((batchTuples > (hashtable->nbuckets * NTUP_GROW_THRESHOLD))
+			&& (hashtable->nbuckets <= (INT_MAX/2))) {
+		hashtable->nbuckets *= 2;
+		hashtable->log2_nbuckets += 1;
+	}
+
+	/* XXX Not sure if we should update the info about used space here.
+	 * The code seems to ignore the space used for 'buckets' and we're not
+	 * allocating more space for tuples (just shuffling them to the new
+	 * buckets). And the amount of memory used for buckets is quite small
+	 * (just an array of pointers, thus ~8kB per 1k buckets on 64-bit). */
+
+	/* XXX Should we disable growth if (nbuckets * NTUP_PER_BUCKET)
+	 * reaches work_mem (or something like that)? We shouldn't really
+	 * get into such position (should be handled by increasing the
+	 * number of batches, which is called right before this). */
+
+	/* XXX Maybe adding info into hashjoin explain output (e.g. initial
+	 * nbuckets, time spent growing the table) would be appropriate. */
+
+	Assert(hashtable->nbuckets > 1);
+	Assert(hashtable->nbuckets <= (INT_MAX/2));
+	Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+#ifdef HJDEBUG
+	printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
+#endif
+
+	/* TODO Maybe it'd be better to resize the buckets in place (should be possible,
+	 * but when I tried it I always ended up with a strange infinite loop). */
+
+	/* allocate a new bucket list (use the batch context as before) */
+	oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
+
+	hashtable->buckets = (HashJoinTuple *)
+		palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
+
+	MemoryContextSwitchTo(oldcxt);
+
+	/* walk through the old buckets, move the buckets into the new table */
+	for (i = 0; i < oldnbuckets; i++)
+	{
+
+		HashJoinTuple tuple = oldbuckets[i];
+
+		while (tuple != NULL)
+		{
+			/* save link in case we delete */
+			HashJoinTuple nexttuple = tuple->next;
+			int			bucketno;
+			int			batchno;
+
+			ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+									  &bucketno, &batchno);
+
+			/* move it to the correct bucket (in the new bucket array) */
+			tuple->next = hashtable->buckets[bucketno];
+			hashtable->buckets[bucketno] = tuple;
+
+			/* process the next tuple */
+			tuple = nexttuple;
+		}
+	}
+
+	pfree(oldbuckets);
+
+#ifdef HJDEBUG
+	printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+		   hashtable->nbuckets, batchTuples / hashtable->nbuckets);
+#endif
+	
+}
+
+
+/*
  * ExecHashTableInsert
  *		insert a tuple into the hash table depending on the hash value
  *		it may just go to a temp file for later batches
@@ -740,6 +895,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
 			hashtable->spacePeak = hashtable->spaceUsed;
 		if (hashtable->spaceUsed > hashtable->spaceAllowed)
 			ExecHashIncreaseNumBatches(hashtable);
+
 	}
 	else
 	{
@@ -856,14 +1012,12 @@ ExecHashGetHashValue(HashJoinTable hashtable,
  * chains), and must only cause the batch number to remain the same or
  * increase.  Our algorithm is
  *		bucketno = hashvalue MOD nbuckets
- *		batchno = (hashvalue DIV nbuckets) MOD nbatch
+ *		batchno =  highers log2(nbatch) bits
  * where nbuckets and nbatch are both expected to be powers of 2, so we can
  * do the computations by shifting and masking.  (This assumes that all hash
  * functions are good about randomizing all their output bits, else we are
  * likely to have very skewed bucket or batch occupancy.)
  *
- * nbuckets doesn't change over the course of the join.
- *
  * nbatch is always a power of 2; we increase it only by doubling it.  This
  * effectively adds one more bit to the top of the batchno.
  */
@@ -880,7 +1034,7 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 	{
 		/* we can do MOD by masking, DIV by shifting */
 		*bucketno = hashvalue & (nbuckets - 1);
-		*batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1);
+		*batchno = (hashvalue >> (32 - hashtable->log2_nbatch));
 	}
 	else
 	{
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3a31a75..c92cc26 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -788,6 +788,15 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 	{
+		{"enable_hashjoin_bucket", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables dynamic increase of hash buckets."),
+			NULL
+		},
+		&enable_hashjoin_bucket,
+		true,
+		NULL, NULL, NULL
+	},
+	{
 		{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
 			gettext_noop("Enables genetic query optimization."),
 			gettext_noop("This algorithm attempts to do planning without "
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 3beae40..82648ab 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -106,6 +106,7 @@ typedef struct HashSkewBucket
 typedef struct HashJoinTableData
 {
 	int			nbuckets;		/* # buckets in the in-memory hash table */
+	int			nbuckets_original;	/* # buckets when starting the first hash */
 	int			log2_nbuckets;	/* its log2 (nbuckets must be a power of 2) */
 
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
@@ -122,6 +123,7 @@ typedef struct HashJoinTableData
 
 	int			nbatch;			/* number of batches */
 	int			curbatch;		/* current batch #; 0 during 1st pass */
+	int			log2_nbatch;	/* its log2 (nbuckets must be a power of 2) */
 
 	int			nbatch_original;	/* nbatch when we started inner scan */
 	int			nbatch_outstart;	/* nbatch when we started outer scan */
@@ -129,6 +131,7 @@ typedef struct HashJoinTableData
 	bool		growEnabled;	/* flag to shut off nbatch increases */
 
 	double		totalTuples;	/* # tuples obtained from inner plan */
+	double		skewTuples;		/* # tuples inserted into skew tuples */
 
 	/*
 	 * These arrays are allocated for the life of the hash join, but only if
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index 75be5bd..15604cb 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -50,4 +50,6 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 						int *num_skew_mcvs);
 extern int	ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue);
 
+extern bool enable_hashjoin_bucket;
+
 #endif   /* NODEHASH_H */
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 75e2afb..60b8da8 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -62,6 +62,7 @@ extern bool enable_material;
 extern bool enable_mergejoin;
 extern bool enable_hashjoin;
 extern int	constraint_exclusion;
+extern bool enable_hashjoin_bucket;
 
 extern double clamp_row_est(double nrows);
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] bad estimation together with large work_mem generates terrible slow hash joins

Reply via email to