Hi,

attached is v1 of one of the hashjoin improvements mentioned in September in the lengthy thread [1].

The main objection against simply removing the MaxAllocSize check (and switching to MemoryContextAllocHuge) is that if the number of rows is overestimated, we may consume significantly more memory than necessary.

We run into this issue because we allocate the buckets at the very beginning, based on the estimate. I've noticed we don't really need to do that - we don't really use the buckets until after the Hash node completes, and we don't even use it when incrementing the number of batches (we use the dense allocation for that).

So this patch removes this - it postpones allocating the buckets to the end of MultiExecHash(), and at that point we know pretty well what is the optimal number of buckets.

This makes tracking nbuckets_optimal/log2_nbuckets_optimal unnecessary, as we can simply use nbuckets/log2_nbuckets for that purpose. I've also removed nbuckets_original, but maybe that's not a good idea and we want to keep that information (OTOH we never use that number of buckets).

This patch does not change the estimation in ExecChooseHashTableSize() at all, because we still need to do that to get nbucket/nbatch. Maybe this is another opportunity for improvement in case of overestimates, because in that case it may happen that we do batching even when we could do without it. So we might start with nbuckets=1024 and nbatches=1, and only switch to the estimated number of batches if really needed.

This patch also does not mess with the allocation, i.e. it still uses the MaxAllocSize limit (which amounts to ~256MB due to the doubling, IIRC), but it should make it easier to do that change.

[1] http://www.postgresql.org/message-id/flat/19746.1443983...@sss.pgh.pa.us

regards

--
Tomas Vondra                  http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 12dae77..8fd9c9f 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2116,21 +2116,17 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 		if (es->format != EXPLAIN_FORMAT_TEXT)
 		{
 			ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
-			ExplainPropertyLong("Original Hash Buckets",
-								hashtable->nbuckets_original, es);
 			ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
 			ExplainPropertyLong("Original Hash Batches",
 								hashtable->nbatch_original, es);
 			ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
 		}
-		else if (hashtable->nbatch_original != hashtable->nbatch ||
-				 hashtable->nbuckets_original != hashtable->nbuckets)
+		else if (hashtable->nbatch_original != hashtable->nbatch)
 		{
 			appendStringInfoSpaces(es->str, es->indent * 2);
 			appendStringInfo(es->str,
-							 "Buckets: %d (originally %d)  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
+							 "Buckets: %d Batches: %d (originally %d)  Memory Usage: %ldkB\n",
 							 hashtable->nbuckets,
-							 hashtable->nbuckets_original,
 							 hashtable->nbatch,
 							 hashtable->nbatch_original,
 							 spacePeakKb);
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 5e05ec3..b0cf97d 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,7 +39,7 @@
 
 
 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
-static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
+static void ExecHashBuildBuckets(HashJoinTable hashtable);
 static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
 					  int mcvsToUse);
 static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -129,9 +129,8 @@ MultiExecHash(HashState *node)
 		}
 	}
 
-	/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
-	if (hashtable->nbuckets != hashtable->nbuckets_optimal)
-		ExecHashIncreaseNumBuckets(hashtable);
+	/* Construct the actual hash table (using the optimal number of buckets). */
+	ExecHashBuildBuckets(hashtable);
 
 	/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
 	hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
@@ -283,10 +282,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	 */
 	hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
 	hashtable->nbuckets = nbuckets;
-	hashtable->nbuckets_original = nbuckets;
-	hashtable->nbuckets_optimal = nbuckets;
 	hashtable->log2_nbuckets = log2_nbuckets;
-	hashtable->log2_nbuckets_optimal = log2_nbuckets;
 	hashtable->buckets = NULL;
 	hashtable->keepNulls = keepNulls;
 	hashtable->skewEnabled = false;
@@ -372,18 +368,11 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	}
 
 	/*
-	 * Prepare context for the first-scan space allocations; allocate the
-	 * hashbucket array therein, and set each bucket "empty".
-	 */
-	MemoryContextSwitchTo(hashtable->batchCxt);
-
-	hashtable->buckets = (HashJoinTuple *)
-		palloc0(nbuckets * sizeof(HashJoinTuple));
-
-	/*
 	 * Set up for skew optimization, if possible and there's a need for more
 	 * than one batch.  (In a one-batch join, there's no point in it.)
 	 */
+	MemoryContextSwitchTo(hashtable->batchCxt);
+
 	if (nbatch > 1)
 		ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
 
@@ -654,19 +643,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	 */
 	ninmemory = nfreed = 0;
 
-	/* If know we need to resize nbuckets, we can do it while rebatching. */
-	if (hashtable->nbuckets_optimal != hashtable->nbuckets)
-	{
-		/* we never decrease the number of buckets */
-		Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
-
-		hashtable->nbuckets = hashtable->nbuckets_optimal;
-		hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
-
-		hashtable->buckets = repalloc(hashtable->buckets,
-								sizeof(HashJoinTuple) * hashtable->nbuckets);
-	}
-
 	/*
 	 * We will scan through the chunks directly, so that we can reset the
 	 * buckets now and not have to keep track which tuples in the buckets have
@@ -704,10 +680,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 
 				copyTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
 				memcpy(copyTuple, hashTuple, hashTupleSize);
-
-				/* and add it back to the appropriate bucket */
-				copyTuple->next = hashtable->buckets[bucketno];
-				hashtable->buckets[bucketno] = copyTuple;
 			}
 			else
 			{
@@ -753,27 +725,20 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 }
 
 /*
- * ExecHashIncreaseNumBuckets
- *		increase the original number of buckets in order to reduce
- *		number of tuples per bucket
+ * ExecHashBuildBuckets
+ *		complete building the hash table by allocating the optimal number of
+ * 		buckets and filling them with tuples
  */
 static void
-ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+ExecHashBuildBuckets(HashJoinTable hashtable)
 {
 	HashMemoryChunk chunk;
 
-	/* do nothing if not an increase (it's called increase for a reason) */
-	if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
-		return;
-
 #ifdef HJDEBUG
-	printf("Increasing nbuckets %d => %d\n",
-		   hashtable->nbuckets, hashtable->nbuckets_optimal);
+	printf("Constructing table with nbuckets %d\n", hashtable->nbuckets);
 #endif
 
-	hashtable->nbuckets = hashtable->nbuckets_optimal;
-	hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
-
+	Assert(hashtable->buckets == NULL);
 	Assert(hashtable->nbuckets > 1);
 	Assert(hashtable->nbuckets <= (INT_MAX / 2));
 	Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
@@ -785,10 +750,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
 	 * chunks)
 	 */
 	hashtable->buckets =
-		(HashJoinTuple *) repalloc(hashtable->buckets,
-								hashtable->nbuckets * sizeof(HashJoinTuple));
-
-	memset(hashtable->buckets, 0, hashtable->nbuckets * sizeof(HashJoinTuple));
+		(HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
 
 	/* scan through all tuples in all chunks to rebuild the hash table */
 	for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
@@ -816,7 +778,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
 	}
 
 #ifdef HJDEBUG
-	printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+	printf("Nbuckets set to %d, average items per bucket %.1f\n",
 		   hashtable->nbuckets, batchTuples / hashtable->nbuckets);
 #endif
 }
@@ -872,9 +834,15 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 */
 		HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 
-		/* Push it onto the front of the bucket's list */
-		hashTuple->next = hashtable->buckets[bucketno];
-		hashtable->buckets[bucketno] = hashTuple;
+		/*
+		 * We only do this if we already have buckets allocated, i.e. after
+		 * the first batch.
+		 */
+		if (hashtable->buckets != NULL)
+		{
+			hashTuple->next = hashtable->buckets[bucketno];
+			hashtable->buckets[bucketno] = hashTuple;
+		}
 
 		/*
 		 * Increase the (optimal) number of buckets if we just exceeded the
@@ -882,14 +850,14 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 * batch.
 		 */
 		if (hashtable->nbatch == 1 &&
-			ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
+			ntuples > (hashtable->nbuckets * NTUP_PER_BUCKET))
 		{
 			/* Guard against integer overflow and alloc size overflow */
-			if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
-				hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
+			if (hashtable->nbuckets <= INT_MAX / 2 &&
+				hashtable->nbuckets * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
 			{
-				hashtable->nbuckets_optimal *= 2;
-				hashtable->log2_nbuckets_optimal += 1;
+				hashtable->nbuckets *= 2;
+				hashtable->log2_nbuckets += 1;
 			}
 		}
 
@@ -898,7 +866,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		if (hashtable->spaceUsed > hashtable->spacePeak)
 			hashtable->spacePeak = hashtable->spaceUsed;
 		if (hashtable->spaceUsed +
-			hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
+			hashtable->nbuckets * sizeof(HashJoinTuple)
 			> hashtable->spaceAllowed)
 			ExecHashIncreaseNumBatches(hashtable);
 	}
@@ -1216,7 +1184,6 @@ void
 ExecHashTableReset(HashJoinTable hashtable)
 {
 	MemoryContext oldcxt;
-	int			nbuckets = hashtable->nbuckets;
 
 	/*
 	 * Release all the hash buckets and tuples acquired in the prior pass, and
@@ -1226,8 +1193,8 @@ ExecHashTableReset(HashJoinTable hashtable)
 	oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
 
 	/* Reallocate and reinitialize the hash bucket headers. */
-	hashtable->buckets = (HashJoinTuple *)
-		palloc0(nbuckets * sizeof(HashJoinTuple));
+	hashtable->buckets =
+		(HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
 
 	hashtable->spaceUsed = 0;
 
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 7a51ea6..255c506 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -128,11 +128,6 @@ typedef struct HashJoinTableData
 	int			nbuckets;		/* # buckets in the in-memory hash table */
 	int			log2_nbuckets;	/* its log2 (nbuckets must be a power of 2) */
 
-	int			nbuckets_original;		/* # buckets when starting the first
-										 * hash */
-	int			nbuckets_optimal;		/* optimal # buckets (per batch) */
-	int			log2_nbuckets_optimal;	/* log2(nbuckets_optimal) */
-
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
 	struct HashJoinTupleData **buckets;
 	/* buckets array is per-batch storage, as are all the tuples */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to