On 22.06.2013 19:19, Simon Riggs wrote:
So I think that (2) is the best route: Given that we know with much
better certainty the number of rows in the scanned-relation, we should
be able to examine our hash table after it has been built and decide
whether it would be cheaper to rebuild the hash table with the right
number of buckets, or continue processing with what we have now. Which
is roughly what Heikki proposed already, in January.

Back in January, I wrote a quick patch to experiment with rehashing when the hash table becomes too full. It was too late to make it into 9.3 so I didn't pursue it further back then, but IIRC it worked. If we have the capability to rehash, the accuracy of the initial guess becomes much less important.

- Heikki
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 6a2f236..32aebb9 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -223,6 +223,60 @@ ExecEndHash(HashState *node)
 	ExecEndNode(outerPlan);
 }
 
+static void
+Rehash(HashJoinTable hashtable)
+{
+	int nbuckets_old = hashtable->nbuckets_inmem;
+	int nbuckets_new;
+	uint32 mask;
+	int i;
+	
+	if (nbuckets_old > (1<<30))
+		return;
+
+	nbuckets_new = nbuckets_old * 2;
+	hashtable->buckets = (HashJoinTuple *)
+		repalloc(hashtable->buckets, nbuckets_new * sizeof(HashJoinTuple));
+	memset(((char *) hashtable->buckets) + nbuckets_old * sizeof(HashJoinTuple),
+		   0, nbuckets_old * sizeof(HashJoinTuple));
+	hashtable->nbuckets_inmem = nbuckets_new;
+
+	mask = nbuckets_old;
+
+	for (i = 0; i < nbuckets_old; i++)
+	{
+		int			newbucketno = i + nbuckets_old;
+		HashJoinTuple prevtuple;
+		HashJoinTuple tuple;
+
+		prevtuple = NULL;
+		tuple = hashtable->buckets[i];
+
+		while (tuple != NULL)
+		{
+			/* save link in case we delete */
+			HashJoinTuple nexttuple = tuple->next;
+
+			if ((tuple->hashvalue & mask) != 0)
+			{
+				/* move to the new bucket */
+				tuple->next = hashtable->buckets[newbucketno];
+				hashtable->buckets[newbucketno] = tuple;
+
+				if (prevtuple)
+					prevtuple->next = nexttuple;
+				else
+					hashtable->buckets[i] = nexttuple;
+			}
+			else
+			{
+				/* keep in this bucket */
+				prevtuple = tuple;
+			}
+			tuple = nexttuple;
+		}
+	}
+}
 
 /* ----------------------------------------------------------------
  *		ExecHashTableCreate
@@ -271,6 +325,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	 */
 	hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
 	hashtable->nbuckets = nbuckets;
+	hashtable->nbuckets_inmem = nbuckets;
 	hashtable->log2_nbuckets = log2_nbuckets;
 	hashtable->buckets = NULL;
 	hashtable->keepNulls = keepNulls;
@@ -285,6 +340,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	hashtable->nbatch_outstart = nbatch;
 	hashtable->growEnabled = true;
 	hashtable->totalTuples = 0;
+	hashtable->inmemTuples = 0;
 	hashtable->innerBatchFile = NULL;
 	hashtable->outerBatchFile = NULL;
 	hashtable->spaceUsed = 0;
@@ -612,7 +668,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	 */
 	ninmemory = nfreed = 0;
 
-	for (i = 0; i < hashtable->nbuckets; i++)
+	for (i = 0; i < hashtable->nbuckets_inmem; i++)
 	{
 		HashJoinTuple prevtuple;
 		HashJoinTuple tuple;
@@ -734,6 +790,10 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		hashTuple->next = hashtable->buckets[bucketno];
 		hashtable->buckets[bucketno] = hashTuple;
 
+		hashtable->inmemTuples += 1;
+		if (hashtable->inmemTuples > hashtable->nbuckets_inmem * NTUP_PER_BUCKET * 2)
+			Rehash(hashtable);
+
 		/* Account for space used, and back off if we've used too much */
 		hashtable->spaceUsed += hashTupleSize;
 		if (hashtable->spaceUsed > hashtable->spacePeak)
@@ -873,18 +933,18 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 						  int *bucketno,
 						  int *batchno)
 {
-	uint32		nbuckets = (uint32) hashtable->nbuckets;
+	uint32		nbuckets_inmem = (uint32) hashtable->nbuckets_inmem;
 	uint32		nbatch = (uint32) hashtable->nbatch;
 
 	if (nbatch > 1)
 	{
 		/* we can do MOD by masking, DIV by shifting */
-		*bucketno = hashvalue & (nbuckets - 1);
+		*bucketno = hashvalue & (nbuckets_inmem - 1);
 		*batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1);
 	}
 	else
 	{
-		*bucketno = hashvalue & (nbuckets - 1);
+		*bucketno = hashvalue & (nbuckets_inmem - 1);
 		*batchno = 0;
 	}
 }
@@ -995,7 +1055,7 @@ ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
 		 */
 		if (hashTuple != NULL)
 			hashTuple = hashTuple->next;
-		else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
+		else if (hjstate->hj_CurBucketNo < hashtable->nbuckets_inmem)
 		{
 			hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
 			hjstate->hj_CurBucketNo++;
@@ -1052,7 +1112,7 @@ void
 ExecHashTableReset(HashJoinTable hashtable)
 {
 	MemoryContext oldcxt;
-	int			nbuckets = hashtable->nbuckets;
+	int			nbuckets_inmem = hashtable->nbuckets_inmem;
 
 	/*
 	 * Release all the hash buckets and tuples acquired in the prior pass, and
@@ -1063,9 +1123,10 @@ ExecHashTableReset(HashJoinTable hashtable)
 
 	/* Reallocate and reinitialize the hash bucket headers. */
 	hashtable->buckets = (HashJoinTuple *)
-		palloc0(nbuckets * sizeof(HashJoinTuple));
+		palloc0(nbuckets_inmem * sizeof(HashJoinTuple));
 
 	hashtable->spaceUsed = 0;
+	hashtable->inmemTuples = 0;
 
 	MemoryContextSwitchTo(oldcxt);
 }
@@ -1081,7 +1142,7 @@ ExecHashTableResetMatchFlags(HashJoinTable hashtable)
 	int			i;
 
 	/* Reset all flags in the main table ... */
-	for (i = 0; i < hashtable->nbuckets; i++)
+	for (i = 0; i < hashtable->nbuckets_inmem; i++)
 	{
 		for (tuple = hashtable->buckets[i]; tuple != NULL; tuple = tuple->next)
 			HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 8654066..1f10863 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -106,6 +106,7 @@ typedef struct HashSkewBucket
 typedef struct HashJoinTableData
 {
 	int			nbuckets;		/* # buckets in the in-memory hash table */
+	int			nbuckets_inmem;		/* # buckets in the in-memory hash table */
 	int			log2_nbuckets;	/* its log2 (nbuckets must be a power of 2) */
 
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
@@ -130,6 +131,8 @@ typedef struct HashJoinTableData
 
 	double		totalTuples;	/* # tuples obtained from inner plan */
 
+	uint64		inmemTuples;	/* # tuples in current hash table  */
+
 	/*
 	 * These arrays are allocated for the life of the hash join, but only if
 	 * nbatch > 1.	A file is opened only when we first write a tuple into it
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to