Re: [HACKERS] A better way than tweaking NTUP_PER_BUCKET

Heikki Linnakangas Sat, 22 Jun 2013 14:10:22 -0700

On 22.06.2013 19:19, Simon Riggs wrote:

So I think that (2) is the best route: Given that we know with much
better certainty the number of rows in the scanned-relation, we should
be able to examine our hash table after it has been built and decide
whether it would be cheaper to rebuild the hash table with the right
number of buckets, or continue processing with what we have now. Which
is roughly what Heikki proposed already, in January.

Back in January, I wrote a quick patch to experiment with rehashing whenthe hash table becomes too full. It was too late to make it into 9.3 soI didn't pursue it further back then, but IIRC it worked. If we have thecapability to rehash, the accuracy of the initial guess becomes muchless important.


- Heikki

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 6a2f236..32aebb9 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -223,6 +223,60 @@ ExecEndHash(HashState *node)
 	ExecEndNode(outerPlan);
 }
 
+static void
+Rehash(HashJoinTable hashtable)
+{
+	int nbuckets_old = hashtable->nbuckets_inmem;
+	int nbuckets_new;
+	uint32 mask;
+	int i;
+	
+	if (nbuckets_old > (1<<30))
+		return;
+
+	nbuckets_new = nbuckets_old * 2;
+	hashtable->buckets = (HashJoinTuple *)
+		repalloc(hashtable->buckets, nbuckets_new * sizeof(HashJoinTuple));
+	memset(((char *) hashtable->buckets) + nbuckets_old * sizeof(HashJoinTuple),
+		   0, nbuckets_old * sizeof(HashJoinTuple));
+	hashtable->nbuckets_inmem = nbuckets_new;
+
+	mask = nbuckets_old;
+
+	for (i = 0; i < nbuckets_old; i++)
+	{
+		int			newbucketno = i + nbuckets_old;
+		HashJoinTuple prevtuple;
+		HashJoinTuple tuple;
+
+		prevtuple = NULL;
+		tuple = hashtable->buckets[i];
+
+		while (tuple != NULL)
+		{
+			/* save link in case we delete */
+			HashJoinTuple nexttuple = tuple->next;
+
+			if ((tuple->hashvalue & mask) != 0)
+			{
+				/* move to the new bucket */
+				tuple->next = hashtable->buckets[newbucketno];
+				hashtable->buckets[newbucketno] = tuple;
+
+				if (prevtuple)
+					prevtuple->next = nexttuple;
+				else
+					hashtable->buckets[i] = nexttuple;
+			}
+			else
+			{
+				/* keep in this bucket */
+				prevtuple = tuple;
+			}
+			tuple = nexttuple;
+		}
+	}
+}
 
 /* ----------------------------------------------------------------
  *		ExecHashTableCreate
@@ -271,6 +325,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	 */
 	hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
 	hashtable->nbuckets = nbuckets;
+	hashtable->nbuckets_inmem = nbuckets;
 	hashtable->log2_nbuckets = log2_nbuckets;
 	hashtable->buckets = NULL;
 	hashtable->keepNulls = keepNulls;
@@ -285,6 +340,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	hashtable->nbatch_outstart = nbatch;
 	hashtable->growEnabled = true;
 	hashtable->totalTuples = 0;
+	hashtable->inmemTuples = 0;
 	hashtable->innerBatchFile = NULL;
 	hashtable->outerBatchFile = NULL;
 	hashtable->spaceUsed = 0;
@@ -612,7 +668,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	 */
 	ninmemory = nfreed = 0;
 
-	for (i = 0; i < hashtable->nbuckets; i++)
+	for (i = 0; i < hashtable->nbuckets_inmem; i++)
 	{
 		HashJoinTuple prevtuple;
 		HashJoinTuple tuple;
@@ -734,6 +790,10 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		hashTuple->next = hashtable->buckets[bucketno];
 		hashtable->buckets[bucketno] = hashTuple;
 
+		hashtable->inmemTuples += 1;
+		if (hashtable->inmemTuples > hashtable->nbuckets_inmem * NTUP_PER_BUCKET * 2)
+			Rehash(hashtable);
+
 		/* Account for space used, and back off if we've used too much */
 		hashtable->spaceUsed += hashTupleSize;
 		if (hashtable->spaceUsed > hashtable->spacePeak)
@@ -873,18 +933,18 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 						  int *bucketno,
 						  int *batchno)
 {
-	uint32		nbuckets = (uint32) hashtable->nbuckets;
+	uint32		nbuckets_inmem = (uint32) hashtable->nbuckets_inmem;
 	uint32		nbatch = (uint32) hashtable->nbatch;
 
 	if (nbatch > 1)
 	{
 		/* we can do MOD by masking, DIV by shifting */
-		*bucketno = hashvalue & (nbuckets - 1);
+		*bucketno = hashvalue & (nbuckets_inmem - 1);
 		*batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1);
 	}
 	else
 	{
-		*bucketno = hashvalue & (nbuckets - 1);
+		*bucketno = hashvalue & (nbuckets_inmem - 1);
 		*batchno = 0;
 	}
 }
@@ -995,7 +1055,7 @@ ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
 		 */
 		if (hashTuple != NULL)
 			hashTuple = hashTuple->next;
-		else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
+		else if (hjstate->hj_CurBucketNo < hashtable->nbuckets_inmem)
 		{
 			hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
 			hjstate->hj_CurBucketNo++;
@@ -1052,7 +1112,7 @@ void
 ExecHashTableReset(HashJoinTable hashtable)
 {
 	MemoryContext oldcxt;
-	int			nbuckets = hashtable->nbuckets;
+	int			nbuckets_inmem = hashtable->nbuckets_inmem;
 
 	/*
 	 * Release all the hash buckets and tuples acquired in the prior pass, and
@@ -1063,9 +1123,10 @@ ExecHashTableReset(HashJoinTable hashtable)
 
 	/* Reallocate and reinitialize the hash bucket headers. */
 	hashtable->buckets = (HashJoinTuple *)
-		palloc0(nbuckets * sizeof(HashJoinTuple));
+		palloc0(nbuckets_inmem * sizeof(HashJoinTuple));
 
 	hashtable->spaceUsed = 0;
+	hashtable->inmemTuples = 0;
 
 	MemoryContextSwitchTo(oldcxt);
 }
@@ -1081,7 +1142,7 @@ ExecHashTableResetMatchFlags(HashJoinTable hashtable)
 	int			i;
 
 	/* Reset all flags in the main table ... */
-	for (i = 0; i < hashtable->nbuckets; i++)
+	for (i = 0; i < hashtable->nbuckets_inmem; i++)
 	{
 		for (tuple = hashtable->buckets[i]; tuple != NULL; tuple = tuple->next)
 			HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 8654066..1f10863 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -106,6 +106,7 @@ typedef struct HashSkewBucket
 typedef struct HashJoinTableData
 {
 	int			nbuckets;		/* # buckets in the in-memory hash table */
+	int			nbuckets_inmem;		/* # buckets in the in-memory hash table */
 	int			log2_nbuckets;	/* its log2 (nbuckets must be a power of 2) */
 
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
@@ -130,6 +131,8 @@ typedef struct HashJoinTableData
 
 	double		totalTuples;	/* # tuples obtained from inner plan */
 
+	uint64		inmemTuples;	/* # tuples in current hash table  */
+
 	/*
 	 * These arrays are allocated for the life of the hash join, but only if
 	 * nbatch > 1.	A file is opened only when we first write a tuple into it

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] A better way than tweaking NTUP_PER_BUCKET

Reply via email to