Re: [HACKERS] tweaking NTUP_PER_BUCKET

Tomas Vondra Sun, 20 Jul 2014 09:18:07 -0700

On 19.7.2014 20:24, Tomas Vondra wrote:
> On 13.7.2014 21:32, Tomas Vondra wrote:
>> The current patch only implemnents this for tuples in the main
>> hash table, not for skew buckets. I plan to do that, but it will
>> require separate chunks for each skew bucket (so we can remove it
>> without messing with all of them). The chunks for skew buckets
>> should be smaller (I'm thinking about ~4kB), because there'll be
>> more of them, but OTOH those buckets are for frequent values so the
>> chunks should not remain empty.
> 
> I've looked into extending the dense allocation to the skew buckets, 
> and I think we shouldn't do that. I got about 50% of the changes and 
> then just threw it out because it turned out quite pointless.
> 
> The amount of memory for skew buckets is limited to 2% of work mem, 
> so even with 100% overhead it'll use ~4% of work mem. So there's 
> pretty much nothing to gain here. So the additional complexity 
> introduced by the dense allocation seems pretty pointless.
> 
> I'm not entirely happy with the code allocating some memory densely 
> and some using traditional palloc, but it certainly seems cleaner 
> than the code I had.
> 
> So I think the patch is mostly OK as is.


Attached is v4 of the patch, mostly with minor improvements and cleanups
(removed MemoryContextStats, code related to skew buckets).

regards
Tomas

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 589b2f1..6a9d956 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -47,6 +47,7 @@ static void ExecHashSkewTableInsert(HashJoinTable hashtable,
 						int bucketNumber);
 static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
 
+static char * chunk_alloc(HashJoinTable hashtable, int tupleSize);
 
 /* ----------------------------------------------------------------
  *		ExecHash
@@ -129,7 +130,7 @@ MultiExecHash(HashState *node)
 	/* must provide our own instrumentation support */
 	if (node->ps.instrument)
 		InstrStopNode(node->ps.instrument, hashtable->totalTuples);
-
+	
 	/*
 	 * We do not return the hash table directly because it's not a subtype of
 	 * Node, and so would violate the MultiExecProcNode API.  Instead, our
@@ -223,7 +224,6 @@ ExecEndHash(HashState *node)
 	ExecEndNode(outerPlan);
 }
 
-
 /* ----------------------------------------------------------------
  *		ExecHashTableCreate
  *
@@ -294,6 +294,8 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	hashtable->spaceAllowedSkew =
 		hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
 
+	hashtable->chunks_batch = NULL;
+
 	/*
 	 * Get info about the hash functions to be used for each hash key. Also
 	 * remember whether the join operators are strict.
@@ -556,10 +558,10 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	int			oldnbatch = hashtable->nbatch;
 	int			curbatch = hashtable->curbatch;
 	int			nbatch;
-	int			i;
 	MemoryContext oldcxt;
 	long		ninmemory;
 	long		nfreed;
+	HashChunk	oldchunks = hashtable->chunks_batch;
 
 	/* do nothing if we've decided to shut off growth */
 	if (!hashtable->growEnabled)
@@ -612,51 +614,70 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	 */
 	ninmemory = nfreed = 0;
 
-	for (i = 0; i < hashtable->nbuckets; i++)
-	{
-		HashJoinTuple prevtuple;
-		HashJoinTuple tuple;
+	/* we're going to scan through the chunks directly, not buckets, so we can reset the
+	 * buckets and reinsert all the tuples there - just set them to NULL */
+	memset(hashtable->buckets, 0, sizeof(void*) * hashtable->nbuckets);
 
-		prevtuple = NULL;
-		tuple = hashtable->buckets[i];
+	/* reset the chunks (we have a copy in oldchunks) - this way we'll allocate */
+	hashtable->chunks_batch = NULL;
 
-		while (tuple != NULL)
-		{
-			/* save link in case we delete */
-			HashJoinTuple nexttuple = tuple->next;
-			int			bucketno;
-			int			batchno;
+	/* so, let's scan through the old chunks, and all tuples in each chunk */
+	while (oldchunks != NULL) {
+
+		/* pointer to the next memory chunk (may be NULL for the last chunk) */
+		HashChunk nextchunk = oldchunks->next;
+
+		/* position within the buffer (up to oldchunks->used) */
+		size_t idx = 0;	
+
+		/* process all tuples stored in this chunk (and then free it) */
+		while (idx < oldchunks->used) {
+
+			/* get the hashjoin tuple and mintuple stored right after it */
+			HashJoinTuple hashTuple = (HashJoinTuple)(oldchunks->data + idx);
+			MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+
+			int		hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
+
+			int		bucketno;
+			int		batchno;
 
 			ninmemory++;
-			ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+			ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
 									  &bucketno, &batchno);
-			Assert(bucketno == i);
+
 			if (batchno == curbatch)
 			{
-				/* keep tuple */
-				prevtuple = tuple;
+				/* keep tuple - this means we need to copy it into the new chunks */
+				HashJoinTuple copyTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize);
+				memcpy(copyTuple, hashTuple, hashTupleSize);
+
+				/* and of course add it to the new buckets - just push the copy it onto the front
+				 * of the bucket's list */
+				copyTuple->next = hashtable->buckets[bucketno];
+				hashtable->buckets[bucketno] = copyTuple;
 			}
 			else
 			{
 				/* dump it out */
 				Assert(batchno > curbatch);
-				ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
-									  tuple->hashvalue,
+				ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
+									  hashTuple->hashvalue,
 									  &hashtable->innerBatchFile[batchno]);
-				/* and remove from hash table */
-				if (prevtuple)
-					prevtuple->next = nexttuple;
-				else
-					hashtable->buckets[i] = nexttuple;
-				/* prevtuple doesn't change */
-				hashtable->spaceUsed -=
-					HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
-				pfree(tuple);
+
+				hashtable->spaceUsed -= hashTupleSize;
 				nfreed++;
 			}
 
-			tuple = nexttuple;
+			/* bytes occupied in memory HJ tuple overhead + actual tuple length */
+			idx += hashTupleSize;
+
 		}
+
+		/* so, we're done with this chunk - free it and proceed to the next one */
+		pfree(oldchunks);
+		oldchunks = nextchunk;
+
 	}
 
 #ifdef HJDEBUG
@@ -717,8 +738,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
 
 		/* Create the HashJoinTuple */
 		hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
-		hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
-													   hashTupleSize);
+		hashTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize);
+
 		hashTuple->hashvalue = hashvalue;
 		memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 
@@ -1068,6 +1089,11 @@ ExecHashTableReset(HashJoinTable hashtable)
 	hashtable->spaceUsed = 0;
 
 	MemoryContextSwitchTo(oldcxt);
+
+	/* reset the chunks too (the memory was allocated within batchCxt, so it's
+	 * already freed by resetting the batch context -just set it to NULL) */
+	hashtable->chunks_batch = NULL;
+
 }
 
 /*
@@ -1318,6 +1344,61 @@ ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
 	return INVALID_SKEW_BUCKET_NO;
 }
 
+static
+char * chunk_alloc(HashJoinTable hashtable, int size) {
+
+	/* maybe we should use MAXALIGN(size) here ... */
+
+	/* if tuple size is larger than of 1/8 of chunk size, allocate a separate chunk */
+	if (size > (HASH_CHUNK_SIZE/8)) {
+
+		/* allocate new chunk and put it at the beginning of the list */
+		HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + size);
+
+		newChunk->maxlen = size;
+		newChunk->used = 0;
+		newChunk->ntuples = 0;
+
+		/* If there already is a chunk, add if after it so we can still use the space in it */
+		if (hashtable->chunks_batch != NULL) {
+			newChunk->next = hashtable->chunks_batch->next;
+			hashtable->chunks_batch->next = newChunk;
+		} else {
+			newChunk->next = hashtable->chunks_batch;
+			hashtable->chunks_batch = newChunk;
+		}
+
+		newChunk->used += size;
+		newChunk->ntuples += 1;
+
+		return newChunk->data;
+
+	}
+
+	/* ok, it's within chunk size, let's see if we have enough space for it in the current
+	 * chunk => if not, allocate a new chunk (chunks==NULL means this is the first chunk) */
+	if ((hashtable->chunks_batch == NULL) || (hashtable->chunks_batch->maxlen - hashtable->chunks_batch->used) < size) {
+
+		/* allocate new chunk and put it at the beginning of the list */
+		HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + HASH_CHUNK_SIZE);
+
+		newChunk->maxlen = HASH_CHUNK_SIZE;
+		newChunk->used = 0;
+		newChunk->ntuples = 0;
+
+		newChunk->next = hashtable->chunks_batch;
+		hashtable->chunks_batch = newChunk;
+	}
+
+	/* OK, we have enough space in the chunk, let's add the tuple */
+	hashtable->chunks_batch->used += size;
+	hashtable->chunks_batch->ntuples += 1;
+
+	/* allocate pointer to the start of the tuple memory */
+	return hashtable->chunks_batch->data + (hashtable->chunks_batch->used - size);
+
+}
+
 /*
  * ExecHashSkewTableInsert
  *
@@ -1340,6 +1421,7 @@ ExecHashSkewTableInsert(HashJoinTable hashtable,
 	hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
 	hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
 												   hashTupleSize);
+
 	hashTuple->hashvalue = hashvalue;
 	memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 3beae40..c80a4d1 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -102,6 +102,22 @@ typedef struct HashSkewBucket
 #define SKEW_WORK_MEM_PERCENT  2
 #define SKEW_MIN_OUTER_FRACTION  0.01
 
+#define HASH_CHUNK_SIZE			(16*1024L)	/* 16kB chunks by default */
+#define HASH_SKEW_CHUNK_SIZE	(4*1024L)	/* 4kB chunks for skew buckets */
+
+typedef struct HashChunkData
+{
+	int			ntuples;	/* number of tuples stored in this chunk */
+	size_t		maxlen;		/* length of the buffer */
+	size_t		used;		/* number of chunk bytes already used */
+
+	struct HashChunkData   *next;	/* pointer to the next chunk (linked list) */
+
+	char		data[1];	/* buffer allocated at the end */
+} HashChunkData;
+
+typedef struct HashChunkData* HashChunk;
+
 
 typedef struct HashJoinTableData
 {
@@ -157,6 +173,10 @@ typedef struct HashJoinTableData
 
 	MemoryContext hashCxt;		/* context for whole-hash-join storage */
 	MemoryContext batchCxt;		/* context for this-batch-only storage */
+
+	/* used for dense allocation of tuples (into linked chunks) */
+	HashChunk	chunks_batch;	/*  one list for the whole batch */
+
 }	HashJoinTableData;
 
 #endif   /* HASHJOIN_H */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] tweaking NTUP_PER_BUCKET

Reply via email to