On 19.7.2014 20:24, Tomas Vondra wrote:
> On 13.7.2014 21:32, Tomas Vondra wrote:
>> The current patch only implemnents this for tuples in the main
>> hash table, not for skew buckets. I plan to do that, but it will
>> require separate chunks for each skew bucket (so we can remove it
>> without messing with all of them). The chunks for skew buckets
>> should be smaller (I'm thinking about ~4kB), because there'll be
>> more of them, but OTOH those buckets are for frequent values so the
>> chunks should not remain empty.
>
> I've looked into extending the dense allocation to the skew buckets,
> and I think we shouldn't do that. I got about 50% of the changes and
> then just threw it out because it turned out quite pointless.
>
> The amount of memory for skew buckets is limited to 2% of work mem,
> so even with 100% overhead it'll use ~4% of work mem. So there's
> pretty much nothing to gain here. So the additional complexity
> introduced by the dense allocation seems pretty pointless.
>
> I'm not entirely happy with the code allocating some memory densely
> and some using traditional palloc, but it certainly seems cleaner
> than the code I had.
>
> So I think the patch is mostly OK as is.
Attached is v4 of the patch, mostly with minor improvements and cleanups
(removed MemoryContextStats, code related to skew buckets).
regards
Tomas
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 589b2f1..6a9d956 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -47,6 +47,7 @@ static void ExecHashSkewTableInsert(HashJoinTable hashtable,
int bucketNumber);
static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
+static char * chunk_alloc(HashJoinTable hashtable, int tupleSize);
/* ----------------------------------------------------------------
* ExecHash
@@ -129,7 +130,7 @@ MultiExecHash(HashState *node)
/* must provide our own instrumentation support */
if (node->ps.instrument)
InstrStopNode(node->ps.instrument, hashtable->totalTuples);
-
+
/*
* We do not return the hash table directly because it's not a subtype of
* Node, and so would violate the MultiExecProcNode API. Instead, our
@@ -223,7 +224,6 @@ ExecEndHash(HashState *node)
ExecEndNode(outerPlan);
}
-
/* ----------------------------------------------------------------
* ExecHashTableCreate
*
@@ -294,6 +294,8 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
hashtable->spaceAllowedSkew =
hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
+ hashtable->chunks_batch = NULL;
+
/*
* Get info about the hash functions to be used for each hash key. Also
* remember whether the join operators are strict.
@@ -556,10 +558,10 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
int oldnbatch = hashtable->nbatch;
int curbatch = hashtable->curbatch;
int nbatch;
- int i;
MemoryContext oldcxt;
long ninmemory;
long nfreed;
+ HashChunk oldchunks = hashtable->chunks_batch;
/* do nothing if we've decided to shut off growth */
if (!hashtable->growEnabled)
@@ -612,51 +614,70 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
*/
ninmemory = nfreed = 0;
- for (i = 0; i < hashtable->nbuckets; i++)
- {
- HashJoinTuple prevtuple;
- HashJoinTuple tuple;
+ /* we're going to scan through the chunks directly, not buckets, so we can reset the
+ * buckets and reinsert all the tuples there - just set them to NULL */
+ memset(hashtable->buckets, 0, sizeof(void*) * hashtable->nbuckets);
- prevtuple = NULL;
- tuple = hashtable->buckets[i];
+ /* reset the chunks (we have a copy in oldchunks) - this way we'll allocate */
+ hashtable->chunks_batch = NULL;
- while (tuple != NULL)
- {
- /* save link in case we delete */
- HashJoinTuple nexttuple = tuple->next;
- int bucketno;
- int batchno;
+ /* so, let's scan through the old chunks, and all tuples in each chunk */
+ while (oldchunks != NULL) {
+
+ /* pointer to the next memory chunk (may be NULL for the last chunk) */
+ HashChunk nextchunk = oldchunks->next;
+
+ /* position within the buffer (up to oldchunks->used) */
+ size_t idx = 0;
+
+ /* process all tuples stored in this chunk (and then free it) */
+ while (idx < oldchunks->used) {
+
+ /* get the hashjoin tuple and mintuple stored right after it */
+ HashJoinTuple hashTuple = (HashJoinTuple)(oldchunks->data + idx);
+ MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+
+ int hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
+
+ int bucketno;
+ int batchno;
ninmemory++;
- ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+ ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
&bucketno, &batchno);
- Assert(bucketno == i);
+
if (batchno == curbatch)
{
- /* keep tuple */
- prevtuple = tuple;
+ /* keep tuple - this means we need to copy it into the new chunks */
+ HashJoinTuple copyTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize);
+ memcpy(copyTuple, hashTuple, hashTupleSize);
+
+ /* and of course add it to the new buckets - just push the copy it onto the front
+ * of the bucket's list */
+ copyTuple->next = hashtable->buckets[bucketno];
+ hashtable->buckets[bucketno] = copyTuple;
}
else
{
/* dump it out */
Assert(batchno > curbatch);
- ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
- tuple->hashvalue,
+ ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
+ hashTuple->hashvalue,
&hashtable->innerBatchFile[batchno]);
- /* and remove from hash table */
- if (prevtuple)
- prevtuple->next = nexttuple;
- else
- hashtable->buckets[i] = nexttuple;
- /* prevtuple doesn't change */
- hashtable->spaceUsed -=
- HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
- pfree(tuple);
+
+ hashtable->spaceUsed -= hashTupleSize;
nfreed++;
}
- tuple = nexttuple;
+ /* bytes occupied in memory HJ tuple overhead + actual tuple length */
+ idx += hashTupleSize;
+
}
+
+ /* so, we're done with this chunk - free it and proceed to the next one */
+ pfree(oldchunks);
+ oldchunks = nextchunk;
+
}
#ifdef HJDEBUG
@@ -717,8 +738,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
/* Create the HashJoinTuple */
hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
- hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
- hashTupleSize);
+ hashTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize);
+
hashTuple->hashvalue = hashvalue;
memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
@@ -1068,6 +1089,11 @@ ExecHashTableReset(HashJoinTable hashtable)
hashtable->spaceUsed = 0;
MemoryContextSwitchTo(oldcxt);
+
+ /* reset the chunks too (the memory was allocated within batchCxt, so it's
+ * already freed by resetting the batch context -just set it to NULL) */
+ hashtable->chunks_batch = NULL;
+
}
/*
@@ -1318,6 +1344,61 @@ ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
return INVALID_SKEW_BUCKET_NO;
}
+static
+char * chunk_alloc(HashJoinTable hashtable, int size) {
+
+ /* maybe we should use MAXALIGN(size) here ... */
+
+ /* if tuple size is larger than of 1/8 of chunk size, allocate a separate chunk */
+ if (size > (HASH_CHUNK_SIZE/8)) {
+
+ /* allocate new chunk and put it at the beginning of the list */
+ HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + size);
+
+ newChunk->maxlen = size;
+ newChunk->used = 0;
+ newChunk->ntuples = 0;
+
+ /* If there already is a chunk, add if after it so we can still use the space in it */
+ if (hashtable->chunks_batch != NULL) {
+ newChunk->next = hashtable->chunks_batch->next;
+ hashtable->chunks_batch->next = newChunk;
+ } else {
+ newChunk->next = hashtable->chunks_batch;
+ hashtable->chunks_batch = newChunk;
+ }
+
+ newChunk->used += size;
+ newChunk->ntuples += 1;
+
+ return newChunk->data;
+
+ }
+
+ /* ok, it's within chunk size, let's see if we have enough space for it in the current
+ * chunk => if not, allocate a new chunk (chunks==NULL means this is the first chunk) */
+ if ((hashtable->chunks_batch == NULL) || (hashtable->chunks_batch->maxlen - hashtable->chunks_batch->used) < size) {
+
+ /* allocate new chunk and put it at the beginning of the list */
+ HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + HASH_CHUNK_SIZE);
+
+ newChunk->maxlen = HASH_CHUNK_SIZE;
+ newChunk->used = 0;
+ newChunk->ntuples = 0;
+
+ newChunk->next = hashtable->chunks_batch;
+ hashtable->chunks_batch = newChunk;
+ }
+
+ /* OK, we have enough space in the chunk, let's add the tuple */
+ hashtable->chunks_batch->used += size;
+ hashtable->chunks_batch->ntuples += 1;
+
+ /* allocate pointer to the start of the tuple memory */
+ return hashtable->chunks_batch->data + (hashtable->chunks_batch->used - size);
+
+}
+
/*
* ExecHashSkewTableInsert
*
@@ -1340,6 +1421,7 @@ ExecHashSkewTableInsert(HashJoinTable hashtable,
hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
hashTupleSize);
+
hashTuple->hashvalue = hashvalue;
memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 3beae40..c80a4d1 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -102,6 +102,22 @@ typedef struct HashSkewBucket
#define SKEW_WORK_MEM_PERCENT 2
#define SKEW_MIN_OUTER_FRACTION 0.01
+#define HASH_CHUNK_SIZE (16*1024L) /* 16kB chunks by default */
+#define HASH_SKEW_CHUNK_SIZE (4*1024L) /* 4kB chunks for skew buckets */
+
+typedef struct HashChunkData
+{
+ int ntuples; /* number of tuples stored in this chunk */
+ size_t maxlen; /* length of the buffer */
+ size_t used; /* number of chunk bytes already used */
+
+ struct HashChunkData *next; /* pointer to the next chunk (linked list) */
+
+ char data[1]; /* buffer allocated at the end */
+} HashChunkData;
+
+typedef struct HashChunkData* HashChunk;
+
typedef struct HashJoinTableData
{
@@ -157,6 +173,10 @@ typedef struct HashJoinTableData
MemoryContext hashCxt; /* context for whole-hash-join storage */
MemoryContext batchCxt; /* context for this-batch-only storage */
+
+ /* used for dense allocation of tuples (into linked chunks) */
+ HashChunk chunks_batch; /* one list for the whole batch */
+
} HashJoinTableData;
#endif /* HASHJOIN_H */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers