On 19.7.2014 20:24, Tomas Vondra wrote: > On 13.7.2014 21:32, Tomas Vondra wrote: >> The current patch only implemnents this for tuples in the main >> hash table, not for skew buckets. I plan to do that, but it will >> require separate chunks for each skew bucket (so we can remove it >> without messing with all of them). The chunks for skew buckets >> should be smaller (I'm thinking about ~4kB), because there'll be >> more of them, but OTOH those buckets are for frequent values so the >> chunks should not remain empty. > > I've looked into extending the dense allocation to the skew buckets, > and I think we shouldn't do that. I got about 50% of the changes and > then just threw it out because it turned out quite pointless. > > The amount of memory for skew buckets is limited to 2% of work mem, > so even with 100% overhead it'll use ~4% of work mem. So there's > pretty much nothing to gain here. So the additional complexity > introduced by the dense allocation seems pretty pointless. > > I'm not entirely happy with the code allocating some memory densely > and some using traditional palloc, but it certainly seems cleaner > than the code I had. > > So I think the patch is mostly OK as is.
Attached is v4 of the patch, mostly with minor improvements and cleanups (removed MemoryContextStats, code related to skew buckets). regards Tomas
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 589b2f1..6a9d956 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -47,6 +47,7 @@ static void ExecHashSkewTableInsert(HashJoinTable hashtable, int bucketNumber); static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable); +static char * chunk_alloc(HashJoinTable hashtable, int tupleSize); /* ---------------------------------------------------------------- * ExecHash @@ -129,7 +130,7 @@ MultiExecHash(HashState *node) /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStopNode(node->ps.instrument, hashtable->totalTuples); - + /* * We do not return the hash table directly because it's not a subtype of * Node, and so would violate the MultiExecProcNode API. Instead, our @@ -223,7 +224,6 @@ ExecEndHash(HashState *node) ExecEndNode(outerPlan); } - /* ---------------------------------------------------------------- * ExecHashTableCreate * @@ -294,6 +294,8 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) hashtable->spaceAllowedSkew = hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100; + hashtable->chunks_batch = NULL; + /* * Get info about the hash functions to be used for each hash key. Also * remember whether the join operators are strict. @@ -556,10 +558,10 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) int oldnbatch = hashtable->nbatch; int curbatch = hashtable->curbatch; int nbatch; - int i; MemoryContext oldcxt; long ninmemory; long nfreed; + HashChunk oldchunks = hashtable->chunks_batch; /* do nothing if we've decided to shut off growth */ if (!hashtable->growEnabled) @@ -612,51 +614,70 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) */ ninmemory = nfreed = 0; - for (i = 0; i < hashtable->nbuckets; i++) - { - HashJoinTuple prevtuple; - HashJoinTuple tuple; + /* we're going to scan through the chunks directly, not buckets, so we can reset the + * buckets and reinsert all the tuples there - just set them to NULL */ + memset(hashtable->buckets, 0, sizeof(void*) * hashtable->nbuckets); - prevtuple = NULL; - tuple = hashtable->buckets[i]; + /* reset the chunks (we have a copy in oldchunks) - this way we'll allocate */ + hashtable->chunks_batch = NULL; - while (tuple != NULL) - { - /* save link in case we delete */ - HashJoinTuple nexttuple = tuple->next; - int bucketno; - int batchno; + /* so, let's scan through the old chunks, and all tuples in each chunk */ + while (oldchunks != NULL) { + + /* pointer to the next memory chunk (may be NULL for the last chunk) */ + HashChunk nextchunk = oldchunks->next; + + /* position within the buffer (up to oldchunks->used) */ + size_t idx = 0; + + /* process all tuples stored in this chunk (and then free it) */ + while (idx < oldchunks->used) { + + /* get the hashjoin tuple and mintuple stored right after it */ + HashJoinTuple hashTuple = (HashJoinTuple)(oldchunks->data + idx); + MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple); + + int hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len); + + int bucketno; + int batchno; ninmemory++; - ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue, + ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, &bucketno, &batchno); - Assert(bucketno == i); + if (batchno == curbatch) { - /* keep tuple */ - prevtuple = tuple; + /* keep tuple - this means we need to copy it into the new chunks */ + HashJoinTuple copyTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize); + memcpy(copyTuple, hashTuple, hashTupleSize); + + /* and of course add it to the new buckets - just push the copy it onto the front + * of the bucket's list */ + copyTuple->next = hashtable->buckets[bucketno]; + hashtable->buckets[bucketno] = copyTuple; } else { /* dump it out */ Assert(batchno > curbatch); - ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple), - tuple->hashvalue, + ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple), + hashTuple->hashvalue, &hashtable->innerBatchFile[batchno]); - /* and remove from hash table */ - if (prevtuple) - prevtuple->next = nexttuple; - else - hashtable->buckets[i] = nexttuple; - /* prevtuple doesn't change */ - hashtable->spaceUsed -= - HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len; - pfree(tuple); + + hashtable->spaceUsed -= hashTupleSize; nfreed++; } - tuple = nexttuple; + /* bytes occupied in memory HJ tuple overhead + actual tuple length */ + idx += hashTupleSize; + } + + /* so, we're done with this chunk - free it and proceed to the next one */ + pfree(oldchunks); + oldchunks = nextchunk; + } #ifdef HJDEBUG @@ -717,8 +738,8 @@ ExecHashTableInsert(HashJoinTable hashtable, /* Create the HashJoinTuple */ hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len; - hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt, - hashTupleSize); + hashTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize); + hashTuple->hashvalue = hashvalue; memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len); @@ -1068,6 +1089,11 @@ ExecHashTableReset(HashJoinTable hashtable) hashtable->spaceUsed = 0; MemoryContextSwitchTo(oldcxt); + + /* reset the chunks too (the memory was allocated within batchCxt, so it's + * already freed by resetting the batch context -just set it to NULL) */ + hashtable->chunks_batch = NULL; + } /* @@ -1318,6 +1344,61 @@ ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue) return INVALID_SKEW_BUCKET_NO; } +static +char * chunk_alloc(HashJoinTable hashtable, int size) { + + /* maybe we should use MAXALIGN(size) here ... */ + + /* if tuple size is larger than of 1/8 of chunk size, allocate a separate chunk */ + if (size > (HASH_CHUNK_SIZE/8)) { + + /* allocate new chunk and put it at the beginning of the list */ + HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + size); + + newChunk->maxlen = size; + newChunk->used = 0; + newChunk->ntuples = 0; + + /* If there already is a chunk, add if after it so we can still use the space in it */ + if (hashtable->chunks_batch != NULL) { + newChunk->next = hashtable->chunks_batch->next; + hashtable->chunks_batch->next = newChunk; + } else { + newChunk->next = hashtable->chunks_batch; + hashtable->chunks_batch = newChunk; + } + + newChunk->used += size; + newChunk->ntuples += 1; + + return newChunk->data; + + } + + /* ok, it's within chunk size, let's see if we have enough space for it in the current + * chunk => if not, allocate a new chunk (chunks==NULL means this is the first chunk) */ + if ((hashtable->chunks_batch == NULL) || (hashtable->chunks_batch->maxlen - hashtable->chunks_batch->used) < size) { + + /* allocate new chunk and put it at the beginning of the list */ + HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + HASH_CHUNK_SIZE); + + newChunk->maxlen = HASH_CHUNK_SIZE; + newChunk->used = 0; + newChunk->ntuples = 0; + + newChunk->next = hashtable->chunks_batch; + hashtable->chunks_batch = newChunk; + } + + /* OK, we have enough space in the chunk, let's add the tuple */ + hashtable->chunks_batch->used += size; + hashtable->chunks_batch->ntuples += 1; + + /* allocate pointer to the start of the tuple memory */ + return hashtable->chunks_batch->data + (hashtable->chunks_batch->used - size); + +} + /* * ExecHashSkewTableInsert * @@ -1340,6 +1421,7 @@ ExecHashSkewTableInsert(HashJoinTable hashtable, hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len; hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt, hashTupleSize); + hashTuple->hashvalue = hashvalue; memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len); HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple)); diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 3beae40..c80a4d1 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -102,6 +102,22 @@ typedef struct HashSkewBucket #define SKEW_WORK_MEM_PERCENT 2 #define SKEW_MIN_OUTER_FRACTION 0.01 +#define HASH_CHUNK_SIZE (16*1024L) /* 16kB chunks by default */ +#define HASH_SKEW_CHUNK_SIZE (4*1024L) /* 4kB chunks for skew buckets */ + +typedef struct HashChunkData +{ + int ntuples; /* number of tuples stored in this chunk */ + size_t maxlen; /* length of the buffer */ + size_t used; /* number of chunk bytes already used */ + + struct HashChunkData *next; /* pointer to the next chunk (linked list) */ + + char data[1]; /* buffer allocated at the end */ +} HashChunkData; + +typedef struct HashChunkData* HashChunk; + typedef struct HashJoinTableData { @@ -157,6 +173,10 @@ typedef struct HashJoinTableData MemoryContext hashCxt; /* context for whole-hash-join storage */ MemoryContext batchCxt; /* context for this-batch-only storage */ + + /* used for dense allocation of tuples (into linked chunks) */ + HashChunk chunks_batch; /* one list for the whole batch */ + } HashJoinTableData; #endif /* HASHJOIN_H */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers