On 22.06.2013 19:19, Simon Riggs wrote:
So I think that (2) is the best route: Given that we know with much better certainty the number of rows in the scanned-relation, we should be able to examine our hash table after it has been built and decide whether it would be cheaper to rebuild the hash table with the right number of buckets, or continue processing with what we have now. Which is roughly what Heikki proposed already, in January.
Back in January, I wrote a quick patch to experiment with rehashing when the hash table becomes too full. It was too late to make it into 9.3 so I didn't pursue it further back then, but IIRC it worked. If we have the capability to rehash, the accuracy of the initial guess becomes much less important.
- Heikki
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 6a2f236..32aebb9 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -223,6 +223,60 @@ ExecEndHash(HashState *node) ExecEndNode(outerPlan); } +static void +Rehash(HashJoinTable hashtable) +{ + int nbuckets_old = hashtable->nbuckets_inmem; + int nbuckets_new; + uint32 mask; + int i; + + if (nbuckets_old > (1<<30)) + return; + + nbuckets_new = nbuckets_old * 2; + hashtable->buckets = (HashJoinTuple *) + repalloc(hashtable->buckets, nbuckets_new * sizeof(HashJoinTuple)); + memset(((char *) hashtable->buckets) + nbuckets_old * sizeof(HashJoinTuple), + 0, nbuckets_old * sizeof(HashJoinTuple)); + hashtable->nbuckets_inmem = nbuckets_new; + + mask = nbuckets_old; + + for (i = 0; i < nbuckets_old; i++) + { + int newbucketno = i + nbuckets_old; + HashJoinTuple prevtuple; + HashJoinTuple tuple; + + prevtuple = NULL; + tuple = hashtable->buckets[i]; + + while (tuple != NULL) + { + /* save link in case we delete */ + HashJoinTuple nexttuple = tuple->next; + + if ((tuple->hashvalue & mask) != 0) + { + /* move to the new bucket */ + tuple->next = hashtable->buckets[newbucketno]; + hashtable->buckets[newbucketno] = tuple; + + if (prevtuple) + prevtuple->next = nexttuple; + else + hashtable->buckets[i] = nexttuple; + } + else + { + /* keep in this bucket */ + prevtuple = tuple; + } + tuple = nexttuple; + } + } +} /* ---------------------------------------------------------------- * ExecHashTableCreate @@ -271,6 +325,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) */ hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData)); hashtable->nbuckets = nbuckets; + hashtable->nbuckets_inmem = nbuckets; hashtable->log2_nbuckets = log2_nbuckets; hashtable->buckets = NULL; hashtable->keepNulls = keepNulls; @@ -285,6 +340,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) hashtable->nbatch_outstart = nbatch; hashtable->growEnabled = true; hashtable->totalTuples = 0; + hashtable->inmemTuples = 0; hashtable->innerBatchFile = NULL; hashtable->outerBatchFile = NULL; hashtable->spaceUsed = 0; @@ -612,7 +668,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) */ ninmemory = nfreed = 0; - for (i = 0; i < hashtable->nbuckets; i++) + for (i = 0; i < hashtable->nbuckets_inmem; i++) { HashJoinTuple prevtuple; HashJoinTuple tuple; @@ -734,6 +790,10 @@ ExecHashTableInsert(HashJoinTable hashtable, hashTuple->next = hashtable->buckets[bucketno]; hashtable->buckets[bucketno] = hashTuple; + hashtable->inmemTuples += 1; + if (hashtable->inmemTuples > hashtable->nbuckets_inmem * NTUP_PER_BUCKET * 2) + Rehash(hashtable); + /* Account for space used, and back off if we've used too much */ hashtable->spaceUsed += hashTupleSize; if (hashtable->spaceUsed > hashtable->spacePeak) @@ -873,18 +933,18 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable, int *bucketno, int *batchno) { - uint32 nbuckets = (uint32) hashtable->nbuckets; + uint32 nbuckets_inmem = (uint32) hashtable->nbuckets_inmem; uint32 nbatch = (uint32) hashtable->nbatch; if (nbatch > 1) { /* we can do MOD by masking, DIV by shifting */ - *bucketno = hashvalue & (nbuckets - 1); + *bucketno = hashvalue & (nbuckets_inmem - 1); *batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1); } else { - *bucketno = hashvalue & (nbuckets - 1); + *bucketno = hashvalue & (nbuckets_inmem - 1); *batchno = 0; } } @@ -995,7 +1055,7 @@ ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext) */ if (hashTuple != NULL) hashTuple = hashTuple->next; - else if (hjstate->hj_CurBucketNo < hashtable->nbuckets) + else if (hjstate->hj_CurBucketNo < hashtable->nbuckets_inmem) { hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo]; hjstate->hj_CurBucketNo++; @@ -1052,7 +1112,7 @@ void ExecHashTableReset(HashJoinTable hashtable) { MemoryContext oldcxt; - int nbuckets = hashtable->nbuckets; + int nbuckets_inmem = hashtable->nbuckets_inmem; /* * Release all the hash buckets and tuples acquired in the prior pass, and @@ -1063,9 +1123,10 @@ ExecHashTableReset(HashJoinTable hashtable) /* Reallocate and reinitialize the hash bucket headers. */ hashtable->buckets = (HashJoinTuple *) - palloc0(nbuckets * sizeof(HashJoinTuple)); + palloc0(nbuckets_inmem * sizeof(HashJoinTuple)); hashtable->spaceUsed = 0; + hashtable->inmemTuples = 0; MemoryContextSwitchTo(oldcxt); } @@ -1081,7 +1142,7 @@ ExecHashTableResetMatchFlags(HashJoinTable hashtable) int i; /* Reset all flags in the main table ... */ - for (i = 0; i < hashtable->nbuckets; i++) + for (i = 0; i < hashtable->nbuckets_inmem; i++) { for (tuple = hashtable->buckets[i]; tuple != NULL; tuple = tuple->next) HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple)); diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 8654066..1f10863 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -106,6 +106,7 @@ typedef struct HashSkewBucket typedef struct HashJoinTableData { int nbuckets; /* # buckets in the in-memory hash table */ + int nbuckets_inmem; /* # buckets in the in-memory hash table */ int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */ /* buckets[i] is head of list of tuples in i'th in-memory bucket */ @@ -130,6 +131,8 @@ typedef struct HashJoinTableData double totalTuples; /* # tuples obtained from inner plan */ + uint64 inmemTuples; /* # tuples in current hash table */ + /* * These arrays are allocated for the life of the hash join, but only if * nbatch > 1. A file is opened only when we first write a tuple into it
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers