On 26.6.2014 20:43, Tomas Vondra wrote:
> Attached is v2 of the patch, with some cleanups / minor improvements:
>
> * there's a single FIXME, related to counting tuples in the
Meh, I couldn't resist resolving this FIXME, so attached is v3 of the
patch. This just adds a proper 'batch tuples' counter to the hash table.
All comments, measurements on different queries etc. welcome. We'll
certainly do a lot of testing, because this was a big issue for us.
regards
Tomas
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 0d9663c..db3a953 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1900,18 +1900,20 @@ show_hash_info(HashState *hashstate, ExplainState *es)
if (es->format != EXPLAIN_FORMAT_TEXT)
{
ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
+ ExplainPropertyLong("Original Hash Buckets",
+ hashtable->nbuckets_original, es);
ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
ExplainPropertyLong("Original Hash Batches",
hashtable->nbatch_original, es);
ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
}
- else if (hashtable->nbatch_original != hashtable->nbatch)
+ else if ((hashtable->nbatch_original != hashtable->nbatch) || (hashtable->nbuckets_original != hashtable->nbuckets))
{
appendStringInfoSpaces(es->str, es->indent * 2);
appendStringInfo(es->str,
- "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n",
- hashtable->nbuckets, hashtable->nbatch,
- hashtable->nbatch_original, spacePeakKb);
+ "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
+ hashtable->nbuckets, hashtable->nbuckets_original,
+ hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
}
else
{
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 589b2f1..b32fb6b 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
int mcvsToUse);
static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -271,6 +272,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
*/
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
hashtable->nbuckets = nbuckets;
+ hashtable->nbuckets_original = nbuckets;
hashtable->log2_nbuckets = log2_nbuckets;
hashtable->buckets = NULL;
hashtable->keepNulls = keepNulls;
@@ -285,6 +287,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
hashtable->nbatch_outstart = nbatch;
hashtable->growEnabled = true;
hashtable->totalTuples = 0;
+ hashtable->batchTuples = 0;
hashtable->innerBatchFile = NULL;
hashtable->outerBatchFile = NULL;
hashtable->spaceUsed = 0;
@@ -386,6 +389,22 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
/* Target bucket loading (tuples per bucket) */
#define NTUP_PER_BUCKET 10
+/* Multiple of NTUP_PER_BUCKET triggering the increase of nbuckets.
+ *
+ * Once we reach the threshold we double the number of buckets, and we
+ * want to get 1.0 on average (to get NTUP_PER_BUCKET on average). That
+ * means these two equations should hold
+ *
+ * b = 2a (growth)
+ * (a + b)/2 = 1 (oscillate around NTUP_PER_BUCKET)
+ *
+ * which means b=1.3333 (a = b/2). If we wanted higher threshold, we
+ * could grow the nbuckets to (4*nbuckets), thus using (b=4a) for
+ * growth, leading to (b=1.6). Or (b=8a) giving 1.7777 etc.
+ *
+ * Let's start with doubling the bucket count, i.e. 1.333. */
+#define NTUP_GROW_THRESHOLD 1.333
+
void
ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
int *numbuckets,
@@ -651,6 +670,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
/* prevtuple doesn't change */
hashtable->spaceUsed -=
HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
+ hashtable->batchTuples -= 1;
pfree(tuple);
nfreed++;
}
@@ -682,6 +702,92 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
}
/*
+ * ExecHashIncreaseNumBuckets
+ * increase the original number of buckets in order to reduce
+ * number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+ int i;
+ int ntuples = 0;
+ int oldnbuckets = hashtable->nbuckets;
+ HashJoinTuple *oldbuckets = hashtable->buckets;
+ MemoryContext oldcxt;
+
+ /* XXX Not sure if we should update the info about used space here.
+ * The code seems to ignore the space used for 'buckets' and we're not
+ * allocating more space for tuples (just shuffling them to the new
+ * buckets). And the amount of memory used for buckets is quite small
+ * (just an array of pointers, thus ~8kB per 1k buckets on 64-bit). */
+
+ /* XXX Should we disable growth if (nbuckets * NTUP_PER_BUCKET)
+ * reaches work_mem (or something like that)? We shouldn't really
+ * get into such position (should be handled by increasing the
+ * number of batches, which is called right before this). */
+
+ /* XXX Maybe adding info into hashjoin explain output (e.g. initial
+ * nbuckets, time spent growing the table) would be appropriate. */
+
+ /* update the hashtable info, so that we can compute buckets etc. */
+ hashtable->log2_nbuckets += 1;
+ hashtable->nbuckets *= 2;
+
+ Assert(hashtable->nbuckets > 1);
+ Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+#ifdef HJDEBUG
+ printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
+#endif
+
+ /* TODO Maybe it'd be better to resize the buckets in place (should be possible,
+ * but when I tried it I always ended up with a strange infinite loop). */
+
+ /* allocate a new bucket list (use the batch context as before) */
+ oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
+
+ hashtable->buckets = (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
+
+ MemoryContextSwitchTo(oldcxt);
+
+ /* walk through the old buckets, move the buckets into the new table */
+ for (i = 0; i < oldnbuckets; i++)
+ {
+
+ HashJoinTuple tuple = oldbuckets[i];
+
+ while (tuple != NULL)
+ {
+ /* save link in case we delete */
+ HashJoinTuple nexttuple = tuple->next;
+ int bucketno;
+ int batchno;
+
+ ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+ &bucketno, &batchno);
+
+ /* move it to the correct bucket */
+ tuple->next = hashtable->buckets[bucketno];
+ hashtable->buckets[bucketno] = tuple;
+
+ /* process the next tuple */
+ tuple = nexttuple;
+
+ ntuples++;
+ }
+ }
+
+ pfree(oldbuckets);
+
+#ifdef HJDEBUG
+ printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+ hashtable->nbuckets, (float)ntuples / hashtable->nbuckets);
+#endif
+
+}
+
+
+/*
* ExecHashTableInsert
* insert a tuple into the hash table depending on the hash value
* it may just go to a temp file for later batches
@@ -734,12 +840,28 @@ ExecHashTableInsert(HashJoinTable hashtable,
hashTuple->next = hashtable->buckets[bucketno];
hashtable->buckets[bucketno] = hashTuple;
+ /* increase the number of tuples in the batch */
+ hashtable->batchTuples += 1;
+
/* Account for space used, and back off if we've used too much */
hashtable->spaceUsed += hashTupleSize;
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spacePeak = hashtable->spaceUsed;
if (hashtable->spaceUsed > hashtable->spaceAllowed)
ExecHashIncreaseNumBatches(hashtable);
+
+ /* If average number of tuples per bucket, double the number of buckets. */
+ if (((float)hashtable->batchTuples / hashtable->nbuckets) > NTUP_GROW_THRESHOLD * NTUP_PER_BUCKET) {
+
+#ifdef HJDEBUG
+ printf("Increasing nbucket to %d because average per bucket = %.1f\n",
+ nbuckets, (float)hashtable->batchTuples / hashtable->nbuckets);
+#endif
+
+ ExecHashIncreaseNumBuckets(hashtable);
+
+ }
+
}
else
{
@@ -1066,6 +1188,7 @@ ExecHashTableReset(HashJoinTable hashtable)
palloc0(nbuckets * sizeof(HashJoinTuple));
hashtable->spaceUsed = 0;
+ hashtable->batchTuples = 0;
MemoryContextSwitchTo(oldcxt);
}
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 3beae40..2858f82 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -106,6 +106,7 @@ typedef struct HashSkewBucket
typedef struct HashJoinTableData
{
int nbuckets; /* # buckets in the in-memory hash table */
+ int nbuckets_original; /* # buckets when starting the first hash */
int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
@@ -129,6 +130,7 @@ typedef struct HashJoinTableData
bool growEnabled; /* flag to shut off nbatch increases */
double totalTuples; /* # tuples obtained from inner plan */
+ int64 batchTuples; /* # tuples in the current batch */
/*
* These arrays are allocated for the life of the hash join, but only if
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers