On Mon, 2025-01-06 at 12:34 -0800, Jeff Davis wrote:
> If we separate out 4, we can use the Bump allocator for 2 & 3.
Attached POC patch, which reduces memory usage by ~15% for a simple
distinct query on an integer key. Performance is the same or perhaps a
hair faster.
It's not many lines of code, but the surrounding code might benefit
from some refactoring which would make it a bit simpler.
Regards,
Jeff Davis
From 3ed8fcf2db77b51cf665125ff5c83acd3e87a816 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 8 Jan 2025 11:46:35 -0800
Subject: [PATCH v1] HashAgg: use Bump context for hash table entries.
---
src/backend/executor/nodeAgg.c | 32 +++++++++++++++++++++++++-------
src/include/nodes/execnodes.h | 3 ++-
2 files changed, 27 insertions(+), 8 deletions(-)
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 3005b5c0e3b..695091c7c46 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1503,7 +1503,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
{
AggStatePerHash perhash = &aggstate->perhash[setno];
MemoryContext metacxt = aggstate->hash_metacxt;
- MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+ MemoryContext tablecxt = aggstate->hash_tablecxt;
MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
Size additionalsize;
@@ -1529,7 +1529,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
nbuckets,
additionalsize,
metacxt,
- hashcxt,
+ tablecxt,
tmpcxt,
DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
}
@@ -1858,15 +1858,18 @@ hash_agg_check_limits(AggState *aggstate)
uint64 ngroups = aggstate->hash_ngroups_current;
Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
true);
- Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
- true);
+ Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
+ true);
+ Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+ true);
+ Size total_mem = meta_mem + entry_mem + tval_mem;
/*
* Don't spill unless there's at least one group in the hash table so we
* can be sure to make progress even in edge cases.
*/
if (aggstate->hash_ngroups_current > 0 &&
- (meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+ (total_mem > aggstate->hash_mem_limit ||
ngroups > aggstate->hash_ngroups_limit))
{
hash_agg_enter_spill_mode(aggstate);
@@ -1917,6 +1920,7 @@ static void
hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
{
Size meta_mem;
+ Size entry_mem;
Size hashkey_mem;
Size buffer_mem;
Size total_mem;
@@ -1928,7 +1932,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
/* memory for the hash table itself */
meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
- /* memory for the group keys and transition states */
+ /* memory for hash entries */
+ entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
+
+ /* memory for byref transition states */
hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
/* memory for read/write tape buffers, if spilled */
@@ -1937,7 +1944,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
buffer_mem += HASHAGG_READ_BUFFER_SIZE;
/* update peak mem */
- total_mem = meta_mem + hashkey_mem + buffer_mem;
+ total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
if (total_mem > aggstate->hash_mem_peak)
aggstate->hash_mem_peak = total_mem;
@@ -2622,6 +2629,7 @@ agg_refill_hash_table(AggState *aggstate)
/* free memory and reset hash tables */
ReScanExprContext(aggstate->hashcontext);
+ MemoryContextReset(aggstate->hash_tablecxt);
for (int setno = 0; setno < aggstate->num_hashes; setno++)
ResetTupleHashTable(aggstate->perhash[setno].hashtable);
@@ -3587,6 +3595,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
"HashAgg meta context",
ALLOCSET_DEFAULT_SIZES);
+ aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
+ "HashAgg table context",
+ ALLOCSET_DEFAULT_SIZES);
aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
&TTSOpsMinimalTuple);
aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4331,6 +4342,12 @@ ExecEndAgg(AggState *node)
MemoryContextDelete(node->hash_metacxt);
node->hash_metacxt = NULL;
}
+ if (node->hash_tablecxt != NULL)
+ {
+ MemoryContextDelete(node->hash_tablecxt);
+ node->hash_tablecxt = NULL;
+ }
+
for (transno = 0; transno < node->numtrans; transno++)
{
@@ -4447,6 +4464,7 @@ ExecReScanAgg(AggState *node)
node->hash_ngroups_current = 0;
ReScanExprContext(node->hashcontext);
+ MemoryContextReset(node->hash_tablecxt);
/* Rebuild an empty hash table */
build_hash_tables(node);
node->table_filled = false;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b3f7aa299f5..434adf2300c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2532,7 +2532,8 @@ typedef struct AggState
/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
bool table_filled; /* hash table filled yet? */
int num_hashes;
- MemoryContext hash_metacxt; /* memory for hash table itself */
+ MemoryContext hash_metacxt; /* memory for hash table bucket array */
+ MemoryContext hash_tablecxt; /* memory for hash table entries */
struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */
struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set,
* exists only during first pass */
--
2.34.1