Hi David,
>> I pushed the changes to WindowAgg so as not to call tuplestore_end()
>> on every partition. Can you rebase this patch over that change?
>>
>> It would be good to do this in a way that does not add any new state
>> to WindowAggState, you can see that I had to shuffle fields around in
>> that struct because the next_parition field would have caused the
>> struct to become larger. I've not looked closely, but I expect this
>> can be done by adding more code to tuplestore_updatemax() to also
>> track the disk space used if the current storage has gone to disk. I
>> expect the maxSpace field can be used for both, but we'd need another
>> bool field to track if the max used was by disk or memory.
I have created a patch in the direction you suggested. See attached
patch (v1-0001-Enhance-tuplestore.txt). To not confuse CFbot, the
extension is "txt", not "patch".
>> I think the performance of this would also need to be tested as it
>> means doing an lseek() on every tuplestore_clear() when we've gone to
>> disk. Probably that will be dominated by all the other overheads of a
>> tuplestore going to disk (i.e. dumptuples() etc), but it would be good
>> to check this. I suggest setting work_mem = 64 and making a test case
>> that only just spills to disk. Maybe do a few thousand partitions
>> worth of that and see if you can measure any slowdown.
I copied your shell script and slightly modified it then ran pgbench
with (1 10 100 1000 5000 10000) window partitions (see attached shell
script). In the script I set work_mem to 64kB. It seems for 10000,
1000 and 100 partitions, the performance difference seems
noises. However, for 10, 2, 1 partitions. I see large performance
degradation with the patched version: patched is slower than stock
master in 1.5% (10 partitions), 41% (2 partitions) and 55.7% (1
partition). See the attached graph.
>From 4749d2018f33e883c292eb904f3253d393a47c99 Mon Sep 17 00:00:00 2001
From: Tatsuo Ishii <[email protected]>
Date: Thu, 5 Sep 2024 20:59:01 +0900
Subject: [PATCH v1] Enhance tuplestore.
Let tuplestore_updatemax() handle both memory and disk case.
---
src/backend/utils/sort/tuplestore.c | 27 ++++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/src/backend/utils/sort/tuplestore.c
b/src/backend/utils/sort/tuplestore.c
index 444c8e25c2..854121fc11 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -107,9 +107,10 @@ struct Tuplestorestate
bool backward; /* store extra length words in
file? */
bool interXact; /* keep open through
transactions? */
bool truncated; /* tuplestore_trim has removed
tuples? */
+ bool inMem; /* true if maxSpace is for
memory */
int64 availMem; /* remaining memory available,
in bytes */
int64 allowedMem; /* total memory allowed, in
bytes */
- int64 maxSpace; /* maximum space used in memory
*/
+ int64 maxSpace; /* maximum space used in memory
or disk */
int64 tuples; /* number of tuples added */
BufFile *myfile; /* underlying file, or NULL if
none */
MemoryContext context; /* memory context for holding tuples */
@@ -262,6 +263,7 @@ tuplestore_begin_common(int eflags, bool interXact, int
maxKBytes)
state->eflags = eflags;
state->interXact = interXact;
state->truncated = false;
+ state->inMem = true;
state->allowedMem = maxKBytes * 1024L;
state->availMem = state->allowedMem;
state->maxSpace = 0;
@@ -1497,8 +1499,17 @@ static void
tuplestore_updatemax(Tuplestorestate *state)
{
if (state->status == TSS_INMEM)
+ {
state->maxSpace = Max(state->maxSpace,
state->allowedMem -
state->availMem);
+ state->inMem = true;
+ }
+ else
+ {
+ state->maxSpace = Max(state->maxSpace,
+
BufFileSize(state->myfile));
+ state->inMem = false;
+ }
}
/*
@@ -1509,7 +1520,7 @@ tuplestore_updatemax(Tuplestorestate *state)
const char *
tuplestore_storage_type_name(Tuplestorestate *state)
{
- if (state->status == TSS_INMEM)
+ if (state->inMem)
return "Memory";
else
return "Disk";
@@ -1517,8 +1528,7 @@ tuplestore_storage_type_name(Tuplestorestate *state)
/*
* tuplestore_space_used
- * Return the maximum space used in memory unless the tuplestore
has spilled
- * to disk, in which case, return the disk space used.
+ * Return the maximum space used in memory or disk.
*/
int64
tuplestore_space_used(Tuplestorestate *state)
@@ -1526,10 +1536,7 @@ tuplestore_space_used(Tuplestorestate *state)
/* First, update the maxSpace field */
tuplestore_updatemax(state);
- if (state->status == TSS_INMEM)
- return state->maxSpace;
- else
- return BufFileSize(state->myfile);
+ return state->maxSpace;
}
/*
@@ -1601,7 +1608,9 @@ writetup_heap(Tuplestorestate *state, void *tup)
if (state->backward) /* need trailing length word? */
BufFileWrite(state->myfile, &tuplen, sizeof(tuplen));
- /* no need to call tuplestore_updatemax() when not in TSS_INMEM */
+ /* update maxSpace */
+ tuplestore_updatemax(state);
+
FREEMEM(state, GetMemoryChunkSpace(tuple));
heap_free_minimal_tuple(tuple);
}
--
2.25.1
dbname=test
rows=10000
secs=10
ntests=3
psql -c "drop table if exists part_test;" $dbname
psql -c "create table part_test (a int not null);" $dbname
psql -c "insert into part_test select g.s from generate_series(1,$rows) g(s);"
$dbname
psql -c "vacuum freeze analyze part_test;" $dbname
psql -c "alter system set work_mem = '64kB';" $dbname
psql -c "alter system set jit = 0;" $dbname
psql -c "select pg_reload_conf();" $dbname
for c in 1 10 100 1000 5000 10000
do
echo "SELECT a,count(*) OVER (PARTITION BY a / $c) FROM part_test
OFFSET $rows" > bench.sql
echo "Testing with $(($rows / $c)) partitions"
for i in $(seq 1 $ntests)
do
pgbench -n -f bench.sql -M prepared -T $secs $dbname | grep
latency
done
done