From 4375fcb4141f18d6cd927659970518553aa3fe94 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 31 Aug 2025 16:37:05 -0700
Subject: [PATCH v1 3/3] Introduce stack for tracking per-node WAL/buffer usage

---
 src/backend/commands/explain.c      |   8 +-
 src/backend/executor/execMain.c     |   7 ++
 src/backend/executor/execProcnode.c |   9 +++
 src/backend/executor/instrument.c   | 111 ++++++++++++++++++++++++----
 src/include/executor/instrument.h   |  42 ++++++++++-
 5 files changed, 155 insertions(+), 22 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 6a135e51996..584f0adbcc1 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2280,9 +2280,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 	/* Show buffer/WAL usage */
 	if (es->buffers && planstate->instrument)
-		show_buffer_usage(es, &planstate->instrument->bufusage);
+		show_buffer_usage(es, &planstate->instrument->stack.bufusage);
 	if (es->wal && planstate->instrument)
-		show_wal_usage(es, &planstate->instrument->walusage);
+		show_wal_usage(es, &planstate->instrument->stack.walusage);
 
 	/* Prepare per-worker buffer/WAL usage */
 	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
@@ -2299,9 +2299,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 			ExplainOpenWorker(n, es);
 			if (es->buffers)
-				show_buffer_usage(es, &instrument->bufusage);
+				show_buffer_usage(es, &instrument->stack.bufusage);
 			if (es->wal)
-				show_wal_usage(es, &instrument->walusage);
+				show_wal_usage(es, &instrument->stack.walusage);
 			ExplainCloseWorker(n, es);
 		}
 	}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index b83ced9a57a..1c2268bc608 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -312,6 +312,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	DestReceiver *dest;
 	bool		sendTuples;
 	MemoryContext oldcontext;
+	InstrStackResource *res;
 
 	/* sanity checks */
 	Assert(queryDesc != NULL);
@@ -333,6 +334,9 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	if (queryDesc->totaltime)
 		InstrStart(queryDesc->totaltime);
 
+	/* Start up per-query node level instrumentation */
+	res = InstrStartQuery();
+
 	/*
 	 * extract information from the query descriptor and the query feature.
 	 */
@@ -382,6 +386,9 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	if (sendTuples)
 		dest->rShutdown(dest);
 
+	/* Shut down per-query node level instrumentation */
+	InstrShutdownQuery(res);
+
 	if (queryDesc->totaltime)
 		InstrStop(queryDesc->totaltime, estate->es_processed);
 
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d286471254b..7436f307994 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -823,8 +823,17 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 
 	/* Stop the node if we started it above, reporting 0 tuples. */
 	if (node->instrument && node->instrument->running)
+	{
 		InstrStopNode(node->instrument, 0);
 
+		/*
+		 * Propagate WAL/buffer stats to the parent node on the
+		 * instrumentation stack (which is where InstrStopNode returned us
+		 * to).
+		 */
+		InstrNodeAddToCurrent(&node->instrument->stack);
+	}
+
 	return false;
 }
 
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index c53480d8030..040d1fdecbd 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,15 +16,40 @@
 #include <unistd.h>
 
 #include "executor/instrument.h"
+#include "utils/memutils.h"
 
 BufferUsage pgBufferUsage;
 static BufferUsage save_pgBufferUsage;
 WalUsage	pgWalUsage;
 static WalUsage save_pgWalUsage;
+InstrStack *pgInstrStack = NULL;
 
 static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
 static void WalUsageAdd(WalUsage *dst, WalUsage *add);
 
+/*
+ * Node-specific instrumentation handling uses ResourceOwner mechanism to
+ * reset pgInstrStack on abort.
+ */
+static void ResOwnerReleaseInstrStack(Datum res);
+static const ResourceOwnerDesc instr_stack_resowner_desc =
+{
+	.name = "instrumentation stack scope",
+	.release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
+	.release_priority = RELEASE_PRIO_FIRST,
+	.ReleaseResource = ResOwnerReleaseInstrStack,
+	.DebugPrint = NULL,			/* default message is fine */
+};
+static inline void
+ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStackResource * scope)
+{
+	ResourceOwnerRemember(owner, PointerGetDatum(scope), &instr_stack_resowner_desc);
+}
+static inline void
+ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStackResource * scope)
+{
+	ResourceOwnerForget(owner, PointerGetDatum(scope), &instr_stack_resowner_desc);
+}
 
 /* General purpose instrumentation handling */
 Instrumentation *
@@ -139,12 +164,17 @@ InstrStartNode(NodeInstrumentation * instr)
 		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
 		elog(ERROR, "InstrStartNode called twice in a row");
 
-	/* save buffer usage totals at node entry, if needed */
-	if (instr->need_bufusage)
-		instr->bufusage_start = pgBufferUsage;
+	if (instr->need_bufusage || instr->need_walusage)
+	{
+		/*
+		 * Ensure that we have an active pgInstrStack (InstrStartQuery must
+		 * have been called)
+		 */
+		Assert(pgInstrStack != NULL);
 
-	if (instr->need_walusage)
-		instr->walusage_start = pgWalUsage;
+		instr->stack.previous = pgInstrStack;
+		pgInstrStack = &instr->stack;
+	}
 }
 
 /* Exit from a plan node */
@@ -169,14 +199,12 @@ InstrStopNode(NodeInstrumentation * instr, double nTuples)
 		INSTR_TIME_SET_ZERO(instr->starttime);
 	}
 
-	/* Add delta of buffer usage since entry to node's totals */
-	if (instr->need_bufusage)
-		BufferUsageAccumDiff(&instr->bufusage,
-							 &pgBufferUsage, &instr->bufusage_start);
-
-	if (instr->need_walusage)
-		WalUsageAccumDiff(&instr->walusage,
-						  &pgWalUsage, &instr->walusage_start);
+	if (instr->need_bufusage || instr->need_walusage)
+	{
+		/* Ensure that there is a stack entry above the top-most node */
+		Assert(instr->stack.previous != NULL);
+		pgInstrStack = instr->stack.previous;
+	}
 
 	/* Is this the first tuple of this cycle? */
 	if (!instr->running)
@@ -257,10 +285,65 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add)
 
 	/* Add delta of buffer usage since entry to node's totals */
 	if (dst->need_bufusage)
-		BufferUsageAdd(&dst->bufusage, &add->bufusage);
+		BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage);
 
 	if (dst->need_walusage)
+		WalUsageAdd(&dst->stack.walusage, &add->stack.walusage);
+}
+
+InstrStackResource *
+InstrStartQuery()
+{
+	InstrStack *usage = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStack));
+	InstrStackResource *usageRes = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStackResource));
+	ResourceOwner owner = CurrentResourceOwner;
+
+	Assert(owner != NULL);
+
+	usageRes->owner = owner;
+
+	ResourceOwnerEnlarge(owner);
+	ResourceOwnerRememberInstrStack(owner, usageRes);
+
+	usage->previous = pgInstrStack;
+	pgInstrStack = usage;
+
+	return usageRes;
+}
+
+void
+InstrShutdownQuery(InstrStackResource * res)
+{
+	Assert(res != NULL);
+	Assert(res->owner != NULL);
+
+	pgInstrStack = res->previous;
+
+	ResourceOwnerForgetInstrStack(res->owner, res);
+}
+
+static void
+ResOwnerReleaseInstrStack(Datum res)
+{
+	/*
+	 * XXX: Registered resources are *not* called in reverse order, i.e. we'll
+	 * get what was first registered first at shutdown. To avoid handling
+	 * that, we are resetting the stack here on abort (instead of recovering
+	 * to previous).
+	 */
+	pgInstrStack = NULL;
+}
+
+void
+InstrNodeAddToCurrent(InstrStack * add)
+{
+	if (pgInstrStack != NULL)
+	{
+		InstrStack *dst = pgInstrStack;
+
+		BufferUsageAdd(&dst->bufusage, &add->bufusage);
 		WalUsageAdd(&dst->walusage, &add->walusage);
+	}
 }
 
 /* note current values during parallel executor startup */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 3a280f4caae..a98efab5f93 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -14,6 +14,7 @@
 #define INSTRUMENT_H
 
 #include "portability/instr_time.h"
+#include "utils/resowner.h"
 
 
 /*
@@ -66,6 +67,21 @@ typedef enum InstrumentOption
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
+/* Stack of WAL/buffer usage used for per-node instrumentation */
+typedef struct InstrStack
+{
+	struct InstrStack *previous;
+	BufferUsage bufusage;
+	WalUsage	walusage;
+}			InstrStack;
+
+/* Used to manage resetting of instrumentation stack on abort. */
+typedef struct InstrStackResource
+{
+	InstrStack *previous;
+	ResourceOwner owner;
+}			InstrStackResource;
+
 /*
  * General purpose instrumentation that can capture time, WAL/buffer usage and tuples
  *
@@ -91,6 +107,10 @@ typedef struct Instrumentation
 
 /*
  * Specialized instrumentation for per-node execution statistics
+ *
+ * Requires use of InstrStartQuery to initialize the stack used for WAL/buffer
+ * usage statistics, and cleanup through InstrShutdownQuery. Solely intended for
+ * the executor and anyone reporting about its activities (e.g. EXPLAIN ANALYZE).
  */
 typedef struct NodeInstrumentation
 {
@@ -105,8 +125,6 @@ typedef struct NodeInstrumentation
 	instr_time	counter;		/* accumulated runtime for this node */
 	double		firsttuple;		/* time for first tuple of this cycle */
 	double		tuplecount;		/* # of tuples emitted so far this cycle */
-	BufferUsage bufusage_start; /* buffer usage at start */
-	WalUsage	walusage_start; /* WAL usage at start */
 	/* Accumulated statistics across all completed cycles: */
 	double		startup;		/* total startup time (in seconds) */
 	double		total;			/* total time (in seconds) */
@@ -115,8 +133,7 @@ typedef struct NodeInstrumentation
 	double		nloops;			/* # of run cycles for this node */
 	double		nfiltered1;		/* # of tuples removed by scanqual or joinqual */
 	double		nfiltered2;		/* # of tuples removed by "other" quals */
-	BufferUsage bufusage;		/* total buffer usage */
-	WalUsage	walusage;		/* total WAL usage */
+	InstrStack	stack;			/* stack tracking buffer/WAL usage */
 }			NodeInstrumentation;
 
 typedef struct WorkerInstrumentation
@@ -127,6 +144,7 @@ typedef struct WorkerInstrumentation
 
 extern PGDLLIMPORT BufferUsage pgBufferUsage;
 extern PGDLLIMPORT WalUsage pgWalUsage;
+extern PGDLLIMPORT InstrStack * pgInstrStack;
 
 extern Instrumentation *InstrAlloc(int n, int instrument_options);
 extern void InstrStart(Instrumentation *instr);
@@ -135,11 +153,14 @@ extern void InstrStop(Instrumentation *instr, double nTuples);
 extern NodeInstrumentation * InstrAllocNode(int n, int instrument_options,
 											bool async_mode);
 extern void InstrInitNode(NodeInstrumentation * instr, int instrument_options);
+extern InstrStackResource * InstrStartQuery(void);
+extern void InstrShutdownQuery(InstrStackResource * res);
 extern void InstrStartNode(NodeInstrumentation * instr);
 extern void InstrStopNode(NodeInstrumentation * instr, double nTuples);
 extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples);
 extern void InstrEndLoop(NodeInstrumentation * instr);
 extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add);
+extern void InstrNodeAddToCurrent(InstrStack * stack);
 
 extern void InstrStartParallelQuery(void);
 extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
@@ -151,21 +172,34 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
 
 #define INSTR_BUFUSAGE_INCR(fld) do { \
 		pgBufferUsage.fld++; \
+		if (pgInstrStack) \
+			pgInstrStack->bufusage.fld++; \
 	} while(0)
 #define INSTR_BUFUSAGE_ADD(fld,val) do { \
 		pgBufferUsage.fld += val; \
+		if (pgInstrStack) \
+			pgInstrStack->bufusage.fld += val; \
 	} while(0)
 #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
 	INSTR_TIME_ADD(pgBufferUsage.fld, val); \
+	if (pgInstrStack) \
+		INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \
 	} while (0)
 #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
 	INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \
+	if (pgInstrStack) \
+		INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \
 	} while (0)
+
 #define INSTR_WALUSAGE_INCR(fld) do { \
 		pgWalUsage.fld++; \
+		if (pgInstrStack) \
+			pgInstrStack->walusage.fld++; \
 	} while(0)
 #define INSTR_WALUSAGE_ADD(fld,val) do { \
 		pgWalUsage.fld += val; \
+		if (pgInstrStack) \
+			pgInstrStack->walusage.fld += val; \
 	} while(0)
 
 #endif							/* INSTRUMENT_H */
-- 
2.47.1

