From 99b5fcbe6a7e097fd36bdee730e98eca41f5426b Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 3 Jan 2022 18:26:45 -0500
Subject: [PATCH v21 5/8] Add "buffers" to pgstat_reset_shared_counters

Backends count IO operations for various IO paths in their
PgBackendStatus. Upon exit, they send these counts to the stats
collector.

Prior to this commit, the IO operations stats from exited backends
persisted by the stats collector would have been been reset when
pgstat_reset_shared_counters() was invoked with target "bgwriter".
However the IO operations stats in each live backend's PgBackendStatus
would remain the same. Thus the totals calculated from both live and
exited backends would be incorrect after a reset.

Backends' PgBackendStatuses cannot be written to by another backend;
therefore, in order to calculate correct totals after a reset has
occurred, the backend sending the reset message to the stats collector
now reads the IO operation stats totals from live backends and sends
them to the stats collector to be persisted in an array of "resets"
which can be used to calculate the correct totals after a reset.

Because the IO operations statistics are broader in scope than those in
pg_stat_bgwriter, rename the reset target to "buffers". The "buffers"
target will reset all IO operations statistics and all statistics for
the pg_stat_bgwriter view maintained by the stats collector.

Author: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/20200124195226.lth52iydq2n2uilq%40alap3.anarazel.de
---
 doc/src/sgml/monitoring.sgml                |  2 +-
 src/backend/postmaster/pgstat.c             | 87 ++++++++++++++++++---
 src/backend/utils/activity/backend_status.c | 27 +++++++
 src/include/pgstat.h                        | 29 +++++--
 src/include/utils/backend_status.h          |  2 +
 5 files changed, 129 insertions(+), 18 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index bf7625d988..caa45cb5f5 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -5218,7 +5218,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
        </para>
        <para>
         Resets some cluster-wide statistics counters to zero, depending on the
-        argument.  The argument can be <literal>bgwriter</literal> to reset
+        argument.  The argument can be <literal>buffers</literal> to reset
         all the counters shown in
         the <structname>pg_stat_bgwriter</structname>
         view, <literal>archiver</literal> to reset all the counters shown in
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5eaf8b6ee7..a5b7cfa45d 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1509,6 +1509,36 @@ pgstat_reset_counters(void)
 	pgstat_send(&msg, sizeof(msg));
 }
 
+/*
+ * Helper function to collect and send live backends' current IO operations
+ * stats counters when a stats reset is initiated so that they may be deducted
+ * from future totals.
+ */
+static void
+pgstat_send_buffers_reset(PgStat_MsgResetsharedcounter *msg)
+{
+	PgStatIOPathOps ops[BACKEND_NUM_TYPES];
+
+	memset(ops, 0, sizeof(ops));
+	pgstat_report_live_backend_io_path_ops(ops);
+
+	/*
+	 * Iterate through the array of all IOOps for all IOPaths for each
+	 * BackendType.
+	 *
+	 * An individual message is sent for each backend type because sending all
+	 * IO operations in one message would exceed the PGSTAT_MAX_MSG_SIZE of
+	 * 1000.
+	 */
+	for (int i = 0; i < BACKEND_NUM_TYPES; i++)
+	{
+		msg->m_backend_resets.backend_type = idx_get_backend_type(i);
+		memcpy(&msg->m_backend_resets.iop, &ops[i],
+				sizeof(msg->m_backend_resets.iop));
+		pgstat_send(msg, sizeof(PgStat_MsgResetsharedcounter));
+	}
+}
+
 /* ----------
  * pgstat_reset_shared_counters() -
  *
@@ -1526,19 +1556,25 @@ pgstat_reset_shared_counters(const char *target)
 	if (pgStatSock == PGINVALID_SOCKET)
 		return;
 
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
+	if (strcmp(target, "buffers") == 0)
+	{
+		msg.m_resettarget = RESET_BUFFERS;
+		pgstat_send_buffers_reset(&msg);
+		return;
+	}
+
 	if (strcmp(target, "archiver") == 0)
 		msg.m_resettarget = RESET_ARCHIVER;
-	else if (strcmp(target, "bgwriter") == 0)
-		msg.m_resettarget = RESET_BGWRITER;
 	else if (strcmp(target, "wal") == 0)
 		msg.m_resettarget = RESET_WAL;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("unrecognized reset target: \"%s\"", target),
-				 errhint("Target must be \"archiver\", \"bgwriter\", or \"wal\".")));
+				 errhint(
+					 "Target must be \"archiver\", \"buffers\", or \"wal\".")));
 
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
 	pgstat_send(&msg, sizeof(msg));
 }
 
@@ -4425,6 +4461,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	ts = GetCurrentTimestamp();
 	globalStats.bgwriter.stat_reset_timestamp = ts;
+	globalStats.buffers.stat_reset_timestamp = ts;
 	archiverStats.stat_reset_timestamp = ts;
 	walStats.stat_reset_timestamp = ts;
 
@@ -5588,18 +5625,46 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
 static void
 pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 {
-	if (msg->m_resettarget == RESET_BGWRITER)
-	{
-		/* Reset the global, bgwriter and checkpointer statistics for the cluster. */
-		memset(&globalStats, 0, sizeof(globalStats));
-		globalStats.bgwriter.stat_reset_timestamp = GetCurrentTimestamp();
-	}
-	else if (msg->m_resettarget == RESET_ARCHIVER)
+	if (msg->m_resettarget == RESET_ARCHIVER)
 	{
 		/* Reset the archiver statistics for the cluster. */
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_BUFFERS)
+	{
+		/*
+		 * Reset global stats for bgwriter, buffers, and checkpointer.
+		 *
+		 * Because the stats collector cannot write to live backends'
+		 * PgBackendStatuses, it maintains an array of "resets". The reset
+		 * message contains the current values of these counters for live
+		 * backends. The stats collector saves these in its "resets" array,
+		 * then zeroes out the exited backends' saved IO operations counters.
+		 * This is required to calculate an accurate total for each IO
+		 * operations counter post reset.
+		 */
+		BackendType backend_type = msg->m_backend_resets.backend_type;
+
+		/*
+		 * We reset each member individually (as opposed to resetting the
+		 * entire globalStats struct) because we need to preserve the resets
+		 * array (globalStats.buffers.resets).
+		 *
+		 * Though globalStats.buffers.ops, globalStats.bgwriter, and
+		 * globalStats.checkpointer only need to be reset once, doing so for
+		 * every message is less brittle and the extra cost is irrelevant given
+		 * how often stats are reset.
+		 */
+		memset(&globalStats.bgwriter, 0, sizeof(globalStats.bgwriter));
+		memset(&globalStats.checkpointer, 0, sizeof(globalStats.checkpointer));
+		memset(&globalStats.buffers.ops, 0, sizeof(globalStats.buffers.ops));
+		globalStats.bgwriter.stat_reset_timestamp = GetCurrentTimestamp();
+		globalStats.buffers.stat_reset_timestamp = GetCurrentTimestamp();
+		memcpy(&globalStats.buffers.resets[backend_type_get_idx(backend_type)],
+				&msg->m_backend_resets.iop.io_path_ops,
+				sizeof(msg->m_backend_resets.iop.io_path_ops));
+	}
 	else if (msg->m_resettarget == RESET_WAL)
 	{
 		/* Reset the WAL statistics for the cluster. */
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 79410e0b2c..87b9d0fc0d 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -636,6 +636,33 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 	PGSTAT_END_WRITE_ACTIVITY(beentry);
 }
 
+/*
+ * Iterate through BackendStatusArray and capture live backends' stats on IOOps
+ * for all IOPaths, adding them to that backend type's member of the
+ * backend_io_path_ops structure.
+ */
+void
+pgstat_report_live_backend_io_path_ops(PgStatIOPathOps *backend_io_path_ops)
+{
+	PgBackendStatus *beentry = BackendStatusArray;
+
+	/*
+	 * Loop through live backends and capture reset values
+	 */
+	for (int i = 0; i < GetMaxBackends() + NUM_AUXPROCTYPES; i++, beentry++)
+	{
+		int idx;
+
+		/* Don't count dead backends or those with type B_INVALID. */
+		if (beentry->st_procpid == 0 || beentry->st_backendType == B_INVALID)
+			continue;
+
+		idx = backend_type_get_idx(beentry->st_backendType);
+		pgstat_sum_io_path_ops(backend_io_path_ops[idx].io_path_ops,
+				(IOOpCounters *) beentry->io_path_stats);
+	}
+}
+
 /* --------
  * pgstat_report_query_id() -
  *
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 431f273d23..e818a26780 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -141,7 +141,7 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER,
+	RESET_BUFFERS,
 	RESET_WAL
 } PgStat_Shared_Reset_Target;
 
@@ -357,7 +357,8 @@ typedef struct PgStatIOPathOps
 
 /*
  * Sent by a backend to the stats collector to report all IOOps for all IOPaths
- * for a given type of a backend. This will happen when the backend exits.
+ * for a given type of a backend. This will happen when the backend exits or
+ * when stats are reset.
  */
 typedef struct PgStat_MsgIOPathOps
 {
@@ -375,9 +376,12 @@ typedef struct PgStat_MsgIOPathOps
  */
 typedef struct PgStat_BackendIOPathOps
 {
+	TimestampTz stat_reset_timestamp;
 	PgStatIOPathOps ops[BACKEND_NUM_TYPES];
+	PgStatIOPathOps resets[BACKEND_NUM_TYPES];
 } PgStat_BackendIOPathOps;
 
+
 /* ----------
  * PgStat_MsgResetcounter		Sent by the backend to tell the collector
  *								to reset counters
@@ -389,15 +393,28 @@ typedef struct PgStat_MsgResetcounter
 	Oid			m_databaseid;
 } PgStat_MsgResetcounter;
 
-/* ----------
- * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector
- *								to reset a shared counter
- * ----------
+/*
+ * Sent by the backend to tell the collector to reset a shared counter.
+ *
+ * In addition to the message header and reset target, the message also
+ * contains an array with all of the IO operations for all IO paths done by a
+ * particular backend type.
+ *
+ * This is needed because the IO operation stats for live backends cannot be
+ * safely modified by other processes. Therefore, to correctly calculate the
+ * total IO operations for a particular backend type after a reset, the balance
+ * of IO operations for live backends at the time of prior resets must be
+ * subtracted from the total IO operations.
+ *
+ * To satisfy this requirement, the process initiating the reset will read the
+ * IO operations counters from live backends and send them to the stats
+ * collector which maintains an array of reset values.
  */
 typedef struct PgStat_MsgResetsharedcounter
 {
 	PgStat_MsgHdr m_hdr;
 	PgStat_Shared_Reset_Target m_resettarget;
+	PgStat_MsgIOPathOps m_backend_resets;
 } PgStat_MsgResetsharedcounter;
 
 /* ----------
diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h
index 3de1e7c8d3..7e59c063b9 100644
--- a/src/include/utils/backend_status.h
+++ b/src/include/utils/backend_status.h
@@ -375,6 +375,7 @@ extern void pgstat_bestart(void);
 extern void pgstat_clear_backend_activity_snapshot(void);
 
 /* Activity reporting functions */
+typedef struct PgStatIOPathOps PgStatIOPathOps;
 
 static inline void
 pgstat_inc_ioop(IOOp io_op, IOPath io_path)
@@ -402,6 +403,7 @@ pgstat_inc_ioop(IOOp io_op, IOPath io_path)
 	}
 }
 extern void pgstat_report_activity(BackendState state, const char *cmd_str);
+extern void pgstat_report_live_backend_io_path_ops(PgStatIOPathOps *backend_io_path_ops);
 extern void pgstat_report_query_id(uint64 query_id, bool force);
 extern void pgstat_report_tempfile(size_t filesize);
 extern void pgstat_report_appname(const char *appname);
-- 
2.32.0

