From b7848cac2c6d40193587b6e352b61b6d86025ca2 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 24 Nov 2021 12:07:37 -0500
Subject: [PATCH v16 5/7] Add system view tracking IO ops per backend type

Add pg_stat_buffers, a system view which tracks the number of IO
operations (allocs, writes, fsyncs, and extends) done through each IO
path (e.g. shared buffers, local buffers, unbuffered IO) by each type of
backend.

Some of these should always be zero. For example, checkpointer does not
use a BufferAccessStrategy (currently), so the "strategy" IO path for
checkpointer will be 0 for all IO operations (alloc, write, fsync, and
extend). All possible combinations of IO Path and IO Op are enumerated
in the view but not all are populated or even possible at this point.

All backends increment a counter in their PgBackendStatus when
performing an IO operation. On exit, backends send these stats to the
stats collector to be persisted.

When the pg_stat_buffers view is queried, one backend will sum live
backends' stats with saved stats from exited backends and subtract saved
reset stats, returning the total.

Each row of the view is stats for a particular backend type for a
particular IO path (e.g. shared buffer accesses by checkpointer) and
each column in the view is the total number of IO operations done (e.g.
writes).
So a cell in the view would be, for example, the number of shared
buffers written by checkpointer since the last stats reset.

Discussion: https://www.postgresql.org/message-id/flat/20210415235954.qcypb4urtovzkat5%40alap3.anarazel.de#724d5cce4bcb587f9167b80a5824bc5c
---
 doc/src/sgml/monitoring.sgml                | 110 +++++++++++++-
 src/backend/catalog/system_views.sql        |  11 ++
 src/backend/postmaster/pgstat.c             |  13 ++
 src/backend/utils/activity/backend_status.c |  19 ++-
 src/backend/utils/adt/pgstatfuncs.c         | 153 ++++++++++++++++++++
 src/include/catalog/pg_proc.dat             |   9 ++
 src/include/pgstat.h                        |   1 +
 src/include/utils/backend_status.h          |   5 +
 src/test/regress/expected/rules.out         |   8 +
 9 files changed, 326 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index bda3eef309..b16952d439 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -435,6 +435,15 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
      </entry>
      </row>
 
+     <row>
+      <entry><structname>pg_stat_buffers</structname><indexterm><primary>pg_stat_buffers</primary></indexterm></entry>
+      <entry>A row for each IO path for each backend type showing
+      statistics about backend IO operations. See
+       <link linkend="monitoring-pg-stat-buffers-view">
+       <structname>pg_stat_buffers</structname></link> for details.
+     </entry>
+     </row>
+
      <row>
       <entry><structname>pg_stat_wal</structname><indexterm><primary>pg_stat_wal</primary></indexterm></entry>
       <entry>One row only, showing statistics about WAL activity. See
@@ -3613,6 +3622,101 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
 
  </sect2>
 
+ <sect2 id="monitoring-pg-stat-buffers-view">
+  <title><structname>pg_stat_buffers</structname></title>
+
+  <indexterm>
+   <primary>pg_stat_buffers</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_stat_buffers</structname> view has a row for each backend
+   type for each possible IO path, containing global data for the cluster for
+   that backend and IO path.
+  </para>
+
+  <table id="pg-stat-buffers-view" xreflabel="pg_stat_buffers">
+   <title><structname>pg_stat_buffers</structname> View</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>backend_type</structfield> <type>text</type>
+      </para>
+      <para>
+       Type of backend (e.g. background worker, autovacuum worker).
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>io_path</structfield> <type>text</type>
+      </para>
+      <para>
+       IO path taken (e.g. shared buffers, direct).
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>alloc</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of buffers allocated.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>extend</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of buffers extended.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>fsync</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of buffers fsynced.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>write</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of buffers written.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stats_reset</structfield> <type>timestamp with time zone</type>
+      </para>
+      <para>
+       Time at which these statistics were last reset.
+      </para></entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+
+ </sect2>
+
  <sect2 id="monitoring-pg-stat-wal-view">
    <title><structname>pg_stat_wal</structname></title>
 
@@ -5213,8 +5317,10 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
         all the counters shown in
         the <structname>pg_stat_bgwriter</structname>
         view, <literal>archiver</literal> to reset all the counters shown in
-        the <structname>pg_stat_archiver</structname> view or <literal>wal</literal>
-        to reset all the counters shown in the <structname>pg_stat_wal</structname> view.
+        the <structname>pg_stat_archiver</structname> view, <literal>wal</literal>
+        to reset all the counters shown in the <structname>pg_stat_wal</structname> view, 
+        or <literal>buffers</literal> to reset all the counters shown in the
+        <structname>pg_stat_buffers</structname> view.
        </para>
        <para>
         This function is restricted to superusers by default, but other users
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 61b515cdb8..e214d23056 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1076,6 +1076,17 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
+CREATE VIEW pg_stat_buffers AS
+SELECT
+       b.backend_type,
+       b.io_path,
+       b.alloc,
+       b.extend,
+       b.fsync,
+       b.write,
+       b.stats_reset
+FROM pg_stat_get_buffers() b;
+
 CREATE VIEW pg_stat_wal AS
     SELECT
         w.wal_records,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index cebbbc13da..4e7426d273 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2916,6 +2916,19 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info,
 		rec->tuples_inserted + rec->tuples_updated;
 }
 
+/*
+ *	Support function for SQL-callable pgstat* functions. Returns a pointer to
+ *	the PgStat_BackendIOPathOps structure tracking IO op statistics for both
+ *	exited backends and reset arithmetic.
+ */
+PgStat_BackendIOPathOps *
+pgstat_fetch_exited_backend_buffers(void)
+{
+	backend_read_statsfile();
+
+	return &globalStats.buffers;
+}
+
 
 /* ----------
  * pgstat_fetch_stat_dbentry() -
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 29f6604488..285279b8ed 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -50,7 +50,7 @@ int			pgstat_track_activity_query_size = 1024;
 PgBackendStatus *MyBEEntry = NULL;
 
 
-static PgBackendStatus *BackendStatusArray = NULL;
+PgBackendStatus *BackendStatusArray = NULL;
 static char *BackendAppnameBuffer = NULL;
 static char *BackendClientHostnameBuffer = NULL;
 static char *BackendActivityBuffer = NULL;
@@ -236,6 +236,23 @@ CreateSharedBackendStatus(void)
 #endif
 }
 
+const char *
+GetIOPathDesc(IOPath io_path)
+{
+	switch (io_path)
+	{
+		case IOPATH_DIRECT:
+			return "direct";
+		case IOPATH_LOCAL:
+			return "local";
+		case IOPATH_SHARED:
+			return "shared";
+		case IOPATH_STRATEGY:
+			return "strategy";
+	}
+	return "unknown IO path";
+}
+
 /*
  * Initialize pgstats backend activity state, and set up our on-proc-exit
  * hook.  Called from InitPostgres and AuxiliaryProcessMain. For auxiliary
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index f529c1561a..74f0c22170 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1796,6 +1796,159 @@ pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
 	PG_RETURN_INT64(pgstat_fetch_stat_bgwriter()->buf_alloc);
 }
 
+/*
+* When adding a new column to the pg_stat_buffers view, add a new enum
+* value here above COLUMN_LENGTH.
+*/
+enum
+{
+	COLUMN_BACKEND_TYPE,
+	COLUMN_IO_PATH,
+	COLUMN_ALLOCS,
+	COLUMN_EXTENDS,
+	COLUMN_FSYNCS,
+	COLUMN_WRITES,
+	COLUMN_RESET_TIME,
+	COLUMN_LENGTH,
+};
+
+/*
+ * Helper function to get the correct row in the pg_stat_buffers view.
+ */
+static inline Datum *
+get_pg_stat_buffers_row(Datum all_values[BACKEND_NUM_TYPES - 1][IOPATH_NUM_TYPES][COLUMN_LENGTH],
+		BackendType backend_type, IOPath io_path)
+{
+	/*
+	 * Subtract 1 from backend_type to avoid having rows for B_INVALID
+	 * BackendType
+	 */
+	return all_values[backend_type - 1][io_path];
+}
+
+Datum
+pg_stat_get_buffers(PG_FUNCTION_ARGS)
+{
+	PgStat_BackendIOPathOps *backend_io_path_ops;
+	PgBackendStatus *beentry;
+	Datum		reset_time;
+
+	ReturnSetInfo *rsinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	Datum		all_values[BACKEND_NUM_TYPES - 1][IOPATH_NUM_TYPES][COLUMN_LENGTH];
+	bool		all_nulls[BACKEND_NUM_TYPES - 1][IOPATH_NUM_TYPES][COLUMN_LENGTH];
+
+	rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+	tupstore = tuplestore_begin_heap((bool) (rsinfo->allowedModes & SFRM_Materialize_Random),
+			false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+	MemoryContextSwitchTo(oldcontext);
+
+	memset(all_values, 0, sizeof(all_values));
+	memset(all_nulls, 0, sizeof(all_nulls));
+
+	/*
+	 * Loop through all live backends and count their IO Ops for each IO Path
+	 */
+	beentry = BackendStatusArray;
+
+	for (int i = 0; i < MaxBackends + NUM_AUXPROCTYPES; i++)
+	{
+		IOOps	   *io_ops;
+
+		/* Don't count dead backends. They should already be counted */
+		if (beentry->st_procpid == 0)
+			continue;
+
+		io_ops = beentry->io_path_stats;
+
+		for (int io_path = 0; io_path < IOPATH_NUM_TYPES; io_path++)
+		{
+			Datum *row = get_pg_stat_buffers_row(all_values, beentry->st_backendType, io_path);
+
+			/*
+			 * COLUMN_RESET_TIME, COLUMN_BACKEND_TYPE, and COLUMN_IO_PATH will
+			 * all be set when looping through exited backends array
+			 */
+			row[COLUMN_ALLOCS] += pg_atomic_read_u64(&io_ops->allocs);
+			row[COLUMN_EXTENDS] += pg_atomic_read_u64(&io_ops->extends);
+			row[COLUMN_FSYNCS] += pg_atomic_read_u64(&io_ops->fsyncs);
+			row[COLUMN_WRITES] += pg_atomic_read_u64(&io_ops->writes);
+			io_ops++;
+		}
+
+		beentry++;
+	}
+
+	/* Add stats from all exited backends */
+	backend_io_path_ops = pgstat_fetch_exited_backend_buffers();
+
+	reset_time = TimestampTzGetDatum(backend_io_path_ops->stat_reset_timestamp);
+
+	/* 0 is not a valid BackendType */
+	for (int backend_type = 1; backend_type < BACKEND_NUM_TYPES; backend_type++)
+	{
+		PgStatIOOps *io_ops = backend_io_path_ops->ops[backend_type].io_path_ops;
+		PgStatIOOps *resets = backend_io_path_ops->resets[backend_type].io_path_ops;
+
+		Datum		backend_type_desc = CStringGetTextDatum(GetBackendTypeDesc(backend_type));
+
+		for (int io_path = 0; io_path < IOPATH_NUM_TYPES; io_path++)
+		{
+			Datum *row = get_pg_stat_buffers_row(all_values, backend_type, io_path);
+
+			row[COLUMN_BACKEND_TYPE] = backend_type_desc;
+			row[COLUMN_IO_PATH] = CStringGetTextDatum(GetIOPathDesc(io_path));
+			row[COLUMN_ALLOCS] += io_ops->allocs - resets->allocs;
+			row[COLUMN_EXTENDS] += io_ops->extends - resets->extends;
+			row[COLUMN_FSYNCS] += io_ops->fsyncs - resets->fsyncs;
+			row[COLUMN_WRITES] += io_ops->writes - resets->writes;
+			row[COLUMN_RESET_TIME] = reset_time;
+			io_ops++;
+			resets++;
+		}
+	}
+
+	for (int i = 0; i < BACKEND_NUM_TYPES - 1 ; i++)
+	{
+		for (int j = 0; j < IOPATH_NUM_TYPES; j++)
+		{
+			Datum	   *values = all_values[i][j];
+			bool	   *nulls = all_nulls[i][j];
+
+			tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+		}
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
+
 /*
  * Returns statistics of WAL activity
  */
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 79d787cd26..32253383ba 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5654,6 +5654,15 @@
   proname => 'pg_stat_get_buf_alloc', provolatile => 's', proparallel => 'r',
   prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc' },
 
+{ oid => '8459', descr => 'statistics: counts of all IO operations done to all IO paths by each type of backend.',
+  proname => 'pg_stat_get_buffers', provolatile => 's', proisstrict => 'f',
+  prorows => '52', proretset => 't',
+  proparallel => 'r', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,text,int8,int8,int8,int8,timestamptz}',
+  proargmodes => '{o,o,o,o,o,o,o}',
+  proargnames => '{backend_type,io_path,alloc,extend,fsync,write,stats_reset}',
+  prosrc => 'pg_stat_get_buffers' },
+
 { oid => '1136', descr => 'statistics: information about WAL activity',
   proname => 'pg_stat_get_wal', proisstrict => 'f', provolatile => 's',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index e8ec0f9b48..00975bcac6 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1276,6 +1276,7 @@ extern void pgstat_sum_io_path_ops(PgStatIOOps *dest, IOOps *src);
  * generate the pgstat* views.
  * ----------
  */
+extern PgStat_BackendIOPathOps *pgstat_fetch_exited_backend_buffers(void);
 extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
 extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h
index 9c997cace8..dd983fc949 100644
--- a/src/include/utils/backend_status.h
+++ b/src/include/utils/backend_status.h
@@ -321,6 +321,7 @@ extern PGDLLIMPORT int pgstat_track_activity_query_size;
  * ----------
  */
 extern PGDLLIMPORT PgBackendStatus *MyBEEntry;
+extern PGDLLIMPORT PgBackendStatus *BackendStatusArray;
 
 
 /* ----------
@@ -336,6 +337,10 @@ extern void CreateSharedBackendStatus(void);
  * ----------
  */
 
+/* Utility functions */
+extern const char *GetIOPathDesc(IOPath io_path);
+
+
 /* Initialization functions */
 extern void pgstat_beinit(void);
 extern void pgstat_bestart(void);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index b58b062b10..5869ce442f 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1828,6 +1828,14 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints
     pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
     pg_stat_get_buf_alloc() AS buffers_alloc,
     pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
+pg_stat_buffers| SELECT b.backend_type,
+    b.io_path,
+    b.alloc,
+    b.extend,
+    b.fsync,
+    b.write,
+    b.stats_reset
+   FROM pg_stat_get_buffers() b(backend_type, io_path, alloc, extend, fsync, write, stats_reset);
 pg_stat_database| SELECT d.oid AS datid,
     d.datname,
         CASE
-- 
2.32.0

