From 434c1ddf6a37d2ed9f6f93fa6d17c1eb934b0a85 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 16 Mar 2021 11:45:50 -0400
Subject: [PATCH v1] Add system view tracking shared buffers written

Add a system view which tracks
- number of shared buffers the checkpointer and bgwriter write out
- number of shared buffers a regular backend is forced to flush
- number of extends done by a regular backend through shared buffers
- number of buffers flushed by a backend or autovacuum using a
  BufferAccessStrategy which, were they not to use this strategy, could
  perhaps have been avoided if a clean shared buffer was available
- number of fsyncs done by a backend which could have been done by
  checkpointer if sync queue had not been full

All backends, on exit, will update a shared memory array with the
buffers they wrote or extended.
When the view is queried, add all live backend's statuses
to the totals in the shared memory array and return that as the full
total.

TODO:
- Some kind of test?
- Docs change
---
 src/backend/catalog/system_views.sql  |  14 +-
 src/backend/postmaster/checkpointer.c |  17 +-
 src/backend/postmaster/pgstat.c       | 234 +++++++++++++++++++++++++-
 src/backend/storage/buffer/bufmgr.c   |  77 +++++++++
 src/backend/storage/buffer/freelist.c |  29 +++-
 src/backend/storage/ipc/ipci.c        |   1 +
 src/backend/storage/smgr/smgr.c       |   2 +-
 src/backend/utils/adt/pgstatfuncs.c   |  54 +++++-
 src/include/catalog/pg_proc.dat       |  18 +-
 src/include/miscadmin.h               |  24 +++
 src/include/pgstat.h                  |  13 +-
 src/include/storage/buf_internals.h   |   3 +
 src/test/regress/expected/rules.out   |  11 +-
 src/test/regress/sql/stats.sql        |   1 +
 14 files changed, 453 insertions(+), 45 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 5f2541d316..238d0ed7db 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1062,8 +1062,6 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint,
         pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean,
         pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean,
-        pg_stat_get_buf_written_backend() AS buffers_backend,
-        pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
         pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
@@ -1080,6 +1078,18 @@ CREATE VIEW pg_stat_wal AS
         w.stats_reset
     FROM pg_stat_get_wal() w;
 
+CREATE VIEW pg_stat_buffers_written AS
+    SELECT
+        b.buffers_autovacuum_write,
+        b.buffers_autovacuum_write_strat,
+        b.buffers_backend_extend,
+        b.buffers_backend_write,
+        b.buffers_backend_write_strat,
+        b.buffers_backend_fsync,
+        b.buffers_bgwriter_write,
+        b.buffers_checkpointer_write
+FROM pg_stat_get_buffers_written() b;
+
 CREATE VIEW pg_stat_progress_analyze AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index e7e6a2a459..3bdfc222ba 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -127,9 +127,6 @@ typedef struct
 	ConditionVariable start_cv; /* signaled when ckpt_started advances */
 	ConditionVariable done_cv;	/* signaled when ckpt_done advances */
 
-	uint32		num_backend_writes; /* counts user backend buffer writes */
-	uint32		num_backend_fsync;	/* counts user backend fsync calls */
-
 	int			num_requests;	/* current # of requests */
 	int			max_requests;	/* allocated array size */
 	CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
@@ -1092,10 +1089,6 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
 
 	LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
 
-	/* Count all backend writes regardless of if they fit in the queue */
-	if (!AmBackgroundWriterProcess())
-		CheckpointerShmem->num_backend_writes++;
-
 	/*
 	 * If the checkpointer isn't running or the request queue is full, the
 	 * backend will have to perform its own fsync request.  But before forcing
@@ -1109,8 +1102,9 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
 		 * Count the subset of writes where backends have to do their own
 		 * fsync
 		 */
+		/* TODO: should we count fsyncs for all types of procs? */
 		if (!AmBackgroundWriterProcess())
-			CheckpointerShmem->num_backend_fsync++;
+			pgstat_increment_buffers_written(BA_Fsync);
 		LWLockRelease(CheckpointerCommLock);
 		return false;
 	}
@@ -1267,13 +1261,6 @@ AbsorbSyncRequests(void)
 
 	LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
 
-	/* Transfer stats counts into pending pgstats message */
-	BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes;
-	BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync;
-
-	CheckpointerShmem->num_backend_writes = 0;
-	CheckpointerShmem->num_backend_fsync = 0;
-
 	/*
 	 * We try to avoid holding the lock for a long time by copying the request
 	 * array, and processing the requests after releasing the lock.
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 2f3f378e63..3c506bcc79 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -386,6 +386,10 @@ static void pgstat_recv_connstat(PgStat_MsgConn *msg, int len);
 static void pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+static int
+pgstat_get_index_buffers_written(BufferActionType buffer_action_type,
+								 BackendType backend_type);
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -2923,6 +2927,7 @@ static PgBackendGSSStatus *BackendGssStatusBuffer = NULL;
 #endif
 
 
+static int *BuffersWrittenCountersArray = NULL;
 /*
  * Report shared-memory space needed by CreateSharedBackendStatus.
  */
@@ -2955,6 +2960,19 @@ BackendStatusShmemSize(void)
 	return size;
 }
 
+void
+CreateSharedBuffersWrittenCounters(void)
+{
+	bool		found;
+	Size		size = 0;
+
+	size = mul_size(sizeof(int), BuffersWrittenCountersArrayLength);
+	BuffersWrittenCountersArray = (int *)
+		ShmemInitStruct("Buffers written by various backend types", size, &found);
+	if (!found)
+		MemSet(BuffersWrittenCountersArray, 0, size);
+}
+
 /*
  * Initialize the shared status array and several string buffers
  * during postmaster startup.
@@ -3253,6 +3271,10 @@ pgstat_bestart(void)
 	lbeentry.st_state = STATE_UNDEFINED;
 	lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID;
 	lbeentry.st_progress_command_target = InvalidOid;
+	lbeentry.num_extends = 0;
+	lbeentry.num_writes = 0;
+	lbeentry.num_writes_strat = 0;
+	lbeentry.num_fsyncs = 0;
 
 	/*
 	 * we don't zero st_progress_param here to save cycles; nobody should
@@ -3338,6 +3360,16 @@ pgstat_beshutdown_hook(int code, Datum arg)
 	beentry->st_procpid = 0;	/* mark invalid */
 
 	PGSTAT_END_WRITE_ACTIVITY(beentry);
+
+	/*
+	 * Because the stats tracking shared buffers written and extended do not
+	 * go through the stats collector, it didn't make sense to add them to
+	 * pgstat_report_stat() At least the DatabaseId should be valid. Otherwise
+	 * we can't be sure that the members were zero-initialized (TODO: is that
+	 * true?)
+	 */
+	if (OidIsValid(MyDatabaseId))
+		pgstat_record_dead_backend_buffers_written();
 }
 
 
@@ -6928,8 +6960,6 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 	globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
 	globalStats.buf_written_clean += msg->m_buf_written_clean;
 	globalStats.maxwritten_clean += msg->m_maxwritten_clean;
-	globalStats.buf_written_backend += msg->m_buf_written_backend;
-	globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
 	globalStats.buf_alloc += msg->m_buf_alloc;
 }
 
@@ -7467,3 +7497,203 @@ pgstat_count_slru_truncate(int slru_idx)
 {
 	slru_entry(slru_idx)->m_truncate += 1;
 }
+
+void
+pgstat_increment_buffers_written(BufferActionType ba_type)
+{
+	volatile PgBackendStatus *beentry   = MyBEEntry;
+	BackendType bt;
+
+	if (!beentry || !pgstat_track_activities)
+		return;
+	bt = beentry->st_backendType;
+	if (bt != B_CHECKPOINTER && bt != B_AUTOVAC_WORKER && bt != B_BG_WRITER && bt != B_BACKEND)
+		return;
+
+	PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+	if (ba_type == BA_Write)
+		beentry->num_writes++;
+	else if (ba_type == BA_Extend)
+		beentry->num_extends++;
+	else if (ba_type == BA_Write_Strat)
+		beentry->num_writes_strat++;
+	else if (ba_type == BA_Fsync)
+		beentry->num_fsyncs++;
+	PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+
+/*
+ * Used for a single backend of a BackendType when needing its stats on the
+ * various BufferActionTypes it has done.
+ */
+/*  TODO: should this be a size_t of some kind? */
+static int
+pgstat_get_index_buffers_written(BufferActionType buffer_action_type, BackendType backend_type)
+{
+	/*
+	 * The order is: BLACK HOLE - 0 buffers_autovacuum_write - 1
+	 * buffers_autovacuum_write_strat - 2 buffers_backend_extend - 3
+	 * buffers_backend_write - 4 buffers_backend_write_strat - 5
+	 * buffers_backend_fsync - 6 buffers_bgwriter_write - 7
+	 * buffers_checkpointer_write - 8
+	 *
+	 * This function is responsible for maintaining
+	 * BuffersWrittenCountersArray in the following order [ BLACK_HOLE,
+	 * buffers_autovaccum_write,buffers_autovacuum_write_strat,
+	 * buffers_backend_extend,buffers_backend_write,
+	 * buffers_backend_write_strat,buffers_backend_fsync,
+	 * buffers_bgwriter_write,buffers_checkpointer_write ]
+	 *
+	 * Note that if a BufferActionType is unimplemented for a particular
+	 * BackendType, B_BUFFERS_WRITTEN_BLACK_HOLE is returned
+	 */
+	Assert(buffer_action_type < BA_NUM_TYPES && buffer_action_type >= 0);
+
+	/* TODO: silence the compiler on -wswitch uncovered cases */
+	switch (backend_type)
+	{
+		case B_AUTOVAC_WORKER:
+			switch (buffer_action_type)
+			{
+				case BA_Write:
+					return B_AUTOVAC_WORKER_BA_WRITE;
+				case BA_Write_Strat:
+					return B_AUTOVAC_WORKER_BA_WRITE_STRAT;
+			}
+			break;
+		case B_BACKEND:
+			switch (buffer_action_type)
+			{
+				case BA_Extend:
+					return B_BACKEND_BA_EXTEND;
+				case BA_Write:
+					return B_BACKEND_BA_WRITE;
+				case BA_Write_Strat:
+					return B_BACKEND_BA_WRITE_STRAT;
+				case BA_Fsync:
+					return B_BACKEND_BA_FSYNC;
+			}
+			break;
+		case B_BG_WRITER:
+			switch (buffer_action_type)
+			{
+				case BA_Write:
+					return B_BG_WRITER_BA_WRITE;
+			}
+			break;
+		case B_CHECKPOINTER:
+			switch (buffer_action_type)
+			{
+				case BA_Write:
+					return B_CHECKPOINTER_WRITE;
+			}
+			break;
+		default:
+
+			/*
+			 * TODO: is this ERROR even a good idea? is it better to only
+			 * return the black hole?
+			 */
+			elog(ERROR, "Unrecognized backend type, %d, for buffers written counting.", backend_type);
+	}
+	return B_BUFFERS_WRITTEN_BLACK_HOLE;
+}
+
+/*
+ * Called for a single backend at the time of death to persist its I/O stats
+ * For now, only used by pgstat_beshutdown_hook(), however, could be of use
+ * elsewhere, so keep it public.
+ */
+void
+pgstat_record_dead_backend_buffers_written(void)
+{
+	BackendType bt;
+	volatile	PgBackendStatus *beentry = MyBEEntry;
+
+	if (beentry->st_procpid != 0)
+		return;
+	bt = beentry->st_backendType;
+	if (bt != B_CHECKPOINTER && bt != B_AUTOVAC_WORKER && bt != B_BG_WRITER && bt != B_BACKEND)
+		return;
+
+	for (;;)
+	{
+		int			before_changecount;
+		int			after_changecount;
+
+		pgstat_begin_read_activity(beentry, before_changecount);
+
+		/*
+		 * It is guaranteed that the index will be within bounds of the array
+		 * because pgstat_get_index_buffers_written() only returns indexes
+		 * within bounds of BuffersWrittenCountersArray
+		 */
+		BuffersWrittenCountersArray[pgstat_get_index_buffers_written(BA_Write_Strat, beentry->st_backendType)] += beentry->num_writes_strat;
+		BuffersWrittenCountersArray[pgstat_get_index_buffers_written(BA_Write, beentry->st_backendType)] += beentry->num_writes;
+		BuffersWrittenCountersArray[pgstat_get_index_buffers_written(BA_Extend, beentry->st_backendType)] += beentry->num_extends;
+		BuffersWrittenCountersArray[pgstat_get_index_buffers_written(BA_Fsync, beentry->st_backendType)] += beentry->num_fsyncs;
+
+		pgstat_end_read_activity(beentry, after_changecount);
+
+		if (pgstat_read_activity_complete(before_changecount, after_changecount))
+			break;
+
+		/* Make sure we can break out of loop if stuck... */
+		CHECK_FOR_INTERRUPTS();
+	}
+}
+
+/*
+ * Input parameter, length, is the length of the values array passed in
+ * Output parameter is values, an array to be filled
+ */
+void
+pgstat_recount_all_backends_buffers_written(Datum * values, int length)
+{
+	int			beid;
+	int			tot_backends = pgstat_fetch_stat_numbackends();
+
+	Assert(length == BuffersWrittenCountersArrayLength);
+
+	/*
+	 * Add stats from all exited backends
+	 *
+	 * TODO: I thought maybe it is okay to just access this lock-free since it
+	 * is only written to when a process dies in
+	 * pgstat_record_dead_backend_buffers_written() and is read at the time of
+	 * querying the view with the stats. It's okay if we don't have 100%
+	 * up-to-date stats. However, I was wondering about torn values and
+	 * platforms without 64bit "single copy atomicity"
+	 *
+	 * Because the values array is datums and
+	 * BuffersWrittenCountersArrayLength is int64s, can't do a simple memcpy
+	 *
+	 */
+	for (int i = 0; i < BuffersWrittenCountersArrayLength; i++)
+		values[i] += BuffersWrittenCountersArray[i];
+
+	/*
+	 * Loop through all live backends and count their writes
+	 */
+	for (beid = 1; beid <= tot_backends; beid++)
+	{
+		BackendType bt;
+		PgBackendStatus *beentry = pgstat_fetch_stat_beentry(beid);
+
+		if (beentry->st_procpid == 0)
+			continue;
+		bt = beentry->st_backendType;
+		if (bt != B_CHECKPOINTER && bt != B_AUTOVAC_WORKER && bt != B_BG_WRITER && bt != B_BACKEND)
+			continue;
+
+		values[pgstat_get_index_buffers_written(BA_Extend,
+												beentry->st_backendType)] += beentry->num_extends;
+		values[pgstat_get_index_buffers_written(BA_Write,
+												beentry->st_backendType)] += beentry->num_writes;
+		values[pgstat_get_index_buffers_written(BA_Write_Strat,
+												beentry->st_backendType)] += beentry->num_writes_strat;
+		values[pgstat_get_index_buffers_written(BA_Fsync,
+												beentry->st_backendType)] += beentry->num_fsyncs;
+	}
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 852138f9c9..f71648fa89 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -891,6 +891,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	if (isExtend)
 	{
+		/*
+		 * Extends counted here are only those that go through shared buffers
+		 */
+		pgstat_increment_buffers_written(BA_Extend);
+
 		/* new buffers are zero-filled */
 		MemSet((char *) bufBlock, 0, BLCKSZ);
 		/* don't set checksum for all-zero page */
@@ -1157,11 +1162,65 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 					if (XLogNeedsFlush(lsn) &&
 						StrategyRejectBuffer(strategy, buf))
 					{
+						/*
+						 * Unset the strat write flag, as we will not be writing
+						 * this particular buffer from our ring out and may end
+						 * up having to find a buffer from main shared buffers,
+						 * which, if it is dirty, we may have to write out, which
+						 * could have been prevented by checkpointing and background
+						 * writing
+						 */
+						StrategyUnChooseBufferFromRing(strategy);
 						/* Drop lock/pin and loop around for another buffer */
 						LWLockRelease(BufferDescriptorGetContentLock(buf));
 						UnpinBuffer(buf, true);
 						continue;
 					}
+
+					/*
+					 * TODO: there is certainly a better way to write this
+					 * logic
+					 */
+
+					/*
+					 * buffers_backend_write, buffers_backend_write_strat,
+					 * buffers_autovacuum_write, or
+					 * buffers_autovacuum_write_strat
+					 */
+					/* are all incremented in the next 20 or so lines */
+
+					/*
+					 * The dirty buffer that will be written out was selected
+					 * from the ring and we did not bother checking the
+					 * freelist or doing a clock sweep to look for a clean
+					 * buffer to use, thus, this write will be counted as a
+					 * strategy write -- one that may be unnecessary without a
+					 * strategy
+					 */
+					if (StrategyIsBufferFromRing(strategy))
+					{
+						pgstat_increment_buffers_written(BA_Write_Strat);
+					}
+
+					/*
+					 * If the dirty buffer was one we grabbed from the
+					 * freelist or through a clock sweep, it could have been
+					 * written out by bgwriter or checkpointer, thus, we will
+					 * count it as a regular write
+					 */
+					else
+						pgstat_increment_buffers_written(BA_Write);
+				}
+				else
+				{
+					/*
+					 * If strategy is NULL, we could only be doing a write.
+					 * Extend operations will be counted in smgrextend. That
+					 * is separate I/O than any flushing of dirty buffers. If
+					 * we add more Backend Access Types, perhaps we will need
+					 * additional checks here
+					 */
+					pgstat_increment_buffers_written(BA_Write);
 				}
 
 				/* OK, do the I/O */
@@ -2471,6 +2530,10 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	 * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
 	 * buffer is clean by the time we've locked it.)
 	 */
+	/*
+	 * Increment buffers_bgwriter_write and buffers_checkpointer_write
+	 */
+	pgstat_increment_buffers_written(BA_Write);
 	PinBuffer_Locked(bufHdr);
 	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 
@@ -2823,6 +2886,20 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	/*
 	 * bufToWrite is either the shared buffer or a copy, as appropriate.
 	 */
+
+	/*
+	 * TODO: consider that if we did not need to distinguish between a buffer
+	 * flushed that was grabbed from the ring buffer and written out as part
+	 * of a strategy which was not from main Shared Buffers (and thus
+	 * preventable by bgwriter or checkpointer), then we could move all calls
+	 * to pgstat_increment_buffers_written() here except for the one for
+	 * extends, which would remain in ReadBuffer_common() before smgrextend()
+	 * (unless we decide to start counting other extends). That includes the
+	 * call to count buffers written by bgwriter and checkpointer which go
+	 * through FlushBuffer() but not BufferAlloc(). That would make it
+	 * simpler. Perhaps instead we can find somewhere else to indicate that
+	 * the buffer is from the ring of buffers to reuse.
+	 */
 	smgrwrite(reln,
 			  buf->tag.forkNum,
 			  buf->tag.blockNum,
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 6be80476db..4fbc7c4619 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -87,6 +87,14 @@ typedef struct BufferAccessStrategyData
 	 */
 	bool		current_was_in_ring;
 
+	/*
+	 * If we could chose a buffer from this list and we end up having to write
+	 * it out because it is dirty when we actually could have found a clean
+	 * buffer in either the freelist or through doing a clock sweep of shared
+	 * buffers, this flag will indicate that
+	 */
+	bool		chose_buffer_in_ring;
+
 	/*
 	 * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
 	 * have not yet selected a buffer for this ring slot.  For allocation
@@ -212,8 +220,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
 	if (strategy != NULL)
 	{
 		buf = GetBufferFromRing(strategy, buf_state);
-		if (buf != NULL)
+		if (buf != NULL) {
+			StrategyChooseBufferBufferFromRing(strategy);
 			return buf;
+		}
 	}
 
 	/*
@@ -702,3 +712,20 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
 
 	return true;
 }
+void
+StrategyUnChooseBufferFromRing(BufferAccessStrategy strategy)
+{
+	strategy->chose_buffer_in_ring = false;
+}
+
+void
+StrategyChooseBufferBufferFromRing(BufferAccessStrategy strategy)
+{
+	strategy->chose_buffer_in_ring = true;
+}
+
+bool
+StrategyIsBufferFromRing(BufferAccessStrategy strategy)
+{
+	return strategy->chose_buffer_in_ring;
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3e4ec53a97..e2e86705e0 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -240,6 +240,7 @@ CreateSharedMemoryAndSemaphores(void)
 		InitProcGlobal();
 	CreateSharedProcArray();
 	CreateSharedBackendStatus();
+	CreateSharedBuffersWrittenCounters();
 	TwoPhaseShmemInit();
 	BackgroundWorkerShmemInit();
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4dc24649df..a9a077af80 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -25,7 +25,7 @@
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
-
+#include "miscadmin.h"
 
 /*
  * This struct of function pointers defines the API between smgr.c and
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 9ffbca685c..4c2a2b92b1 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1774,23 +1774,59 @@ pg_stat_get_bgwriter_stat_reset_time(PG_FUNCTION_ARGS)
 }
 
 Datum
-pg_stat_get_buf_written_backend(PG_FUNCTION_ARGS)
+pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_INT64(pgstat_fetch_global()->buf_written_backend);
+	PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc);
 }
 
 Datum
-pg_stat_get_buf_fsync_backend(PG_FUNCTION_ARGS)
+pg_stat_get_buffers_written(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_INT64(pgstat_fetch_global()->buf_fsync_backend);
-}
+	TupleDesc	tupdesc;
+	Datum		values[BuffersWrittenCountersArrayLength];
 
-Datum
-pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
-{
-	PG_RETURN_INT64(pgstat_fetch_global()->buf_alloc);
+	/*
+	 * Values will be filled from BuffersWrittenCountersArray, which has an
+	 * extra spot for data that is not needed, a black hole
+	 */
+	bool		nulls[BuffersWrittenCountersArrayLength];
+
+	/* Initialise values and NULL flags arrays */
+	MemSet(values, 0, sizeof(values));
+	MemSet(nulls, 0, sizeof(nulls));
+
+	/* Initialise attributes information in the tuple descriptor */
+	tupdesc = CreateTemplateTupleDesc(BuffersWrittenCountersArrayLength - 1);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "buffers_autovacuum_write",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "buffers_autovacuum_write_strat",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "buffers_backend_extend",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "buffers_backend_write",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 5, "buffers_backend_write_strat",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 6, "buffers_backend_fsync",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 7, "buffers_bgwriter_write",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 8, "buffers_checkpointer_write",
+					   INT8OID, -1, 0);
+
+	BlessTupleDesc(tupdesc);
+
+	/*
+	 * Fill values and NULLs. values will be filled with the number of writes
+	 * by all live regular backends and relevant auxiliary backends as well as
+	 * exited backends
+	 */
+	pgstat_recount_all_backends_buffers_written(values, BuffersWrittenCountersArrayLength);
+	/* Returns the record as Datum */
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values + 1, nulls)));
 }
 
+
 /*
  * Returns statistics of WAL activity
  */
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 69ffd0c3f4..b6c07175dc 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5366,6 +5366,15 @@
   proname => 'pg_stat_get_db_numbackends', provolatile => 's',
   proparallel => 'r', prorettype => 'int4', proargtypes => 'oid',
   prosrc => 'pg_stat_get_db_numbackends' },
+
+{ oid => '8459', descr => 'statistics: counts of buffers written by different types of backends',
+  proname => 'pg_stat_get_buffers_written', provolatile => 's', proisstrict => 'f',
+  proparallel => 'r', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int8,int8,int8,int8,int8,int8,int8,int8}',
+  proargmodes => '{o,o,o,o,o,o,o,o}',
+  proargnames => '{buffers_autovacuum_write,buffers_autovacuum_write_strat,buffers_backend_extend,buffers_backend_write,buffers_backend_write_strat,buffers_backend_fsync,buffers_bgwriter_write,buffers_checkpointer_write}',
+  prosrc => 'pg_stat_get_buffers_written' },
+
 { oid => '1942', descr => 'statistics: transactions committed',
   proname => 'pg_stat_get_db_xact_commit', provolatile => 's',
   proparallel => 'r', prorettype => 'int8', proargtypes => 'oid',
@@ -5544,15 +5553,6 @@
   proname => 'pg_stat_get_checkpoint_sync_time', provolatile => 's',
   proparallel => 'r', prorettype => 'float8', proargtypes => '',
   prosrc => 'pg_stat_get_checkpoint_sync_time' },
-{ oid => '2775', descr => 'statistics: number of buffers written by backends',
-  proname => 'pg_stat_get_buf_written_backend', provolatile => 's',
-  proparallel => 'r', prorettype => 'int8', proargtypes => '',
-  prosrc => 'pg_stat_get_buf_written_backend' },
-{ oid => '3063',
-  descr => 'statistics: number of backend buffer writes that did their own fsync',
-  proname => 'pg_stat_get_buf_fsync_backend', provolatile => 's',
-  proparallel => 'r', prorettype => 'int8', proargtypes => '',
-  prosrc => 'pg_stat_get_buf_fsync_backend' },
 { oid => '2859', descr => 'statistics: number of buffer allocations',
   proname => 'pg_stat_get_buf_alloc', provolatile => 's', proparallel => 'r',
   prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc' },
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..b52fe74b72 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -326,6 +326,30 @@ typedef enum BackendType
 
 extern BackendType MyBackendType;
 
+typedef enum BufferActionType
+{
+	BA_Extend,
+	BA_Write,
+	BA_Write_Strat,
+	BA_Fsync,
+	BA_NUM_TYPES,
+}			BufferActionType;
+
+/*  TODO: does this belong here? */
+typedef enum BuffersWrittenCountersIndex
+{
+	B_BUFFERS_WRITTEN_BLACK_HOLE = 0,
+	B_AUTOVAC_WORKER_BA_WRITE,
+	B_AUTOVAC_WORKER_BA_WRITE_STRAT,
+	B_BACKEND_BA_EXTEND,
+	B_BACKEND_BA_WRITE,
+	B_BACKEND_BA_WRITE_STRAT,
+	B_BACKEND_BA_FSYNC,
+	B_BG_WRITER_BA_WRITE,
+	B_CHECKPOINTER_WRITE,
+	BuffersWrittenCountersArrayLength,
+}			BuffersWrittenCountersIndex;
+
 extern const char *GetBackendTypeDesc(BackendType backendType);
 
 extern void SetDatabasePath(const char *path);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fe6683cf5c..776520098e 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -456,8 +456,6 @@ typedef struct PgStat_MsgBgWriter
 	PgStat_Counter m_buf_written_checkpoints;
 	PgStat_Counter m_buf_written_clean;
 	PgStat_Counter m_maxwritten_clean;
-	PgStat_Counter m_buf_written_backend;
-	PgStat_Counter m_buf_fsync_backend;
 	PgStat_Counter m_buf_alloc;
 	PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
 	PgStat_Counter m_checkpoint_sync_time;
@@ -830,7 +828,6 @@ typedef struct PgStat_GlobalStats
 	PgStat_Counter buf_written_checkpoints;
 	PgStat_Counter buf_written_clean;
 	PgStat_Counter maxwritten_clean;
-	PgStat_Counter buf_written_backend;
 	PgStat_Counter buf_fsync_backend;
 	PgStat_Counter buf_alloc;
 	TimestampTz stat_reset_timestamp;
@@ -1263,6 +1260,10 @@ typedef struct PgBackendStatus
 	 */
 	ProgressCommandType st_progress_command;
 	Oid			st_progress_command_target;
+	int num_extends;
+	int num_writes;
+	int num_writes_strat;
+	int num_fsyncs;
 	int64		st_progress_param[PGSTAT_NUM_PROGRESS_PARAM];
 } PgBackendStatus;
 
@@ -1411,7 +1412,7 @@ extern SessionEndType pgStatSessionEndCause;
  */
 extern Size BackendStatusShmemSize(void);
 extern void CreateSharedBackendStatus(void);
-
+extern void CreateSharedBuffersWrittenCounters(void);
 extern void pgstat_init(void);
 extern int	pgstat_start(void);
 extern void pgstat_reset_all(void);
@@ -1634,4 +1635,8 @@ extern void pgstat_count_slru_truncate(int slru_idx);
 extern const char *pgstat_slru_name(int slru_idx);
 extern int	pgstat_slru_index(const char *name);
 
+extern void pgstat_increment_buffers_written(BufferActionType ba_type);
+extern void pgstat_record_dead_backend_buffers_written(void);
+extern void pgstat_recount_all_backends_buffers_written(Datum *values, int length);
+
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 33fcaf5c9a..2bec2cee45 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -314,6 +314,9 @@ extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 								 BufferDesc *buf);
+extern void StrategyUnChooseBufferFromRing(BufferAccessStrategy strategy);
+extern void StrategyChooseBufferBufferFromRing(BufferAccessStrategy strategy);
+extern bool StrategyIsBufferFromRing(BufferAccessStrategy strategy);
 
 extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
 extern void StrategyNotifyBgWriter(int bgwprocno);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 9b59a7b4a5..a3d5e60e74 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1823,10 +1823,17 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints
     pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint,
     pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean,
     pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean,
-    pg_stat_get_buf_written_backend() AS buffers_backend,
-    pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
     pg_stat_get_buf_alloc() AS buffers_alloc,
     pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
+pg_stat_buffers_written| SELECT b.buffers_autovacuum_write,
+    b.buffers_autovacuum_write_strat,
+    b.buffers_backend_extend,
+    b.buffers_backend_write,
+    b.buffers_backend_write_strat,
+    b.buffers_backend_fsync,
+    b.buffers_bgwriter_write,
+    b.buffers_checkpointer_write
+   FROM pg_stat_get_buffers_written() b(buffers_autovacuum_write, buffers_autovacuum_write_strat, buffers_backend_extend, buffers_backend_write, buffers_backend_write_strat, buffers_backend_fsync, buffers_bgwriter_write, buffers_checkpointer_write);
 pg_stat_database| SELECT d.oid AS datid,
     d.datname,
         CASE
diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql
index feaaee6326..737b813c15 100644
--- a/src/test/regress/sql/stats.sql
+++ b/src/test/regress/sql/stats.sql
@@ -176,4 +176,5 @@ FROM prevstats AS pr;
 
 DROP TABLE trunc_stats_test, trunc_stats_test1, trunc_stats_test2, trunc_stats_test3, trunc_stats_test4;
 DROP TABLE prevstats;
+SELECT * FROM pg_stat_buffers_written;
 -- End of Stats Test
-- 
2.25.0

