From 74d1d60db67b817ebb2dd73b64ae6fecab4d468f Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Tue, 30 Jun 2026 18:44:02 +0500
Subject: [PATCH] Track relation writes that bypass shared buffers in
 pg_stat_io

Operations that populate new relation storage via the bulk write facility
(bulk_write.c) -- building a B-tree index, a CLUSTER or VACUUM FULL heap
rewrite, ALTER TABLE SET TABLESPACE, and so on -- write pages by calling
smgrextend()/smgrwrite() directly, bypassing shared buffers.  Until now
those writes were invisible in pg_stat_io; only the eventual fsync was
counted, under IOCONTEXT_NORMAL like all relation fsyncs.

Count these writes and extends in a new IOCONTEXT_BYPASS context so that
the I/O done while building, for example, a B-tree index is observable.
The matching fsync is unchanged and remains counted under IOCONTEXT_NORMAL.
---
 doc/src/sgml/monitoring.sgml           | 18 ++++++++++--
 src/backend/storage/smgr/bulk_write.c  | 27 ++++++++++++++++++
 src/backend/utils/activity/pgstat_io.c | 22 +++++++++++++--
 src/include/pgstat.h                   |  3 +-
 src/test/regress/expected/stats.out    | 39 +++++++++++++++++++++++++-
 src/test/regress/sql/stats.sql         | 12 ++++++++
 6 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 08d5b824552..d9fe39e9886 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -2882,9 +2882,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
 
   <para>
    Currently, I/O on relations (e.g. tables, indexes) and WAL activity are
-   tracked. However, relation I/O which bypasses shared buffers
-   (e.g. when moving a table from one tablespace to another) is currently
-   not tracked.
+   tracked. Writes that bypass shared buffers while building new relation
+   storage are tracked in the <literal>bypass</literal>
+   <varname>context</varname>. However, some relation I/O which bypasses
+   shared buffers is still not tracked, such as the reads performed when
+   moving a table from one tablespace to another.
   </para>
 
   <table id="pg-stat-io-view" xreflabel="pg_stat_io">
@@ -2995,6 +2997,16 @@ description | Waiting for a newly initialized WAL file to reach durable storage
           done outside of shared buffers, such as <command>COPY</command>.
          </para>
         </listitem>
+        <listitem>
+         <para>
+          <literal>bypass</literal>: Write I/O operations that populate
+          newly built relation storage by bypassing shared buffers, such as
+          building a B-tree index or rewriting a table's heap during
+          <command>VACUUM FULL</command> or <command>CLUSTER</command>. The
+          associated <literal>fsync</literal> is counted separately in
+          <varname>context</varname> <literal>normal</literal>.
+         </para>
+        </listitem>
        </itemizedlist>
       </entry>
      </row>
diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index f3c24082a69..6e502e9d766 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -38,6 +38,8 @@
 
 #include "access/xloginsert.h"
 #include "access/xlogrecord.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/bulk_write.h"
 #include "storage/proc.h"
@@ -243,10 +245,21 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 {
 	int			npending = bulkstate->npending;
 	PendingWrite *pending_writes = bulkstate->pending_writes;
+	IOObject	io_object;
 
 	if (npending == 0)
 		return;
 
+	/*
+	 * These writes bypass shared buffers, so they are not accounted for by
+	 * the buffer manager.  Count them in pg_stat_io ourselves, under the
+	 * dedicated IOCONTEXT_BYPASS context.  The matching fsync happens
+	 * separately (see smgr_bulk_finish()) and is counted under
+	 * IOCONTEXT_NORMAL, like all relation fsyncs.
+	 */
+	io_object = SmgrIsTemp(bulkstate->smgr) ?
+		IOOBJECT_TEMP_RELATION : IOOBJECT_RELATION;
+
 	if (npending > 1)
 		qsort(pending_writes, npending, sizeof(PendingWrite), buffer_cmp);
 
@@ -283,6 +296,8 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 
 		if (blkno >= bulkstate->relsize)
 		{
+			instr_time	io_start;
+
 			/*
 			 * If we have to write pages nonsequentially, fill in the space
 			 * with zeroes until we come back and overwrite.  This is not
@@ -292,19 +307,31 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 			 */
 			while (blkno > bulkstate->relsize)
 			{
+				io_start = pgstat_prepare_io_time(track_io_timing);
 				/* don't set checksum for all-zero page */
 				smgrextend(bulkstate->smgr, bulkstate->forknum,
 						   bulkstate->relsize,
 						   &zero_buffer,
 						   true);
+				pgstat_count_io_op_time(io_object, IOCONTEXT_BYPASS,
+										IOOP_EXTEND, io_start, 1, BLCKSZ);
 				bulkstate->relsize++;
 			}
 
+			io_start = pgstat_prepare_io_time(track_io_timing);
 			smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
+			pgstat_count_io_op_time(io_object, IOCONTEXT_BYPASS,
+									IOOP_EXTEND, io_start, 1, BLCKSZ);
 			bulkstate->relsize++;
 		}
 		else
+		{
+			instr_time	io_start = pgstat_prepare_io_time(track_io_timing);
+
 			smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
+			pgstat_count_io_op_time(io_object, IOCONTEXT_BYPASS,
+									IOOP_WRITE, io_start, 1, BLCKSZ);
+		}
 		pfree(page);
 	}
 
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 38bae7b15d2..58d8102dee4 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -245,6 +245,8 @@ pgstat_get_io_context_name(IOContext io_context)
 			return "bulkread";
 		case IOCONTEXT_BULKWRITE:
 			return "bulkwrite";
+		case IOCONTEXT_BYPASS:
+			return "bypass";
 		case IOCONTEXT_INIT:
 			return "init";
 		case IOCONTEXT_NORMAL:
@@ -414,9 +416,10 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
 
 	/*
 	 * Currently, IO on temporary relations can only occur in the
-	 * IOCONTEXT_NORMAL IOContext.
+	 * IOCONTEXT_NORMAL and IOCONTEXT_BYPASS IOContexts.
 	 */
 	if (io_context != IOCONTEXT_NORMAL &&
+		io_context != IOCONTEXT_BYPASS &&
 		io_object == IOOBJECT_TEMP_RELATION)
 		return false;
 
@@ -434,7 +437,8 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
 		bktype == B_WAL_SUMMARIZER || bktype == B_WAL_WRITER ||
 		bktype == B_WAL_RECEIVER;
 
-	if (no_temp_rel && io_context == IOCONTEXT_NORMAL &&
+	if (no_temp_rel &&
+		(io_context == IOCONTEXT_NORMAL || io_context == IOCONTEXT_BYPASS) &&
 		io_object == IOOBJECT_TEMP_RELATION)
 		return false;
 
@@ -454,6 +458,7 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
 	if ((bktype == B_CHECKPOINTER || bktype == B_BG_WRITER) &&
 		(io_context == IOCONTEXT_BULKREAD ||
 		 io_context == IOCONTEXT_BULKWRITE ||
+		 io_context == IOCONTEXT_BYPASS ||
 		 io_context == IOCONTEXT_VACUUM))
 		return false;
 
@@ -461,7 +466,8 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
 		return false;
 
 	if ((bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_LAUNCHER) &&
-		io_context == IOCONTEXT_BULKWRITE)
+		(io_context == IOCONTEXT_BULKWRITE ||
+		 io_context == IOCONTEXT_BYPASS))
 		return false;
 
 	return true;
@@ -525,6 +531,16 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
 	if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
 		return false;
 
+	/*
+	 * IOCONTEXT_BYPASS covers relation data written by bypassing shared
+	 * buffers (see bulk_write.c), which only ever extends or overwrites
+	 * relation blocks.  The matching fsync is counted separately under
+	 * IOCONTEXT_NORMAL (see below).
+	 */
+	if (io_context == IOCONTEXT_BYPASS &&
+		!(io_op == IOOP_EXTEND || io_op == IOOP_WRITE))
+		return false;
+
 	strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
 		io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
 
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index dfa2e837638..3df2156470a 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -218,7 +218,7 @@ typedef struct PgStat_TableXactStatus
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BCBC
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BCBD
 
 typedef struct PgStat_ArchiverStats
 {
@@ -289,6 +289,7 @@ typedef enum IOContext
 {
 	IOCONTEXT_BULKREAD,
 	IOCONTEXT_BULKWRITE,
+	IOCONTEXT_BYPASS,
 	IOCONTEXT_INIT,
 	IOCONTEXT_NORMAL,
 	IOCONTEXT_VACUUM,
diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out
index bbb1db3c433..80e8af9aae5 100644
--- a/src/test/regress/expected/stats.out
+++ b/src/test/regress/expected/stats.out
@@ -29,9 +29,11 @@ autovacuum worker|wal|init
 autovacuum worker|wal|normal
 background worker|relation|bulkread
 background worker|relation|bulkwrite
+background worker|relation|bypass
 background worker|relation|init
 background worker|relation|normal
 background worker|relation|vacuum
+background worker|temp relation|bypass
 background worker|temp relation|normal
 background worker|wal|init
 background worker|wal|normal
@@ -45,46 +47,57 @@ checkpointer|wal|init
 checkpointer|wal|normal
 client backend|relation|bulkread
 client backend|relation|bulkwrite
+client backend|relation|bypass
 client backend|relation|init
 client backend|relation|normal
 client backend|relation|vacuum
+client backend|temp relation|bypass
 client backend|temp relation|normal
 client backend|wal|init
 client backend|wal|normal
 datachecksums launcher|relation|bulkread
 datachecksums launcher|relation|bulkwrite
+datachecksums launcher|relation|bypass
 datachecksums launcher|relation|init
 datachecksums launcher|relation|normal
 datachecksums launcher|relation|vacuum
+datachecksums launcher|temp relation|bypass
 datachecksums launcher|temp relation|normal
 datachecksums launcher|wal|init
 datachecksums launcher|wal|normal
 datachecksums worker|relation|bulkread
 datachecksums worker|relation|bulkwrite
+datachecksums worker|relation|bypass
 datachecksums worker|relation|init
 datachecksums worker|relation|normal
 datachecksums worker|relation|vacuum
+datachecksums worker|temp relation|bypass
 datachecksums worker|temp relation|normal
 datachecksums worker|wal|init
 datachecksums worker|wal|normal
 io worker|relation|bulkread
 io worker|relation|bulkwrite
+io worker|relation|bypass
 io worker|relation|init
 io worker|relation|normal
 io worker|relation|vacuum
+io worker|temp relation|bypass
 io worker|temp relation|normal
 io worker|wal|init
 io worker|wal|normal
 slotsync worker|relation|bulkread
 slotsync worker|relation|bulkwrite
+slotsync worker|relation|bypass
 slotsync worker|relation|init
 slotsync worker|relation|normal
 slotsync worker|relation|vacuum
+slotsync worker|temp relation|bypass
 slotsync worker|temp relation|normal
 slotsync worker|wal|init
 slotsync worker|wal|normal
 standalone backend|relation|bulkread
 standalone backend|relation|bulkwrite
+standalone backend|relation|bypass
 standalone backend|relation|init
 standalone backend|relation|normal
 standalone backend|relation|vacuum
@@ -92,6 +105,7 @@ standalone backend|wal|init
 standalone backend|wal|normal
 startup|relation|bulkread
 startup|relation|bulkwrite
+startup|relation|bypass
 startup|relation|init
 startup|relation|normal
 startup|relation|vacuum
@@ -101,9 +115,11 @@ walreceiver|wal|init
 walreceiver|wal|normal
 walsender|relation|bulkread
 walsender|relation|bulkwrite
+walsender|relation|bypass
 walsender|relation|init
 walsender|relation|normal
 walsender|relation|vacuum
+walsender|temp relation|bypass
 walsender|temp relation|normal
 walsender|wal|init
 walsender|wal|normal
@@ -111,7 +127,7 @@ walsummarizer|wal|init
 walsummarizer|wal|normal
 walwriter|wal|init
 walwriter|wal|normal
-(95 rows)
+(111 rows)
 \a
 -- ensure that both seqscan and indexscan plans are allowed
 SET enable_seqscan TO on;
@@ -1798,6 +1814,27 @@ SELECT :io_sum_bulkwrite_strategy_extends_after > :io_sum_bulkwrite_strategy_ext
  t
 (1 row)
 
+-- Test that writes done while building a relation by bypassing shared buffers
+-- (here, building a B-tree index) are tracked in pg_stat_io in the bypass
+-- IOContext.
+SELECT sum(extends) AS io_sum_bypass_extends_before
+  FROM pg_stat_io WHERE context = 'bypass' AND object = 'relation' \gset
+CREATE INDEX test_io_bulkwrite_strategy_idx
+  ON test_io_bulkwrite_strategy (i);
+SELECT pg_stat_force_next_flush();
+ pg_stat_force_next_flush 
+--------------------------
+ 
+(1 row)
+
+SELECT sum(extends) AS io_sum_bypass_extends_after
+  FROM pg_stat_io WHERE context = 'bypass' AND object = 'relation' \gset
+SELECT :io_sum_bypass_extends_after > :io_sum_bypass_extends_before;
+ ?column? 
+----------
+ t
+(1 row)
+
 -- Test IO stats reset
 SELECT pg_stat_have_stats('io', 0, 0);
  pg_stat_have_stats 
diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql
index 610fd21fae4..7fa67c98c4a 100644
--- a/src/test/regress/sql/stats.sql
+++ b/src/test/regress/sql/stats.sql
@@ -849,6 +849,18 @@ SELECT sum(extends) AS io_sum_bulkwrite_strategy_extends_after
   FROM pg_stat_io WHERE context = 'bulkwrite' \gset
 SELECT :io_sum_bulkwrite_strategy_extends_after > :io_sum_bulkwrite_strategy_extends_before;
 
+-- Test that writes done while building a relation by bypassing shared buffers
+-- (here, building a B-tree index) are tracked in pg_stat_io in the bypass
+-- IOContext.
+SELECT sum(extends) AS io_sum_bypass_extends_before
+  FROM pg_stat_io WHERE context = 'bypass' AND object = 'relation' \gset
+CREATE INDEX test_io_bulkwrite_strategy_idx
+  ON test_io_bulkwrite_strategy (i);
+SELECT pg_stat_force_next_flush();
+SELECT sum(extends) AS io_sum_bypass_extends_after
+  FROM pg_stat_io WHERE context = 'bypass' AND object = 'relation' \gset
+SELECT :io_sum_bypass_extends_after > :io_sum_bypass_extends_before;
+
 -- Test IO stats reset
 SELECT pg_stat_have_stats('io', 0, 0);
 SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) + sum(writebacks) + sum(hits) AS io_stats_pre_reset
-- 
2.50.1 (Apple Git-155)

