Hi,

On 2023-03-16 17:19:16 -0400, Melanie Plageman wrote:
> > > > I wonder if we should get rid of pgStatBlockReadTime, 
> > > > pgStatBlockWriteTime,
> > >
> > > And then have pg_stat_reset_shared('io') reset pg_stat_database IO
> > > stats?
> >
> > Yes.
> 
> I think this makes sense but I am hesitant to do it in this patchset,
> because it feels a bit hidden...maybe?

I'd not do it in the same commit, but I don't see a problem with doing it in
the same patchset.

Now that I think about it again, this wouldn't make pg_stat_reset_shared('io')
affect pg_stat_database - I was thinking we should use pgstat_io.c stats to
provide the information for pgstat_database.c, using its own pending counter.


> > No, I don't think I am suggesting that. What I am trying to suggest is that
> > PendingIOStats should contain instr_time, but that PgStat_IO should contain
> > PgStat_Counter as microseconds, as before.
> 
> So, I've modified the code to make a union of instr_time and
> PgStat_Counter in PgStat_BktypeIO. I am not quite sure if this is okay.
> I store in microsec and then in pg_stat_io, I multiply to get
> milliseconds for display.

Not a fan - what do we gain by having this union? It seems considerably
cleaner to have a struct local to pgstat_io.c that uses instr_time and have a
clean type in PgStat_BktypeIO.  In fact, the code worked after just changing
that.

I don't think it makes sense to have pgstat_io_start()/end() as well as
pgstat_count_io*. For one, the name seems in a "too general namespace" - why
not a pgstat_count*?


> I considered refactoring pgstat_io_end() to use INSTR_TIME_ACCUM_DIFF()
> like [1], but, in the end I actually think I would end up with more
> operations because of the various different counters needing to be
> updated. As it is now, I do a single subtract and a few adds (one for
> each of the different statistics objects tracking IO times
> (pgBufferUsage, pgStatBlockWrite/ReadTime). Whereas, I would need to do
> an accum diff for every one of those.

Right - that only INSTR_TIME_ACCUM_DIFF() only makes sense if there's just a
single counter to update.


WRT:
                                /* TODO: AFAICT, pgstat_count_buffer_write_time 
is only called */
                                /* for shared buffers whereas 
pgstat_count_buffer_read_time is */
                                /* called for temp relations and shared 
buffers. */
                                /*
                                 * is this intentional and should I match 
current behavior or
                                 * not?
                                 */

It's hard to see how that behaviour could be intentional.  Probably worth
fixing in a separate patch. I don't think we're going to backpatch, but it
would make this clearer nonetheless.


Incremental patch with some of the above changed attached.



Btw, it's quite nice how one now can attribute time more easily:

20 connections copying an 8MB file 50 times each:
SELECT reuses, evictions, writes, write_time, extends, extend_time FROM 
pg_stat_io WHERE backend_type = 'client backend' AND io_object = 'relation' AND 
io_context='bulkwrite';
┌────────┬───────────┬────────┬────────────┬─────────┬─────────────┐
│ reuses │ evictions │ writes │ write_time │ extends │ extend_time │
├────────┼───────────┼────────┼────────────┼─────────┼─────────────┤
│  36112 │         0 │  36112 │        141 │ 1523176 │        8676 │
└────────┴───────────┴────────┴────────────┴─────────┴─────────────┘

20 connections copying an 80MB file 5 times each:
┌─────────┬───────────┬─────────┬────────────┬─────────┬─────────────┐
│ reuses  │ evictions │ writes  │ write_time │ extends │ extend_time │
├─────────┼───────────┼─────────┼────────────┼─────────┼─────────────┤
│ 1318539 │         0 │ 1318539 │       5013 │ 1523339 │        7873 │
└─────────┴───────────┴─────────┴────────────┴─────────┴─────────────┘
(1 row)


Greetings,

Andres
>From 5d4aa3f6c651006f1ec960f59e24ebc8b5a8ca25 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Mon, 6 Mar 2023 10:41:51 -0500
Subject: [PATCH v6 1/2] Track IO times in pg_stat_io

Add IO timing for reads, writes, extends, and fsyncs to pg_stat_io.

Reviewed-by: Bertrand Drouvot <bertranddrouvot...@gmail.com>
Reviewed-by: Andres Freund <and...@anarazel.de>
Discussion: https://www.postgresql.org/message-id/flat/CAAKRu_ay5iKmnbXZ3DsauViF3eMxu4m1oNnJXqV_HyqYeg55Ww%40mail.gmail.com
---
 src/include/catalog/pg_proc.dat        |   6 +-
 src/include/pgstat.h                   |  14 +++-
 src/backend/catalog/system_views.sql   |   4 +
 src/backend/storage/buffer/bufmgr.c    |  56 ++++++--------
 src/backend/storage/buffer/localbuf.c  |   6 +-
 src/backend/storage/smgr/md.c          |  27 ++++---
 src/backend/utils/activity/pgstat.c    |  77 ++++++++++++++++++-
 src/backend/utils/activity/pgstat_io.c | 101 ++++++++++++++++++++-----
 src/backend/utils/adt/pgstatfuncs.c    |  48 ++++++++++--
 doc/src/sgml/monitoring.sgml           |  59 +++++++++++++++
 src/test/regress/expected/rules.out    |   6 +-
 11 files changed, 326 insertions(+), 78 deletions(-)

diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 5cf87aeb2c4..4148ec09ae7 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5744,9 +5744,9 @@
   proname => 'pg_stat_get_io', provolatile => 'v',
   prorows => '30', proretset => 't',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
-  proallargtypes => '{text,text,text,int8,int8,int8,int8,int8,int8,int8,timestamptz}',
-  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{backend_type,io_object,io_context,reads,writes,extends,op_bytes,evictions,reuses,fsyncs,stats_reset}',
+  proallargtypes => '{text,text,text,int8,float8,int8,float8,int8,float8,int8,int8,int8,int8,float8,timestamptz}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{backend_type,io_object,io_context,reads,read_time,writes,write_time,extends,extend_time,op_bytes,evictions,reuses,fsyncs,fsync_time,stats_reset}',
   prosrc => 'pg_stat_get_io' },
 
 { oid => '1136', descr => 'statistics: information about WAL activity',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 1e418b682b5..288a8c7c0ef 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -304,7 +304,12 @@ typedef enum IOOp
 
 typedef struct PgStat_BktypeIO
 {
-	PgStat_Counter data[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	union
+	{
+		instr_time	pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+		PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	};
 } PgStat_BktypeIO;
 
 typedef struct PgStat_IO
@@ -466,6 +471,11 @@ extern TimestampTz pgstat_get_stat_snapshot_timestamp(bool *have_snapshot);
 extern PgStat_Kind pgstat_get_kind_from_str(char *kind_str);
 extern bool pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid);
 
+extern instr_time pgstat_io_start(void);
+
+extern void pgstat_io_end(instr_time io_start, IOObject io_object,
+						  IOContext io_context, IOOp io_op);
+
 
 /*
  * Functions in pgstat_archiver.c
@@ -498,6 +508,7 @@ extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void);
 extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *context_ops,
 										 BackendType bktype);
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op);
+extern void pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time);
 extern PgStat_IO *pgstat_fetch_stat_io(void);
 extern const char *pgstat_get_io_context_name(IOContext io_context);
 extern const char *pgstat_get_io_object_name(IOObject io_object);
@@ -507,6 +518,7 @@ extern bool pgstat_tracks_io_object(BackendType bktype,
 									IOObject io_object, IOContext io_context);
 extern bool pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
 								IOContext io_context, IOOp io_op);
+extern IOOp pgstat_tracks_io_time(IOOp io_op);
 
 
 /*
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 34ca0e739f5..39391bc2fc0 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1123,12 +1123,16 @@ SELECT
        b.io_object,
        b.io_context,
        b.reads,
+       b.read_time,
        b.writes,
+       b.write_time,
        b.extends,
+       b.extend_time,
        b.op_bytes,
        b.evictions,
        b.reuses,
        b.fsyncs,
+       b.fsync_time,
        b.stats_reset
 FROM pg_stat_get_io() b;
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0a05577b68d..64857f1ff68 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1000,12 +1000,17 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	if (isExtend)
 	{
+		instr_time	io_start;
+
 		/* new buffers are zero-filled */
 		MemSet((char *) bufBlock, 0, BLCKSZ);
+
+		io_start = pgstat_io_start();
+
 		/* don't set checksum for all-zero page */
 		smgrextend(smgr, forkNum, blockNum, bufBlock, false);
 
-		pgstat_count_io_op(io_object, io_context, IOOP_EXTEND);
+		pgstat_io_end(io_start, io_object, io_context, IOOP_EXTEND);
 
 		/*
 		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
@@ -1024,25 +1029,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			MemSet((char *) bufBlock, 0, BLCKSZ);
 		else
 		{
-			instr_time	io_start,
-						io_time;
+			instr_time	io_start;
 
-			if (track_io_timing)
-				INSTR_TIME_SET_CURRENT(io_start);
-			else
-				INSTR_TIME_SET_ZERO(io_start);
+			io_start = pgstat_io_start();
 
 			smgrread(smgr, forkNum, blockNum, bufBlock);
 
-			pgstat_count_io_op(io_object, io_context, IOOP_READ);
-
-			if (track_io_timing)
-			{
-				INSTR_TIME_SET_CURRENT(io_time);
-				INSTR_TIME_SUBTRACT(io_time, io_start);
-				pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
-				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
-			}
+			pgstat_io_end(io_start, io_object, io_context, IOOP_READ);
 
 			/* check for garbage data */
 			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
@@ -2848,6 +2841,7 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
 	*blknum = bufHdr->tag.blockNum;
 }
 
+
 /*
  * FlushBuffer
  *		Physically write out a shared buffer.
@@ -2873,8 +2867,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 {
 	XLogRecPtr	recptr;
 	ErrorContextCallback errcallback;
-	instr_time	io_start,
-				io_time;
+	instr_time	io_start;
 	Block		bufBlock;
 	char	   *bufToWrite;
 	uint32		buf_state;
@@ -2949,10 +2942,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 */
 	bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
 
-	if (track_io_timing)
-		INSTR_TIME_SET_CURRENT(io_start);
-	else
-		INSTR_TIME_SET_ZERO(io_start);
+	io_start = pgstat_io_start();
 
 	/*
 	 * bufToWrite is either the shared buffer or a copy, as appropriate.
@@ -2981,17 +2971,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * When a strategy is not in use, the write can only be a "regular" write
 	 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
 	 */
-	pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE);
-
-	if (track_io_timing)
-	{
-		INSTR_TIME_SET_CURRENT(io_time);
-		INSTR_TIME_SUBTRACT(io_time, io_start);
-		pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-		INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
-	}
-
-	pgBufferUsage.shared_blks_written++;
+	pgstat_io_end(io_start, IOOBJECT_RELATION, io_context, IOOP_WRITE);
 
 	/*
 	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
@@ -3597,6 +3577,7 @@ FlushRelationBuffers(Relation rel)
 		for (i = 0; i < NLocBuffer; i++)
 		{
 			uint32		buf_state;
+			instr_time	io_start;
 
 			bufHdr = GetLocalBufferDescriptor(i);
 			if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
@@ -3616,6 +3597,12 @@ FlushRelationBuffers(Relation rel)
 
 				PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
+				/*
+				 * TODO: is it okay that this formerly didn't count
+				 * pgBufferUsage or pgStatBlockReadTime
+				 */
+				io_start = pgstat_io_start();
+
 				smgrwrite(RelationGetSmgr(rel),
 						  BufTagGetForkNum(&bufHdr->tag),
 						  bufHdr->tag.blockNum,
@@ -3625,13 +3612,14 @@ FlushRelationBuffers(Relation rel)
 				buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-				pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
+				pgstat_io_end(io_start, IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
 
 				/* Pop the error context stack */
 				error_context_stack = errcallback.previous;
 			}
 		}
 
+
 		return;
 	}
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 5325ddb663d..af45a72934a 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -220,6 +220,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 */
 	if (buf_state & BM_DIRTY)
 	{
+		instr_time	io_start;
 		SMgrRelation oreln;
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
@@ -228,6 +229,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
+		io_start = pgstat_io_start();
+
 		/* And write... */
 		smgrwrite(oreln,
 				  BufTagGetForkNum(&bufHdr->tag),
@@ -239,8 +242,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		buf_state &= ~BM_DIRTY;
 		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-		pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
-		pgBufferUsage.local_blks_written++;
+		pgstat_io_end(io_start, IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
 	}
 
 	/*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 352958e1feb..2440211f03d 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1030,6 +1030,17 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
 	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
 	{
+		instr_time	io_start = pgstat_io_start();
+
+		ereport(DEBUG1,
+				(errmsg_internal("could not forward fsync request because request queue is full")));
+
+		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							FilePathName(seg->mdfd_vfd))));
+
 		/*
 		 * We have no way of knowing if the current IOContext is
 		 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
@@ -1041,16 +1052,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 		 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
 		 * backend fsyncs.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
-
-		ereport(DEBUG1,
-				(errmsg_internal("could not forward fsync request because request queue is full")));
-
-		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
-			ereport(data_sync_elevel(ERROR),
-					(errcode_for_file_access(),
-					 errmsg("could not fsync file \"%s\": %m",
-							FilePathName(seg->mdfd_vfd))));
+		pgstat_io_end(io_start, IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
 	}
 }
 
@@ -1399,6 +1401,7 @@ int
 mdsyncfiletag(const FileTag *ftag, char *path)
 {
 	SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
+	instr_time	io_start;
 	File		file;
 	bool		need_to_close;
 	int			result,
@@ -1425,6 +1428,8 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 		need_to_close = true;
 	}
 
+	io_start = pgstat_io_start();
+
 	/* Sync the file. */
 	result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
 	save_errno = errno;
@@ -1432,7 +1437,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 	if (need_to_close)
 		FileClose(file);
 
-	pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
+	pgstat_io_end(io_start, IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
 
 	errno = save_errno;
 	return result;
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index b125802b215..9f5f041d7c2 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -95,9 +95,11 @@
 
 #include "access/transam.h"
 #include "access/xact.h"
+#include "executor/instrument.h"
 #include "lib/dshash.h"
 #include "pgstat.h"
 #include "port/atomics.h"
+#include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
@@ -108,7 +110,6 @@
 #include "utils/pgstat_internal.h"
 #include "utils/timestamp.h"
 
-
 /* ----------
  * Timer definitions.
  *
@@ -906,6 +907,80 @@ pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid)
 	return pgstat_get_entry_ref(kind, dboid, objoid, false, NULL) != NULL;
 }
 
+instr_time
+pgstat_io_start(void)
+{
+	instr_time	io_start;
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+	else
+		INSTR_TIME_SET_ZERO(io_start);
+
+	return io_start;
+}
+
+void
+pgstat_io_end(instr_time io_start, IOObject io_object,
+			  IOContext io_context, IOOp io_op)
+{
+	instr_time	io_time;
+
+	if (track_io_timing)
+	{
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, io_start);
+		pgstat_count_io_time(io_object, io_context, io_op, io_time);
+
+		if (io_op == IOOP_WRITE)
+		{
+			if (io_object == IOOBJECT_RELATION)
+			{
+				/* TODO: AFAICT, pgstat_count_buffer_write_time is only called */
+				/* for shared buffers whereas pgstat_count_buffer_read_time is */
+				/* called for temp relations and shared buffers. */
+				/*
+				 * is this intentional and should I match current behavior or
+				 * not?
+				 */
+				pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+			}
+		}
+		else if (io_op == IOOP_READ)
+		{
+			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+			if (io_object == IOOBJECT_RELATION)
+			{
+				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+			}
+		}
+	}
+
+	pgstat_count_io_op(io_object, io_context, io_op);
+
+	if (io_op == IOOP_WRITE)
+	{
+		if (io_object == IOOBJECT_RELATION)
+			pgBufferUsage.shared_blks_written++;
+		else if (io_object == IOOBJECT_TEMP_RELATION)
+			pgBufferUsage.local_blks_written++;
+	}
+
+	/*
+	 * TODO: this is normally done later in ReadBuffer_common() is it okay to
+	 * do here?
+	 */
+	else if (io_op == IOOP_READ)
+	{
+		if (io_object == IOOBJECT_RELATION)
+			pgBufferUsage.shared_blks_read++;
+		else if (io_object == IOOBJECT_TEMP_RELATION)
+			pgBufferUsage.local_blks_read++;
+	}
+}
+
+
 /*
  * Ensure snapshot for fixed-numbered 'kind' exists.
  *
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index af5d5546101..fe09cbf16d4 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -25,36 +25,47 @@ bool		have_iostats = false;
 
 /*
  * Check that stats have not been counted for any combination of IOObject,
- * IOContext, and IOOp which are not tracked for the passed-in BackendType. The
- * passed-in PgStat_BktypeIO must contain stats from the BackendType specified
- * by the second parameter. Caller is responsible for locking the passed-in
- * PgStat_BktypeIO, if needed.
+ * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
+ * the IOOp is not counted for this combination but IO time is otherwise
+ * tracked for this IOOp, check that IO time has not been counted for this
+ * combination. If stats are tracked for this combination and IO times are
+ * non-zero, counts should be non-zero.
+ *
+ * The passed-in PgStat_BktypeIO must contain stats from the BackendType
+ * specified by the second parameter. Caller is responsible for locking the
+ * passed-in PgStat_BktypeIO, if needed.
  */
 bool
 pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 							 BackendType bktype)
 {
-	bool		bktype_tracked = pgstat_tracks_io_bktype(bktype);
-
 	for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
 	{
 		for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
 		{
-			/*
-			 * Don't bother trying to skip to the next loop iteration if
-			 * pgstat_tracks_io_object() would return false here. We still
-			 * need to validate that each counter is zero anyway.
-			 */
 			for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 			{
-				/* No stats, so nothing to validate */
-				if (backend_io->data[io_object][io_context][io_op] == 0)
-					continue;
+				/* we do track it */
+				if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
+				{
+					/* ensure that if IO times are non-zero, counts are > 0 */
+					if (backend_io->times[io_object][io_context][io_op] != 0 &&
+						backend_io->counts[io_object][io_context][io_op] <= 0)
+						return false;
 
-				/* There are stats and there shouldn't be */
-				if (!bktype_tracked ||
-					!pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
+					continue;
+				}
+
+				/* we don't track it, and it is not 0 */
+				if (backend_io->counts[io_object][io_context][io_op] != 0)
 					return false;
+
+				/* we don't track this IOOp, so make sure its IO time is zero */
+				if (pgstat_tracks_io_time(io_op) > -1)
+				{
+					if (backend_io->times[io_object][io_context][io_op] != 0)
+						return false;
+				}
 			}
 		}
 	}
@@ -70,7 +81,21 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
 	Assert((unsigned int) io_op < IOOP_NUM_TYPES);
 	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
 
-	PendingIOStats.data[io_object][io_context][io_op]++;
+	PendingIOStats.counts[io_object][io_context][io_op]++;
+
+	have_iostats = true;
+}
+
+void
+pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time)
+{
+	Assert(io_object < IOOBJECT_NUM_TYPES);
+	Assert(io_context < IOCONTEXT_NUM_TYPES);
+	Assert(io_op < IOOP_NUM_TYPES);
+	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
+	Assert(pgstat_tracks_io_time(io_op) != -1);
+
+	INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op], time);
 
 	have_iostats = true;
 }
@@ -114,8 +139,17 @@ pgstat_flush_io(bool nowait)
 		for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
 		{
 			for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
-				bktype_shstats->data[io_object][io_context][io_op] +=
-					PendingIOStats.data[io_object][io_context][io_op];
+			{
+				instr_time	time;
+
+				bktype_shstats->counts[io_object][io_context][io_op] +=
+					PendingIOStats.counts[io_object][io_context][io_op];
+
+				time = PendingIOStats.pending_times[io_object][io_context][io_op];
+
+				bktype_shstats->times[io_object][io_context][io_op] +=
+					INSTR_TIME_GET_MICROSEC(time);
+			}
 		}
 	}
 
@@ -384,3 +418,30 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
 
 	return true;
 }
+
+/*
+ * PgStat_BktypeIO->times contains IO times for IOOps. For simplicity this
+ * array has a spot for every IOOp. pgstat_tracks_io_time() is the source of
+ * truth for which IOOps have corresponding IO times.
+ */
+IOOp
+pgstat_tracks_io_time(IOOp io_op)
+{
+	switch (io_op)
+	{
+		case IOOP_READ:
+			return IOOP_READ;
+		case IOOP_WRITE:
+			return IOOP_WRITE;
+		case IOOP_EXTEND:
+			return IOOP_EXTEND;
+		case IOOP_FSYNC:
+			return IOOP_FSYNC;
+		case IOOP_EVICT:
+		case IOOP_REUSE:
+			return -1;
+	}
+
+	elog(ERROR, "unrecognized IOOp value: %d", io_op);
+	pg_unreachable();
+}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 35c6d465553..60fdda1a147 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1255,12 +1255,16 @@ typedef enum io_stat_col
 	IO_COL_IO_OBJECT,
 	IO_COL_IO_CONTEXT,
 	IO_COL_READS,
+	IO_COL_READ_TIME,
 	IO_COL_WRITES,
+	IO_COL_WRITE_TIME,
 	IO_COL_EXTENDS,
+	IO_COL_EXTEND_TIME,
 	IO_COL_CONVERSION,
 	IO_COL_EVICTIONS,
 	IO_COL_REUSES,
 	IO_COL_FSYNCS,
+	IO_COL_FSYNC_TIME,
 	IO_COL_RESET_TIME,
 	IO_NUM_COLUMNS,
 } io_stat_col;
@@ -1292,6 +1296,28 @@ pgstat_get_io_op_index(IOOp io_op)
 	pg_unreachable();
 }
 
+/*
+ * Get the number of the column containing IO times for the specified IOOp. If
+ * the specified IOOp is one for which IO time is not tracked, return -1. Note
+ * that this function assumes that IO time for an IOOp is displayed in the view
+ * in the column directly after the IOOp counts.
+ */
+static io_stat_col
+pgstat_get_io_time_index(IOOp io_op)
+{
+	if (pgstat_tracks_io_time(io_op) == -1)
+		return -1;
+
+	return pgstat_get_io_op_index(io_op) + 1;
+}
+
+static inline
+PgStat_Counter
+pg_stat_micro_to_millisecs(PgStat_Counter val_microsec)
+{
+	return val_microsec * 0.001;
+}
+
 Datum
 pg_stat_get_io(PG_FUNCTION_ARGS)
 {
@@ -1359,20 +1385,32 @@ pg_stat_get_io(PG_FUNCTION_ARGS)
 
 				for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 				{
-					int			col_idx = pgstat_get_io_op_index(io_op);
+					PgStat_Counter count = bktype_stats->counts[io_obj][io_context][io_op];
+					int			i = pgstat_get_io_op_index(io_op);
 
 					/*
 					 * Some combinations of BackendType and IOOp, of IOContext
 					 * and IOOp, and of IOObject and IOOp are not tracked. Set
 					 * these cells in the view NULL.
 					 */
-					nulls[col_idx] = !pgstat_tracks_io_op(bktype, io_obj, io_context, io_op);
+					if (pgstat_tracks_io_op(bktype, io_obj, io_context, io_op))
+						values[i] = Int64GetDatum(count);
+					else
+						nulls[i] = true;
+				}
 
-					if (nulls[col_idx])
+				for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
+				{
+					PgStat_Counter time = bktype_stats->times[io_obj][io_context][io_op];
+					int			i = pgstat_get_io_time_index(io_op);
+
+					if (i == -1)
 						continue;
 
-					values[col_idx] =
-						Int64GetDatum(bktype_stats->data[io_obj][io_context][io_op]);
+					if (!nulls[pgstat_get_io_op_index(io_op)])
+						values[i] = Float8GetDatum(pg_stat_micro_to_millisecs(time));
+					else
+						nulls[i] = true;
 				}
 
 				tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 6249bb50d02..ad3667f2585 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -3814,6 +3814,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>read_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in read operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3826,6 +3838,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>write_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in write operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3838,6 +3862,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>extend_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in extend operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3902,6 +3938,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>fsync_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in fsync operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3967,6 +4015,17 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
    </itemizedlist>
   </para>
 
+  <note>
+   <para>
+    Columns tracking I/O time will only be non-zero when <xref
+    linkend="guc-track-io-timing"/> is enabled. The user should be careful when
+    using these columns in combination with their corresponding operations to
+    ensure that <varname>track_io_timing</varname> was enabled for the entire
+    time since the last reset.
+   </para>
+  </note>
+
+
 
  </sect2>
 
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index e953d1f5159..54348513145 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1880,14 +1880,18 @@ pg_stat_io| SELECT backend_type,
     io_object,
     io_context,
     reads,
+    read_time,
     writes,
+    write_time,
     extends,
+    extend_time,
     op_bytes,
     evictions,
     reuses,
     fsyncs,
+    fsync_time,
     stats_reset
-   FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, writes, extends, op_bytes, evictions, reuses, fsyncs, stats_reset);
+   FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, read_time, writes, write_time, extends, extend_time, op_bytes, evictions, reuses, fsyncs, fsync_time, stats_reset);
 pg_stat_progress_analyze| SELECT s.pid,
     s.datid,
     d.datname,
-- 
2.38.0

>From f4f1534c105766c64534810b62eba7fea674c2a5 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 20 Mar 2023 19:26:41 -0700
Subject: [PATCH v6 2/2] pgstat: io: increment pgstat_io hackery

---
 src/include/pgstat.h                   | 15 ++---
 src/backend/storage/buffer/bufmgr.c    | 21 ++++---
 src/backend/storage/buffer/localbuf.c  |  5 +-
 src/backend/storage/smgr/md.c          | 12 ++--
 src/backend/utils/activity/pgstat.c    | 77 +-------------------------
 src/backend/utils/activity/pgstat_io.c | 70 ++++++++++++++++++++---
 6 files changed, 90 insertions(+), 110 deletions(-)

diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 288a8c7c0ef..4adbf29fdfd 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -305,11 +305,7 @@ typedef enum IOOp
 typedef struct PgStat_BktypeIO
 {
 	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
-	union
-	{
-		instr_time	pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
-		PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
-	};
+	PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 } PgStat_BktypeIO;
 
 typedef struct PgStat_IO
@@ -471,11 +467,6 @@ extern TimestampTz pgstat_get_stat_snapshot_timestamp(bool *have_snapshot);
 extern PgStat_Kind pgstat_get_kind_from_str(char *kind_str);
 extern bool pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid);
 
-extern instr_time pgstat_io_start(void);
-
-extern void pgstat_io_end(instr_time io_start, IOObject io_object,
-						  IOContext io_context, IOOp io_op);
-
 
 /*
  * Functions in pgstat_archiver.c
@@ -507,8 +498,10 @@ extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void);
 
 extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *context_ops,
 										 BackendType bktype);
+extern instr_time pgstat_prepare_io_time(void);
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op);
-extern void pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time);
+extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time);
+
 extern PgStat_IO *pgstat_fetch_stat_io(void);
 extern const char *pgstat_get_io_context_name(IOContext io_context);
 extern const char *pgstat_get_io_object_name(IOObject io_object);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 64857f1ff68..532c7af992d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1005,12 +1005,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/* new buffers are zero-filled */
 		MemSet((char *) bufBlock, 0, BLCKSZ);
 
-		io_start = pgstat_io_start();
+		io_start = pgstat_prepare_io_time();
 
 		/* don't set checksum for all-zero page */
 		smgrextend(smgr, forkNum, blockNum, bufBlock, false);
 
-		pgstat_io_end(io_start, io_object, io_context, IOOP_EXTEND);
+		pgstat_count_io_op_time(io_object, io_context,
+								IOOP_EXTEND, io_start);
 
 		/*
 		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
@@ -1031,11 +1032,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		{
 			instr_time	io_start;
 
-			io_start = pgstat_io_start();
+			io_start = pgstat_prepare_io_time();
 
 			smgrread(smgr, forkNum, blockNum, bufBlock);
 
-			pgstat_io_end(io_start, io_object, io_context, IOOP_READ);
+			pgstat_count_io_op_time(io_object, io_context,
+									IOOP_READ, io_start);
 
 			/* check for garbage data */
 			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
@@ -2942,7 +2944,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 */
 	bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
 
-	io_start = pgstat_io_start();
+	io_start = pgstat_prepare_io_time();
 
 	/*
 	 * bufToWrite is either the shared buffer or a copy, as appropriate.
@@ -2971,7 +2973,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * When a strategy is not in use, the write can only be a "regular" write
 	 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
 	 */
-	pgstat_io_end(io_start, IOOBJECT_RELATION, io_context, IOOP_WRITE);
+	pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
+							IOOP_WRITE, io_start);
 
 	/*
 	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
@@ -3601,7 +3604,7 @@ FlushRelationBuffers(Relation rel)
 				 * TODO: is it okay that this formerly didn't count
 				 * pgBufferUsage or pgStatBlockReadTime
 				 */
-				io_start = pgstat_io_start();
+				io_start = pgstat_prepare_io_time();
 
 				smgrwrite(RelationGetSmgr(rel),
 						  BufTagGetForkNum(&bufHdr->tag),
@@ -3612,7 +3615,9 @@ FlushRelationBuffers(Relation rel)
 				buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-				pgstat_io_end(io_start, IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
+				pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
+										IOCONTEXT_NORMAL, IOOP_WRITE,
+										io_start);
 
 				/* Pop the error context stack */
 				error_context_stack = errcallback.previous;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index af45a72934a..23f2e2fb759 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -229,7 +229,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-		io_start = pgstat_io_start();
+		io_start = pgstat_prepare_io_time();
 
 		/* And write... */
 		smgrwrite(oreln,
@@ -242,7 +242,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		buf_state &= ~BM_DIRTY;
 		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-		pgstat_io_end(io_start, IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
+		pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL,
+								IOOP_WRITE, io_start);
 	}
 
 	/*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2440211f03d..51a9134d57f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1030,11 +1030,13 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
 	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
 	{
-		instr_time	io_start = pgstat_io_start();
+		instr_time	io_start;
 
 		ereport(DEBUG1,
 				(errmsg_internal("could not forward fsync request because request queue is full")));
 
+		io_start = pgstat_prepare_io_time();
+
 		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
 			ereport(data_sync_elevel(ERROR),
 					(errcode_for_file_access(),
@@ -1052,7 +1054,8 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 		 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
 		 * backend fsyncs.
 		 */
-		pgstat_io_end(io_start, IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
+		pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+								IOOP_FSYNC, io_start);
 	}
 }
 
@@ -1428,7 +1431,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 		need_to_close = true;
 	}
 
-	io_start = pgstat_io_start();
+	io_start = pgstat_prepare_io_time();
 
 	/* Sync the file. */
 	result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
@@ -1437,7 +1440,8 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 	if (need_to_close)
 		FileClose(file);
 
-	pgstat_io_end(io_start, IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
+	pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+							IOOP_FSYNC, io_start);
 
 	errno = save_errno;
 	return result;
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 9f5f041d7c2..b125802b215 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -95,11 +95,9 @@
 
 #include "access/transam.h"
 #include "access/xact.h"
-#include "executor/instrument.h"
 #include "lib/dshash.h"
 #include "pgstat.h"
 #include "port/atomics.h"
-#include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
@@ -110,6 +108,7 @@
 #include "utils/pgstat_internal.h"
 #include "utils/timestamp.h"
 
+
 /* ----------
  * Timer definitions.
  *
@@ -907,80 +906,6 @@ pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid)
 	return pgstat_get_entry_ref(kind, dboid, objoid, false, NULL) != NULL;
 }
 
-instr_time
-pgstat_io_start(void)
-{
-	instr_time	io_start;
-
-	if (track_io_timing)
-		INSTR_TIME_SET_CURRENT(io_start);
-	else
-		INSTR_TIME_SET_ZERO(io_start);
-
-	return io_start;
-}
-
-void
-pgstat_io_end(instr_time io_start, IOObject io_object,
-			  IOContext io_context, IOOp io_op)
-{
-	instr_time	io_time;
-
-	if (track_io_timing)
-	{
-		INSTR_TIME_SET_CURRENT(io_time);
-		INSTR_TIME_SUBTRACT(io_time, io_start);
-		pgstat_count_io_time(io_object, io_context, io_op, io_time);
-
-		if (io_op == IOOP_WRITE)
-		{
-			if (io_object == IOOBJECT_RELATION)
-			{
-				/* TODO: AFAICT, pgstat_count_buffer_write_time is only called */
-				/* for shared buffers whereas pgstat_count_buffer_read_time is */
-				/* called for temp relations and shared buffers. */
-				/*
-				 * is this intentional and should I match current behavior or
-				 * not?
-				 */
-				pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
-			}
-		}
-		else if (io_op == IOOP_READ)
-		{
-			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
-			if (io_object == IOOBJECT_RELATION)
-			{
-				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
-			}
-		}
-	}
-
-	pgstat_count_io_op(io_object, io_context, io_op);
-
-	if (io_op == IOOP_WRITE)
-	{
-		if (io_object == IOOBJECT_RELATION)
-			pgBufferUsage.shared_blks_written++;
-		else if (io_object == IOOBJECT_TEMP_RELATION)
-			pgBufferUsage.local_blks_written++;
-	}
-
-	/*
-	 * TODO: this is normally done later in ReadBuffer_common() is it okay to
-	 * do here?
-	 */
-	else if (io_op == IOOP_READ)
-	{
-		if (io_object == IOOBJECT_RELATION)
-			pgBufferUsage.shared_blks_read++;
-		else if (io_object == IOOBJECT_TEMP_RELATION)
-			pgBufferUsage.local_blks_read++;
-	}
-}
-
-
 /*
  * Ensure snapshot for fixed-numbered 'kind' exists.
  *
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index fe09cbf16d4..08888fa0c5b 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -16,10 +16,19 @@
 
 #include "postgres.h"
 
+#include "executor/instrument.h"
+#include "storage/bufmgr.h"
 #include "utils/pgstat_internal.h"
 
 
-static PgStat_BktypeIO PendingIOStats;
+typedef struct PgStat_PendingIO
+{
+	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	instr_time	pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+} PgStat_PendingIO;
+
+
+static PgStat_PendingIO PendingIOStats;
 bool		have_iostats = false;
 
 
@@ -73,6 +82,19 @@ pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 	return true;
 }
 
+instr_time
+pgstat_prepare_io_time(void)
+{
+	instr_time	io_start;
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+	else
+		INSTR_TIME_SET_ZERO(io_start);
+
+	return io_start;
+}
+
 void
 pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
 {
@@ -86,18 +108,48 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
 	have_iostats = true;
 }
 
+/*
+ * Like pgstat_count_io_op() except it also accumulates time.
+ */
 void
-pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time)
+pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
+						instr_time start_time)
 {
-	Assert(io_object < IOOBJECT_NUM_TYPES);
-	Assert(io_context < IOCONTEXT_NUM_TYPES);
-	Assert(io_op < IOOP_NUM_TYPES);
-	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
-	Assert(pgstat_tracks_io_time(io_op) != -1);
+	if (track_io_timing)
+	{
+		instr_time	io_time;
 
-	INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op], time);
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, start_time);
 
-	have_iostats = true;
+		if (io_op == IOOP_WRITE)
+		{
+			if (io_object == IOOBJECT_RELATION)
+			{
+				/* TODO: AFAICT, pgstat_count_buffer_write_time is only called */
+				/* for shared buffers whereas pgstat_count_buffer_read_time is */
+				/* called for temp relations and shared buffers. */
+				/*
+				 * is this intentional and should I match current behavior or
+				 * not?
+				 */
+				pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+			}
+		}
+		else if (io_op == IOOP_READ)
+		{
+			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+			if (io_object == IOOBJECT_RELATION)
+			{
+				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+			}
+		}
+
+		INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op], io_time);
+	}
+
+	pgstat_count_io_op(io_object, io_context, io_op);
 }
 
 PgStat_IO *
-- 
2.38.0

Reply via email to