On Mon, Mar 20, 2023 at 10:34 PM Andres Freund <and...@anarazel.de> wrote:
>
> Hi,
>
> On 2023-03-16 17:19:16 -0400, Melanie Plageman wrote:
> > > > > I wonder if we should get rid of pgStatBlockReadTime, 
> > > > > pgStatBlockWriteTime,
> > > >
> > > > And then have pg_stat_reset_shared('io') reset pg_stat_database IO
> > > > stats?
> > >
> > > Yes.
> >
> > I think this makes sense but I am hesitant to do it in this patchset,
> > because it feels a bit hidden...maybe?
>
> I'd not do it in the same commit, but I don't see a problem with doing it in
> the same patchset.
>
> Now that I think about it again, this wouldn't make pg_stat_reset_shared('io')
> affect pg_stat_database - I was thinking we should use pgstat_io.c stats to
> provide the information for pgstat_database.c, using its own pending counter.

So, I've done this in the attached. But, won't resetting pgstat_database
be a bit weird if you have built up some IO timing in pending counters
and right after you reset a flush happens and then suddenly the values
are way above 0 again?

> > I considered refactoring pgstat_io_end() to use INSTR_TIME_ACCUM_DIFF()
> > like [1], but, in the end I actually think I would end up with more
> > operations because of the various different counters needing to be
> > updated. As it is now, I do a single subtract and a few adds (one for
> > each of the different statistics objects tracking IO times
> > (pgBufferUsage, pgStatBlockWrite/ReadTime). Whereas, I would need to do
> > an accum diff for every one of those.
>
> Right - that only INSTR_TIME_ACCUM_DIFF() only makes sense if there's just a
> single counter to update.
>
>
> WRT:
>                                 /* TODO: AFAICT, 
> pgstat_count_buffer_write_time is only called */
>                                 /* for shared buffers whereas 
> pgstat_count_buffer_read_time is */
>                                 /* called for temp relations and shared 
> buffers. */
>                                 /*
>                                  * is this intentional and should I match 
> current behavior or
>                                  * not?
>                                  */
>
> It's hard to see how that behaviour could be intentional.  Probably worth
> fixing in a separate patch. I don't think we're going to backpatch, but it
> would make this clearer nonetheless.


Attached v7 does this in separate commits.

Remaining feedback is about FlushLocalBuffers(). Is the idea simply to
get it into bufmgr.c because that is cleaner from an API perspective?

- Melanie
From a7ba3cce6dbbde49efa5b20e2db5cd49c259d3ad Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Tue, 21 Mar 2023 18:20:44 -0400
Subject: [PATCH v7 2/4] FlushRelationBuffers() counts temp relation IO timing

Add pgstat_database and pgBufferUsage IO timing counting to
FlushRelationBuffers() for writes of temporary relations.
---
 src/backend/storage/buffer/bufmgr.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0a05577b68..dea2e8fe40 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3591,6 +3591,8 @@ FlushRelationBuffers(Relation rel)
 {
 	int			i;
 	BufferDesc *bufHdr;
+	instr_time	io_start,
+				io_time;
 
 	if (RelationUsesLocalBuffers(rel))
 	{
@@ -3616,17 +3618,33 @@ FlushRelationBuffers(Relation rel)
 
 				PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
+				if (track_io_timing)
+					INSTR_TIME_SET_CURRENT(io_start);
+				else
+					INSTR_TIME_SET_ZERO(io_start);
+
 				smgrwrite(RelationGetSmgr(rel),
 						  BufTagGetForkNum(&bufHdr->tag),
 						  bufHdr->tag.blockNum,
 						  localpage,
 						  false);
 
+
 				buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
 				pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
 
+				if (track_io_timing)
+				{
+					INSTR_TIME_SET_CURRENT(io_time);
+					INSTR_TIME_SUBTRACT(io_time, io_start);
+					pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+					INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+				}
+
+				pgBufferUsage.local_blks_written++;
+
 				/* Pop the error context stack */
 				error_context_stack = errcallback.previous;
 			}
-- 
2.37.2

From 1088cde0ea0d39b7e55cd919f4c9151136f36e28 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Tue, 21 Mar 2023 16:00:55 -0400
Subject: [PATCH v7 1/4] Count IO time for temp relation writes

Both pgstat_database and pgBufferUsage write times failed to count
timing for flushes of dirty local buffers when acquiring a new local
buffer for a temporary relation block.
---
 src/backend/storage/buffer/localbuf.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 5325ddb663..80510411ae 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -114,6 +114,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	LocalBufferLookupEnt *hresult;
 	BufferDesc *bufHdr;
 	int			b;
+	instr_time	io_start,
+				io_time;
 	int			trycounter;
 	bool		found;
 	uint32		buf_state;
@@ -228,6 +230,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
+		if (track_io_timing)
+			INSTR_TIME_SET_CURRENT(io_start);
+		else
+			INSTR_TIME_SET_ZERO(io_start);
+
 		/* And write... */
 		smgrwrite(oreln,
 				  BufTagGetForkNum(&bufHdr->tag),
@@ -240,6 +247,15 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
 		pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
+
+		if (track_io_timing)
+		{
+			INSTR_TIME_SET_CURRENT(io_time);
+			INSTR_TIME_SUBTRACT(io_time, io_start);
+			pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+			INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+		}
+
 		pgBufferUsage.local_blks_written++;
 	}
 
-- 
2.37.2

From 664c25813452060536a5f4afe49feae44ed17187 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Tue, 21 Mar 2023 20:36:10 -0400
Subject: [PATCH v7 4/4] pgstat_database uses pgstat_io time counters

Use pgstat_io's pending counters to increment pgStatBlockWriteTime and
pgStatBlockReadTime.
---
 src/backend/utils/activity/pgstat_io.c | 14 ++++++++------
 src/include/pgstat.h                   |  4 ----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 0cf4b78947..c22c178078 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -124,13 +124,11 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 
 		if (io_op == IOOP_WRITE)
 		{
-			pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
 			if (io_object == IOOBJECT_RELATION)
 				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
 		}
 		else if (io_op == IOOP_READ)
 		{
-			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
 			if (io_object == IOOBJECT_RELATION)
 				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
 		}
@@ -181,15 +179,19 @@ pgstat_flush_io(bool nowait)
 		{
 			for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 			{
-				instr_time	time;
+				PgStat_Counter time;
 
 				bktype_shstats->counts[io_object][io_context][io_op] +=
 					PendingIOStats.counts[io_object][io_context][io_op];
 
-				time = PendingIOStats.pending_times[io_object][io_context][io_op];
+				time = INSTR_TIME_GET_MICROSEC(PendingIOStats.pending_times[io_object][io_context][io_op]);
 
-				bktype_shstats->times[io_object][io_context][io_op] +=
-					INSTR_TIME_GET_MICROSEC(time);
+				bktype_shstats->times[io_object][io_context][io_op] += time;
+
+				if (io_op == IOOP_WRITE)
+					pgStatBlockWriteTime += time;
+				else if (io_op == IOOP_READ)
+					pgStatBlockReadTime += time;
 			}
 		}
 	}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 4adbf29fdf..3904182075 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -526,10 +526,6 @@ extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
 extern void pgstat_report_checksum_failure(void);
 extern void pgstat_report_connect(Oid dboid);
 
-#define pgstat_count_buffer_read_time(n)							\
-	(pgStatBlockReadTime += (n))
-#define pgstat_count_buffer_write_time(n)							\
-	(pgStatBlockWriteTime += (n))
 #define pgstat_count_conn_active_time(n)							\
 	(pgStatActiveTime += (n))
 #define pgstat_count_conn_txn_idle_time(n)							\
-- 
2.37.2

From 12ce42fc1093ac37736e98ccb7d38ff030f435c2 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Tue, 21 Mar 2023 19:02:01 -0400
Subject: [PATCH v7 3/4] Track IO times in pg_stat_io

Add IO timing for reads, writes, extends, and fsyncs to pg_stat_io.

Reviewed-by: Bertrand Drouvot <bertranddrouvot...@gmail.com>
Reviewed-by: Andres Freund <and...@anarazel.de>
Discussion: https://www.postgresql.org/message-id/flat/CAAKRu_ay5iKmnbXZ3DsauViF3eMxu4m1oNnJXqV_HyqYeg55Ww%40mail.gmail.com
---
 doc/src/sgml/monitoring.sgml           |  59 ++++++++++
 src/backend/catalog/system_views.sql   |   4 +
 src/backend/storage/buffer/bufmgr.c    |  69 ++++--------
 src/backend/storage/buffer/localbuf.c  |  20 +---
 src/backend/storage/smgr/md.c          |  31 ++++--
 src/backend/utils/activity/pgstat_io.c | 142 +++++++++++++++++++++----
 src/backend/utils/adt/pgstatfuncs.c    |  48 ++++++++-
 src/include/catalog/pg_proc.dat        |   6 +-
 src/include/pgstat.h                   |   7 +-
 src/test/regress/expected/rules.out    |   6 +-
 10 files changed, 288 insertions(+), 104 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 6249bb50d0..ad3667f258 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -3814,6 +3814,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>read_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in read operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3826,6 +3838,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>write_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in write operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3838,6 +3862,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>extend_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in extend operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3902,6 +3938,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>fsync_time</structfield> <type>double precision</type>
+       </para>
+       <para>
+        Time spent in fsync operations in milliseconds (if <xref
+        linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
@@ -3967,6 +4015,17 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
    </itemizedlist>
   </para>
 
+  <note>
+   <para>
+    Columns tracking I/O time will only be non-zero when <xref
+    linkend="guc-track-io-timing"/> is enabled. The user should be careful when
+    using these columns in combination with their corresponding operations to
+    ensure that <varname>track_io_timing</varname> was enabled for the entire
+    time since the last reset.
+   </para>
+  </note>
+
+
 
  </sect2>
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 34ca0e739f..39391bc2fc 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1123,12 +1123,16 @@ SELECT
        b.io_object,
        b.io_context,
        b.reads,
+       b.read_time,
        b.writes,
+       b.write_time,
        b.extends,
+       b.extend_time,
        b.op_bytes,
        b.evictions,
        b.reuses,
        b.fsyncs,
+       b.fsync_time,
        b.stats_reset
 FROM pg_stat_get_io() b;
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dea2e8fe40..4467a24853 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1000,12 +1000,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	if (isExtend)
 	{
+		instr_time	io_start;
+
 		/* new buffers are zero-filled */
 		MemSet((char *) bufBlock, 0, BLCKSZ);
+
+		io_start = pgstat_prepare_io_time();
+
 		/* don't set checksum for all-zero page */
 		smgrextend(smgr, forkNum, blockNum, bufBlock, false);
 
-		pgstat_count_io_op(io_object, io_context, IOOP_EXTEND);
+		pgstat_count_io_op_time(io_object, io_context,
+								IOOP_EXTEND, io_start);
 
 		/*
 		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
@@ -1024,25 +1030,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			MemSet((char *) bufBlock, 0, BLCKSZ);
 		else
 		{
-			instr_time	io_start,
-						io_time;
+			instr_time	io_start;
 
-			if (track_io_timing)
-				INSTR_TIME_SET_CURRENT(io_start);
-			else
-				INSTR_TIME_SET_ZERO(io_start);
+			io_start = pgstat_prepare_io_time();
 
 			smgrread(smgr, forkNum, blockNum, bufBlock);
 
-			pgstat_count_io_op(io_object, io_context, IOOP_READ);
-
-			if (track_io_timing)
-			{
-				INSTR_TIME_SET_CURRENT(io_time);
-				INSTR_TIME_SUBTRACT(io_time, io_start);
-				pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
-				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
-			}
+			pgstat_count_io_op_time(io_object, io_context,
+									IOOP_READ, io_start);
 
 			/* check for garbage data */
 			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
@@ -2848,6 +2843,7 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
 	*blknum = bufHdr->tag.blockNum;
 }
 
+
 /*
  * FlushBuffer
  *		Physically write out a shared buffer.
@@ -2873,8 +2869,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 {
 	XLogRecPtr	recptr;
 	ErrorContextCallback errcallback;
-	instr_time	io_start,
-				io_time;
+	instr_time	io_start;
 	Block		bufBlock;
 	char	   *bufToWrite;
 	uint32		buf_state;
@@ -2949,10 +2944,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 */
 	bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
 
-	if (track_io_timing)
-		INSTR_TIME_SET_CURRENT(io_start);
-	else
-		INSTR_TIME_SET_ZERO(io_start);
+	io_start = pgstat_prepare_io_time();
 
 	/*
 	 * bufToWrite is either the shared buffer or a copy, as appropriate.
@@ -2981,15 +2973,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * When a strategy is not in use, the write can only be a "regular" write
 	 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
 	 */
-	pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE);
-
-	if (track_io_timing)
-	{
-		INSTR_TIME_SET_CURRENT(io_time);
-		INSTR_TIME_SUBTRACT(io_time, io_start);
-		pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-		INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
-	}
+	pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
+							IOOP_WRITE, io_start);
 
 	pgBufferUsage.shared_blks_written++;
 
@@ -3591,14 +3576,13 @@ FlushRelationBuffers(Relation rel)
 {
 	int			i;
 	BufferDesc *bufHdr;
-	instr_time	io_start,
-				io_time;
 
 	if (RelationUsesLocalBuffers(rel))
 	{
 		for (i = 0; i < NLocBuffer; i++)
 		{
 			uint32		buf_state;
+			instr_time	io_start;
 
 			bufHdr = GetLocalBufferDescriptor(i);
 			if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
@@ -3618,10 +3602,7 @@ FlushRelationBuffers(Relation rel)
 
 				PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-				if (track_io_timing)
-					INSTR_TIME_SET_CURRENT(io_start);
-				else
-					INSTR_TIME_SET_ZERO(io_start);
+				io_start = pgstat_prepare_io_time();
 
 				smgrwrite(RelationGetSmgr(rel),
 						  BufTagGetForkNum(&bufHdr->tag),
@@ -3629,19 +3610,12 @@ FlushRelationBuffers(Relation rel)
 						  localpage,
 						  false);
 
-
 				buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-				pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
-
-				if (track_io_timing)
-				{
-					INSTR_TIME_SET_CURRENT(io_time);
-					INSTR_TIME_SUBTRACT(io_time, io_start);
-					pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-					INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
-				}
+				pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
+										IOCONTEXT_NORMAL, IOOP_WRITE,
+										io_start);
 
 				pgBufferUsage.local_blks_written++;
 
@@ -3650,6 +3624,7 @@ FlushRelationBuffers(Relation rel)
 			}
 		}
 
+
 		return;
 	}
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 80510411ae..86beb40a6f 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -114,8 +114,6 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	LocalBufferLookupEnt *hresult;
 	BufferDesc *bufHdr;
 	int			b;
-	instr_time	io_start,
-				io_time;
 	int			trycounter;
 	bool		found;
 	uint32		buf_state;
@@ -222,6 +220,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 */
 	if (buf_state & BM_DIRTY)
 	{
+		instr_time	io_start;
 		SMgrRelation oreln;
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
@@ -230,10 +229,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-		if (track_io_timing)
-			INSTR_TIME_SET_CURRENT(io_start);
-		else
-			INSTR_TIME_SET_ZERO(io_start);
+		io_start = pgstat_prepare_io_time();
 
 		/* And write... */
 		smgrwrite(oreln,
@@ -246,16 +242,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		buf_state &= ~BM_DIRTY;
 		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-		pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
-
-		if (track_io_timing)
-		{
-			INSTR_TIME_SET_CURRENT(io_time);
-			INSTR_TIME_SUBTRACT(io_time, io_start);
-			pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-			INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
-		}
-
+		pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL,
+								IOOP_WRITE, io_start);
 		pgBufferUsage.local_blks_written++;
 	}
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 352958e1fe..51a9134d57 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1030,6 +1030,19 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
 	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
 	{
+		instr_time	io_start;
+
+		ereport(DEBUG1,
+				(errmsg_internal("could not forward fsync request because request queue is full")));
+
+		io_start = pgstat_prepare_io_time();
+
+		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							FilePathName(seg->mdfd_vfd))));
+
 		/*
 		 * We have no way of knowing if the current IOContext is
 		 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
@@ -1041,16 +1054,8 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 		 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
 		 * backend fsyncs.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
-
-		ereport(DEBUG1,
-				(errmsg_internal("could not forward fsync request because request queue is full")));
-
-		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
-			ereport(data_sync_elevel(ERROR),
-					(errcode_for_file_access(),
-					 errmsg("could not fsync file \"%s\": %m",
-							FilePathName(seg->mdfd_vfd))));
+		pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+								IOOP_FSYNC, io_start);
 	}
 }
 
@@ -1399,6 +1404,7 @@ int
 mdsyncfiletag(const FileTag *ftag, char *path)
 {
 	SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
+	instr_time	io_start;
 	File		file;
 	bool		need_to_close;
 	int			result,
@@ -1425,6 +1431,8 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 		need_to_close = true;
 	}
 
+	io_start = pgstat_prepare_io_time();
+
 	/* Sync the file. */
 	result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
 	save_errno = errno;
@@ -1432,7 +1440,8 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 	if (need_to_close)
 		FileClose(file);
 
-	pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
+	pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+							IOOP_FSYNC, io_start);
 
 	errno = save_errno;
 	return result;
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index af5d554610..0cf4b78947 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -16,45 +16,65 @@
 
 #include "postgres.h"
 
+#include "executor/instrument.h"
+#include "storage/bufmgr.h"
 #include "utils/pgstat_internal.h"
 
 
-static PgStat_BktypeIO PendingIOStats;
+typedef struct PgStat_PendingIO
+{
+	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	instr_time	pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+}			PgStat_PendingIO;
+
+
+static PgStat_PendingIO PendingIOStats;
 bool		have_iostats = false;
 
 
 /*
  * Check that stats have not been counted for any combination of IOObject,
- * IOContext, and IOOp which are not tracked for the passed-in BackendType. The
- * passed-in PgStat_BktypeIO must contain stats from the BackendType specified
- * by the second parameter. Caller is responsible for locking the passed-in
- * PgStat_BktypeIO, if needed.
+ * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
+ * the IOOp is not counted for this combination but IO time is otherwise
+ * tracked for this IOOp, check that IO time has not been counted for this
+ * combination. If stats are tracked for this combination and IO times are
+ * non-zero, counts should be non-zero.
+ *
+ * The passed-in PgStat_BktypeIO must contain stats from the BackendType
+ * specified by the second parameter. Caller is responsible for locking the
+ * passed-in PgStat_BktypeIO, if needed.
  */
 bool
 pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 							 BackendType bktype)
 {
-	bool		bktype_tracked = pgstat_tracks_io_bktype(bktype);
-
 	for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
 	{
 		for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
 		{
-			/*
-			 * Don't bother trying to skip to the next loop iteration if
-			 * pgstat_tracks_io_object() would return false here. We still
-			 * need to validate that each counter is zero anyway.
-			 */
 			for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 			{
-				/* No stats, so nothing to validate */
-				if (backend_io->data[io_object][io_context][io_op] == 0)
+				/* we do track it */
+				if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
+				{
+					/* ensure that if IO times are non-zero, counts are > 0 */
+					if (backend_io->times[io_object][io_context][io_op] != 0 &&
+						backend_io->counts[io_object][io_context][io_op] <= 0)
+						return false;
+
 					continue;
+				}
 
-				/* There are stats and there shouldn't be */
-				if (!bktype_tracked ||
-					!pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
+				/* we don't track it, and it is not 0 */
+				if (backend_io->counts[io_object][io_context][io_op] != 0)
 					return false;
+
+				/* we don't track this IOOp, so make sure its IO time is zero */
+				if (pgstat_tracks_io_time(io_op) > -1)
+				{
+					if (backend_io->times[io_object][io_context][io_op] != 0)
+						return false;
+				}
 			}
 		}
 	}
@@ -62,6 +82,19 @@ pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 	return true;
 }
 
+instr_time
+pgstat_prepare_io_time(void)
+{
+	instr_time	io_start;
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+	else
+		INSTR_TIME_SET_ZERO(io_start);
+
+	return io_start;
+}
+
 void
 pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
 {
@@ -70,11 +103,44 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
 	Assert((unsigned int) io_op < IOOP_NUM_TYPES);
 	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
 
-	PendingIOStats.data[io_object][io_context][io_op]++;
+	PendingIOStats.counts[io_object][io_context][io_op]++;
 
 	have_iostats = true;
 }
 
+/*
+ * Like pgstat_count_io_op() except it also accumulates time.
+ */
+void
+pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
+						instr_time start_time)
+{
+	if (track_io_timing)
+	{
+		instr_time	io_time;
+
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, start_time);
+
+		if (io_op == IOOP_WRITE)
+		{
+			pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+			if (io_object == IOOBJECT_RELATION)
+				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+		}
+		else if (io_op == IOOP_READ)
+		{
+			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+			if (io_object == IOOBJECT_RELATION)
+				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+		}
+
+		INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op], io_time);
+	}
+
+	pgstat_count_io_op(io_object, io_context, io_op);
+}
+
 PgStat_IO *
 pgstat_fetch_stat_io(void)
 {
@@ -114,8 +180,17 @@ pgstat_flush_io(bool nowait)
 		for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
 		{
 			for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
-				bktype_shstats->data[io_object][io_context][io_op] +=
-					PendingIOStats.data[io_object][io_context][io_op];
+			{
+				instr_time	time;
+
+				bktype_shstats->counts[io_object][io_context][io_op] +=
+					PendingIOStats.counts[io_object][io_context][io_op];
+
+				time = PendingIOStats.pending_times[io_object][io_context][io_op];
+
+				bktype_shstats->times[io_object][io_context][io_op] +=
+					INSTR_TIME_GET_MICROSEC(time);
+			}
 		}
 	}
 
@@ -384,3 +459,30 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
 
 	return true;
 }
+
+/*
+ * PgStat_BktypeIO->times contains IO times for IOOps. For simplicity this
+ * array has a spot for every IOOp. pgstat_tracks_io_time() is the source of
+ * truth for which IOOps have corresponding IO times.
+ */
+IOOp
+pgstat_tracks_io_time(IOOp io_op)
+{
+	switch (io_op)
+	{
+		case IOOP_READ:
+			return IOOP_READ;
+		case IOOP_WRITE:
+			return IOOP_WRITE;
+		case IOOP_EXTEND:
+			return IOOP_EXTEND;
+		case IOOP_FSYNC:
+			return IOOP_FSYNC;
+		case IOOP_EVICT:
+		case IOOP_REUSE:
+			return -1;
+	}
+
+	elog(ERROR, "unrecognized IOOp value: %d", io_op);
+	pg_unreachable();
+}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 35c6d46555..60fdda1a14 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1255,12 +1255,16 @@ typedef enum io_stat_col
 	IO_COL_IO_OBJECT,
 	IO_COL_IO_CONTEXT,
 	IO_COL_READS,
+	IO_COL_READ_TIME,
 	IO_COL_WRITES,
+	IO_COL_WRITE_TIME,
 	IO_COL_EXTENDS,
+	IO_COL_EXTEND_TIME,
 	IO_COL_CONVERSION,
 	IO_COL_EVICTIONS,
 	IO_COL_REUSES,
 	IO_COL_FSYNCS,
+	IO_COL_FSYNC_TIME,
 	IO_COL_RESET_TIME,
 	IO_NUM_COLUMNS,
 } io_stat_col;
@@ -1292,6 +1296,28 @@ pgstat_get_io_op_index(IOOp io_op)
 	pg_unreachable();
 }
 
+/*
+ * Get the number of the column containing IO times for the specified IOOp. If
+ * the specified IOOp is one for which IO time is not tracked, return -1. Note
+ * that this function assumes that IO time for an IOOp is displayed in the view
+ * in the column directly after the IOOp counts.
+ */
+static io_stat_col
+pgstat_get_io_time_index(IOOp io_op)
+{
+	if (pgstat_tracks_io_time(io_op) == -1)
+		return -1;
+
+	return pgstat_get_io_op_index(io_op) + 1;
+}
+
+static inline
+PgStat_Counter
+pg_stat_micro_to_millisecs(PgStat_Counter val_microsec)
+{
+	return val_microsec * 0.001;
+}
+
 Datum
 pg_stat_get_io(PG_FUNCTION_ARGS)
 {
@@ -1359,20 +1385,32 @@ pg_stat_get_io(PG_FUNCTION_ARGS)
 
 				for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 				{
-					int			col_idx = pgstat_get_io_op_index(io_op);
+					PgStat_Counter count = bktype_stats->counts[io_obj][io_context][io_op];
+					int			i = pgstat_get_io_op_index(io_op);
 
 					/*
 					 * Some combinations of BackendType and IOOp, of IOContext
 					 * and IOOp, and of IOObject and IOOp are not tracked. Set
 					 * these cells in the view NULL.
 					 */
-					nulls[col_idx] = !pgstat_tracks_io_op(bktype, io_obj, io_context, io_op);
+					if (pgstat_tracks_io_op(bktype, io_obj, io_context, io_op))
+						values[i] = Int64GetDatum(count);
+					else
+						nulls[i] = true;
+				}
 
-					if (nulls[col_idx])
+				for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
+				{
+					PgStat_Counter time = bktype_stats->times[io_obj][io_context][io_op];
+					int			i = pgstat_get_io_time_index(io_op);
+
+					if (i == -1)
 						continue;
 
-					values[col_idx] =
-						Int64GetDatum(bktype_stats->data[io_obj][io_context][io_op]);
+					if (!nulls[pgstat_get_io_op_index(io_op)])
+						values[i] = Float8GetDatum(pg_stat_micro_to_millisecs(time));
+					else
+						nulls[i] = true;
 				}
 
 				tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 5cf87aeb2c..4148ec09ae 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5744,9 +5744,9 @@
   proname => 'pg_stat_get_io', provolatile => 'v',
   prorows => '30', proretset => 't',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
-  proallargtypes => '{text,text,text,int8,int8,int8,int8,int8,int8,int8,timestamptz}',
-  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{backend_type,io_object,io_context,reads,writes,extends,op_bytes,evictions,reuses,fsyncs,stats_reset}',
+  proallargtypes => '{text,text,text,int8,float8,int8,float8,int8,float8,int8,int8,int8,int8,float8,timestamptz}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{backend_type,io_object,io_context,reads,read_time,writes,write_time,extends,extend_time,op_bytes,evictions,reuses,fsyncs,fsync_time,stats_reset}',
   prosrc => 'pg_stat_get_io' },
 
 { oid => '1136', descr => 'statistics: information about WAL activity',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 1e418b682b..4adbf29fdf 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -304,7 +304,8 @@ typedef enum IOOp
 
 typedef struct PgStat_BktypeIO
 {
-	PgStat_Counter data[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 } PgStat_BktypeIO;
 
 typedef struct PgStat_IO
@@ -497,7 +498,10 @@ extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void);
 
 extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *context_ops,
 										 BackendType bktype);
+extern instr_time pgstat_prepare_io_time(void);
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op);
+extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time);
+
 extern PgStat_IO *pgstat_fetch_stat_io(void);
 extern const char *pgstat_get_io_context_name(IOContext io_context);
 extern const char *pgstat_get_io_object_name(IOObject io_object);
@@ -507,6 +511,7 @@ extern bool pgstat_tracks_io_object(BackendType bktype,
 									IOObject io_object, IOContext io_context);
 extern bool pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
 								IOContext io_context, IOOp io_op);
+extern IOOp pgstat_tracks_io_time(IOOp io_op);
 
 
 /*
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index e953d1f515..5434851314 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1880,14 +1880,18 @@ pg_stat_io| SELECT backend_type,
     io_object,
     io_context,
     reads,
+    read_time,
     writes,
+    write_time,
     extends,
+    extend_time,
     op_bytes,
     evictions,
     reuses,
     fsyncs,
+    fsync_time,
     stats_reset
-   FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, writes, extends, op_bytes, evictions, reuses, fsyncs, stats_reset);
+   FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, read_time, writes, write_time, extends, extend_time, op_bytes, evictions, reuses, fsyncs, fsync_time, stats_reset);
 pg_stat_progress_analyze| SELECT s.pid,
     s.datid,
     d.datname,
-- 
2.37.2

Reply via email to