From b900ab6fe2b9fc22ebd2304ffc5aa51adcfd7898 Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Wed, 18 Aug 2021 14:55:17 +0900
Subject: [PATCH v4 1/2] Track I/O timing for temp buffers

Previously, the output of EXPLAIN with BUFFERS option shows only I/O
timing spent reading and writing shared and local buffers. This
commit shows I/O timing for temp buffers in the output of EXPLAIN. It
is helpful for users in a case where it is the temp buffers I/O that
is using most of the time.

Same as shared/local buffer I/O timings, temp buffers I/O timing is
shown only when track_io_timing is enabled.
---
 doc/src/sgml/ref/explain.sgml         |   8 +-
 src/backend/commands/explain.c        |  42 ++++++++--
 src/backend/executor/instrument.c     |   6 ++
 src/backend/storage/file/buffile.c    |  29 ++++++-
 src/include/executor/instrument.h     |   6 +-
 src/test/regress/expected/explain.out | 112 ++++++++++++++------------
 src/test/regress/sql/explain.sql      |   8 +-
 7 files changed, 145 insertions(+), 66 deletions(-)

diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 4d758fb237..70c65c4ba1 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -174,10 +174,10 @@ ROLLBACK;
       Include information on buffer usage. Specifically, include the number of
       shared blocks hit, read, dirtied, and written, the number of local blocks
       hit, read, dirtied, and written, the number of temp blocks read and
-      written, and the time spent reading and writing data file blocks
-      (in milliseconds) if <xref linkend="guc-track-io-timing"/> is enabled.
-      A <emphasis>hit</emphasis> means that a read was avoided because the block was
-      found already in cache when needed.
+      written, and the time spent reading and writing data file blocks and
+      temp file blocks (in milliseconds) if <xref linkend="guc-track-io-timing"/>
+      is enabled.  A <emphasis>hit</emphasis> means that a read was avoided because
+      the block was found already in cache when needed.
       Shared blocks contain data from regular tables and indexes;
       local blocks contain data from temporary tables and indexes;
       while temp blocks contain short-term working data used in sorts, hashes,
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index cb13227db1..197a11d84b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3513,8 +3513,11 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 								usage->temp_blks_written > 0);
 		bool		has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
 								  !INSTR_TIME_IS_ZERO(usage->blk_write_time));
+		bool		has_temp_timing = (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time) ||
+									   !INSTR_TIME_IS_ZERO(usage->temp_blk_write_time));
 		bool		show_planning = (planning && (has_shared ||
-												  has_local || has_temp || has_timing));
+												  has_local || has_temp || has_timing ||
+												  has_temp_timing));
 
 		if (show_planning)
 		{
@@ -3579,16 +3582,33 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 		}
 
 		/* As above, show only positive counter values. */
-		if (has_timing)
+		if (has_timing || has_temp_timing)
 		{
 			ExplainIndentText(es);
 			appendStringInfoString(es->str, "I/O Timings:");
-			if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
-				appendStringInfo(es->str, " read=%0.3f",
-								 INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
-			if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
-				appendStringInfo(es->str, " write=%0.3f",
-								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+
+			if (has_timing)
+			{
+				appendStringInfoString(es->str, " shared/local");
+				if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
+					appendStringInfo(es->str, " read=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
+				if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
+					appendStringInfo(es->str, " write=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+				if (has_temp_timing)
+					appendStringInfoChar(es->str, ',');
+			}
+			if (has_temp_timing)
+			{
+				appendStringInfoString(es->str, " temp");
+				if (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time))
+					appendStringInfo(es->str, " read=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+				if (!INSTR_TIME_IS_ZERO(usage->temp_blk_write_time))
+					appendStringInfo(es->str, " write=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+			}
 			appendStringInfoChar(es->str, '\n');
 		}
 
@@ -3625,6 +3645,12 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			ExplainPropertyFloat("I/O Write Time", "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time),
 								 3, es);
+			ExplainPropertyFloat("Temp I/O Read Time", "ms",
+								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time),
+								 3, es);
+			ExplainPropertyFloat("Temp I/O Write Time", "ms",
+								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
+								 3, es);
 		}
 	}
 }
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index c5ff02a842..ceff4727d4 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -237,6 +237,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->temp_blks_written += add->temp_blks_written;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
+	INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
+	INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
 }
 
 /* dst += add - sub */
@@ -259,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
 						  add->blk_write_time, sub->blk_write_time);
+	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
+						  add->temp_blk_read_time, sub->temp_blk_read_time);
+	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
+						  add->temp_blk_write_time, sub->temp_blk_write_time);
 }
 
 /* helper functions for WAL usage accumulation */
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 3108e96dc9..56b88594cc 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -429,6 +429,8 @@ static void
 BufFileLoadBuffer(BufFile *file)
 {
 	File		thisfile;
+	instr_time	io_start;
+	instr_time	io_time;
 
 	/*
 	 * Advance to next component file if necessary and possible.
@@ -440,10 +442,14 @@ BufFileLoadBuffer(BufFile *file)
 		file->curOffset = 0L;
 	}
 
+	thisfile = file->files[file->curFile];
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+
 	/*
 	 * Read whatever we can get, up to a full bufferload.
 	 */
-	thisfile = file->files[file->curFile];
 	file->nbytes = FileRead(thisfile,
 							file->buffer.data,
 							sizeof(file->buffer),
@@ -458,6 +464,13 @@ BufFileLoadBuffer(BufFile *file)
 						FilePathName(thisfile))));
 	}
 
+	if (track_io_timing)
+	{
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, io_start);
+		INSTR_TIME_ADD(pgBufferUsage.temp_blk_read_time, io_time);
+	}
+
 	/* we choose not to advance curOffset here */
 
 	if (file->nbytes > 0)
@@ -485,6 +498,8 @@ BufFileDumpBuffer(BufFile *file)
 	while (wpos < file->nbytes)
 	{
 		off_t		availbytes;
+		instr_time	io_start;
+		instr_time	io_time;
 
 		/*
 		 * Advance to next component file if necessary and possible.
@@ -507,6 +522,10 @@ BufFileDumpBuffer(BufFile *file)
 			bytestowrite = (int) availbytes;
 
 		thisfile = file->files[file->curFile];
+
+		if (track_io_timing)
+			INSTR_TIME_SET_CURRENT(io_start);
+
 		bytestowrite = FileWrite(thisfile,
 								 file->buffer.data + wpos,
 								 bytestowrite,
@@ -517,6 +536,14 @@ BufFileDumpBuffer(BufFile *file)
 					(errcode_for_file_access(),
 					 errmsg("could not write to file \"%s\": %m",
 							FilePathName(thisfile))));
+
+		if (track_io_timing)
+		{
+			INSTR_TIME_SET_CURRENT(io_time);
+			INSTR_TIME_SUBTRACT(io_time, io_start);
+			INSTR_TIME_ADD(pgBufferUsage.temp_blk_write_time, io_time);
+		}
+
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 1b7157bdd1..6c080beb81 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -33,8 +33,10 @@ typedef struct BufferUsage
 	int64		local_blks_written; /* # of local disk blocks written */
 	int64		temp_blks_read; /* # of temp blocks read */
 	int64		temp_blks_written;	/* # of temp blocks written */
-	instr_time	blk_read_time;	/* time spent reading */
-	instr_time	blk_write_time; /* time spent writing */
+	instr_time	blk_read_time;	/* time spent reading blocks */
+	instr_time	blk_write_time; /* time spent writing blocks */
+	instr_time	temp_blk_read_time;	/* time spent reading temp blocks */
+	instr_time	temp_blk_write_time; /* time spent writing temp blocks */
 } BufferUsage;
 
 /*
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index bc36175921..49890625c8 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -85,56 +85,6 @@ select explain_filter('explain (analyze, buffers, format text) select * from int
  Execution Time: N.N ms
 (3 rows)
 
-select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
-           explain_filter           
-------------------------------------
- [                                 +
-   {                               +
-     "Plan": {                     +
-       "Node Type": "Seq Scan",    +
-       "Parallel Aware": false,    +
-       "Async Capable": false,     +
-       "Relation Name": "int8_tbl",+
-       "Alias": "i8",              +
-       "Startup Cost": N.N,        +
-       "Total Cost": N.N,          +
-       "Plan Rows": N,             +
-       "Plan Width": N,            +
-       "Actual Startup Time": N.N, +
-       "Actual Total Time": N.N,   +
-       "Actual Rows": N,           +
-       "Actual Loops": N,          +
-       "Shared Hit Blocks": N,     +
-       "Shared Read Blocks": N,    +
-       "Shared Dirtied Blocks": N, +
-       "Shared Written Blocks": N, +
-       "Local Hit Blocks": N,      +
-       "Local Read Blocks": N,     +
-       "Local Dirtied Blocks": N,  +
-       "Local Written Blocks": N,  +
-       "Temp Read Blocks": N,      +
-       "Temp Written Blocks": N    +
-     },                            +
-     "Planning": {                 +
-       "Shared Hit Blocks": N,     +
-       "Shared Read Blocks": N,    +
-       "Shared Dirtied Blocks": N, +
-       "Shared Written Blocks": N, +
-       "Local Hit Blocks": N,      +
-       "Local Read Blocks": N,     +
-       "Local Dirtied Blocks": N,  +
-       "Local Written Blocks": N,  +
-       "Temp Read Blocks": N,      +
-       "Temp Written Blocks": N    +
-     },                            +
-     "Planning Time": N.N,         +
-     "Triggers": [                 +
-     ],                            +
-     "Execution Time": N.N         +
-   }                               +
- ]
-(1 row)
-
 select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
                      explain_filter                     
 --------------------------------------------------------
@@ -276,6 +226,68 @@ select explain_filter('explain (buffers, format json) select * from int8_tbl i8'
  ]
 (1 row)
 
+-- Check output including I/O timings. Since these fields are conditional
+-- we check them only in json format.
+set track_io_timing = on;
+select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
+           explain_filter           
+------------------------------------
+ [                                 +
+   {                               +
+     "Plan": {                     +
+       "Node Type": "Seq Scan",    +
+       "Parallel Aware": false,    +
+       "Async Capable": false,     +
+       "Relation Name": "int8_tbl",+
+       "Alias": "i8",              +
+       "Startup Cost": N.N,        +
+       "Total Cost": N.N,          +
+       "Plan Rows": N,             +
+       "Plan Width": N,            +
+       "Actual Startup Time": N.N, +
+       "Actual Total Time": N.N,   +
+       "Actual Rows": N,           +
+       "Actual Loops": N,          +
+       "Shared Hit Blocks": N,     +
+       "Shared Read Blocks": N,    +
+       "Shared Dirtied Blocks": N, +
+       "Shared Written Blocks": N, +
+       "Local Hit Blocks": N,      +
+       "Local Read Blocks": N,     +
+       "Local Dirtied Blocks": N,  +
+       "Local Written Blocks": N,  +
+       "Temp Read Blocks": N,      +
+       "Temp Written Blocks": N,   +
+       "I/O Read Time": N.N,       +
+       "I/O Write Time": N.N,      +
+       "Temp I/O Read Time": N.N,  +
+       "Temp I/O Write Time": N.N  +
+     },                            +
+     "Planning": {                 +
+       "Shared Hit Blocks": N,     +
+       "Shared Read Blocks": N,    +
+       "Shared Dirtied Blocks": N, +
+       "Shared Written Blocks": N, +
+       "Local Hit Blocks": N,      +
+       "Local Read Blocks": N,     +
+       "Local Dirtied Blocks": N,  +
+       "Local Written Blocks": N,  +
+       "Temp Read Blocks": N,      +
+       "Temp Written Blocks": N,   +
+       "I/O Read Time": N.N,       +
+       "I/O Write Time": N.N,      +
+       "Temp I/O Read Time": N.N,  +
+       "Temp I/O Write Time": N.N  +
+     },                            +
+     "Planning Time": N.N,         +
+     "Triggers": [                 +
+     ],                            +
+     "Execution Time": N.N         +
+   }                               +
+ ]
+(1 row)
+
+set track_io_timing = off;
 -- SETTINGS option
 -- We have to ignore other settings that might be imposed by the environment,
 -- so printing the whole Settings field unfortunately won't do.
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index 5069fa8795..6c30e933ef 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -65,12 +65,18 @@ select explain_filter('explain select * from int8_tbl i8');
 select explain_filter('explain (analyze) select * from int8_tbl i8');
 select explain_filter('explain (analyze, verbose) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers, format text) select * from int8_tbl i8');
-select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers, format yaml) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
 
+-- Check output including I/O timings. Since these fields are conditional
+-- we check them only in json format.
+
+set track_io_timing = on;
+select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
+set track_io_timing = off;
+
 -- SETTINGS option
 -- We have to ignore other settings that might be imposed by the environment,
 -- so printing the whole Settings field unfortunately won't do.
-- 
2.24.3 (Apple Git-128)

