From dc76ac40030ceafb54a11a8126b1702fb807a64a Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Wed, 18 Aug 2021 14:55:17 +0900
Subject: [PATCH v2] Track I/O timing for temp buffers

Previously, the output of EXPLAIN with BUFFERS option shows only I/O
timing spent reading and writing shared and local buffers. This
commit shows I/O timing for temp buffers in the output of EXPLAIN. It
is helpful for users in a case where it is the temp buffers I/O that
is using most of the time.

Same as shared/local buffer I/O timings, temp buffers I/O timing is
shown only when track_io_timing is enabled.
---
 doc/src/sgml/ref/explain.sgml      |  8 +++----
 src/backend/commands/explain.c     | 36 +++++++++++++++++++++++-------
 src/backend/executor/instrument.c  |  6 +++++
 src/backend/storage/file/buffile.c | 29 +++++++++++++++++++++++-
 src/include/executor/instrument.h  |  2 ++
 5 files changed, 68 insertions(+), 13 deletions(-)

diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 4d758fb237..70c65c4ba1 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -174,10 +174,10 @@ ROLLBACK;
       Include information on buffer usage. Specifically, include the number of
       shared blocks hit, read, dirtied, and written, the number of local blocks
       hit, read, dirtied, and written, the number of temp blocks read and
-      written, and the time spent reading and writing data file blocks
-      (in milliseconds) if <xref linkend="guc-track-io-timing"/> is enabled.
-      A <emphasis>hit</emphasis> means that a read was avoided because the block was
-      found already in cache when needed.
+      written, and the time spent reading and writing data file blocks and
+      temp file blocks (in milliseconds) if <xref linkend="guc-track-io-timing"/>
+      is enabled.  A <emphasis>hit</emphasis> means that a read was avoided because
+      the block was found already in cache when needed.
       Shared blocks contain data from regular tables and indexes;
       local blocks contain data from temporary tables and indexes;
       while temp blocks contain short-term working data used in sorts, hashes,
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 10644dfac4..c5b643cd31 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3501,8 +3501,11 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 								usage->temp_blks_written > 0);
 		bool		has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
 								  !INSTR_TIME_IS_ZERO(usage->blk_write_time));
+		bool		has_temp_timing = (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time) ||
+									   !INSTR_TIME_IS_ZERO(usage->temp_blk_write_time));
 		bool		show_planning = (planning && (has_shared ||
-												  has_local || has_temp || has_timing));
+												  has_local || has_temp || has_timing ||
+												  has_temp_timing));
 
 		if (show_planning)
 		{
@@ -3567,16 +3570,33 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 		}
 
 		/* As above, show only positive counter values. */
-		if (has_timing)
+		if (has_timing || has_temp_timing)
 		{
 			ExplainIndentText(es);
 			appendStringInfoString(es->str, "I/O Timings:");
-			if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
-				appendStringInfo(es->str, " read=%0.3f",
-								 INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
-			if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
-				appendStringInfo(es->str, " write=%0.3f",
-								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+
+			if (has_timing)
+			{
+				appendStringInfoString(es->str, " shared/local");
+				if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
+					appendStringInfo(es->str, " read=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
+				if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
+					appendStringInfo(es->str, " write=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+				if (has_temp_timing)
+					appendStringInfoChar(es->str, ',');
+			}
+			if (has_temp_timing)
+			{
+				appendStringInfoString(es->str, " temp");
+				if (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time))
+					appendStringInfo(es->str, " read=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+				if (!INSTR_TIME_IS_ZERO(usage->temp_blk_write_time))
+					appendStringInfo(es->str, " write=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+			}
 			appendStringInfoChar(es->str, '\n');
 		}
 
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 2b106d8473..5c3cc4b1cc 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -237,6 +237,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->temp_blks_written += add->temp_blks_written;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
+	INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
+	INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
 }
 
 /* dst += add - sub */
@@ -259,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
 						  add->blk_write_time, sub->blk_write_time);
+	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
+						  add->temp_blk_read_time, sub->temp_blk_read_time);
+	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
+						  add->temp_blk_write_time, sub->temp_blk_write_time);
 }
 
 /* helper functions for WAL usage accumulation */
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index a4be5fe513..0188fbc1b6 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -417,6 +417,8 @@ static void
 BufFileLoadBuffer(BufFile *file)
 {
 	File		thisfile;
+	instr_time	io_start;
+	instr_time	io_time;
 
 	/*
 	 * Advance to next component file if necessary and possible.
@@ -428,10 +430,14 @@ BufFileLoadBuffer(BufFile *file)
 		file->curOffset = 0L;
 	}
 
+	thisfile = file->files[file->curFile];
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+
 	/*
 	 * Read whatever we can get, up to a full bufferload.
 	 */
-	thisfile = file->files[file->curFile];
 	file->nbytes = FileRead(thisfile,
 							file->buffer.data,
 							sizeof(file->buffer),
@@ -446,6 +452,13 @@ BufFileLoadBuffer(BufFile *file)
 						FilePathName(thisfile))));
 	}
 
+	if (track_io_timing)
+	{
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, io_start);
+		INSTR_TIME_ADD(pgBufferUsage.temp_blk_read_time, io_time);
+	}
+
 	/* we choose not to advance curOffset here */
 
 	if (file->nbytes > 0)
@@ -473,6 +486,8 @@ BufFileDumpBuffer(BufFile *file)
 	while (wpos < file->nbytes)
 	{
 		off_t		availbytes;
+		instr_time	io_start;
+		instr_time	io_time;
 
 		/*
 		 * Advance to next component file if necessary and possible.
@@ -495,6 +510,10 @@ BufFileDumpBuffer(BufFile *file)
 			bytestowrite = (int) availbytes;
 
 		thisfile = file->files[file->curFile];
+
+		if (track_io_timing)
+			INSTR_TIME_SET_CURRENT(io_start);
+
 		bytestowrite = FileWrite(thisfile,
 								 file->buffer.data + wpos,
 								 bytestowrite,
@@ -505,6 +524,14 @@ BufFileDumpBuffer(BufFile *file)
 					(errcode_for_file_access(),
 					 errmsg("could not write to file \"%s\": %m",
 							FilePathName(thisfile))));
+
+		if (track_io_timing)
+		{
+			INSTR_TIME_SET_CURRENT(io_time);
+			INSTR_TIME_SUBTRACT(io_time, io_start);
+			INSTR_TIME_ADD(pgBufferUsage.temp_blk_write_time, io_time);
+		}
+
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 2f9905b7c8..6f3cbcabef 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -35,6 +35,8 @@ typedef struct BufferUsage
 	int64		temp_blks_written;	/* # of temp blocks written */
 	instr_time	blk_read_time;	/* time spent reading */
 	instr_time	blk_write_time; /* time spent writing */
+	instr_time	temp_blk_read_time;	/* time spent reading tmp blocks */
+	instr_time	temp_blk_write_time; /* time spent writing tmp blocks */
 } BufferUsage;
 
 /*
-- 
2.24.3 (Apple Git-128)

