On Mon, Apr 24, 2023 at 6:13 PM Andres Freund <and...@anarazel.de> wrote:

> Hi,
>
> On 2023-04-24 17:37:48 -0400, Melanie Plageman wrote:
> > On Mon, Apr 24, 2023 at 02:14:32PM -0700, Andres Freund wrote:
> > > It starts blocking once "enough" IO is in flight. For things like an
> immediate
> > > checkpoint, that can happen fairly quickly, unless you have a very
> fast IO
> > > subsystem. So often it'll not matter whether we track smgrwriteback(),
> but
> > > when it matter, it can matter a lot.
> >
> > I see. So, it sounds like this is most likely to happen for checkpointer
> > and not likely to happen for other backends who call
> > ScheduleBufferTagForWriteback().
>
> It's more likely, but once the IO subsystem is busy, it'll also happen for
> other users ScheduleBufferTagForWriteback().
>
>
> > Also, it seems like this (given the current code) is only reachable for
> > permanent relations (i.e. not for IO object temp relation). If other
> backend
> > types than checkpointer may call smgrwriteback(), we likely have to
> consider
> > the IO context.
>
> I think we should take it into account - it'd e.g. interesting to see a
> COPY
> is bottlenecked on smgrwriteback() rather than just writing the data.
>

With the quick and dirty attached patch and using your example but with a
pgbench -T200 on my rather fast local NVMe SSD, you can still see quite
a difference.
This is with a stats reset before the checkpoint.

unpatched:

    backend_type     |    object     |  context  |  writes | write_time |
 fsyncs | fsync_time
---------------------+---------------+-----------+---------+------------+---------+------------
 background writer   | relation      | normal    |     443 |      1.408 |
    0 |          0
 checkpointer        | relation      | normal    |  187804 |    396.829 |
   47 |    254.226

patched:

    backend_type     |    object     |  context  |  writes |     write_time
    | fsyncs | fsync_time
---------------------+---------------+-----------+---------+--------------------+--------+------------
 background writer   | relation      | normal    |     917 |
4.4670000000000005 |      0 |          0
 checkpointer        | relation      | normal    |  375798 |
 977.354 |     48 |    202.514

I did compare client backend stats before and after pgbench and it made
basically no difference. I'll do a COPY example like you mentioned.

Patch needs cleanup/comments and a bit more work, but I could do with
a sanity check review on the approach.

- Melanie
From 6c5661ee5f0efbda9d246184f40cd799c21b5d2a Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Mon, 24 Apr 2023 18:21:54 -0400
Subject: [PATCH v1] Add writeback to pg_stat_io writes.

---
 src/backend/postmaster/bgwriter.c     |  4 ++--
 src/backend/storage/buffer/buf_init.c |  2 +-
 src/backend/storage/buffer/bufmgr.c   | 12 ++++++++++--
 src/include/storage/buf_internals.h   |  4 +++-
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index caad642ec9..4c3540d6e1 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -129,7 +129,7 @@ BackgroundWriterMain(void)
 											 ALLOCSET_DEFAULT_SIZES);
 	MemoryContextSwitchTo(bgwriter_context);
 
-	WritebackContextInit(&wb_context, &bgwriter_flush_after);
+	WritebackContextInit(&wb_context, IOCONTEXT_NORMAL, &bgwriter_flush_after);
 
 	/*
 	 * If an exception is encountered, processing resumes here.
@@ -185,7 +185,7 @@ BackgroundWriterMain(void)
 		MemoryContextResetAndDeleteChildren(bgwriter_context);
 
 		/* re-initialize to avoid repeated errors causing problems */
-		WritebackContextInit(&wb_context, &bgwriter_flush_after);
+		WritebackContextInit(&wb_context, IOCONTEXT_NORMAL, &bgwriter_flush_after);
 
 		/* Now we can allow interrupts again */
 		RESUME_INTERRUPTS();
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 0057443f0c..1d80532f3b 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -146,7 +146,7 @@ InitBufferPool(void)
 	StrategyInitialize(!foundDescs);
 
 	/* Initialize per-backend file flush context */
-	WritebackContextInit(&BackendWritebackContext,
+	WritebackContextInit(&BackendWritebackContext, IOCONTEXT_NORMAL,
 						 &backend_flush_after);
 }
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1fa689052e..774dbadd08 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1685,6 +1685,7 @@ again:
 		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
 		LWLockRelease(content_lock);
 
+		BackendWritebackContext.io_context = io_context;
 		ScheduleBufferTagForWriteback(&BackendWritebackContext,
 									  &buf_hdr->tag);
 	}
@@ -2555,7 +2556,7 @@ BufferSync(int flags)
 	if (num_to_scan == 0)
 		return;					/* nothing to do */
 
-	WritebackContextInit(&wb_context, &checkpoint_flush_after);
+	WritebackContextInit(&wb_context, IOCONTEXT_NORMAL, &checkpoint_flush_after);
 
 	TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
 
@@ -5433,10 +5434,11 @@ ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
  * writeback control will be performed.
  */
 void
-WritebackContextInit(WritebackContext *context, int *max_pending)
+WritebackContextInit(WritebackContext *context, IOContext io_context, int *max_pending)
 {
 	Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
 
+	context->io_context = io_context;
 	context->max_pending = max_pending;
 	context->nr_pending = 0;
 }
@@ -5491,6 +5493,7 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
 void
 IssuePendingWritebacks(WritebackContext *context)
 {
+	instr_time	io_start;
 	int			i;
 
 	if (context->nr_pending == 0)
@@ -5502,6 +5505,8 @@ IssuePendingWritebacks(WritebackContext *context)
 	 */
 	sort_pending_writebacks(context->pending_writebacks, context->nr_pending);
 
+	io_start = pgstat_prepare_io_time();
+
 	/*
 	 * Coalesce neighbouring writes, but nothing else. For that we iterate
 	 * through the, now sorted, array of pending flushes, and look forward to
@@ -5555,6 +5560,9 @@ IssuePendingWritebacks(WritebackContext *context)
 		smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
 	}
 
+	pgstat_count_io_op_time(IOOBJECT_RELATION, context->io_context, IOOP_WRITE,
+			io_start, context->nr_pending);
+
 	context->nr_pending = 0;
 }
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 136cf8fbaf..0d6e93ffad 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -295,6 +295,8 @@ typedef struct PendingWriteback
 /* struct forward declared in bufmgr.h */
 typedef struct WritebackContext
 {
+	IOContext io_context;
+
 	/* pointer to the max number of writeback requests to coalesce */
 	int		   *max_pending;
 
@@ -387,7 +389,7 @@ extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
  * Internal buffer management routines
  */
 /* bufmgr.c */
-extern void WritebackContextInit(WritebackContext *context, int *max_pending);
+extern void WritebackContextInit(WritebackContext *context, IOContext io_context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
 
-- 
2.37.2

Reply via email to