Hi,

While discussing the next steps for AIO writes in Postgres, Andres
suggested that a good starting point would be to begin evicting more
than one buffer at a time in some of the buffer access strategies that
perform writes. This would make it easier to later combine these
writes and, eventually, issue them asynchronously.

The attached patch implements this behavior for the BAS_BULKWRITE
strategy. With the patch applied, I observe average performance
improvements of about 15-20% for parallel COPY FROM operations on the
same table.

After some analysis, this improvement appears to be primarily due to
reduced time spent by each backend waiting on the lock to flush WAL.

Since backends now issue more data file writes before each WAL flush
(using a heuristic that avoids eviction when it would require flushing
WAL), there is less interleaving between WAL flushes and data file
writes. With the patch applied, I observe client backends waiting
significantly less on the WALWriteLock. I also see lower f_await times
in iostat, suggesting reduced flush-related waiting at the kernel
level as well.

It's worth noting that for the serial COPY case (a single COPY FROM),
performance remains essentially unchanged with the patch. The benefit
seems to emerge only when multiple backends are concurrently writing
data and flushing WAL. In fact, the benefits go down the fewer
parallel COPY FROM operations are performed at a time.

The benchmark I did was simple:

-- make 16 source data files that are >= 1GB each

initdb
pg_ctl start
createdb

sudo fstrim -v /mnt/data

psql -c "drop table foo; create table foo(a int, b int) with
(autovacuum_enabled = off);"

time pgbench \
  --no-vacuum \
  -c 16 \
  -j 16 \
  -t 4 \
-f- <<EOF
COPY foo FROM '/mnt/data/foo:client_id.data';
EOF

master -> patch
6.2 minutes -> 5 minutes : ~20% reduction

A 15% improvement can be noticed with the same benchmark but 4 workers.

- Melanie
From 9f18bac7869810914e0dfde2fc14060293bcd5b4 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Mon, 30 Jun 2025 18:04:33 -0400
Subject: [PATCH v1 1/2] Eager evict bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring buffer in order to reuse
them. By eagerly evicting the buffers in a larger batch, we incur less
interleaving of WAL flushes and data file writes. The effect is mainly
noticeable with multiple parallel COPY FROMs. In this case, client
backends achieve higher write throughput and end up spending less time
waiting on acquiring the lock to flush WAL. Larger flush operations also
mean less time waiting for flush operations at the kernel level as well.

The heuristic for eager eviction is to only evict buffers in the
strategy ring which flushing does not require flushing WAL.
---
 src/backend/storage/buffer/bufmgr.c   | 72 +++++++++++++++++++++++++++
 src/backend/storage/buffer/freelist.c | 53 ++++++++++++++++++++
 src/include/storage/buf_internals.h   |  1 +
 src/include/storage/bufmgr.h          |  2 +
 4 files changed, 128 insertions(+)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6afdd28dba6..ca7d900e7ec 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2346,6 +2346,75 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr)
 	return true;
 }

+/*
+ * Pin and lock a shared buffer and then evict it. Don't evict the buffer if
+ * doing so would mean we have to flush WAL. We only evict the buffer if doing
+ * so is "cheap", i.e. we're able to lock the buffer and we don't have to
+ * flush WAL. This is appropriate for occasions in which we don't need to
+ * guarantee that the buffer is flushed.
+ *
+ * Returns true if the buffer was flushed, false otherwise.
+ */
+bool
+QuickCleanBuffer(BufferDesc *bufdesc, IOContext io_context)
+{
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	LWLock	   *content_lock;
+	Buffer		buffer;
+
+	buffer = BufferDescriptorGetBuffer(bufdesc);
+	buf_state = LockBufHdr(bufdesc);
+
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * No need to evict the buffer if it isn't dirty. We won't flush buffers
+	 * in use by other backends.
+	 */
+	if (!(buf_state & BM_DIRTY) ||
+		BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+		BUF_STATE_GET_USAGECOUNT(buf_state) > 1)
+	{
+		UnlockBufHdr(bufdesc, buf_state);
+		return false;
+	}
+
+	ReservePrivateRefCountEntry();
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+
+	/* Releases buffer header lock before acquiring content lock */
+	PinBuffer_Locked(bufdesc);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+	{
+		UnpinBuffer(bufdesc);
+		return false;
+	}
+
+	CheckBufferIsPinnedOnce(buffer);
+
+	/* Need buffer header lock to get the LSN */
+	buf_state = LockBufHdr(bufdesc);
+	lsn = BufferGetLSN(bufdesc);
+	UnlockBufHdr(bufdesc, buf_state);
+
+	if (XLogNeedsFlush(lsn))
+	{
+		UnlockReleaseBuffer(buffer);
+		return false;
+	}
+
+	FlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context);
+
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+
+	UnpinBuffer(bufdesc);
+	return true;
+}
+
 static Buffer
 GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 {
@@ -2451,6 +2520,9 @@ again:

 		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
 									  &buf_hdr->tag);
+
+		if (strategy)
+			EvictStrategyRing(strategy);
 	}


diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 01909be0272..ab38c96e2de 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,31 @@ have_free_buffer(void)
 		return false;
 }

+/*
+ * Some BufferAccessStrategies support eager eviction -- which is evicting
+ * buffers in the ring before they are needed. This can lean to better I/O
+ * patterns than lazily evicting buffers directly before reusing them.
+ */
+bool
+strategy_supports_eager_eviction(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -780,6 +805,34 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 	return NULL;
 }

+/*
+ * Evict all the buffers we can in the strategy ring. This encourages write
+ * batching at the kernel level and leaves a ring full of clean buffers. We'll
+ * skip evicting buffers that would require us to flush WAL.
+ */
+void
+EvictStrategyRing(BufferAccessStrategy strategy)
+{
+	IOContext	io_context;
+
+	if (!strategy_supports_eager_eviction(strategy))
+		return;
+
+	io_context = IOContextForStrategy(strategy);
+
+	for (int i = 0; i < strategy->nbuffers; i++)
+	{
+		BufferDesc *bufdesc;
+		Buffer		bufnum = strategy->buffers[i];
+
+		if (bufnum == InvalidBuffer)
+			continue;
+		bufdesc = GetBufferDescriptor(bufnum - 1);
+		QuickCleanBuffer(bufdesc, io_context);
+	}
+}
+
+
 /*
  * AddBufferToRing -- add a buffer to the buffer ring
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 52a71b138f7..4d3f9552027 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -433,6 +433,7 @@ extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
+extern bool QuickCleanBuffer(BufferDesc *bufdesc, IOContext io_context);

 /* solely to make it easier to write tests */
 extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 41fdc1e7693..a4d122fa3c5 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -331,8 +331,10 @@ extern BufferAccessStrategy GetAccessStrategyWithSize(BufferAccessStrategyType b
 extern int	GetAccessStrategyBufferCount(BufferAccessStrategy strategy);
 extern int	GetAccessStrategyPinLimit(BufferAccessStrategy strategy);

+extern void EvictStrategyRing(BufferAccessStrategy strategy);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);

+extern bool strategy_supports_eager_eviction(BufferAccessStrategy strategy);

 /* inline functions */

--
2.43.0

Reply via email to