From 03d7037bf313576f347e240ba6d857e2b0890d9c Mon Sep 17 00:00:00 2001
From: Melih Mutlu <m.melihmutlu@gmail.com>
Date: Wed, 24 Jul 2024 17:19:07 +0300
Subject: [PATCH v2] Use pg_pwritev() in XlogWrite()

XlogWrite() had to write() whenever it reached to the last buffer in
circular wal buffers before circling back to the first buffer as
previous pages in wal buffers is not contiguous in memory.

Vectored IO allow us to write when pages are not contiguous. This patch
utilizes pg_pwritev() in XlogWrite() so that it is able to circle back
without having to write() anything if not necessary.
---
 src/backend/access/transam/xlog.c | 50 +++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ee0fb0e28f..72b04fd553 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2315,6 +2315,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 	bool		ispartialpage;
 	bool		last_iteration;
 	bool		finishing_seg;
+	bool		full_cycle;
 	int			curridx;
 	int			npages;
 	int			startidx;
@@ -2407,29 +2408,53 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 
 		/*
 		 * Dump the set if this will be the last loop iteration, or if we are
-		 * at the last page of the cache area (since the next page won't be
-		 * contiguous in memory), or if we are at the end of the logfile
-		 * segment.
+		 * completed a full cycle in our circular wal buffers, or if we are at
+		 * the end of the logfile segment.
 		 */
 		last_iteration = WriteRqst.Write <= LogwrtResult.Write;
 
 		finishing_seg = !ispartialpage &&
 			(startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
 
+		full_cycle = curridx == (startidx - 1);
+
 		if (last_iteration ||
-			curridx == XLogCtl->XLogCacheBlck ||
+			full_cycle ||
 			finishing_seg)
 		{
-			char	   *from;
-			Size		nbytes;
 			Size		nleft;
 			ssize_t		written;
 			instr_time	start;
+			struct iovec iov[2];
+			int			iovcnt;
+
+			if (curridx < startidx)
+			{
+				Assert(curridx + 1 + XLogCtl->XLogCacheBlck - startidx + 1 == npages);
+
+				/*
+				 * From startidx to the end until the next page is not
+				 * contiguous in memory anymore.
+				 */
+				iov[0].iov_base = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
+				iov[0].iov_len = (XLogCtl->XLogCacheBlck - startidx + 1) * (Size) XLOG_BLCKSZ;
+
+				/* From first wal buffer to the current idx */
+				iov[1].iov_base = XLogCtl->pages;
+				iov[1].iov_len = (curridx + 1) * (Size) XLOG_BLCKSZ;
+
+				iovcnt = 2;
+			}
+			else
+			{
+				/* Contiguous case */
+				iov[0].iov_base = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;;
+				iov[0].iov_len = npages * (Size) XLOG_BLCKSZ;
+				iovcnt = 1;
+			}
 
 			/* OK to write the page(s) */
-			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
-			nbytes = npages * (Size) XLOG_BLCKSZ;
-			nleft = nbytes;
+			nleft = npages * (Size) XLOG_BLCKSZ;
 			do
 			{
 				errno = 0;
@@ -2441,7 +2466,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 					INSTR_TIME_SET_ZERO(start);
 
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
-				written = pg_pwrite(openLogFile, from, nleft, startoffset);
+				written = pg_pwritev(openLogFile, iov, iovcnt, startoffset);
 				pgstat_report_wait_end();
 
 				/*
@@ -2476,9 +2501,10 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 									xlogfname, startoffset, nleft)));
 				}
 				nleft -= written;
-				from += written;
 				startoffset += written;
-			} while (nleft > 0);
+
+				iovcnt = compute_remaining_iovec(iov, iov, iovcnt, written);
+			} while (iovcnt > 0);
 
 			npages = 0;
 
-- 
2.34.1

