From d8b13074e674a0b61c65e7d0bf61c3ff95df8f55 Mon Sep 17 00:00:00 2001
From: Melih Mutlu <m.melihmutlu@gmail.com>
Date: Wed, 24 Jul 2024 17:19:07 +0300
Subject: [PATCH v1] Use pg_pwritev() in XlogWrite()

XlogWrite() had to write() whenever it reached to the last buffer in
circular wal buffers before circling back to the first buffer as
previous pages in wal buffers is not contiguous in memory.

Vectored IO allow us to write when pages are not contiguous. This patch
utilizes pg_pwritev() in XlogWrite() so that it is able to circle back
without having to write() anything if not necessary.
---
 src/backend/access/transam/xlog.c | 44 ++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ee0fb0e28f..05d46b9d14 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2315,6 +2315,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 	bool		ispartialpage;
 	bool		last_iteration;
 	bool		finishing_seg;
+	bool		full_cycle;
 	int			curridx;
 	int			npages;
 	int			startidx;
@@ -2407,17 +2408,22 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 
 		/*
 		 * Dump the set if this will be the last loop iteration, or if we are
-		 * at the last page of the cache area (since the next page won't be
-		 * contiguous in memory), or if we are at the end of the logfile
-		 * segment.
+		 * completed a full cycle in our circular wal buffers, or if we are at
+		 * the end of the logfile segment.
 		 */
 		last_iteration = WriteRqst.Write <= LogwrtResult.Write;
 
 		finishing_seg = !ispartialpage &&
 			(startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
 
+		/* 
+		 * Reaching the buffer right before the start buffer means that we
+		 * completed a full cycle in our circular wal buffers.
+		 */
+		full_cycle = curridx == (startidx - 1);
+
 		if (last_iteration ||
-			curridx == XLogCtl->XLogCacheBlck ||
+			full_cycle ||
 			finishing_seg)
 		{
 			char	   *from;
@@ -2425,9 +2431,35 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 			Size		nleft;
 			ssize_t		written;
 			instr_time	start;
+			struct iovec iov[2];	
+			int 		iovcnt;
+
+			if (curridx < startidx)
+			{
+				Assert(curridx + 1 + XLogCtl->XLogCacheBlck - startidx + 1 == npages);
+
+				/* 
+				 * From startidx to the end until the next page is not contiguous
+				 * in memory anymore.
+				 */
+				iov[0].iov_base = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
+    			iov[0].iov_len = (XLogCtl->XLogCacheBlck - startidx + 1) * (Size) XLOG_BLCKSZ;
+
+				/* From first wal buffer to the current idx */
+    			iov[1].iov_base = XLogCtl->pages;
+    			iov[1].iov_len = (curridx + 1) * (Size) XLOG_BLCKSZ;
+
+				iovcnt = 2;
+			}
+			else
+			{
+				/* Contiguous case */
+				iov[0].iov_base = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;;
+    			iov[0].iov_len = npages * (Size) XLOG_BLCKSZ;
+				iovcnt = 1;
+			}
 
 			/* OK to write the page(s) */
-			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
 			nbytes = npages * (Size) XLOG_BLCKSZ;
 			nleft = nbytes;
 			do
@@ -2441,7 +2473,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 					INSTR_TIME_SET_ZERO(start);
 
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
-				written = pg_pwrite(openLogFile, from, nleft, startoffset);
+				written = pg_pwritev(openLogFile, iov, iovcnt, startoffset);
 				pgstat_report_wait_end();
 
 				/*
-- 
2.34.1

