From 35ab77794c4731c7a6466726e83e880111c9519a Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Thu, 29 Sep 2022 04:40:25 +0000
Subject: [PATCH v5] Use pg_pwritev_with_retry() instead of write() in
 walmethods.c

Use pg_pwritev_with_retry() while prepadding a WAL segment
instead of write() in walmethods.c dir_open_for_write() to avoid
partial writes. As the pg_pwritev_with_retry() function uses
pwritev, we can avoid explicit lseek() on non-Windows platforms
to seek to the beginning of the WAL segment. It looks like on
Windows, we need an explicit lseek() call here despite using
pwrite() implementation from win32pwrite.c. Otherwise
an error occurs.

These changes are inline with how core postgres initializes the
WAL segment in XLogFileInitInternal().

Author: Bharath Rupireddy
Reviewed-by: Nathan Bossart
Discussion: https://www.postgresql.org/message-id/CALj2ACUq7nAb7%3DbJNbK3yYmp-SZhJcXFR_pLk8un6XgDzDF3OA%40mail.gmail.com
---
 src/backend/access/transam/xlog.c  | 33 ++-----------
 src/bin/pg_basebackup/walmethods.c | 26 +++++-----
 src/common/file_utils.c            | 77 ++++++++++++++++++++++++++++++
 src/include/common/file_utils.h    |  2 +
 4 files changed, 98 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8e15256db8..ecfb4b13fb 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2925,7 +2925,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 					 bool *added, char *path)
 {
 	char		tmppath[MAXPGPATH];
-	PGAlignedXLogBlock zbuffer;
 	XLogSegNo	installed_segno;
 	XLogSegNo	max_segno;
 	int			fd;
@@ -2969,14 +2968,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
-	memset(zbuffer.data, 0, XLOG_BLCKSZ);
-
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
 	save_errno = 0;
 	if (wal_init_zero)
 	{
-		struct iovec iov[PG_IOV_MAX];
-		int			blocks;
+		ssize_t rc;
 
 		/*
 		 * Zero-fill the file.  With this setting, we do this the hard way to
@@ -2987,29 +2983,10 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
 		 * O_DSYNC will be sufficient to sync future writes to the log file.
 		 */
+		rc = pg_pwritev_zeros(fd, wal_segment_size);
 
-		/* Prepare to write out a lot of copies of our zero buffer at once. */
-		for (int i = 0; i < lengthof(iov); ++i)
-		{
-			iov[i].iov_base = zbuffer.data;
-			iov[i].iov_len = XLOG_BLCKSZ;
-		}
-
-		/* Loop, writing as many blocks as we can for each system call. */
-		blocks = wal_segment_size / XLOG_BLCKSZ;
-		for (int i = 0; i < blocks;)
-		{
-			int			iovcnt = Min(blocks - i, lengthof(iov));
-			off_t		offset = i * XLOG_BLCKSZ;
-
-			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
-			{
-				save_errno = errno;
-				break;
-			}
-
-			i += iovcnt;
-		}
+		if (rc < 0)
+			save_errno = errno;
 	}
 	else
 	{
@@ -3018,7 +2995,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * enough.
 		 */
 		errno = 0;
-		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
+		if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
 		{
 			/* if write didn't set errno, assume no disk space */
 			save_errno = errno ? errno : ENOSPC;
diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c
index bc2e83d02b..9eb4d35a40 100644
--- a/src/bin/pg_basebackup/walmethods.c
+++ b/src/bin/pg_basebackup/walmethods.c
@@ -220,28 +220,30 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 	/* Do pre-padding on non-compressed files */
 	if (pad_to_size && wwmethod->compression_algorithm == PG_COMPRESSION_NONE)
 	{
-		PGAlignedXLogBlock zerobuf;
-		int			bytes;
+		ssize_t rc;
 
-		memset(zerobuf.data, 0, XLOG_BLCKSZ);
-		for (bytes = 0; bytes < pad_to_size; bytes += XLOG_BLCKSZ)
+		rc = pg_pwritev_zeros(fd, pad_to_size);
+
+		if (rc < 0)
 		{
-			errno = 0;
-			if (write(fd, zerobuf.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
-			{
-				/* If write didn't set errno, assume problem is no disk space */
-				wwmethod->lasterrno = errno ? errno : ENOSPC;
-				close(fd);
-				return NULL;
-			}
+			wwmethod->lasterrno = errno;
+			close(fd);
+			return NULL;
 		}
 
+#ifdef WIN32
+		/*
+		 * WriteFile() on Windows changes the current file position, hence we
+		 * need an explicit lseek() here. See pg_pwrite() implementation in
+		 * win32pwrite.c for more details.
+		 */
 		if (lseek(fd, 0, SEEK_SET) != 0)
 		{
 			wwmethod->lasterrno = errno;
 			close(fd);
 			return NULL;
 		}
+#endif
 	}
 
 	/*
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 4a4bdc31c4..30ce31201f 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -525,3 +525,80 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 
 	return sum;
 }
+
+/*
+ * Function to zero-fill a file with given size. On failure, a negative value
+ * is returned and errno is set appropriately so that the caller can use it
+ * accordingly.
+ */
+ssize_t
+pg_pwritev_zeros(int fd, size_t size)
+{
+	PGAlignedXLogBlock	zbuffer;
+	struct iovec	iov[PG_IOV_MAX];
+	int		blocks;
+	size_t	block_sz;
+	size_t	remaining_size = 0;
+	int		i;
+	ssize_t	written;
+	ssize_t	total_written = 0;
+
+	/*
+	 * XXX: Writing more than one block of size XLOG_BLCKSZ bytes via
+	 * PGAlignedXLogBlock structure per vector buffer might improve write
+	 * performance on some platforms. However, tests (on some platforms, not
+	 * all) show not much improvements with varying block sizes. Hence we stick
+	 * to one block in PGAlignedXLogBlock structure.
+	 */
+	block_sz = sizeof(zbuffer.data);
+
+	memset(zbuffer.data, 0, block_sz);
+
+	/* Prepare to write out a lot of copies of our zero buffer at once. */
+	for (i = 0; i < lengthof(iov); ++i)
+	{
+		iov[i].iov_base = zbuffer.data;
+		iov[i].iov_len = block_sz;
+	}
+
+	/* Loop, writing as many blocks as we can for each system call. */
+	blocks = size / block_sz;
+	remaining_size = size % block_sz;
+	for (i = 0; i < blocks;)
+	{
+		int		iovcnt = Min(blocks - i, lengthof(iov));
+		off_t	offset = i * block_sz;
+
+		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if (written < 0)
+			return written;
+
+		i += iovcnt;
+		total_written += written;
+	}
+
+	/* Now, write the remaining size, if any, of the file with zeros. */
+	if (remaining_size > 0)
+	{
+		/* We'll never write more than one block here */
+		int		iovcnt = 1;
+
+		/* Jump on to the end of previously written blocks */
+		off_t	offset = i * block_sz;
+
+		iov[0].iov_base = zbuffer.data;
+		iov[0].iov_len = remaining_size;
+
+		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if (written < 0)
+			return written;
+
+		total_written += written;
+	}
+
+	Assert(total_written == size);
+
+	return total_written;
+}
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 2c5dbcb0b1..f8d0d475fa 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -44,4 +44,6 @@ extern ssize_t pg_pwritev_with_retry(int fd,
 									 int iovcnt,
 									 off_t offset);
 
+extern ssize_t pg_pwritev_zeros(int fd, size_t size);
+
 #endif							/* FILE_UTILS_H */
-- 
2.34.1

