From 718b5b5e70bb32ebbbbaf9d0ea0304534827b73e Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Thu, 13 Oct 2022 07:24:55 +0000
Subject: [PATCH v9] Make WAL file initialization by pg_receivewal atomic

While initializing WAL files with zeros (i.e. padding) the
pg_receivewal can fail for any reason, for instance, no space left
on device or host server crash etc., which may leave partially
padded WAL files due to which the pg_receivewal won't come up after
the hostserver restart. The error it emits is "error: write-ahead
log file "foo.partial" has x bytes, should be 0 or y" (where y is
size of WAL file typically and x < y).

To fix this, one needs to manually intervene and delete the partially
padded WAL file which requires an internal understanding of what
the issue is and often a time-consuming process in production
environments.

The above problem can also exist for pg_basebackup as it uses the
same walmethods.c function for initialization.

To address the above problem, make WAL file initialization atomic
i.e. first create a temporary file, pad it and then rename it to the
target file. This is similar to what postgres does to make WAL file
initialization atomic.

Authors: Bharath Rupireddy, SATYANARAYANA NARLAPURAM
Reviewed-by: Cary Huang, mahendrakar s
Reviewed-by: Kyotaro Horiguchi, Michael Paquier
Discussion: https://www.postgresql.org/message-id/CAHg%2BQDcVUss6ocOmbLbV5f4YeGLhOCt%2B1x2yLNfG2H_eDwU8tw%40mail.gmail.com
---
 src/bin/pg_basebackup/walmethods.c | 60 +++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c
index bc2e83d02b..0c3f8fc917 100644
--- a/src/bin/pg_basebackup/walmethods.c
+++ b/src/bin/pg_basebackup/walmethods.c
@@ -118,7 +118,10 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 				   const char *temp_suffix, size_t pad_to_size)
 {
 	DirectoryMethodData *dir_data = (DirectoryMethodData *) wwmethod;
+	char		targetpath[MAXPGPATH];
 	char		tmppath[MAXPGPATH];
+	char	   *path;
+	bool		createtempfile = false;
 	char	   *filename;
 	int			fd;
 	DirectoryMethodFile *f;
@@ -134,7 +137,7 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 	clear_error(wwmethod);
 
 	filename = dir_get_file_name(wwmethod, pathname, temp_suffix);
-	snprintf(tmppath, sizeof(tmppath), "%s/%s",
+	snprintf(targetpath, sizeof(targetpath), "%s/%s",
 			 dir_data->basedir, filename);
 	pg_free(filename);
 
@@ -143,8 +146,32 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 	 * the file descriptor is important for dir_sync() method as gzflush()
 	 * does not do any system calls to fsync() to make changes permanent on
 	 * disk.
+	 *
+	 * Create file atomically for padding and non-compression cases. This is
+	 * done by first creating a temporary file, initializing it and renaming it
+	 * to the actual file name.
 	 */
-	fd = open(tmppath, O_WRONLY | O_CREAT | PG_BINARY, pg_file_create_mode);
+	if (pad_to_size > 0 &&
+		wwmethod->compression_algorithm == PG_COMPRESSION_NONE)
+	{
+		snprintf(tmppath, MAXPGPATH, "%s/xlog.tmp", dir_data->basedir);
+
+		/*
+		 * Remove temporary file possibly left over from a crash or a previous
+		 * failure in this function.
+		 */
+		unlink(tmppath);
+
+		path = tmppath;
+		createtempfile = true;
+	}
+	else
+	{
+		path = targetpath;
+		createtempfile = false;
+	}
+
+	fd = open(path, O_WRONLY | O_CREAT | PG_BINARY, pg_file_create_mode);
 	if (fd < 0)
 	{
 		wwmethod->lasterrno = errno;
@@ -244,16 +271,37 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 		}
 	}
 
+	/* Rename the temporary file to actual file */
+	if (createtempfile)
+	{
+		int rc;
+
+		if (wwmethod->sync)
+			rc = durable_rename(tmppath, targetpath);
+		else
+			rc = rename(tmppath, targetpath);
+
+		if (rc != 0)
+		{
+			wwmethod->lasterrno = errno;
+			close(fd);
+			return NULL;
+		}
+	}
+
 	/*
 	 * fsync WAL file and containing directory, to ensure the file is
 	 * persistently created and zeroed (if padded). That's particularly
 	 * important when using synchronous mode, where the file is modified and
 	 * fsynced in-place, without a directory fsync.
+	 *
+	 * For padding/temporary file case, durable_rename would have already
+	 * fsynced file and containing directory.
 	 */
-	if (wwmethod->sync)
+	if (!createtempfile && wwmethod->sync)
 	{
-		if (fsync_fname(tmppath, false) != 0 ||
-			fsync_parent_path(tmppath) != 0)
+		if (fsync_fname(targetpath, false) != 0 ||
+			fsync_parent_path(targetpath) != 0)
 		{
 			wwmethod->lasterrno = errno;
 #ifdef HAVE_LIBZ
@@ -294,7 +342,7 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 	f->base.currpos = 0;
 	f->base.pathname = pg_strdup(pathname);
 	f->fd = fd;
-	f->fullpath = pg_strdup(tmppath);
+	f->fullpath = pg_strdup(targetpath);
 	if (temp_suffix)
 		f->temp_suffix = pg_strdup(temp_suffix);
 
-- 
2.34.1

