From 6a254cd4831d64e6296907265b60fcdf86f5771a Mon Sep 17 00:00:00 2001
From: Daniel Gustafsson <daniel@yesql.se>
Date: Thu, 17 Feb 2022 23:22:43 +0100
Subject: [PATCH v5 3/3] Quick exit on log stream child exit in pg_basebackup

If the log streaming child process (thread on Windows) dies during
backup then the whole backup will be aborted at the end of the
backup.  Instead, trap ungraceful termination of the log streaming
child and exit early.  This also adds a TAP test for simulating this
by terminating the responsible backend.

Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Reviewed-by: Magnus Hagander <magnus@hagander.net>
Discussion: https://postgr.es/m/0F69E282-97F9-4DB7-8D6D-F927AA6340C8@yesql.se
Discussion: https://postgr.es/m/VI1PR83MB0189818B82C19059CB62E26199A89@VI1PR83MB0189.EURPRD83.prod.outlook.com
---
 src/bin/pg_basebackup/pg_basebackup.c        | 47 +++++++++++++++++++-
 src/bin/pg_basebackup/t/010_pg_basebackup.pl | 31 +++++++++++++
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 08b07d5a06..ac289331c8 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -174,6 +174,8 @@ static int	bgpipe[2] = {-1, -1};
 /* Handle to child process */
 static pid_t bgchild = -1;
 static bool in_log_streamer = false;
+/* Flag to indicate if child process exited unexpectedly */
+static volatile sig_atomic_t bgchild_exited = false;
 
 /* End position for xlog streaming, empty string if unknown yet */
 static XLogRecPtr xlogendptr;
@@ -277,6 +279,18 @@ disconnect_atexit(void)
 }
 
 #ifndef WIN32
+/*
+ * If the bgchild exits prematurely and raises a SIGCHLD signal, we can abort
+ * processing rather than wait until the backup has finished and error out at
+ * that time. On Windows, we use a background thread which can communicate
+ * without the need for a signal handler.
+ */
+static void
+sigchld_handler(SIGNAL_ARGS)
+{
+	bgchild_exited = true;
+}
+
 /*
  * On windows, our background thread dies along with the process. But on
  * Unix, if we have started a subprocess, we want to kill it off so it
@@ -285,7 +299,7 @@ disconnect_atexit(void)
 static void
 kill_bgchild_atexit(void)
 {
-	if (bgchild > 0)
+	if (bgchild > 0 && !bgchild_exited)
 		kill(bgchild, SIGTERM);
 }
 #endif
@@ -572,17 +586,28 @@ LogStreamerMain(logstreamer_param *param)
 											  stream.do_sync);
 
 	if (!ReceiveXlogStream(param->bgconn, &stream))
-
+	{
 		/*
 		 * Any errors will already have been reported in the function process,
 		 * but we need to tell the parent that we didn't shutdown in a nice
 		 * way.
 		 */
+#ifdef WIN32
+		/*
+		 * In order to signal the main thread of an ungraceful exit we
+		 * set the same flag that we use on Unix to signal SIGCHLD.
+		 */
+		bgchild_exited = true;
+#endif
 		return 1;
+	}
 
 	if (!stream.walmethod->finish())
 	{
 		pg_log_error("could not finish writing WAL files: %m");
+#ifdef WIN32
+		bgchild_exited = true;
+#endif
 		return 1;
 	}
 
@@ -1126,6 +1151,12 @@ ReceiveCopyData(PGconn *conn, WriteDataCallback callback,
 			exit(1);
 		}
 
+		if (bgchild_exited)
+		{
+			pg_log_error("background process terminated unexpectedly");
+			exit(1);
+		}
+
 		(*callback) (r, copybuf, callback_data);
 
 		PQfreemem(copybuf);
@@ -2874,6 +2905,18 @@ main(int argc, char **argv)
 	}
 	atexit(disconnect_atexit);
 
+#ifndef WIN32
+	/*
+	 * Trap SIGCHLD to be able to handle the WAL stream process exiting. There
+	 * is no SIGCHLD on Windows, there we rely on the background thread setting
+	 * the signal variable on unexpected but graceful exit. If the WAL stream
+	 * thread crashes on Windows it will bring down the entire process as it's
+	 * a thread, so there is nothing to catch should that happen. A crash on
+	 * UNIX will be caught by the signal handler.
+	 */
+	pqsignal(SIGCHLD, sigchld_handler);
+#endif
+
 	/*
 	 * Set umask so that directories/files are created with the same
 	 * permissions as directories/files in the source data directory.
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index 8c70e5b32b..42d42e65c2 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -778,4 +778,35 @@ SKIP:
 	rmtree("$tempdir/backup_gzip3");
 }
 
+# Test background stream process terminating before the basebackup has
+# finished, the main process should exit gracefully with an error message on
+# stderr.
+$node->safe_psql('postgres',
+	q{CREATE TABLE t AS SELECT a FROM generate_series(1,10000) AS a;});
+
+my $sigchld_bb_timeout = IPC::Run::timer(60);
+my ($sigchld_bb_stdin, $sigchld_bb_stdout, $sigchld_bb_stderr) = ('', '', '');
+my $sigchld_bb = IPC::Run::start(
+	[
+		@pg_basebackup_defs, '-X', 'stream', '-D', "$tempdir/sigchld",
+		'-r', '32', '-d', $node->connstr('postgres')
+	],
+	'<',
+	\$sigchld_bb_stdin,
+	'>',
+	\$sigchld_bb_stdout,
+	'2>',
+	\$sigchld_bb_stderr,
+	$sigchld_bb_timeout);
+
+is($node->poll_query_until('postgres',
+	"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE " .
+	"application_name = '010_pg_basebackup.pl' AND wait_event = 'WalSenderMain' " .
+	"AND backend_type = 'walsender'"), "1", "Walsender killed");
+
+ok(pump_until($sigchld_bb, $sigchld_bb_timeout, \$sigchld_bb_stderr,
+  qr/background process terminated unexpectedly/),
+  'background process exit message');
+$sigchld_bb->finish();
+
 done_testing();
-- 
2.24.3 (Apple Git-128)

