Fix background workers not restarting with restart_after_crash = on

Andrey Rudometov Wed, 11 Jun 2025 01:26:34 -0700

Good day, hackers.

Reading through changes committed in master, I noticed that after
CleanupBackend/CleanupBackroundworker refactor background workers will fail
to
start again after postgres' restart with restart_after_crash = on.


The reason is CleanupBackend and HandleChildCrash not setting background
worker's
rw_pid to zero anymore, if backend, well, crashed and failed to call
shmem_exit
and mark PMChild slot as inactive via MarkPostmasterChildInactive.

Suggested solution is to finish CleanupBackend's background worker related
logic
even after treating the child process as crashed. In earlier versions
zeroing of
pids happen in HandleChildCrash anyway, so there should be no harm in doing
the same actions here.

For fast reproduction I used pg_prewarm extension, as it creates observable
bgworker
and is present in postgres tree, so tap test is easy to run.
-- 
best regards,
Andrey Rudometov

From 4ae584f3f3e352bb4b7a0cedf1f7efc27fe8154d Mon Sep 17 00:00:00 2001
From: Andrey Rudometov <[email protected]>
Date: Wed, 11 Jun 2025 11:10:53 +0700
Subject: [PATCH v1 1/2] Reproduce bgw not starting after restart_after_crash

---
 contrib/pg_prewarm/t/002_bgw_crash_restart.pl | 176 ++++++++++++++++++
 src/backend/postmaster/postmaster.c           |   3 +
 2 files changed, 179 insertions(+)
 create mode 100644 contrib/pg_prewarm/t/002_bgw_crash_restart.pl

diff --git a/contrib/pg_prewarm/t/002_bgw_crash_restart.pl b/contrib/pg_prewarm/t/002_bgw_crash_restart.pl
new file mode 100644
index 00000000000..7376194707a
--- /dev/null
+++ b/contrib/pg_prewarm/t/002_bgw_crash_restart.pl
@@ -0,0 +1,176 @@
+
+# Copyright (c) 2021-2025, PostgreSQL Global Development Group
+
+#
+# Test ensures that background worker survives after crash restart.
+# Intended solely for reproduction; made of 001_basic.pl & 013_crash_restart.pl.
+#
+# pg_prewarm extension is chosen as it creates bgw and is in-tree.
+#
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use Carp qw(croak);
+use Time::HiRes qw(usleep);
+
+my $psql_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default);
+
+my $node = PostgreSQL::Test::Cluster->new('primary');
+$node->init(allows_streaming => 1);
+$node->append_conf(
+	'postgresql.conf',
+	qq{shared_preload_libraries = 'pg_prewarm'
+    pg_prewarm.autoprewarm = true
+    pg_prewarm.autoprewarm_interval = 1});
+$node->start();
+
+# by default PostgreSQL::Test::Cluster doesn't restart after a crash
+$node->safe_psql(
+	'postgres',
+	q[ALTER SYSTEM SET restart_after_crash = 1;
+				   ALTER SYSTEM SET log_connections = receipt;
+				   SELECT pg_reload_conf();]);
+
+# setup pg_prewarm
+$node->safe_psql("postgres",
+		"CREATE EXTENSION pg_prewarm;\n"
+	  . "CREATE TABLE test(c1 int);\n"
+	  . "INSERT INTO test SELECT generate_series(1, 100);");
+$node->restart;
+
+# Wait for prewarm worker to show signs of life
+$node->wait_for_log(
+	"autoprewarm successfully prewarmed [1-9][0-9]* of [0-9]+ previously-loaded blocks"
+);
+
+# Run psql, keeping session alive, so we have an alive backend to kill.
+my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
+my $killme = IPC::Run::start(
+	[
+		'psql', '--no-psqlrc', '--quiet', '--no-align', '--tuples-only',
+		'--set' => 'ON_ERROR_STOP=1',
+		'--file' => '-',
+		'--dbname' => $node->connstr('postgres')
+	],
+	'<' => \$killme_stdin,
+	'>' => \$killme_stdout,
+	'2>' => \$killme_stderr,
+	$psql_timeout);
+
+# Acquire pid of new backend
+$killme_stdin .= q[
+SELECT pg_backend_pid();
+];
+ok( pump_until(
+		$killme, $psql_timeout, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
+	"acquired pid for SIGKILL");
+my $pid = $killme_stdout;
+chomp($pid);
+$killme_stdout = '';
+$killme_stderr = '';
+
+# Need a second psql to check if crash-restart happened.
+my ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
+my $monitor = IPC::Run::start(
+	[
+		'psql', '--no-psqlrc', '--quiet', '--no-align', '--tuples-only',
+		'--set' => 'ON_ERROR_STOP=1',
+		'--file', '-', '--dbname' => $node->connstr('postgres')
+	],
+	'<' => \$monitor_stdin,
+	'>' => \$monitor_stdout,
+	'2>' => \$monitor_stderr,
+	$psql_timeout);
+
+# Start longrunning query in second session; its failure will signal that
+# crash-restart has occurred.  The initial wait for the trivial select is to
+# be sure that psql successfully connected to backend.
+$monitor_stdin .= q[
+SELECT $$psql-connected$$;
+SELECT pg_sleep(3600);
+];
+ok( pump_until(
+		$monitor, $psql_timeout, \$monitor_stdout, qr/psql-connected/m),
+	'monitor connected');
+$monitor_stdout = '';
+$monitor_stderr = '';
+
+my $log_offset = -s $node->logfile;
+
+# kill once with QUIT - we expect psql to exit, while emitting error message first
+my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'QUIT', $pid);
+
+# Exactly process should have been alive to be killed
+is($ret, 0, "killed process with SIGQUIT");
+
+# Check that psql sees the killed backend as having been terminated
+$killme_stdin .= q[
+SELECT 1;
+];
+ok( pump_until(
+		$killme,
+		$psql_timeout,
+		\$killme_stderr,
+		qr/WARNING:  terminating connection because of unexpected SIGQUIT signal|server closed the connection unexpectedly|connection to server was lost|could not send data to server/m
+	),
+	"psql query died successfully after SIGQUIT");
+$killme_stderr = '';
+$killme_stdout = '';
+$killme->finish;
+
+# Wait till server restarts - we should get the WARNING here, but
+# sometimes the server is unable to send that, if interrupted while
+# sending.
+ok( pump_until(
+		$monitor,
+		$psql_timeout,
+		\$monitor_stderr,
+		qr/WARNING:  terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost|could not send data to server/m
+	),
+	"psql monitor died successfully after SIGQUIT");
+$monitor->finish;
+
+# Wait till server restarts
+is($node->poll_query_until('postgres', undef, ''),
+	"1", "reconnected after SIGQUIT");
+
+# Wait for prewarm worker to show signs of life
+wait_for_log_less(
+	$node,
+	"autoprewarm successfully prewarmed [1-9][0-9]* of [0-9]+ previously-loaded blocks",
+	$log_offset
+);
+
+$node->stop();
+
+done_testing();
+
+
+# there is no point in waiting for timeout_default, make less attempts.
+sub wait_for_log_less
+{
+	my ($node, $regexp, $offset) = @_;
+	$offset = 0 unless defined $offset;
+
+	# Wait for 5 seconds. Surely lone bgw with 1 second timeout will start..
+	my $max_attempts = 10 * 5;
+	my $attempts = 0;
+
+	while ($attempts < $max_attempts)
+	{
+		my $log =
+		  PostgreSQL::Test::Utils::slurp_file($node->logfile, $offset);
+
+		return $offset + length($log) if ($log =~ m/$regexp/);
+
+		# Wait 0.1 second before retrying.
+		usleep(100_000);
+
+		$attempts++;
+	}
+
+	croak "timed out waiting for match: $regexp";
+}
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 490f7ce3664..2d99f9d670d 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -4249,6 +4249,9 @@ maybe_start_bgworkers(void)
 
 		rw = dlist_container(RegisteredBgWorker, rw_lnode, iter.cur);
 
+		elog(LOG, "maybe_start_bgworkers: worker %s has pid %d",
+			 rw->rw_worker.bgw_function_name, rw->rw_pid);
+
 		/* ignore if already running */
 		if (rw->rw_pid != 0)
 			continue;
-- 
2.43.0

From 4b01649d52fac9b01566a7a569f27bf3e06841bb Mon Sep 17 00:00:00 2001
From: Andrey Rudometov <[email protected]>
Date: Wed, 11 Jun 2025 11:18:45 +0700
Subject: [PATCH v1 2/2] Fix bgw not starting after restart_after_crash

If postgres assumes child as crashed and said child is background
worker, and restart_after_crash is set - this bgw's local data
won't be cleared; p.e. rw_pid will remain set, so this bgw won't
be restarted later.
---
 src/backend/postmaster/postmaster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 2d99f9d670d..b6d46cd84a7 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2633,7 +2633,7 @@ CleanupBackend(PMChild *bp,
 	if (crashed)
 	{
 		HandleChildCrash(bp_pid, exitstatus, procname);
-		return;
+		logged = true;
 	}
 
 	/*
-- 
2.43.0

Fix background workers not restarting with restart_after_crash = on

Reply via email to