From a9d8230639118d932883c51cc4b6ecf214840022 Mon Sep 17 00:00:00 2001
From: alterego655 <824662526@qq.com>
Date: Tue, 6 Jan 2026 20:55:43 +0800
Subject: [PATCH v1] Fix wait_for_catchup() failure when standby session is
 killed by recovery conflict

Commit f30848cb optimized wait_for_catchup() to use WAIT FOR LSN on the standby instead of polling pg_stat_replication on the primary. However, this introduced a failure mode: the WAIT FOR LSN session can be killed by recovery conflicts on the standby, causing the test helper to die unexpectedly.

This manifests as flapping failures in tests like 031_recovery_conflict, where DROP TABLESPACE on the primary triggers ResolveRecoveryConflictWithTablespace() on the standby. That function kills all backends indiscriminately, including the innocent WAIT FOR LSN session that happens to be connected at that moment.

Fix by wrapping the WAIT FOR LSN call in an eval block and falling back to the original polling approach when the session is killed by a recovery conflict. The fallback is selective:

- If WAIT FOR LSN succeeds with 'success': return immediately

- If WAIT FOR LSN returns non-success (timeout, not_in_recovery): fail immediately with diagnostics

- If the session is killed by a recovery conflict (error contains "conflict with recovery"): fall back to polling on the primary

- For any other error: fail immediately to avoid masking real problems

The polling fallback is immune to standby-side conflicts because it queries pg_stat_replication on the primary, not the standby.
---
 src/test/perl/PostgreSQL/Test/Cluster.pm | 53 +++++++++++++++++++-----
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index a28ea89aa10..08379aeb8fb 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -3401,22 +3401,52 @@ sub wait_for_catchup
 			my $timeout = $PostgreSQL::Test::Utils::timeout_default;
 			my $wait_query =
 			  qq[WAIT FOR LSN '${target_lsn}' WITH (MODE '${wait_mode}', timeout '${timeout}s', no_throw);];
-			my $output = $standby_node->safe_psql('postgres', $wait_query);
-			chomp($output);
 
-			if ($output ne 'success')
+			# Try WAIT FOR LSN. If it succeeds, we're done. If it returns a
+			# non-success status (timeout, not_in_recovery), fail immediately.
+			# If the session is interrupted (e.g., killed by recovery conflict),
+			# fall back to polling on the upstream which is immune to standby-
+			# side conflicts.
+			my $output;
+			local $@;
+			my $wait_succeeded = eval {
+				$output = $standby_node->safe_psql('postgres', $wait_query);
+				chomp($output);
+				1;
+			};
+
+			if ($wait_succeeded && $output eq 'success')
+			{
+				print "done\n";
+				return;
+			}
+
+			# If WAIT FOR LSN executed but returned non-success (e.g., timeout,
+			# not_in_recovery), fail immediately with diagnostic info. Falling
+			# back to polling would just waste time.
+			if ($wait_succeeded)
 			{
-				# Fetch additional detail for debugging purposes
 				my $details = $self->safe_psql('postgres',
 					"SELECT * FROM pg_catalog.pg_stat_replication");
-				diag qq(WAIT FOR LSN failed with status:
-	${output});
-				diag qq(Last pg_stat_replication contents:
-	${details});
-				croak "failed waiting for catchup";
+				diag qq(WAIT FOR LSN returned '$output'
+pg_stat_replication on upstream:
+${details});
+				croak "WAIT FOR LSN '$wait_mode' returned '$output'";
+			}
+
+			# WAIT FOR LSN was interrupted. Only fall back to polling if this
+			# looks like a recovery conflict - the canonical PostgreSQL error
+			# message contains "conflict with recovery". Other errors should
+			# fail immediately rather than being masked by a silent fallback.
+			if ($@ =~ /conflict with recovery/i)
+			{
+				diag qq(WAIT FOR LSN interrupted, falling back to polling:
+$@);
+			}
+			else
+			{
+				croak "WAIT FOR LSN failed: $@";
 			}
-			print "done\n";
-			return;
 		}
 	}
 
@@ -3424,6 +3454,7 @@ sub wait_for_catchup
 	# - 'sent' mode (no corresponding WAIT FOR LSN mode)
 	# - When standby_name is a string (e.g., subscription name)
 	# - When the standby is no longer in recovery (was promoted)
+	# - When WAIT FOR LSN was interrupted (e.g., killed by a recovery conflict)
 	my $query = qq[SELECT '$target_lsn' <= ${mode}_lsn AND state = 'streaming'
          FROM pg_catalog.pg_stat_replication
          WHERE application_name IN ('$standby_name', 'walreceiver')];
-- 
2.51.0

