From 5dce899cb4856908ea41a8817fa71135b505c0c2 Mon Sep 17 00:00:00 2001
From: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Date: Wed, 26 Mar 2025 19:03:50 +0900
Subject: [PATCH v5-PG17] Stabilize 035_standby_logical_decoding.pl

This test tries to invalidate slots on standby server, by running VACUUM on
primary and discarding needed tuples for slots. The problem is that
xl_running_xacts records are sotimetimes generated while testing, it advances
the catalog_xmin so that the invalidation might not happen in some cases.

The fix is to skip activating slots for some testcases.
---
 .../t/035_standby_logical_decoding.pl         | 53 +++++++++----------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index aeb79f51e71..752e31960ea 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -205,9 +205,6 @@ sub reactive_slots_change_hfs_and_wait_for_xmins
 
 	change_hot_standby_feedback_and_wait_for_xmins($hsf, $invalidated);
 
-	$handle =
-	  make_slot_active($node_standby, $slot_prefix, 1, \$stdout, \$stderr);
-
 	# reset stat: easier to check for confl_active_logicalslot in pg_stat_database_conflicts
 	$node_standby->psql('testdb', q[select pg_stat_reset();]);
 }
@@ -215,7 +212,7 @@ sub reactive_slots_change_hfs_and_wait_for_xmins
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 sub check_for_invalidation
 {
-	my ($slot_prefix, $log_start, $test_name) = @_;
+	my ($slot_prefix, $log_start, $test_name, $checks_active_slot) = @_;
 
 	my $active_slot = $slot_prefix . 'activeslot';
 	my $inactive_slot = $slot_prefix . 'inactiveslot';
@@ -231,13 +228,17 @@ sub check_for_invalidation
 			$log_start),
 		"activeslot slot invalidation is logged $test_name");
 
-	# Verify that pg_stat_database_conflicts.confl_active_logicalslot has been updated
-	ok( $node_standby->poll_query_until(
-			'postgres',
-			"select (confl_active_logicalslot = 1) from pg_stat_database_conflicts where datname = 'testdb'",
-			't'),
-		'confl_active_logicalslot updated'
-	) or die "Timed out waiting confl_active_logicalslot to be updated";
+	if ($checks_active_slot)
+	{
+		# Verify that pg_stat_database_conflicts.confl_active_logicalslot has
+		# been updated
+		ok( $node_standby->poll_query_until(
+				'postgres',
+				"select (confl_active_logicalslot = 1) from pg_stat_database_conflicts where datname = 'testdb'",
+				't'),
+			'confl_active_logicalslot updated'
+		) or die "Timed out waiting confl_active_logicalslot to be updated";
+	}
 }
 
 # Launch $sql query, wait for a new snapshot that has a newer horizon and
@@ -250,7 +251,8 @@ sub check_for_invalidation
 # seeing a xl_running_xacts that would advance an active replication slot's
 # catalog_xmin.  Advancing the active replication slot's catalog_xmin
 # would break some tests that expect the active slot to conflict with
-# the catalog xmin horizon.
+# the catalog xmin horizon. We ensure that replication slots are not activated
+# for tests that might produce this race condition though.
 sub wait_until_vacuum_can_remove
 {
 	my ($vac_option, $sql, $to_vac) = @_;
@@ -550,10 +552,6 @@ reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_',
 $node_primary->safe_psql('testdb',
 	qq[INSERT INTO decoding_test(x,y) SELECT 100,'100';]);
 
-$node_standby->poll_query_until('testdb',
-	qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE slot_name = 'vacuum_full_activeslot']
-) or die "replication slot stats of vacuum_full_activeslot not updated";
-
 # This should trigger the conflict
 wait_until_vacuum_can_remove(
 	'full', 'CREATE TABLE conflict_test(x integer, y text);
@@ -562,19 +560,11 @@ wait_until_vacuum_can_remove(
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class');
+check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('vacuum_full_', 'rows_removed');
 
-# Ensure that replication slot stats are not removed after invalidation.
-is( $node_standby->safe_psql(
-		'testdb',
-		qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE slot_name = 'vacuum_full_activeslot']
-	),
-	't',
-	'replication slot stats not removed after invalidation');
-
 $handle =
   make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, \$stderr);
 
@@ -651,7 +641,7 @@ wait_until_vacuum_can_remove(
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class');
+check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('row_removal_', 'rows_removed');
@@ -687,7 +677,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 check_for_invalidation('shared_row_removal_', $logstart,
-	'with vacuum on pg_authid');
+	'with vacuum on pg_authid', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('shared_row_removal_', 'rows_removed');
@@ -711,6 +701,11 @@ $logstart = -s $node_standby->logfile;
 reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_',
 	'no_conflict_', 0, 1);
 
+# This scenario won't produce the race condition by a xl_running_xacts, so
+# activate the slot. See comments atop wait_until_vacuum_can_remove().
+make_slot_active($node_standby, 'no_conflict_', 1, \$stdout,
+	\$stderr);
+
 # This should not trigger a conflict
 wait_until_vacuum_can_remove(
 	'', 'CREATE TABLE conflict_test(x integer, y text);
@@ -779,7 +774,7 @@ $node_primary->safe_psql('testdb', qq[UPDATE prun SET s = 'E';]);
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('pruning_', $logstart, 'with on-access pruning');
+check_for_invalidation('pruning_', $logstart, 'with on-access pruning', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('pruning_', 'rows_removed');
@@ -823,7 +818,7 @@ $node_primary->restart;
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('wal_level_', $logstart, 'due to wal_level');
+check_for_invalidation('wal_level_', $logstart, 'due to wal_level', 1);
 
 # Verify reason for conflict is 'wal_level_insufficient' in pg_replication_slots
 check_slots_conflict_reason('wal_level_', 'wal_level_insufficient');
-- 
2.43.5

