From ac405310591719db0c37356e4487181bed8b3f71 Mon Sep 17 00:00:00 2001
From: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Date: Wed, 26 Mar 2025 19:03:50 +0900
Subject: [PATCH vPG17] Stabilize 035_standby_logical_decoding.pl by using the
 injection_points.

This test tries to invalidate slots on standby server, by running VACUUM on
primary and discarding needed tuples for slots. The problem is that
xl_running_xacts records are sotimetimes generated while testing, it advances
the catalog_xmin so that the invalidation might not happen in some cases.

The fix is to skip using the active slots for some testcases.
---
 .../t/035_standby_logical_decoding.pl         | 201 ++++++++----------
 1 file changed, 91 insertions(+), 110 deletions(-)

diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index aeb79f51e71..d68a8f9b828 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -44,27 +44,37 @@ sub wait_for_xmins
 # Create the required logical slots on standby.
 sub create_logical_slots
 {
-	my ($node, $slot_prefix) = @_;
+	my ($node, $slot_prefix, $needs_active_slot) = @_;
 
-	my $active_slot = $slot_prefix . 'activeslot';
 	my $inactive_slot = $slot_prefix . 'inactiveslot';
 	$node->create_logical_slot_on_standby($node_primary, qq($inactive_slot),
 		'testdb');
-	$node->create_logical_slot_on_standby($node_primary, qq($active_slot),
-		'testdb');
+
+	if ($needs_active_slot)
+	{
+		my $active_slot = $slot_prefix . 'activeslot';
+
+		$node->create_logical_slot_on_standby($node_primary, qq($active_slot),
+			'testdb');
+	}
 }
 
 # Drop the logical slots on standby.
 sub drop_logical_slots
 {
-	my ($slot_prefix) = @_;
-	my $active_slot = $slot_prefix . 'activeslot';
+	my ($slot_prefix, $needs_active_slot) = @_;
 	my $inactive_slot = $slot_prefix . 'inactiveslot';
 
 	$node_standby->psql('postgres',
 		qq[SELECT pg_drop_replication_slot('$inactive_slot')]);
-	$node_standby->psql('postgres',
-		qq[SELECT pg_drop_replication_slot('$active_slot')]);
+
+	if ($needs_active_slot)
+	{
+		my $active_slot = $slot_prefix . 'activeslot';
+
+		$node_standby->psql('postgres',
+			qq[SELECT pg_drop_replication_slot('$active_slot')]);
+	}
 }
 
 # Acquire one of the standby logical slots created by create_logical_slots().
@@ -171,42 +181,46 @@ sub change_hot_standby_feedback_and_wait_for_xmins
 # Check reason for conflict in pg_replication_slots.
 sub check_slots_conflict_reason
 {
-	my ($slot_prefix, $reason) = @_;
+	my ($slot_prefix, $reason, $needs_active_slot) = @_;
 
-	my $active_slot = $slot_prefix . 'activeslot';
 	my $inactive_slot = $slot_prefix . 'inactiveslot';
 
-	$res = $node_standby->safe_psql(
-		'postgres', qq(
-			 select invalidation_reason from pg_replication_slots where slot_name = '$active_slot' and conflicting;)
-	);
-
-	is($res, "$reason", "$active_slot reason for conflict is $reason");
-
 	$res = $node_standby->safe_psql(
 		'postgres', qq(
 			 select invalidation_reason from pg_replication_slots where slot_name = '$inactive_slot' and conflicting;)
 	);
 
 	is($res, "$reason", "$inactive_slot reason for conflict is $reason");
+
+	if ($needs_active_slot)
+	{
+		my $active_slot = $slot_prefix . 'activeslot';
+
+		$res = $node_standby->safe_psql(
+			'postgres', qq(
+				select invalidation_reason from pg_replication_slots where slot_name = '$active_slot' and conflicting;)
+		);
+
+		is($res, "$reason", "$active_slot reason for conflict is $reason");
+	}
 }
 
-# Drop the slots, re-create them, change hot_standby_feedback,
-# check xmin and catalog_xmin values, make slot active and reset stat.
+# Create slots, change hot_standby_feedback, check xmin and catalog_xmin
+# values, make slot active and reset stat.
 sub reactive_slots_change_hfs_and_wait_for_xmins
 {
-	my ($previous_slot_prefix, $slot_prefix, $hsf, $invalidated) = @_;
-
-	# drop the logical slots
-	drop_logical_slots($previous_slot_prefix);
+	my ($slot_prefix, $hsf, $invalidated, $needs_active_slot) = @_;
 
 	# create the logical slots
-	create_logical_slots($node_standby, $slot_prefix);
+	create_logical_slots($node_standby, $slot_prefix, $needs_active_slot);
 
 	change_hot_standby_feedback_and_wait_for_xmins($hsf, $invalidated);
 
-	$handle =
-	  make_slot_active($node_standby, $slot_prefix, 1, \$stdout, \$stderr);
+	if ($needs_active_slot)
+	{
+		$handle =
+		  make_slot_active($node_standby, $slot_prefix, 1, \$stdout, \$stderr);
+	}
 
 	# reset stat: easier to check for confl_active_logicalslot in pg_stat_database_conflicts
 	$node_standby->psql('testdb', q[select pg_stat_reset();]);
@@ -215,9 +229,8 @@ sub reactive_slots_change_hfs_and_wait_for_xmins
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 sub check_for_invalidation
 {
-	my ($slot_prefix, $log_start, $test_name) = @_;
+	my ($slot_prefix, $log_start, $test_name, $checks_active_slot) = @_;
 
-	my $active_slot = $slot_prefix . 'activeslot';
 	my $inactive_slot = $slot_prefix . 'inactiveslot';
 
 	# message should be issued
@@ -226,31 +239,29 @@ sub check_for_invalidation
 			$log_start),
 		"inactiveslot slot invalidation is logged $test_name");
 
-	ok( $node_standby->log_contains(
-			"invalidating obsolete replication slot \"$active_slot\"",
-			$log_start),
-		"activeslot slot invalidation is logged $test_name");
-
-	# Verify that pg_stat_database_conflicts.confl_active_logicalslot has been updated
-	ok( $node_standby->poll_query_until(
-			'postgres',
-			"select (confl_active_logicalslot = 1) from pg_stat_database_conflicts where datname = 'testdb'",
-			't'),
-		'confl_active_logicalslot updated'
-	) or die "Timed out waiting confl_active_logicalslot to be updated";
+	if ($checks_active_slot)
+	{
+		my $active_slot = $slot_prefix . 'activeslot';
+
+		ok( $node_standby->log_contains(
+				"invalidating obsolete replication slot \"$active_slot\"",
+				$log_start),
+			"activeslot slot invalidation is logged $test_name");
+
+		# Verify that pg_stat_database_conflicts.confl_active_logicalslot has been updated
+		ok( $node_standby->poll_query_until(
+				'postgres',
+				"select (confl_active_logicalslot = 1) from pg_stat_database_conflicts where datname = 'testdb'",
+				't'),
+			'confl_active_logicalslot updated'
+		) or die "Timed out waiting confl_active_logicalslot to be updated";
+	}
 }
 
 # Launch $sql query, wait for a new snapshot that has a newer horizon and
 # launch a VACUUM.  $vac_option is the set of options to be passed to the
 # VACUUM command, $sql the sql to launch before triggering the vacuum and
 # $to_vac the relation to vacuum.
-#
-# Note that pg_current_snapshot() is used to get the horizon.  It does
-# not generate a Transaction/COMMIT WAL record, decreasing the risk of
-# seeing a xl_running_xacts that would advance an active replication slot's
-# catalog_xmin.  Advancing the active replication slot's catalog_xmin
-# would break some tests that expect the active slot to conflict with
-# the catalog xmin horizon.
 sub wait_until_vacuum_can_remove
 {
 	my ($vac_option, $sql, $to_vac) = @_;
@@ -389,7 +400,7 @@ $node_standby->safe_psql('postgres',
 ##################################################
 
 # create the logical slots
-create_logical_slots($node_standby, 'behaves_ok_');
+create_logical_slots($node_standby, 'behaves_ok_', 1);
 
 $node_primary->safe_psql('testdb',
 	qq[CREATE TABLE decoding_test(x integer, y text);]);
@@ -539,21 +550,19 @@ $node_subscriber->stop;
 # active slot is invalidated.
 ##################################################
 
+# drop the logical slots used by previous tests
+drop_logical_slots('behaves_ok_', 1);
+
 # One way to produce recovery conflict is to create/drop a relation and
 # launch a vacuum full on pg_class with hot_standby_feedback turned off on
 # the standby.
-reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_',
-	0, 1);
+reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 0, 1, 0);
 
 # Ensure that replication slot stats are not empty before triggering the
 # conflict.
 $node_primary->safe_psql('testdb',
 	qq[INSERT INTO decoding_test(x,y) SELECT 100,'100';]);
 
-$node_standby->poll_query_until('testdb',
-	qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE slot_name = 'vacuum_full_activeslot']
-) or die "replication slot stats of vacuum_full_activeslot not updated";
-
 # This should trigger the conflict
 wait_until_vacuum_can_remove(
 	'full', 'CREATE TABLE conflict_test(x integer, y text);
@@ -562,27 +571,11 @@ wait_until_vacuum_can_remove(
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class');
+check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('vacuum_full_', 'rows_removed');
 
-# Ensure that replication slot stats are not removed after invalidation.
-is( $node_standby->safe_psql(
-		'testdb',
-		qq[SELECT total_txns > 0 FROM pg_stat_replication_slots WHERE slot_name = 'vacuum_full_activeslot']
-	),
-	't',
-	'replication slot stats not removed after invalidation');
-
-$handle =
-  make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, \$stderr);
-
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-	"can no longer get changes from replication slot \"vacuum_full_activeslot\""
-);
-
 # Turn hot_standby_feedback back on
 change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 
@@ -602,7 +595,7 @@ check_slots_conflict_reason('vacuum_full_', 'rows_removed');
 my $restart_lsn = $node_standby->safe_psql(
 	'postgres',
 	"SELECT restart_lsn FROM pg_replication_slots
-		WHERE slot_name = 'vacuum_full_activeslot' AND conflicting;"
+		WHERE slot_name = 'vacuum_full_inactiveslot' AND conflicting;"
 );
 
 chomp($restart_lsn);
@@ -634,14 +627,16 @@ ok(!-f "$standby_walfile",
 # Scenario 2: conflict due to row removal with hot_standby_feedback off.
 ##################################################
 
+# drop the logical slots used by previous tests
+drop_logical_slots('vacuum_full_', 0);
+
 # get the position to search from in the standby logfile
 my $logstart = -s $node_standby->logfile;
 
 # One way to produce recovery conflict is to create/drop a relation and
 # launch a vacuum on pg_class with hot_standby_feedback turned off on the
 # standby.
-reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_',
-	0, 1);
+reactive_slots_change_hfs_and_wait_for_xmins('row_removal_', 0, 1, 0);
 
 # This should trigger the conflict
 wait_until_vacuum_can_remove(
@@ -651,32 +646,26 @@ wait_until_vacuum_can_remove(
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class');
+check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('row_removal_', 'rows_removed');
 
-$handle =
-  make_slot_active($node_standby, 'row_removal_', 0, \$stdout, \$stderr);
-
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-	"can no longer get changes from replication slot \"row_removal_activeslot\""
-);
-
 ##################################################
 # Recovery conflict: Same as Scenario 2 but on a shared catalog table
 # Scenario 3: conflict due to row removal with hot_standby_feedback off.
 ##################################################
 
+# drop the logical slots used by previous tests
+drop_logical_slots('row_removal_', 0);
+
 # get the position to search from in the standby logfile
 $logstart = -s $node_standby->logfile;
 
 # One way to produce recovery conflict on a shared catalog table is to
 # create/drop a role and launch a vacuum on pg_authid with
 # hot_standby_feedback turned off on the standby.
-reactive_slots_change_hfs_and_wait_for_xmins('row_removal_',
-	'shared_row_removal_', 0, 1);
+reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', 0, 1, 0);
 
 # Trigger the conflict
 wait_until_vacuum_can_remove(
@@ -687,29 +676,23 @@ $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 check_for_invalidation('shared_row_removal_', $logstart,
-	'with vacuum on pg_authid');
+	'with vacuum on pg_authid', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('shared_row_removal_', 'rows_removed');
 
-$handle = make_slot_active($node_standby, 'shared_row_removal_', 0, \$stdout,
-	\$stderr);
-
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-	"can no longer get changes from replication slot \"shared_row_removal_activeslot\""
-);
-
 ##################################################
 # Recovery conflict: Same as Scenario 2 but on a non catalog table
 # Scenario 4: No conflict expected.
 ##################################################
 
+# drop the logical slots used by previous tests
+drop_logical_slots('shared_row_removal_', 0);
+
 # get the position to search from in the standby logfile
 $logstart = -s $node_standby->logfile;
 
-reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_',
-	'no_conflict_', 0, 1);
+reactive_slots_change_hfs_and_wait_for_xmins('no_conflict_', 0, 1);
 
 # This should not trigger a conflict
 wait_until_vacuum_can_remove(
@@ -758,13 +741,15 @@ $node_standby->restart;
 # Scenario 5: conflict due to on-access pruning.
 ##################################################
 
+# drop the logical slots used by previous tests
+drop_logical_slots('no_conflict_', 1);
+
 # get the position to search from in the standby logfile
 $logstart = -s $node_standby->logfile;
 
 # One way to produce recovery conflict is to trigger an on-access pruning
 # on a relation marked as user_catalog_table.
-reactive_slots_change_hfs_and_wait_for_xmins('no_conflict_', 'pruning_', 0,
-	0);
+reactive_slots_change_hfs_and_wait_for_xmins('pruning_', 0, 0, 0);
 
 # This should trigger the conflict
 $node_primary->safe_psql('testdb',
@@ -779,17 +764,13 @@ $node_primary->safe_psql('testdb', qq[UPDATE prun SET s = 'E';]);
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('pruning_', $logstart, 'with on-access pruning');
+check_for_invalidation('pruning_', $logstart, 'with on-access pruning', 0);
 
 # Verify reason for conflict is 'rows_removed' in pg_replication_slots
 check_slots_conflict_reason('pruning_', 'rows_removed');
 
 $handle = make_slot_active($node_standby, 'pruning_', 0, \$stdout, \$stderr);
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-	"can no longer get changes from replication slot \"pruning_activeslot\"");
-
 # Turn hot_standby_feedback back on
 change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 
@@ -802,10 +783,10 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 $logstart = -s $node_standby->logfile;
 
 # drop the logical slots
-drop_logical_slots('pruning_');
+drop_logical_slots('pruning_', 0);
 
 # create the logical slots
-create_logical_slots($node_standby, 'wal_level_');
+create_logical_slots($node_standby, 'wal_level_', 1);
 
 $handle =
   make_slot_active($node_standby, 'wal_level_', 1, \$stdout, \$stderr);
@@ -823,7 +804,7 @@ $node_primary->restart;
 $node_primary->wait_for_replay_catchup($node_standby);
 
 # Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('wal_level_', $logstart, 'due to wal_level');
+check_for_invalidation('wal_level_', $logstart, 'due to wal_level', 1);
 
 # Verify reason for conflict is 'wal_level_insufficient' in pg_replication_slots
 check_slots_conflict_reason('wal_level_', 'wal_level_insufficient');
@@ -855,10 +836,10 @@ check_pg_recvlogical_stderr($handle,
 ##################################################
 
 # drop the logical slots
-drop_logical_slots('wal_level_');
+drop_logical_slots('wal_level_', 1);
 
 # create the logical slots
-create_logical_slots($node_standby, 'drop_db_');
+create_logical_slots($node_standby, 'drop_db_', 1);
 
 $handle = make_slot_active($node_standby, 'drop_db_', 1, \$stdout, \$stderr);
 
@@ -922,14 +903,14 @@ $node_cascading_standby->append_conf(
 $node_cascading_standby->start;
 
 # create the logical slots
-create_logical_slots($node_standby, 'promotion_');
+create_logical_slots($node_standby, 'promotion_', 1);
 
 # Wait for the cascading standby to catchup before creating the slots
 $node_standby->wait_for_replay_catchup($node_cascading_standby,
 	$node_primary);
 
 # create the logical slots on the cascading standby too
-create_logical_slots($node_cascading_standby, 'promotion_');
+create_logical_slots($node_cascading_standby, 'promotion_', 1);
 
 # Make slots actives
 $handle =
-- 
2.43.5

