From f67f9fea45c6a3319aa05c8c1feac133178fd9e6 Mon Sep 17 00:00:00 2001
From: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Date: Wed, 26 Mar 2025 14:19:50 +0900
Subject: [PATCH v4] Stabilize 035_standby_logical_decoding.pl by using the
 injection_points.

This test tries to invalidate slots on standby server, by running VACUUM on
primary and discarding needed tuples for slots. The problem is that
xl_running_xacts records are sotimetimes generated while testing, it advances
the catalog_xmin so that the invalidation might not happen in some cases.

The fix is to skip generating the record when the instance attached to a new
injection point.

This failure can happen since logical decoding is allowed on the standby server.
But the interface of injection_points we used exists only on master, so we do
not backpatch.
---
 src/backend/storage/ipc/standby.c             | 16 ++++++
 .../t/035_standby_logical_decoding.pl         | 57 ++++++++++++++-----
 2 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 5acb4508f85..0e621e9996a 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -31,6 +31,7 @@
 #include "storage/sinvaladt.h"
 #include "storage/standby.h"
 #include "utils/hsearch.h"
+#include "utils/injection_point.h"
 #include "utils/ps_status.h"
 #include "utils/timeout.h"
 #include "utils/timestamp.h"
@@ -1287,6 +1288,21 @@ LogStandbySnapshot(void)
 
 	Assert(XLogStandbyInfoActive());
 
+	/* For testing slot invalidation */
+#ifdef USE_INJECTION_POINTS
+	if (IS_INJECTION_POINT_ATTACHED("log-running-xacts"))
+	{
+		/*
+		 * RUNNING_XACTS could move slots's xmin forward and cause random
+		 * failures in some tests. Skip generating to avoid it.
+		 *
+		 * XXX What value should we return here? Originally this returns the
+		 * inserted location of RUNNING_XACT record. Based on that, here
+		 * returns the latest insert location for now.
+		 */
+		return GetInsertRecPtr();
+	}
+#endif
 	/*
 	 * Get details of any AccessExclusiveLocks being held at the moment.
 	 */
diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index c31cab06f1c..96dd0340e8d 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -10,6 +10,11 @@ use PostgreSQL::Test::Cluster;
 use PostgreSQL::Test::Utils;
 use Test::More;
 
+if ($ENV{enable_injection_points} ne 'yes')
+{
+	plan skip_all => 'Injection points not supported by this build';
+}
+
 my ($stdout, $stderr, $cascading_stdout, $cascading_stderr, $handle);
 
 my $node_primary = PostgreSQL::Test::Cluster->new('primary');
@@ -241,16 +246,19 @@ sub check_for_invalidation
 # VACUUM command, $sql the sql to launch before triggering the vacuum and
 # $to_vac the relation to vacuum.
 #
-# Note that pg_current_snapshot() is used to get the horizon.  It does
-# not generate a Transaction/COMMIT WAL record, decreasing the risk of
-# seeing a xl_running_xacts that would advance an active replication slot's
-# catalog_xmin.  Advancing the active replication slot's catalog_xmin
-# would break some tests that expect the active slot to conflict with
-# the catalog xmin horizon.
+# Note that injection_point is used to avoid seeing a xl_running_xacts that
+# would advance an active replication slot's catalog_xmin. Advancing the active
+# replication slot's catalog_xmin would break some tests that expect the active
+# slot to conflict with the catalog xmin horizon.
 sub wait_until_vacuum_can_remove
 {
 	my ($vac_option, $sql, $to_vac) = @_;
 
+	# Note that from this point the checkpointer and bgwriter will skip writing
+	# xl_running_xacts record.
+	$node_primary->safe_psql('testdb',
+		"SELECT injection_points_attach('log-running-xacts', 'error');");
+
 	# Get the current xid horizon,
 	my $xid_horizon = $node_primary->safe_psql('testdb',
 		qq[select pg_snapshot_xmin(pg_current_snapshot());]);
@@ -268,6 +276,12 @@ sub wait_until_vacuum_can_remove
 	$node_primary->safe_psql(
 		'testdb', qq[VACUUM $vac_option verbose $to_vac;
 										  INSERT INTO flush_wal DEFAULT VALUES;]);
+
+	$node_primary->wait_for_replay_catchup($node_standby);
+
+	# Resume generating the xl_running_xacts record
+	$node_primary->safe_psql('testdb',
+		"SELECT injection_points_detach('log-running-xacts');");
 }
 
 ########################
@@ -285,6 +299,14 @@ autovacuum = off
 $node_primary->dump_info;
 $node_primary->start;
 
+# Check if the extension injection_points is available, as it may be
+# possible that this script is run with installcheck, where the module
+# would not be installed by default.
+if (!$node_primary->check_extension('injection_points'))
+{
+	plan skip_all => 'Extension injection_points not installed';
+}
+
 $node_primary->psql('postgres', q[CREATE DATABASE testdb]);
 
 $node_primary->safe_psql('testdb',
@@ -528,6 +550,9 @@ is($result, qq(10), 'check replicated inserts after subscription on standby');
 $node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub");
 $node_subscriber->stop;
 
+# Create the injection_points extension
+$node_primary->safe_psql('testdb', 'CREATE EXTENSION injection_points;');
+
 ##################################################
 # Recovery conflict: Invalidate conflicting slots, including in-use slots
 # Scenario 1: hot_standby_feedback off and vacuum FULL
@@ -557,8 +582,6 @@ wait_until_vacuum_can_remove(
 	'full', 'CREATE TABLE conflict_test(x integer, y text);
 								 DROP TABLE conflict_test;', 'pg_class');
 
-$node_primary->wait_for_replay_catchup($node_standby);
-
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class');
 
@@ -656,8 +679,6 @@ wait_until_vacuum_can_remove(
 	'', 'CREATE TABLE conflict_test(x integer, y text);
 							 DROP TABLE conflict_test;', 'pg_class');
 
-$node_primary->wait_for_replay_catchup($node_standby);
-
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class');
 
@@ -690,8 +711,6 @@ wait_until_vacuum_can_remove(
 	'', 'CREATE ROLE create_trash;
 							 DROP ROLE create_trash;', 'pg_authid');
 
-$node_primary->wait_for_replay_catchup($node_standby);
-
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 check_for_invalidation('shared_row_removal_', $logstart,
 	'with vacuum on pg_authid');
@@ -724,8 +743,6 @@ wait_until_vacuum_can_remove(
 							 INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
 							 UPDATE conflict_test set x=1, y=1;', 'conflict_test');
 
-$node_primary->wait_for_replay_catchup($node_standby);
-
 # message should not be issued
 ok( !$node_standby->log_contains(
 		"invalidating obsolete slot \"no_conflict_inactiveslot\"", $logstart),
@@ -773,6 +790,14 @@ $logstart = -s $node_standby->logfile;
 reactive_slots_change_hfs_and_wait_for_xmins('no_conflict_', 'pruning_', 0,
 	0);
 
+# Injection_point is used to avoid seeing an xl_running_xacts even here. In
+# scenario 5, we verify the case that the backend process detects the page has
+# enough tuples; thus, page pruning happens. If the record is generated just
+# before doing on-pruning, the catalog_xmin of the active slot would be
+# updated; hence, the conflict would not occur.
+$node_primary->safe_psql('testdb',
+	"SELECT injection_points_attach('log-running-xacts', 'error');");
+
 # This should trigger the conflict
 $node_primary->safe_psql('testdb',
 	qq[CREATE TABLE prun(id integer, s char(2000)) WITH (fillfactor = 75, user_catalog_table = true);]
@@ -785,6 +810,10 @@ $node_primary->safe_psql('testdb', qq[UPDATE prun SET s = 'E';]);
 
 $node_primary->wait_for_replay_catchup($node_standby);
 
+# Resume generating the xl_running_xacts record
+$node_primary->safe_psql('testdb',
+	"SELECT injection_points_detach('log-running-xacts');");
+
 # Check invalidation in the logfile and in pg_stat_database_conflicts
 check_for_invalidation('pruning_', $logstart, 'with on-access pruning');
 
-- 
2.43.5