Hi,

On Tue, Feb 20, 2024 at 04:03:53PM +0000, Bertrand Drouvot wrote:
> Hi,
> 
> On Tue, Feb 20, 2024 at 02:33:44PM +0900, Michael Paquier wrote:
> > On Tue, Feb 20, 2024 at 08:51:17AM +0900, Michael Paquier wrote:
> > > Prefixing these with "initial_" is fine, IMO.  That shows the
> > > intention that these come from the slot's data before doing the
> > > termination.  So I'm OK with what's been proposed in v3.
> > 
> > I was looking at that a second time, and just concluded that this is
> > OK, so I've applied that down to 16, wordsmithing a bit the comments.
> 
> Thanks!
> FWIW, I've started to write a POC regarding the test we mentioned up-thread.
> 
> The POC test is based on what has been submitted by Michael in [1]. The POC 
> test
> seems to work fine and it seems that nothing more is needed in [1] (at some 
> point
> I thought I would need the ability to wake up multiple "wait" injection points
> in sequence but that was not necessary).
> 
> I'll polish and propose my POC test once [1] is pushed (unless you're curious
> about it before).

Though [1] mentioned up-thread is not pushed yet, I'm Sharing the POC patch now
(see the attached).

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
>From 05a4eace2002ad5387b89eb5c133a984de3f8c2d Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot...@gmail.com>
Date: Tue, 20 Feb 2024 08:11:47 +0000
Subject: [PATCH v1] Add regression test for 818fefd8fd

---
 src/backend/replication/slot.c                |   8 ++
 .../t/035_standby_logical_decoding.pl         | 103 +++++++++++++++++-
 2 files changed, 109 insertions(+), 2 deletions(-)
   7.3% src/backend/replication/
  92.6% src/test/recovery/t/

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 0f173f63a2..a7aeb9ca3e 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -53,6 +53,7 @@
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/builtins.h"
+#include "utils/injection_point.h"
 
 /*
  * Replication slot on-disk data structure.
@@ -1647,6 +1648,13 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
 				ReportSlotInvalidation(conflict, true, active_pid,
 									   slotname, restart_lsn,
 									   oldestLSN, snapshotConflictHorizon);
+				/*
+				 * This injection needs to be before the kill to ensure that
+				 * the slot is still "active". It also has to be after
+				 * ReportSlotInvalidation() to ensure the invalidation message
+				 * is reported.
+				 */
+				INJECTION_POINT("TerminateProcessHoldingSlot");
 
 				if (MyBackendType == B_STARTUP)
 					(void) SendProcSignal(active_pid,
diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index 0710bccc17..3a2505bf5e 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -733,14 +733,113 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 
 ##################################################
 # Recovery conflict: Invalidate conflicting slots, including in-use slots
-# Scenario 6: incorrect wal_level on primary.
+# Scenario 6: Ensure race condition described in 818fefd8fd is fixed.
+##################################################
+SKIP:
+{
+    skip "Injection points not supported by this build", 1
+      if ($ENV{enable_injection_points} ne 'yes');
+
+	# Get the position to search from in the standby logfile
+	$logstart = -s $node_standby->logfile;
+
+	# Drop the slots, re-create them, change hot_standby_feedback,
+	# check xmin and catalog_xmin values, make slot active and reset stat.
+	reactive_slots_change_hfs_and_wait_for_xmins('pruning_', 'injection_', 0, 1);
+
+	# Create the injection_points extension
+	$node_primary->safe_psql('testdb', 'CREATE EXTENSION injection_points;');
+
+	# Wait until the extension has been created on the standby
+	$node_primary->wait_for_replay_catchup($node_standby);
+
+	# Attach the injection point
+	$node_standby->safe_psql('testdb',
+	       "SELECT injection_points_attach('TerminateProcessHoldingSlot', 'wait');");
+
+	# Trigger a conflict and insert an XLOG_RUNNING_XACTS before triggering
+	# the vacuum.
+	$node_primary->safe_psql('testdb', qq[CREATE TABLE injection_test(x integer);
+										  DROP TABLE injection_test;
+										  SELECT pg_log_standby_snapshot();]);
+
+	# Now launch the vacuum
+	wait_until_vacuum_can_remove('', 'CREATE TABLE injection_test2(x integer);', 'pg_class');
+
+	# Note: $node_primary->wait_for_replay_catchup($node_standby) would be
+	# hanging here due to the injection point, so check the log instead.
+
+	my $terminated = 0;
+	for (my $i = 0; $i < 10 * $PostgreSQL::Test::Utils::timeout_default; $i++)
+	{
+		if ($node_standby->log_contains(
+			'terminating process .* to release replication slot \"injection_activeslot\"', $logstart))
+		{
+			$terminated = 1;
+			last;
+		}
+		usleep(100_000);
+	}
+	ok($terminated, 'terminating process holding the active slot is logged with injection point');
+
+	# Note that at this point the process holding the active slot is about to
+	# be killed: the termination message has been emitted and the process will
+	# be killed once we stop waiting on the injection point.
+
+	# Extract xid_horizon from the logfile
+	my $log_contents = slurp_file($node_standby->logfile, $logstart);
+	(my $xid_horizon) = $log_contents =~ m/The slot conflicted with xid horizon ([0-9]*)./
+		or die "could not get xid horizon";
+
+	# Wait until catalog_xmin advances after the xid_horizon. That means that
+	# no conflict would be reported without the fix in 818fefd8fd.
+	$node_standby->poll_query_until('testdb',
+		"SELECT (SELECT catalog_xmin::text::int - $xid_horizon from pg_catalog.pg_replication_slots where slot_name = 'injection_activeslot') > 0"
+	) or die "catalog_xmin did not advance";
+
+	# Get the position to search from in the standby logfile
+	$logstart = -s $node_standby->logfile;
+
+	# Wakeup the injection point
+	$node_standby->safe_psql('testdb', "SELECT injection_points_wakeup('TerminateProcessHoldingSlot');");
+
+	# Wait for the standby to catchup
+	$node_primary->wait_for_replay_catchup($node_standby);
+
+	# Check invalidation in the logfile for the active slot
+	ok( $node_standby->log_contains(
+			"invalidating obsolete replication slot \"injection_activeslot\"",
+			$logstart),
+		"activeslot slot invalidation is logged with injection point");
+
+	# Verify conflict_reason is 'rows_removed' in pg_replication_slots
+	check_slots_conflict_reason('injection_', 'rows_removed');
+
+	# Detach from the injection point
+	$node_standby->safe_psql('testdb',
+	       "SELECT injection_points_detach('TerminateProcessHoldingSlot');");
+
+	# Turn hot_standby_feedback back on
+	change_hot_standby_feedback_and_wait_for_xmins(1, 1);
+}
+
+##################################################
+# Recovery conflict: Invalidate conflicting slots, including in-use slots
+# Scenario 7: incorrect wal_level on primary.
 ##################################################
 
 # get the position to search from in the standby logfile
 $logstart = -s $node_standby->logfile;
 
 # drop the logical slots
-drop_logical_slots('pruning_');
+if ($ENV{enable_injection_points} ne 'yes')
+{
+	drop_logical_slots('pruning_');
+}
+else
+{
+	drop_logical_slots('injection_');
+}
 
 # create the logical slots
 create_logical_slots($node_standby, 'wal_level_');
-- 
2.34.1

Reply via email to