From d1874ef6a51b344a21aa614a09b23052c5a01950 Mon Sep 17 00:00:00 2001
From: Joao Foltran <joao@foltrandba.com>
Date: Mon, 15 Dec 2025 15:21:35 -0300
Subject: [PATCH] Allow physical replication slots to recover after
 invalidation

Commit f41d8468 introduced an ERROR when trying to acquire an
invalidated replication slot. While this is correct for logical
replication slots (which cannot safely recover after invalidation), it
breaks recovery workflows for physical replication slots.

Physical replication slots can recover if the required WAL becomes
available through restore_command or other archive recovery mechanisms.
In PostgreSQL 17 and earlier, invalidated physical slots could be
reacquired and streaming would resume once the standby caught up via
archive recovery.

This commit restores that behavior by:
- Allowing physical slots to be acquired when invalidated (with a LOG)
- Keeping the ERROR for logical slots (which cannot safely recover)

This maintains safety guarantees for logical replication while allowing
physical replication to recover in common operational scenarios.

Reported-by: Joao Foltran <joao@foltrandba.com>
---
 src/backend/replication/slot.c                |  22 ++-
 .../t/050_physical_slot_invalidation.pl       | 179 ++++++++++++++++++
 2 files changed, 195 insertions(+), 6 deletions(-)
 create mode 100644 src/test/recovery/t/050_physical_slot_invalidation.pl

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 682eccd116c..d415e1fe170 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -723,13 +723,23 @@ retry:
 	 * the possible race condition with the checkpointer that can otherwise
 	 * invalidate the slot immediately after the check.
 	 */
+
 	if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE)
-		ereport(ERROR,
-				errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				errmsg("can no longer access replication slot \"%s\"",
-					   NameStr(s->data.name)),
-				errdetail("This replication slot has been invalidated due to \"%s\".",
-						  GetSlotInvalidationCauseName(s->data.invalidated)));
+	{
+		if (SlotIsLogical(s))
+			ereport(ERROR,
+					errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					errmsg("can no longer access replication slot \"%s\"",
+						   NameStr(s->data.name)),
+					errdetail("This replication slot has been invalidated due to \"%s\".",
+						GetSlotInvalidationCauseName(s->data.invalidated)));
+		else
+			ereport(LOG,
+				errmsg("acquiring invalidated physical replication slot \"%s\"",
+						NameStr(s->data.name)),
+				errdetail("This slot has been invalidated due to \"%s\" but may recover if required WAL becomes available.",
+						GetSlotInvalidationCauseName(s->data.invalidated)));
+	}
 
 	/* Let everybody know we've modified this slot */
 	ConditionVariableBroadcast(&s->active_cv);
diff --git a/src/test/recovery/t/050_physical_slot_invalidation.pl b/src/test/recovery/t/050_physical_slot_invalidation.pl
new file mode 100644
index 00000000000..a567a309a07
--- /dev/null
+++ b/src/test/recovery/t/050_physical_slot_invalidation.pl
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, PostgreSQL Global Development Group
+
+# Test physical replication slot recovery after invalidation.
+#
+# This test verifies that physical replication slots can be reacquired
+# after being invalidated due to max_slot_wal_keep_size, as long as the
+# required WAL is available through archive recovery.
+#
+# Prior to this fix, PostgreSQL 18 raised an ERROR when trying to acquire
+# an invalidated physical slot, breaking recovery workflows that worked in
+# PostgreSQL 17.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Create primary node with archive mode
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1, has_archiving => 1);
+
+# Configure primary with small max_slot_wal_keep_size to trigger invalidation
+$node_primary->append_conf('postgresql.conf', qq(
+max_wal_senders = 10
+max_replication_slots = 10
+max_slot_wal_keep_size = 64MB
+wal_level = replica
+min_wal_size = 32MB
+max_wal_size = 64MB
+));
+
+$node_primary->start;
+
+# Create a physical replication slot
+$node_primary->safe_psql('postgres',
+	"SELECT pg_create_physical_replication_slot('test_slot');");
+
+# Check initial slot state
+my $slot_info = $node_primary->safe_psql('postgres',
+	"SELECT slot_name, slot_type, active, restart_lsn, invalidation_reason FROM pg_replication_slots WHERE slot_name = 'test_slot';");
+note("Initial slot state:\n$slot_info");
+
+# Create standby using the slot
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+$node_standby->append_conf('postgresql.conf', qq(
+primary_slot_name = 'test_slot'
+));
+
+# Configure restore_command to allow recovery from archive
+my $archive_dir = $node_primary->archive_dir;
+$node_standby->append_conf('postgresql.conf', qq(
+restore_command = 'cp $archive_dir/%f %p'
+));
+
+$node_standby->start;
+
+# Wait for standby to catch up
+$node_primary->wait_for_catchup($node_standby, 'replay');
+note("Standby is caught up and streaming");
+
+# Check slot state while standby is connected
+$slot_info = $node_primary->safe_psql('postgres',
+	"SELECT slot_name, slot_type, active, restart_lsn, invalidation_reason FROM pg_replication_slots WHERE slot_name = 'test_slot';");
+note("Slot state while standby connected:\n$slot_info");
+
+# Stop the standby to let it fall behind
+$node_standby->stop;
+note("Standby stopped. Generating WAL to invalidate slot...");
+
+# Generate enough WAL to exceed max_slot_wal_keep_size (64MB)
+for my $i (1..10)
+{
+	$node_primary->safe_psql('postgres', qq(
+		CREATE TABLE test_table_$i (id int, data text);
+		INSERT INTO test_table_$i
+		SELECT generate_series(1, 100000),
+		       repeat('x', 1000);
+		DROP TABLE test_table_$i;
+	));
+	$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+}
+
+# Force checkpoint to trigger slot invalidation check
+$node_primary->safe_psql('postgres', "CHECKPOINT;");
+sleep(1);
+
+# Verify slot is invalidated and get full slot information
+my $invalidation_reason = $node_primary->safe_psql('postgres',
+	"SELECT invalidation_reason FROM pg_replication_slots WHERE slot_name = 'test_slot';");
+
+is($invalidation_reason, 'wal_removed',
+	'Slot should be invalidated due to wal_removed');
+
+# Get detailed slot information after invalidation
+$slot_info = $node_primary->safe_psql('postgres',
+	"SELECT slot_name, slot_type, active, restart_lsn, invalidation_reason, wal_status FROM pg_replication_slots WHERE slot_name = 'test_slot';");
+note("Slot state after invalidation:\n$slot_info");
+
+note("Slot invalidated. Attempting to restart standby...");
+
+# Restart the standby - it should be able to reconnect despite invalidation
+# because the required WAL is available via archive recovery
+my $standby_log_offset = -s $node_standby->logfile;
+my $primary_log_offset = -s $node_primary->logfile;
+
+$node_standby->start;
+
+# Give it time to reconnect and start streaming
+sleep(5);
+
+# Check both standby and primary logs
+my $standby_log = slurp_file($node_standby->logfile, $standby_log_offset);
+my $primary_log = slurp_file($node_primary->logfile, $primary_log_offset);
+
+# Check if streaming connection was established
+my $wal_receiver_count = $node_standby->safe_psql('postgres',
+	"SELECT count(*) FROM pg_stat_wal_receiver;");
+
+# Get slot state after reconnection attempt
+$slot_info = $node_primary->safe_psql('postgres',
+	"SELECT slot_name, slot_type, active, restart_lsn, invalidation_reason, wal_status FROM pg_replication_slots WHERE slot_name = 'test_slot';");
+note("Slot state after reconnection attempt:\n$slot_info");
+
+# Parse slot information
+my $slot_active = $node_primary->safe_psql('postgres',
+	"SELECT active FROM pg_replication_slots WHERE slot_name = 'test_slot';");
+
+# With the fix, physical slots should be able to reconnect
+# The LOG message appears on the primary (walsender) side
+like($primary_log,
+	qr/acquiring invalidated physical replication slot/,
+	'LOG message about acquiring invalidated physical slot on primary');
+
+unlike($standby_log,
+	qr/FATAL.*cannot acquire invalidated replication slot|FATAL.*can no longer access replication slot/,
+	'No FATAL error on standby when acquiring invalidated physical slot');
+
+is($wal_receiver_count, '1',
+	'Standby successfully establishes streaming replication');
+
+# Verify streaming is actually working
+like($standby_log,
+	qr/started streaming WAL from primary/,
+	'Standby successfully reconnected and resumed streaming');
+
+is($slot_active, 't',
+	'Slot becomes active again');
+
+# Wait for standby to catch up
+$node_primary->wait_for_catchup($node_standby, 'replay');
+
+# Get final slot state after catchup
+$slot_info = $node_primary->safe_psql('postgres',
+	"SELECT slot_name, slot_type, active, restart_lsn, invalidation_reason, wal_status FROM pg_replication_slots WHERE slot_name = 'test_slot';");
+note("Final slot state after catchup:\n$slot_info");
+
+# Verify standby is in sync
+my $primary_lsn = $node_primary->safe_psql('postgres',
+	"SELECT pg_current_wal_lsn();");
+my $standby_lsn = $node_standby->safe_psql('postgres',
+	"SELECT pg_last_wal_replay_lsn();");
+
+is($standby_lsn, $primary_lsn,
+	'Standby caught up to primary after recovering from invalidation');
+
+note("Physical slot successfully recovered after invalidation");
+
+# Verify standby is in recovery mode
+my $is_in_recovery = $node_standby->safe_psql('postgres',
+	"SELECT pg_is_in_recovery();");
+is($is_in_recovery, 't', 'Standby remains in recovery mode');
+
+done_testing();
-- 
2.50.1 (Apple Git-155)