From f068d4e59354d401da09126431ab03f21321782a Mon Sep 17 00:00:00 2001
From: Zhijie Hou <houzj.fnst@cn.fujitsu.com>
Date: Mon, 28 Apr 2025 14:52:00 +0800
Subject: [PATCH v2] Fix assertion failure when decoding synced two-phase
 enabled slots.

Current, during slot synchronization, it skips updating the confirmed_lsn if it
detects that the catalog_xmin or restart_lsn of the synced slot has already
surpassed those of the remote slot on the primary. This behavior poses a
problem when two-phase commit is enabled on the remote slot. The lack of
synchronization between the latest confirmed_lsn and two_phase_at may result in
transactions prepared between the old confirmed_lsn and two_phase_at being
unexpectedly decoded and sent to subscribers following a promotion.

To fix this, when catalog_xmin or restart_lsn of the synced slot are ahead,
we postpone syncing the slot configurations including two_phase_at
until the remote slot's restart_lsn and catalog_xmin are ahead.
---
 src/backend/replication/logical/slotsync.c | 36 ++++++++++++++--------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c
index e22d41891e6..03478cfb94c 100644
--- a/src/backend/replication/logical/slotsync.c
+++ b/src/backend/replication/logical/slotsync.c
@@ -196,14 +196,14 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 		 * restart_lsn or the initial xmin_horizon computed for the local slot
 		 * is ahead of the remote slot.
 		 *
-		 * If the slot is persistent, restart_lsn of the synced slot could
-		 * still be ahead of the remote slot. Since we use slot advance
-		 * functionality to keep snapbuild/slot updated, it is possible that
-		 * the restart_lsn is advanced to a later position than it has on the
-		 * primary. This can happen when slot advancing machinery finds
-		 * running xacts record after reaching the consistent state at a later
-		 * point than the primary where it serializes the snapshot and updates
-		 * the restart_lsn.
+		 * If the slot is persistent, both restart_lsn and catalog_xmin of the
+		 * synced slot could still be ahead of the remote slot. Since we use
+		 * slot advance functionality to keep snapbuild/slot updated, it is
+		 * possible that the restart_lsn and catalog_xmin are advanced to a
+		 * later position than it has on the primary. This can happen when slot
+		 * advancing machinery finds running xacts record after reaching the
+		 * consistent state at a later point than the primary where it
+		 * serializes the snapshot and updates the restart_lsn.
 		 *
 		 * We LOG the message if the slot is temporary as it can help the user
 		 * to understand why the slot is not sync-ready. In the case of a
@@ -221,16 +221,28 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 
 		if (remote_slot_precedes)
 			*remote_slot_precedes = true;
+
+		/*
+		 * Return immediately without updating the configuration. This is
+		 * necessary when two-phase commit is enabled on the remote slot.
+		 * Syncing only two_phase_at, without also syncing the latest
+		 * confirmed_lsn, might lead to transactions between the old
+		 * confirmed_lsn and two_phase_at being unexpectedly decoded and sent
+		 * to the subscriber. Therefore, we postpone syncing the latest
+		 * confirmed_lsn and two_phase_at until the remote slot's restart_lsn
+		 * and catalog_xmin are ahead.
+		 */
+		return;
 	}
 
 	/*
 	 * Attempt to sync LSNs and xmins only if remote slot is ahead of local
 	 * slot.
 	 */
-	else if (remote_slot->confirmed_lsn > slot->data.confirmed_flush ||
-			 remote_slot->restart_lsn > slot->data.restart_lsn ||
-			 TransactionIdFollows(remote_slot->catalog_xmin,
-								  slot->data.catalog_xmin))
+	if (remote_slot->confirmed_lsn > slot->data.confirmed_flush ||
+		remote_slot->restart_lsn > slot->data.restart_lsn ||
+		TransactionIdFollows(remote_slot->catalog_xmin,
+							 slot->data.catalog_xmin))
 	{
 		/*
 		 * We can't directly copy the remote slot's LSN or xmin unless there
-- 
2.30.0.windows.2

