From ba8f606ec202ecd1f9bedee5624b9ba0ecec0e9c Mon Sep 17 00:00:00 2001
From: Shveta Malik <shveta.malik@gmail.com>
Date: Mon, 26 May 2025 11:39:17 +0530
Subject: [PATCH v5] Improve log messages and docs for slotsync

---
 doc/src/sgml/func.sgml                     |  6 +-
 doc/src/sgml/logicaldecoding.sgml          | 86 +++++++++++++++++++++-
 src/backend/replication/logical/slotsync.c |  6 +-
 3 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index c67688cbf5f..8d7d9a2f3e8 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -29698,7 +29698,7 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
       </row>
 
       <row>
-       <entry role="func_table_entry"><para role="func_signature">
+       <entry id="pg-logical-slot-get-binary-changes" role="func_table_entry"><para role="func_signature">
         <indexterm>
          <primary>pg_logical_slot_get_binary_changes</primary>
         </indexterm>
@@ -29970,7 +29970,9 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
         standby server. Temporary synced slots, if any, cannot be used for
         logical decoding and must be dropped after promotion. See
         <xref linkend="logicaldecoding-replication-slots-synchronization"/> for details.
-        Note that this function cannot be executed if
+        Note that this function is primarily intended for testing and
+        debugging purposes and should be used with caution. Additionaly,
+        this function cannot be executed if
         <link linkend="guc-sync-replication-slots"><varname>
         sync_replication_slots</varname></link> is enabled and the slotsync
         worker is already running to perform the synchronization of slots.
diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml
index dd9e83b08ea..8a227bc585c 100644
--- a/doc/src/sgml/logicaldecoding.sgml
+++ b/doc/src/sgml/logicaldecoding.sgml
@@ -370,10 +370,10 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU
      <function>pg_create_logical_replication_slot</function></link>, or by
      using the <link linkend="sql-createsubscription-params-with-failover">
      <literal>failover</literal></link> option of
-     <command>CREATE SUBSCRIPTION</command> during slot creation, and then calling
-     <link linkend="pg-sync-replication-slots">
-     <function>pg_sync_replication_slots</function></link>
-     on the standby. By setting <link linkend="guc-sync-replication-slots">
+     <command>CREATE SUBSCRIPTION</command> during slot creation.
+     Additionally, enabling <link linkend="guc-sync-replication-slots">
+     <varname>sync_replication_slots</varname></link> on the standby
+     is required. By enabling <link linkend="guc-sync-replication-slots">
      <varname>sync_replication_slots</varname></link>
      on the standby, the failover slots can be synchronized periodically in
      the slotsync worker. For the synchronization to work, it is mandatory to
@@ -398,6 +398,84 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU
      receiving the WAL up to the latest flushed position on the primary server.
     </para>
 
+    <note>
+     <para>
+      While enabling <link linkend="guc-sync-replication-slots">
+      <varname>sync_replication_slots</varname></link> allows for automatic
+      periodic synchronization of failover slots, they can also be manually
+      synchronized using the <link linkend="pg-sync-replication-slots">
+      <function>pg_sync_replication_slots</function></link> function on the standby.
+      However, this function is primarily intended for testing and debugging and
+      should be used with caution. Unlike automatic synchronization, it does not
+      include cyclic retries, making it more prone to synchronization failures,
+      particularly during initial sync scenarios where the required WAL files
+      or catalog rows for the slot may have already been removed or are at risk
+      of being removed on the standby. In contrast, automatic synchronization
+      via <varname>sync_replication_slots</varname> provides continuous slot
+      updates, enabling seamless failover and supporting high availability.
+      Therefore, it is the recommended method for synchronizing slots.
+     </para>
+    </note>
+
+    <para>
+      When slot synchronization is configured as recommended,
+      and the initial synchronization is performed either automatically or
+      manually via pg_sync_replication_slot, the standby can persist the
+      synchronized slot only if the following condition is met: The logical
+      replication slot on the primary must retain WALs and system catalog
+      rows that are still available on the standby. This ensures data
+      integrity and allows logical replication to continue smoothly after
+      promotion.
+      If the required WALs or catalog rows have already been purged from the
+      standby, the slot will not be persisted to avoid data loss. In such
+      cases, the following log message may appear:
+<programlisting>
+     LOG:  could not synchronize replication slot "failover_slot"
+     DETAIL:  Synchronization could lead to data loss as the remote slot needs WAL at LSN 0/3003F28 and catalog xmin 754, but the standby has LSN 0/3003F28 and catalog xmin 756
+</programlisting>
+     If the logical replication slot is actively used by a consumer, no
+     manual intervention is needed; the slot will advance automatically,
+     and synchronization will resume in the next cycle. However, if no
+     consumer is configured, it is advisable to manually advance the slot
+     on the primary using <link linkend="pg-logical-slot-get-changes">
+     <function>pg_logical_slot_get_changes</function></link> or
+     <link linkend="pg-logical-slot-get-binary-changes">
+     <function>pg_logical_slot_get_binary_changes</function></link>,
+     allowing synchronization to proceed.
+    </para>
+
+    <para>
+     When slot-synchronization setup is done as recommended, and
+     slot-synchronization is performed the very first time either automatically
+     or by <link linkend="pg-sync-replication-slots">
+     <function>pg_sync_replication_slots</function></link>,
+     then for the synchronized slot to be created and persisted on the standby,
+     one condition must be met. The logical replication slot on the primary
+     must reach a state where the WALs and system catalog rows retained by
+     the slot are also present on the corresponding standby server. This is
+     needed to prevent any data loss and to allow logical replication to continue
+     seamlessly through the synchronized slot if needed after promotion.
+     If the WALs and system catalog rows retained by the slot on the primary have
+     already been purged from the standby server, and synchronization is attempted
+     for the first time, then to prevent the data loss as explained, persistence
+     and synchronization of newly created slot will be skipped, and the following
+     log message may appear on standby.
+<programlisting>
+     LOG:  could not synchronize replication slot "failover_slot"
+     DETAIL:  Synchronization could lead to data loss as the remote slot needs WAL at LSN 0/3003F28 and catalog xmin 754, but the standby has LSN 0/3003F28 and catalog xmin 756
+</programlisting>
+     If the logical replication slot is actively consumed by a consumer, no further
+     manual action is needed by the user, as the slot on primary will be advanced
+     automatically, and synchronization will proceed in the next cycle. However,
+     if no logical replication consumer is set up yet, to advance the slot, it
+     is recommended to manually run the <link linkend="pg-logical-slot-get-changes">
+     <function>pg_logical_slot_get_changes</function></link> or
+     <link linkend="pg-logical-slot-get-binary-changes">
+     <function>pg_logical_slot_get_binary_changes</function></link> on the primary
+     slot and allow synchronization to proceed.
+    </para>
+
+
     <para>
      The ability to resume logical replication after failover depends upon the
      <link linkend="view-pg-replication-slots">pg_replication_slots</link>.<structfield>synced</structfield>
diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c
index 656e66e0ae0..f1dcbebfa1a 100644
--- a/src/backend/replication/logical/slotsync.c
+++ b/src/backend/replication/logical/slotsync.c
@@ -211,9 +211,9 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 		 * impact the users, so we used DEBUG1 level to log the message.
 		 */
 		ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1,
-				errmsg("could not synchronize replication slot \"%s\" because remote slot precedes local slot",
+				errmsg("could not synchronize replication slot \"%s\"",
 					   remote_slot->name),
-				errdetail("The remote slot has LSN %X/%X and catalog xmin %u, but the local slot has LSN %X/%X and catalog xmin %u.",
+				errdetail("Synchronization could lead to data loss as the remote slot needs WAL at LSN %X/%X and catalog xmin %u, but the standby has LSN %X/%X and catalog xmin %u.",
 						  LSN_FORMAT_ARGS(remote_slot->restart_lsn),
 						  remote_slot->catalog_xmin,
 						  LSN_FORMAT_ARGS(slot->data.restart_lsn),
@@ -593,7 +593,7 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid)
 	{
 		ereport(LOG,
 				errmsg("could not synchronize replication slot \"%s\"", remote_slot->name),
-				errdetail("Logical decoding could not find consistent point from local slot's LSN %X/%X.",
+				errdetail("Synchronization could lead to data loss as standby could not build a consistent snapshot to decode WALs at LSN %X/%X.",
 						  LSN_FORMAT_ARGS(slot->data.restart_lsn)));
 
 		return false;
-- 
2.34.1

