From 1d9bc77d86afa5a05dc3cc20845bab379465f9f1 Mon Sep 17 00:00:00 2001
From: Shveta Malik <shveta.malik@gmail.com>
Date: Mon, 26 May 2025 11:39:17 +0530
Subject: [PATCH v3] Improve log messages and docs for slotsync

---
 doc/src/sgml/func.sgml                     |  6 ++-
 doc/src/sgml/logicaldecoding.sgml          | 53 ++++++++++++++++++++--
 src/backend/replication/logical/slotsync.c |  8 ++--
 3 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index c67688cbf5f..8d7d9a2f3e8 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -29698,7 +29698,7 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
       </row>
 
       <row>
-       <entry role="func_table_entry"><para role="func_signature">
+       <entry id="pg-logical-slot-get-binary-changes" role="func_table_entry"><para role="func_signature">
         <indexterm>
          <primary>pg_logical_slot_get_binary_changes</primary>
         </indexterm>
@@ -29970,7 +29970,9 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
         standby server. Temporary synced slots, if any, cannot be used for
         logical decoding and must be dropped after promotion. See
         <xref linkend="logicaldecoding-replication-slots-synchronization"/> for details.
-        Note that this function cannot be executed if
+        Note that this function is primarily intended for testing and
+        debugging purposes and should be used with caution. Additionaly,
+        this function cannot be executed if
         <link linkend="guc-sync-replication-slots"><varname>
         sync_replication_slots</varname></link> is enabled and the slotsync
         worker is already running to perform the synchronization of slots.
diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml
index 3f2bcd45a1e..a4379021d4a 100644
--- a/doc/src/sgml/logicaldecoding.sgml
+++ b/doc/src/sgml/logicaldecoding.sgml
@@ -370,10 +370,10 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU
      <function>pg_create_logical_replication_slot</function></link>, or by
      using the <link linkend="sql-createsubscription-params-with-failover">
      <literal>failover</literal></link> option of
-     <command>CREATE SUBSCRIPTION</command> during slot creation, and then calling
-     <link linkend="pg-sync-replication-slots">
-     <function>pg_sync_replication_slots</function></link>
-     on the standby. By setting <link linkend="guc-sync-replication-slots">
+     <command>CREATE SUBSCRIPTION</command> during slot creation.
+     Additionally, enabling <link linkend="guc-sync-replication-slots">
+     <varname>sync_replication_slots</varname></link> on the standby
+     is required. By enabling <link linkend="guc-sync-replication-slots">
      <varname>sync_replication_slots</varname></link>
      on the standby, the failover slots can be synchronized periodically in
      the slotsync worker. For the synchronization to work, it is mandatory to
@@ -398,6 +398,51 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU
      receiving the WAL up to the latest flushed position on the primary server.
     </para>
 
+    <para>
+     Apart from enabling <link linkend="guc-sync-replication-slots">
+     <varname>sync_replication_slots</varname></link> to synchronize slots
+     periodically, failover slots can be manually synchronized by invoking
+     <link linkend="pg-sync-replication-slots">
+     <function>pg_sync_replication_slots</function></link> on the standby.
+     However, this function is primarily intended for testing and debugging
+     purposes and should be used with caution. The recommended approach to
+     synchronize slots is by enabling <link linkend="guc-sync-replication-slots">
+     <varname>sync_replication_slots</varname></link> on the standby, as it
+     ensures continuous and automatic synchronization of replication slots,
+     facilitating seamless failover and high availability.
+    </para>
+
+    <para>
+     When slot-synchronization setup is done as recommended, and
+     slot-synchronization is performed the very first time either automatically
+     or by <link linkend="pg-sync-replication-slots">
+     <function>pg_sync_replication_slots</function></link>,
+     then for the synchronized slot to be created and persisted on the standby, one
+     condition must be met. The logical replication slot on primary must be advanced
+     to such a catalog change position (catalog_xmin) and WAL's LSN (restart_lsn) for
+     which sufficient data is retained on the corresponding standby server. This is
+     needed to prevent any data loss and to allow logical replication to continue
+     seamlessly through the synchronized slot if needed after promotion.
+     If the primary slot is still lagging behind and synchronization is attempted
+     for the first time, then to prevent the data loss as explained, persistence
+     and synchronization of newly created slot will be skipped, and the following
+     log message may appear on standby.
+<programlisting>
+     LOG: could not synchronize replication slot "failover_slot" to prevent data loss
+     DETAIL:  The remote slot needs WAL at LSN 0/3003F28 and catalog xmin 754, but the standby has LSN 0/3003F28 and catalog xmin 766.
+</programlisting>
+     If the logical replication slot is actively consumed by a consumer, no further
+     manual action is needed by the user, as the slot on primary will be advanced
+     automatically, and synchronization will proceed in the next cycle. However,
+     if no logical replication consumer is set up yet, to advance the slot, it
+     is recommended to manually run the <link linkend="pg-logical-slot-get-changes">
+     <function>pg_logical_slot_get_changes</function></link> or
+     <link linkend="pg-logical-slot-get-binary-changes">
+     <function>pg_logical_slot_get_binary_changes</function></link> on the primary
+     slot and allow synchronization to proceed.
+    </para>
+
+
     <para>
      The ability to resume logical replication after failover depends upon the
      <link linkend="view-pg-replication-slots">pg_replication_slots</link>.<structfield>synced</structfield>
diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c
index 656e66e0ae0..ef4a60ce189 100644
--- a/src/backend/replication/logical/slotsync.c
+++ b/src/backend/replication/logical/slotsync.c
@@ -211,9 +211,9 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 		 * impact the users, so we used DEBUG1 level to log the message.
 		 */
 		ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1,
-				errmsg("could not synchronize replication slot \"%s\" because remote slot precedes local slot",
+				errmsg("could not synchronize replication slot \"%s\" to prevent data loss",
 					   remote_slot->name),
-				errdetail("The remote slot has LSN %X/%X and catalog xmin %u, but the local slot has LSN %X/%X and catalog xmin %u.",
+				errdetail("The remote slot needs WAL at LSN %X/%X and catalog xmin %u, but the standby has LSN %X/%X and catalog xmin %u.",
 						  LSN_FORMAT_ARGS(remote_slot->restart_lsn),
 						  remote_slot->catalog_xmin,
 						  LSN_FORMAT_ARGS(slot->data.restart_lsn),
@@ -592,8 +592,8 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid)
 	if (!found_consistent_snapshot)
 	{
 		ereport(LOG,
-				errmsg("could not synchronize replication slot \"%s\"", remote_slot->name),
-				errdetail("Logical decoding could not find consistent point from local slot's LSN %X/%X.",
+				errmsg("could not synchronize replication slot \"%s\" to prevent data loss", remote_slot->name),
+				errdetail("Standby does not have enough data to decode WALs at LSN %X/%X.",
 						  LSN_FORMAT_ARGS(slot->data.restart_lsn)));
 
 		return false;
-- 
2.34.1

