From f5aadf23829b9c8380b8f5912eacbe90b3c7ccd4 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Mon, 2 May 2022 07:53:57 +0000
Subject: [PATCH v1] Add last failed connection error message to
 pg_stat_wal_receiver

In production environments WAL receiver connection attempts to
primary may fail for many reasons (primary down, network is broken,
authentication tokens changes, primary_conn_info modifications,
socket errors and so on.). Although we emit the error message to
server logs, isn't it useful to show the last connection error
message via pg_stat_wal_receiver or pg_stat_get_wal_receiver? This
will be super helpful in production environments to analyse what
the WAL receiver issues as accessing and sifting through server
logs can be quite cumbersome for the end users.
---
 doc/src/sgml/monitoring.sgml          | 11 +++++++++++
 src/backend/catalog/system_views.sql  |  3 ++-
 src/backend/replication/walreceiver.c | 19 +++++++++++++++++++
 src/include/catalog/pg_proc.dat       |  6 +++---
 src/include/replication/walreceiver.h | 12 ++++++++++++
 src/test/regress/expected/rules.out   |  5 +++--
 6 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 56d9b375ec..6098efc6ee 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -2972,6 +2972,17 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
        with security-sensitive fields obfuscated.
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>last_conn_error</structfield> <type>text</type>
+      </para>
+      <para>
+       Contains last failed connection error message of this WAL receiver while
+       connecting to primary. Contains NULL, if no failed connection attempts
+       at all.
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 0fc614e32c..876ff386de 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -916,7 +916,8 @@ CREATE VIEW pg_stat_wal_receiver AS
             s.slot_name,
             s.sender_host,
             s.sender_port,
-            s.conninfo
+            s.conninfo,
+            s.last_conn_error
     FROM pg_stat_get_wal_receiver() s
     WHERE s.pid IS NOT NULL;
 
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index 3c9411e221..35524943cb 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -287,9 +287,22 @@ WalReceiverMain(void)
 							cluster_name[0] ? cluster_name : "walreceiver",
 							&err);
 	if (!wrconn)
+	{
+		/*
+		 * Place the error message into WAL receiver shared memory so that it
+		 * is easily accessible by the users via WAL receiver stats function.
+		 *
+		 * Can the error message ever be more than MAXCONNERRORLENGTH bytes?
+		 * Most of the common error messages that libpq emits aren't of that
+		 * huge, but if there's any such error message crossing
+		 * MAXCONNERRORLENGTH bytes, it's okay to truncate and store.
+		 */
+		strlcpy(WalRcv->last_conn_error, err, MAXCONNERRORLENGTH);
+
 		ereport(ERROR,
 				(errcode(ERRCODE_CONNECTION_FAILURE),
 				 errmsg("could not connect to the primary server: %s", err)));
+	}
 
 	/*
 	 * Save user-visible connection string.  This clobbers the original
@@ -1358,6 +1371,7 @@ pg_stat_get_wal_receiver(PG_FUNCTION_ARGS)
 	int			sender_port = 0;
 	char		slotname[NAMEDATALEN];
 	char		conninfo[MAXCONNINFO];
+	char		last_conn_error[MAXCONNERRORLENGTH];
 
 	/* Take a lock to ensure value consistency */
 	SpinLockAcquire(&WalRcv->mutex);
@@ -1376,6 +1390,7 @@ pg_stat_get_wal_receiver(PG_FUNCTION_ARGS)
 	strlcpy(sender_host, (char *) WalRcv->sender_host, sizeof(sender_host));
 	sender_port = WalRcv->sender_port;
 	strlcpy(conninfo, (char *) WalRcv->conninfo, sizeof(conninfo));
+	strlcpy(last_conn_error, (char *) WalRcv->last_conn_error, sizeof(last_conn_error));
 	SpinLockRelease(&WalRcv->mutex);
 
 	/*
@@ -1462,6 +1477,10 @@ pg_stat_get_wal_receiver(PG_FUNCTION_ARGS)
 			nulls[14] = true;
 		else
 			values[14] = CStringGetTextDatum(conninfo);
+		if (*last_conn_error == '\0')
+			nulls[15] = true;
+		else
+			values[15] = CStringGetTextDatum(last_conn_error);
 	}
 
 	/* Returns the record as Datum */
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 6d378ff785..19dc0ef411 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5365,9 +5365,9 @@
 { oid => '3317', descr => 'statistics: information about WAL receiver',
   proname => 'pg_stat_get_wal_receiver', proisstrict => 'f', provolatile => 's',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
-  proallargtypes => '{int4,text,pg_lsn,int4,pg_lsn,pg_lsn,int4,timestamptz,timestamptz,pg_lsn,timestamptz,text,text,int4,text}',
-  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{pid,status,receive_start_lsn,receive_start_tli,written_lsn,flushed_lsn,received_tli,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time,slot_name,sender_host,sender_port,conninfo}',
+  proallargtypes => '{int4,text,pg_lsn,int4,pg_lsn,pg_lsn,int4,timestamptz,timestamptz,pg_lsn,timestamptz,text,text,int4,text,text}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{pid,status,receive_start_lsn,receive_start_tli,written_lsn,flushed_lsn,received_tli,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time,slot_name,sender_host,sender_port,conninfo,last_conn_error}',
   prosrc => 'pg_stat_get_wal_receiver' },
 { oid => '6169', descr => 'statistics: information about replication slot',
   proname => 'pg_stat_get_replication_slot', provolatile => 's',
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h
index 81184aa92f..c31a5d76af 100644
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -36,6 +36,11 @@ extern PGDLLIMPORT bool hot_standby_feedback;
  */
 #define MAXCONNINFO		1024
 
+/*
+ * MAXCONNERRORLENGTH: maximum size of a connection error.
+ */
+#define MAXCONNERRORLENGTH	1024
+
 /* Can we allow the standby to accept replication connection from another standby? */
 #define AllowCascadeReplication() (EnableHotStandby && max_wal_senders > 0)
 
@@ -158,6 +163,13 @@ typedef struct
 	 * store semantics, so use sig_atomic_t.
 	 */
 	sig_atomic_t force_reply;	/* used as a bool */
+
+	/*
+	 * WAL receiver connection attempt to primary may fail at times. Contains
+	 * NULL if no failed connection attempts at all, otherwise contains error
+	 * message (truncated to MAXCONNERRORLENGTH) of last failed attempt.
+	 */
+	char	last_conn_error[MAXCONNERRORLENGTH];
 } WalRcvData;
 
 extern PGDLLIMPORT WalRcvData *WalRcv;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 21effe8315..9b1484d32a 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2199,8 +2199,9 @@ pg_stat_wal_receiver| SELECT s.pid,
     s.slot_name,
     s.sender_host,
     s.sender_port,
-    s.conninfo
-   FROM pg_stat_get_wal_receiver() s(pid, status, receive_start_lsn, receive_start_tli, written_lsn, flushed_lsn, received_tli, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time, slot_name, sender_host, sender_port, conninfo)
+    s.conninfo,
+    s.last_conn_error
+   FROM pg_stat_get_wal_receiver() s(pid, status, receive_start_lsn, receive_start_tli, written_lsn, flushed_lsn, received_tli, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time, slot_name, sender_host, sender_port, conninfo, last_conn_error)
   WHERE (s.pid IS NOT NULL);
 pg_stat_xact_all_tables| SELECT c.oid AS relid,
     n.nspname AS schemaname,
-- 
2.25.1

