From 543713209087881e82c3a63731e149e92499260c Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Wed, 6 Mar 2024 08:45:54 +0000
Subject: [PATCH v8 3/4] Track inactive replication slot information

Currently postgres doesn't track metrics like the time at which
the slot became inactive, and the total number of times the slot
became inactive in its lifetime. This commit adds two new metrics
last_inactive_at of type timestamptz and inactive_count of type numeric
to ReplicationSlotPersistentData. Whenever a slot becomes
inactive, the current timestamp and inactive count are persisted
to disk.

These metrics are useful in the following ways:
- To improve replication slot monitoring tools. For instance, one
can build a monitoring tool that signals a) when replication slots
is lying inactive for a day or so using last_inactive_at metric,
b) when a replication slot is becoming inactive too frequently
using last_inactive_at metric.

- To implement timeout-based inactive replication slot management
capability in postgres.

Increases SLOT_VERSION due to the added two new metrics.
---
 doc/src/sgml/system-views.sgml       | 20 +++++++++++++
 src/backend/catalog/system_views.sql |  4 ++-
 src/backend/replication/slot.c       | 43 ++++++++++++++++++++++------
 src/backend/replication/slotfuncs.c  | 15 +++++++++-
 src/include/catalog/pg_proc.dat      |  6 ++--
 src/include/replication/slot.h       |  6 ++++
 src/test/regress/expected/rules.out  |  6 ++--
 7 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index c519b4a7f8..7909623453 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -2740,6 +2740,26 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        ID of role
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>last_inactive_at</structfield> <type>timestamptz</type>
+      </para>
+      <para>
+        The time at which the slot became inactive.
+        <literal>NULL</literal> if the slot is currently actively being
+        used.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>inactive_count</structfield> <type>numeric</type>
+      </para>
+      <para>
+        The total number of times the slot became inactive in its lifetime.
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 1dbfcef9f1..763a4e668b 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1025,7 +1025,9 @@ CREATE VIEW pg_replication_slots AS
             L.two_phase,
             L.invalidation_reason,
             L.failover,
-            L.synced
+            L.synced,
+            L.last_inactive_at,
+            L.inactive_count
     FROM pg_get_replication_slots() AS L
             LEFT JOIN pg_database D ON (L.datoid = D.oid);
 
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index f05990aeb8..9e323b58b3 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -109,7 +109,7 @@ StaticAssertDecl(lengthof(SlotInvalidationCauses) == (RS_INVAL_MAX_CAUSES + 1),
 	sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskConstantSize
 
 #define SLOT_MAGIC		0x1051CA1	/* format identifier */
-#define SLOT_VERSION	5		/* version for new files */
+#define SLOT_VERSION	6		/* version for new files */
 
 /* Control array for replication slot management */
 ReplicationSlotCtlData *ReplicationSlotCtl = NULL;
@@ -364,6 +364,8 @@ ReplicationSlotCreate(const char *name, bool db_specific,
 	slot->data.two_phase_at = InvalidXLogRecPtr;
 	slot->data.failover = failover;
 	slot->data.synced = synced;
+	slot->data.last_inactive_at = 0;
+	slot->data.inactive_count = 0;
 
 	/* and then data only present in shared memory */
 	slot->just_dirtied = false;
@@ -590,6 +592,17 @@ retry:
 
 	if (am_walsender)
 	{
+		if (s->data.persistency == RS_PERSISTENT)
+		{
+			SpinLockAcquire(&s->mutex);
+			s->data.last_inactive_at = 0;
+			SpinLockRelease(&s->mutex);
+
+			/* Write this slot to disk */
+			ReplicationSlotMarkDirty();
+			ReplicationSlotSave();
+		}
+
 		ereport(log_replication_commands ? LOG : DEBUG1,
 				SlotIsLogical(s)
 				? errmsg("acquired logical replication slot \"%s\"",
@@ -657,16 +670,20 @@ ReplicationSlotRelease(void)
 		ConditionVariableBroadcast(&slot->active_cv);
 	}
 
-	MyReplicationSlot = NULL;
-
-	/* might not have been set when we've been a plain slot */
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	MyProc->statusFlags &= ~PROC_IN_LOGICAL_DECODING;
-	ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
-	LWLockRelease(ProcArrayLock);
-
 	if (am_walsender)
 	{
+		if (slot->data.persistency == RS_PERSISTENT)
+		{
+			SpinLockAcquire(&slot->mutex);
+			slot->data.last_inactive_at = GetCurrentTimestamp();
+			slot->data.inactive_count++;
+			SpinLockRelease(&slot->mutex);
+
+			/* Write this slot to disk */
+			ReplicationSlotMarkDirty();
+			ReplicationSlotSave();
+		}
+
 		ereport(log_replication_commands ? LOG : DEBUG1,
 				is_logical
 				? errmsg("released logical replication slot \"%s\"",
@@ -676,6 +693,14 @@ ReplicationSlotRelease(void)
 
 		pfree(slotname);
 	}
+
+	MyReplicationSlot = NULL;
+
+	/* might not have been set when we've been a plain slot */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	MyProc->statusFlags &= ~PROC_IN_LOGICAL_DECODING;
+	ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
+	LWLockRelease(ProcArrayLock);
 }
 
 /*
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 758498d29d..3e287cba66 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -239,10 +239,11 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS)
 Datum
 pg_get_replication_slots(PG_FUNCTION_ARGS)
 {
-#define PG_GET_REPLICATION_SLOTS_COLS 17
+#define PG_GET_REPLICATION_SLOTS_COLS 19
 	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
 	XLogRecPtr	currlsn;
 	int			slotno;
+	char		buf[256];
 
 	/*
 	 * We don't require any special permission to see this function's data
@@ -418,6 +419,18 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 
 		values[i++] = BoolGetDatum(slot_contents.data.synced);
 
+		if (slot_contents.data.last_inactive_at > 0)
+			values[i++] = TimestampTzGetDatum(slot_contents.data.last_inactive_at);
+		else
+			nulls[i++] = true;
+
+		/* Convert to numeric. */
+		snprintf(buf, sizeof buf, UINT64_FORMAT, slot_contents.data.inactive_count);
+		values[i++] = DirectFunctionCall3(numeric_in,
+										  CStringGetDatum(buf),
+										  ObjectIdGetDatum(0),
+										  Int32GetDatum(-1));
+
 		Assert(i == PG_GET_REPLICATION_SLOTS_COLS);
 
 		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 69140a0bf0..0071ce4cf8 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11120,9 +11120,9 @@
   proname => 'pg_get_replication_slots', prorows => '10', proisstrict => 'f',
   proretset => 't', provolatile => 's', prorettype => 'record',
   proargtypes => '',
-  proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,text,bool,bool}',
-  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,invalidation_reason,failover,synced}',
+  proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,text,bool,bool,timestamptz,numeric}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,invalidation_reason,failover,synced,last_inactive_at,inactive_count}',
   prosrc => 'pg_get_replication_slots' },
 { oid => '3786', descr => 'set up a logical replication slot',
   proname => 'pg_create_logical_replication_slot', provolatile => 'v',
diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h
index 4b7ae36f11..7d668918b0 100644
--- a/src/include/replication/slot.h
+++ b/src/include/replication/slot.h
@@ -129,6 +129,12 @@ typedef struct ReplicationSlotPersistentData
 	 * for logical slots on the primary server.
 	 */
 	bool		failover;
+
+	/* When did this slot become inactive last time? */
+	TimestampTz last_inactive_at;
+
+	/* How many times the slot has been inactive? */
+	uint64		inactive_count;
 } ReplicationSlotPersistentData;
 
 /*
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 08b0a34d55..b63f5ea5da 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1475,8 +1475,10 @@ pg_replication_slots| SELECT l.slot_name,
     l.two_phase,
     l.invalidation_reason,
     l.failover,
-    l.synced
-   FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, invalidation_reason, failover, synced)
+    l.synced,
+    l.last_inactive_at,
+    l.inactive_count
+   FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, invalidation_reason, failover, synced, last_inactive_at, inactive_count)
      LEFT JOIN pg_database d ON ((l.datoid = d.oid)));
 pg_roles| SELECT pg_authid.rolname,
     pg_authid.rolsuper,
-- 
2.34.1

