From 8de9d904d70c362ca2af00bd4e73c2ad3bda9b6b Mon Sep 17 00:00:00 2001
From: Shinya Kato <shinya11.kato@gmail.com>
Date: Fri, 6 Mar 2026 16:10:59 +0900
Subject: [PATCH v2] Fix spurious NULL lag in pg_stat_replication

Previously, ProcessStandbyReplyMessage() cleared replication lag times
whenever the standby reported fully-applied WAL in two consecutive
reply messages.  This heuristic was too aggressive: in bursty reply
patterns one message could consume all lag tracker samples, and the
next message -- arriving before new samples accumulated -- would see
no samples and trigger clearing, even though the standby was still
actively replaying WAL.

Add two additional conditions before clearing lag times: (1) all three
LagTrackerRead() calls must return -1, indicating no new lag samples,
and (2) write/flush/apply positions must be unchanged from the
previous reply.  Together with the existing fully-applied check, this
ensures lag is only cleared when the standby is truly idle.

Author: Shinya Kato <shinya11.kato@gmail.com>
Reviewed-by: Fujii Masao <masao.fujii@gmail.com>
Discussion: https://postgr.es/m/CAOzEurTzcUrEzrH97DD7+Yz=HGPU81kzWQonKZvqBwYhx2G9_A@mail.gmail.com
---
 src/backend/replication/walsender.c | 34 ++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 2cde8ebc729..59dcfa340a5 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2456,11 +2456,16 @@ ProcessStandbyReplyMessage(void)
 	TimeOffset	writeLag,
 				flushLag,
 				applyLag;
-	bool		clearLagTimes;
+	bool		clearLagTimes,
+				noLagSamples,
+				positionsUnchanged;
 	TimestampTz now;
 	TimestampTz replyTime;
 
 	static bool fullyAppliedLastTime = false;
+	static XLogRecPtr prevWritePtr = InvalidXLogRecPtr;
+	static XLogRecPtr prevFlushPtr = InvalidXLogRecPtr;
+	static XLogRecPtr prevApplyPtr = InvalidXLogRecPtr;
 
 	/* the caller already consumed the msgtype byte */
 	writePtr = pq_getmsgint64(&reply_message);
@@ -2492,16 +2497,25 @@ ProcessStandbyReplyMessage(void)
 	flushLag = LagTrackerRead(SYNC_REP_WAIT_FLUSH, flushPtr, now);
 	applyLag = LagTrackerRead(SYNC_REP_WAIT_APPLY, applyPtr, now);
 
+	/* Precompute inputs for clearLagTimes decision below. */
+	noLagSamples = (writeLag == -1 && flushLag == -1 && applyLag == -1);
+	positionsUnchanged = (writePtr == prevWritePtr &&
+						  flushPtr == prevFlushPtr &&
+						  applyPtr == prevApplyPtr);
+
 	/*
-	 * If the standby reports that it has fully replayed the WAL in two
-	 * consecutive reply messages, then the second such message must result
-	 * from wal_receiver_status_interval expiring on the standby.  This is a
-	 * convenient time to forget the lag times measured when it last
-	 * wrote/flushed/applied a WAL record, to avoid displaying stale lag data
-	 * until more WAL traffic arrives.
+	 * If the standby reports that it has fully replayed the WAL, there are
+	 * no new lag samples, and positions remain unchanged across two
+	 * consecutive reply messages, forget the lag times measured when it last
+	 * wrote/flushed/applied a WAL record.  This avoids displaying stale lag
+	 * data until more WAL traffic arrives.
+	 *
+	 * The position-unchanged check prevents spuriously clearing lag in
+	 * bursty reply patterns, where one reply consumes all lag tracker
+	 * samples and the next arrives before new samples accumulate.
 	 */
 	clearLagTimes = false;
-	if (applyPtr == sentPtr)
+	if (applyPtr == sentPtr && noLagSamples && positionsUnchanged)
 	{
 		if (fullyAppliedLastTime)
 			clearLagTimes = true;
@@ -2510,6 +2524,10 @@ ProcessStandbyReplyMessage(void)
 	else
 		fullyAppliedLastTime = false;
 
+	prevWritePtr = writePtr;
+	prevFlushPtr = flushPtr;
+	prevApplyPtr = applyPtr;
+
 	/* Send a reply if the standby requested one. */
 	if (replyRequested)
 		WalSndKeepalive(false, InvalidXLogRecPtr);
-- 
2.47.3

