From eea58a421a9a67ad725d708ecc32e694474f3ad2 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Thu, 6 Jun 2024 14:56:53 +0900
Subject: [PATCH v2] Fix infinite loop in walsender during publisher shutdown

When a publisher server is shutting down, there can be a case where
the last WAL record at that point is a continuation record with its
latter part not yet flushed. In such cases, the walsender attempts to
read this unflushed part and ends up in an infinite loop. To prevent
this situation, modify the logical WAL sender to consider itself
caught up in this case. The records that are not fully flushed at this
point are generally not significant, so simply ignoring them should
not cause any issues.
---
 src/backend/replication/walsender.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index c623b07cf0..635396c138 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3426,8 +3426,15 @@ XLogSendLogical(void)
 			flushPtr = GetFlushRecPtr(NULL);
 	}
 
-	/* If EndRecPtr is still past our flushPtr, it means we caught up. */
-	if (logical_decoding_ctx->reader->EndRecPtr >= flushPtr)
+	/*
+	 * If EndRecPtr is still past our flushPtr, it means we caught up.  When
+	 * the server is shutting down, the latter part of a continuation record
+	 * may be missing.  If got_STOPPING is true, assume we are caught up if the
+	 * last record is missing its continuation part at flushPtr.
+	 */
+	if (logical_decoding_ctx->reader->EndRecPtr >= flushPtr ||
+		(got_STOPPING &&
+		 logical_decoding_ctx->reader->missingContrecPtr == flushPtr))
 		WalSndCaughtUp = true;
 
 	/*
-- 
2.34.1

