From eefe06ea57ba19725b345f8e93c1239747815ce7 Mon Sep 17 00:00:00 2001
From: Anthonin Bonnefoy <anthonin.bonnefoy@datadoghq.com>
Date: Wed, 2 Jul 2025 09:58:52 +0200
Subject: Don't keep closed WAL segments in page cache after replay

The recovery process reads the WAL segments, applies changes and closes the
segment. When closed, the segments will still be in page cache memory until
they are evicted due to inactivity. The segments may be re-read if
archive_mode is set to always, wal_summarizer is enabled or if the stanby
is used for replication and has an active walsender.

Outside of those circumstances, the WAL segments won't be re-read and
keeping them in the page cache generates unnecessary memory pressure.

If the standby doesn't archive wal, doesn't have wal_summarize and
doesn't have an active walsender, a POSIX_FADV_DONTNEED is sent before
closing a replayed WAL segment to immediately free any cached pages.
---
 src/backend/access/transam/xlogrecovery.c | 20 ++++++++++++++++++++
 src/backend/storage/lmgr/proc.c           | 23 +++++++++++++++++++++--
 src/backend/utils/init/postinit.c         |  2 +-
 src/include/storage/proc.h                | 10 +++++++++-
 4 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 93d38914854..6891d52124b 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -48,6 +48,7 @@
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walsummarizer.h"
 #include "replication/slot.h"
 #include "replication/slotsync.h"
 #include "replication/walreceiver.h"
@@ -55,6 +56,7 @@
 #include "storage/ipc.h"
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/spin.h"
 #include "utils/datetime.h"
@@ -3341,6 +3343,24 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 			}
 		}
 
+		/*
+		 * Once replayed, WAL segment files may be re-read in several cases:
+		 * archive_mode is set to always, summarize_wal is enabled or the
+		 * standby acts as a walsender for either logical or physical
+		 * replication. Outside of those conditions, the WAL segment files
+		 * shouldn't be re-read and we can signal the kernel to release any
+		 * cached pages.
+		 */
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+		{
+			int			nfree;
+
+			if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS &&
+				!summarize_wal &&
+				(max_wal_senders == 0 || HaveNFreeProcs(max_wal_senders, &nfree, PROC_FREE_WALSENDER)))
+				(void) posix_fadvise(readFile, 0, 0, POSIX_FADV_DONTNEED);
+		}
+#endif
 		close(readFile);
 		readFile = -1;
 		readSource = XLOG_FROM_ANY;
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index e9ef0fbfe32..f75a5191f0d 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -779,17 +779,36 @@ GetStartupBufferPinWaitBufId(void)
  * Note: this is designed on the assumption that N will generally be small.
  */
 bool
-HaveNFreeProcs(int n, int *nfree)
+HaveNFreeProcs(int n, int *nfree, ProcFreeList proc_free_list)
 {
 	dlist_iter	iter;
+	dlist_head *free_list;
 
 	Assert(n > 0);
 	Assert(nfree);
 
+	switch (proc_free_list)
+	{
+		case PROC_FREE_PROCS:
+			free_list = &ProcGlobal->freeProcs;
+			break;
+		case PROC_FREE_AUTOVAC:
+			free_list = &ProcGlobal->autovacFreeProcs;
+			break;
+		case PROC_FREE_BGWORKER:
+			free_list = &ProcGlobal->bgworkerFreeProcs;
+			break;
+		case PROC_FREE_WALSENDER:
+			free_list = &ProcGlobal->walsenderFreeProcs;
+			break;
+		default:
+			elog(ERROR, "invalid free list: %d", (int) proc_free_list);
+	}
+
 	SpinLockAcquire(ProcStructLock);
 
 	*nfree = 0;
-	dlist_foreach(iter, &ProcGlobal->freeProcs)
+	dlist_foreach(iter, free_list)
 	{
 		(*nfree)++;
 		if (*nfree == n)
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index c86ceefda94..894c1eb4fa1 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -936,7 +936,7 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	 */
 	if (AmRegularBackendProcess() && !am_superuser &&
 		(SuperuserReservedConnections + ReservedConnections) > 0 &&
-		!HaveNFreeProcs(SuperuserReservedConnections + ReservedConnections, &nfree))
+		!HaveNFreeProcs(SuperuserReservedConnections + ReservedConnections, &nfree, PROC_FREE_PROCS))
 	{
 		if (nfree < SuperuserReservedConnections)
 			ereport(FATAL,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 9f9b3fcfbf1..a77c31493d5 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -141,6 +141,14 @@ typedef enum
 	PROC_WAIT_STATUS_ERROR,
 } ProcWaitStatus;
 
+typedef enum
+{
+	PROC_FREE_PROCS,
+	PROC_FREE_AUTOVAC,
+	PROC_FREE_BGWORKER,
+	PROC_FREE_WALSENDER,
+}			ProcFreeList;
+
 /*
  * Each backend has a PGPROC struct in shared memory.  There is also a list of
  * currently-unused PGPROC structs that will be reallocated to new backends.
@@ -489,7 +497,7 @@ extern void InitAuxiliaryProcess(void);
 extern void SetStartupBufferPinWaitBufId(int bufid);
 extern int	GetStartupBufferPinWaitBufId(void);
 
-extern bool HaveNFreeProcs(int n, int *nfree);
+extern bool HaveNFreeProcs(int n, int *nfree, ProcFreeList proc_free_list);
 extern void ProcReleaseLocks(bool isCommit);
 
 extern ProcWaitStatus JoinWaitQueue(LOCALLOCK *locallock,
-- 
2.50.0

