From a09e289393ce4b54efcb22aab7b6bed78bc41450 Mon Sep 17 00:00:00 2001
From: Anthonin Bonnefoy <anthonin.bonnefoy@datadoghq.com>
Date: Mon, 23 Mar 2026 08:41:44 +0100
Subject: Propagate errormsg to XLogFindNextRecord caller

Currently, XLogFindNextRecord errormsg is ignored and callers will only
output a generic 'could not find a valid record' message without
details.
Additionally, invalid page header won't go through XLogReadRecord,
leaving the error in state->errormsg_buf.

This patch propagates XLogFindNextRecord's error message to the callers
and displays it. In case of an invalid page header, the errormsg is
filled with errormsg_buf content.
---
 contrib/pg_walinspect/pg_walinspect.c   | 16 ++++++++++----
 src/backend/access/transam/xlogreader.c | 24 +++++++++++++++++---
 src/backend/postmaster/walsummarizer.c  | 17 ++++++++++-----
 src/bin/pg_waldump/pg_waldump.c         | 12 +++++++---
 src/bin/pg_waldump/t/001_basic.pl       | 29 +++++++++++++++++++++++++
 src/include/access/xlogreader.h         |  3 ++-
 6 files changed, 85 insertions(+), 16 deletions(-)

diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index f6f5f0792d2..4cf6e41e2f5 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -99,6 +99,7 @@ InitXLogReaderState(XLogRecPtr lsn)
 	XLogReaderState *xlogreader;
 	ReadLocalXLogPageNoWaitPrivate *private_data;
 	XLogRecPtr	first_valid_record;
+	char	   *errormsg;
 
 	/*
 	 * Reading WAL below the first page of the first segments isn't allowed.
@@ -126,12 +127,19 @@ InitXLogReaderState(XLogRecPtr lsn)
 				 errdetail("Failed while allocating a WAL reading processor.")));
 
 	/* first find a valid recptr to start from */
-	first_valid_record = XLogFindNextRecord(xlogreader, lsn);
+	first_valid_record = XLogFindNextRecord(xlogreader, lsn, &errormsg);
 
 	if (!XLogRecPtrIsValid(first_valid_record))
-		ereport(ERROR,
-				errmsg("could not find a valid record after %X/%08X",
-					   LSN_FORMAT_ARGS(lsn)));
+	{
+		if (errormsg)
+			ereport(ERROR,
+					errmsg("could not find a valid record after %X/%08X: %s",
+						   LSN_FORMAT_ARGS(lsn), errormsg));
+		else
+			ereport(ERROR,
+					errmsg("could not find a valid record after %X/%08X",
+						   LSN_FORMAT_ARGS(lsn)));
+	}
 
 	return xlogreader;
 }
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 8cb2110cb99..8849610db00 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1390,14 +1390,21 @@ XLogReaderResetError(XLogReaderState *state)
  *
  * This positions the reader, like XLogBeginRead(), so that the next call to
  * XLogReadRecord() will read the next valid record.
+ *
+ * On failure, InvalidXLogRecPtr is returned, and *errormsg is set to a string
+ * with details of the failure.
+ *
+ * When set, *errormsg points to an internal buffer that's valid until the next
+ * call to XLogReadRecord.
  */
 XLogRecPtr
-XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
+XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
 {
 	XLogRecPtr	tmpRecPtr;
 	XLogRecPtr	found = InvalidXLogRecPtr;
 	XLogPageHeader header;
-	char	   *errormsg;
+
+	*errormsg = NULL;
 
 	Assert(XLogRecPtrIsValid(RecPtr));
 
@@ -1482,7 +1489,7 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
 	 * or we just jumped over the remaining data of a continuation.
 	 */
 	XLogBeginRead(state, tmpRecPtr);
-	while (XLogReadRecord(state, &errormsg) != NULL)
+	while (XLogReadRecord(state, errormsg) != NULL)
 	{
 		/* past the record we've found, break out */
 		if (RecPtr <= state->ReadRecPtr)
@@ -1497,6 +1504,17 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
 err:
 	XLogReaderInvalReadState(state);
 
+	/*
+	 * We may have reported errors due to invalid WAL header, propagate the
+	 * error message to the caller.
+	 */
+	if (state->errormsg_deferred)
+	{
+		if (state->errormsg_buf[0] != '\0')
+			*errormsg = state->errormsg_buf;
+		state->errormsg_deferred = false;
+	}
+
 	return InvalidXLogRecPtr;
 }
 
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index e1aa102f41d..0c0670f7da9 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -915,6 +915,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	WalSummaryIO io;
 	BlockRefTable *brtab = CreateEmptyBlockRefTable();
 	bool		fast_forward = true;
+	char	   *errormsg;
 
 	/* Initialize private data for xlogreader. */
 	private_data = palloc0_object(SummarizerReadLocalXLogPrivate);
@@ -966,7 +967,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	}
 	else
 	{
-		summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn);
+		summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn, &errormsg);
 		if (!XLogRecPtrIsValid(summary_start_lsn))
 		{
 			/*
@@ -995,9 +996,16 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 				switch_lsn = xlogreader->EndRecPtr;
 			}
 			else
-				ereport(ERROR,
-						errmsg("could not find a valid record after %X/%08X",
-							   LSN_FORMAT_ARGS(start_lsn)));
+			{
+				if (errormsg)
+					ereport(ERROR,
+							errmsg("could not find a valid record after %X/%08X: %s",
+								   LSN_FORMAT_ARGS(start_lsn), errormsg));
+				else
+					ereport(ERROR,
+							errmsg("could not find a valid record after %X/%08X",
+								   LSN_FORMAT_ARGS(start_lsn)));
+			}
 		}
 
 		/* We shouldn't go backward. */
@@ -1010,7 +1018,6 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	while (1)
 	{
 		int			block_id;
-		char	   *errormsg;
 		XLogRecord *record;
 		uint8		rmid;
 
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index f82507ef696..630d9859882 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -1384,11 +1384,17 @@ main(int argc, char **argv)
 		pg_fatal("out of memory while allocating a WAL reading processor");
 
 	/* first find a valid recptr to start from */
-	first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+	first_record = XLogFindNextRecord(xlogreader_state, private.startptr, &errormsg);
 
 	if (!XLogRecPtrIsValid(first_record))
-		pg_fatal("could not find a valid record after %X/%08X",
-				 LSN_FORMAT_ARGS(private.startptr));
+	{
+		if (errormsg)
+			pg_fatal("could not find a valid record after %X/%08X: %s",
+					 LSN_FORMAT_ARGS(private.startptr), errormsg);
+		else
+			pg_fatal("could not find a valid record after %X/%08X",
+					 LSN_FORMAT_ARGS(private.startptr));
+	}
 
 	/*
 	 * Display a message that we're skipping data if `from` wasn't a pointer
diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl
index 11df7e092bf..7ff9e918ff2 100644
--- a/src/bin/pg_waldump/t/001_basic.pl
+++ b/src/bin/pg_waldump/t/001_basic.pl
@@ -10,6 +10,7 @@ use Test::More;
 use List::Util qw(shuffle);
 
 my $tar = $ENV{TAR};
+use File::Copy;
 
 program_help_ok('pg_waldump');
 program_version_ok('pg_waldump');
@@ -246,6 +247,34 @@ command_like(
 	qr/^$/,
 	'no output with --quiet option');
 
+# Wrong WAL version. We copy an existing wal file and set the
+# page's magic value to 0000.
+{
+	# The broken WAL file is created by copying a valid WAL file and
+	# overwriting its magic number with 0000.
+	my $broken_wal_dir = PostgreSQL::Test::Utils::tempdir_short();
+	my $broken_wal = "$broken_wal_dir/$start_walfile";
+	copy($node->data_dir . '/pg_wal/' . $start_walfile, $broken_wal)
+		|| die "copying $start_walfile $!";
+
+	my $fh;
+	open($fh, '+<', $broken_wal)
+	  or BAIL_OUT("open failed: $!");
+	binmode $fh;
+	sysseek($fh, 0, 0)
+		or BAIL_OUT("sysseek failed: $!");
+	syswrite($fh, pack("S", 0))
+		or BAIL_OUT("syswrite failed: $!");
+	close($fh)
+	  or BAIL_OUT("close failed: $!");
+
+	command_fails_like(
+		[ 'pg_waldump', $broken_wal ],
+			qr/invalid magic number 0000/i,
+			'detailed error message shown for invalid WAL page magic'
+	);
+}
+
 # Test for: Display a message that we're skipping data if `from`
 # wasn't a pointer to the start of a record.
 sub test_pg_waldump_skip_bytes
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 80f1a548e08..c2a48bfe809 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -342,7 +342,8 @@ extern void XLogReaderSetDecodeBuffer(XLogReaderState *state,
 
 /* Position the XLogReader to given record */
 extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr);
-extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr);
+extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr,
+                                     char **errormsg);
 
 /* Return values from XLogPageReadCB. */
 typedef enum XLogPageReadResult
-- 
2.53.0

