From e3887b375923a4aff4c28f99946545ffcd11b125 Mon Sep 17 00:00:00 2001
From: Nitin Jadhav <nitinjadhavpostgres@gmail.com>
Date: Thu, 4 Dec 2025 06:09:26 +0000
Subject: [PATCH] Fix crash during recovery when redo segment is missing

The issue was that PostgreSQL did not PANIC when the redo LSN and
checkpoint LSN were in separate segments, and the file containing the
redo LSN was missing. This resulted in a crash.

The fix ensures the REDO location exists after reading the checkpoint
record in InitWalRecovery(). If the REDO location is missing, it logs
a PANIC error. Additionally, a new test script,
050_redo_segment_missing.pl, has been added to validate this.
---
 src/backend/access/transam/xlogrecovery.c     |  14 ++-
 .../recovery/t/050_redo_segment_missing.pl    | 103 ++++++++++++++++++
 2 files changed, 115 insertions(+), 2 deletions(-)
 create mode 100644 src/test/recovery/t/050_redo_segment_missing.pl

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 21b8f179ba0..a402bfa550e 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -787,9 +787,21 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 									  CheckPointTLI);
 		if (record != NULL)
 		{
+			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+			wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
 			ereport(DEBUG1,
 					errmsg_internal("checkpoint record is at %X/%08X",
 									LSN_FORMAT_ARGS(CheckPointLoc)));
+
+			/* Make sure that REDO location exists. */
+			if (RedoStartLSN < CheckPointLoc)
+			{
+				XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
+				if (!ReadRecord(xlogprefetcher, LOG, false, RedoStartTLI))
+					ereport(PANIC,
+						errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
+							LSN_FORMAT_ARGS(RedoStartLSN), LSN_FORMAT_ARGS(CheckPointLoc)));
+			}
 		}
 		else
 		{
@@ -803,8 +815,6 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 					errmsg("could not locate a valid checkpoint record at %X/%08X",
 						   LSN_FORMAT_ARGS(CheckPointLoc)));
 		}
-		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
-		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
 	}
 
 	if (ArchiveRecoveryRequested)
diff --git a/src/test/recovery/t/050_redo_segment_missing.pl b/src/test/recovery/t/050_redo_segment_missing.pl
new file mode 100644
index 00000000000..a7bd9a71125
--- /dev/null
+++ b/src/test/recovery/t/050_redo_segment_missing.pl
@@ -0,0 +1,103 @@
+# Copyright (c) 2021-2025, PostgreSQL Global Development Group
+#
+# Evaluates PostgreSQL's recovery behavior when a WAL segment
+# containing the redo record is missing. It initializes a
+# PostgreSQL instance, configures it, and executes a series of
+# operations to mimic a scenario where a WAL segment is absent.
+# Then checks that PostgreSQL logs the correct error message
+# when it fails to locate the redo record.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Time::HiRes qw(usleep);
+use Test::More;
+
+my $psql_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default);
+my $node = PostgreSQL::Test::Cluster->new('testnode');
+$node->init;
+$node->start;
+
+# Set checkpoint_timeout to the minimum value (30s) to quickly create
+# a checkpoint for testing. Populate some data to ensure the checkpoint
+# operation takes enough time, allowing the execution of pg_switch_wal().
+# This will create a new WAL segment, and the checkpoint record will be
+# written to a different WAL segment.
+$node->safe_psql(
+	'postgres',
+	q[
+		ALTER SYSTEM SET checkpoint_timeout = '30s';
+		SELECT pg_reload_conf();
+		CREATE TABLE test_table(id int);
+		INSERT INTO test_table (id) SELECT generate_series(1, 100);
+	]
+);
+
+# Wait for the checkpoint to start
+my $logstart = -s $node->logfile;
+my $checkpoint_start = 0;
+foreach my $i (0 .. 10 * $PostgreSQL::Test::Utils::timeout_default) {
+	if ($node->log_contains("checkpoint starting: time", $logstart)) {
+		$checkpoint_start = 1;
+		last;
+	}
+	usleep(100_000);
+}
+is($checkpoint_start, 1, 'checkpoint has started');
+
+# Switch the WAL to ensure that the WAL segment containing the checkpoint
+# record is different from the one containing the redo record.
+$node->safe_psql('postgres', 'SELECT pg_switch_wal()');
+
+# Wait for the checkpoint to complete
+my $checkpoint_complete = 0;
+foreach my $i (0 .. 10 * $PostgreSQL::Test::Utils::timeout_default) {
+	if ($node->log_contains("checkpoint complete", $logstart)) {
+		$checkpoint_complete = 1;
+		last;
+	}
+	usleep(100_000);
+}
+is($checkpoint_complete, 1, 'checkpoint has completed');
+
+my $redo_lsn = $node->safe_psql('postgres', "SELECT redo_lsn FROM pg_control_checkpoint()");
+my $redo_walfile_name = $node->safe_psql('postgres', "SELECT pg_walfile_name('$redo_lsn')");
+my $checkpoint_lsn = $node->safe_psql('postgres', "SELECT checkpoint_lsn FROM pg_control_checkpoint()");
+my $checkpoint_walfile_name = $node->safe_psql('postgres', "SELECT pg_walfile_name('$checkpoint_lsn')");
+
+if ($redo_walfile_name eq $checkpoint_walfile_name) {
+	die "redo record wal file is the same as checkpoint record wal file, aborting test";
+}
+
+# Remove the WAL segment containing the redo record
+unlink $node->data_dir . "/pg_wal/$redo_walfile_name"
+	or die "Could not remove WAL file: $!";
+
+$node->stop('immediate');
+
+# Use run_log instead of node->start because this test expects
+# that the server ends with an error during recovery.
+run_log(
+	[
+		'pg_ctl',
+		'--pgdata' => $node->data_dir,
+		'--log' => $node->logfile,
+		'start',
+	]);
+
+# Wait for postgres to terminate
+foreach my $i (0 .. 10 * $PostgreSQL::Test::Utils::timeout_default)
+{
+	last if !-f $node->data_dir . '/postmaster.pid';
+	usleep(100_000);
+}
+
+# Confirm that the recovery fails with an expected error
+my $logfile = slurp_file($node->logfile());
+ok( $logfile =~
+	qr/PANIC:  could not find redo location .* referenced by checkpoint record at .*/,
+	"ends with PANIC because it could not find redo location"
+);
+
+done_testing();
\ No newline at end of file
-- 
2.43.0

