From b01ee0c2408669ced7154be9f0de71e8771a6a8c Mon Sep 17 00:00:00 2001
From: Anthonin Bonnefoy <anthonin.bonnefoy@datadoghq.com>
Date: Tue, 16 Dec 2025 10:48:12 +0100
Subject: Fix 'unexpected data beyond EOF' on replica restart

On restart, a replica can fail with an 'unexpected data beyond EOF in
block 200 of relation T/D/R' error. This can happen under the following
circumstances:

- A relation has a size of 400 blocks.
  - Blocks 201 to 400 are empty.
  - Block 200 has two rows.
  - Blocks 100 to 199 are empty.
- A restartpoint is done
- Vacuum truncates the relation to 200 blocks
- A FPW deletes a row in block 200
- A checkpoint is done
- A FPW deletes the last row in block 200
- Vacuum truncates the relation to 100 blocks
- The replica restarts

When the replica restarts:
- The relation on disk is reduced to 100 blocks due to having applied
  the truncate before restart.
- The first truncate to 200 blocks is replayed. It silently fails, but
  it will still update the cache size to 200 blocks
- The first FPW on block 200 is applied, XLogReadBufferForRead will rely
  on the cached size and incorrectly assume the page exists in file,
  and thus won't extend the relation.
- The Checkpoint Online is replayed, calling smgrdestroyall which will
  discard the cached size.
- The second FPW on block 200 is applied. This time, the detected size
  is 100 blocks, an extend is attempted. However, the block 200 is
  already present in the buffer table due to the first FPW. This
  triggers the 'unexpected data beyond EOF' since the page isn't new.

This patch fixes the issue by moving smgr_cached_nblocks updates in
mdtruncate. If truncate size > old size, we set the cache to the old
size. Otherwise, on successful truncate, the cached size is set to
truncate size.
---
 src/backend/storage/smgr/md.c   | 26 +++++++++++++++++++++++++-
 src/backend/storage/smgr/smgr.c | 12 ------------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2ccb0faceb5..d0d116f42ef 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1280,18 +1280,33 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum,
 	BlockNumber priorblocks;
 	int			curopensegs;
 
+	/* Make the cached size is invalid if we encounter an error. */
+	reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+
 	if (nblocks > curnblk)
 	{
-		/* Bogus request ... but no complaint if InRecovery */
+		/*
+		 * This can happen when a relation was truncated multiple times and
+		 * the restartpoint is located before the truncates. On restart, the
+		 * relation on disk will have the size of the second truncate. As the
+		 * first truncate has a higher nblocks, mdtruncate will be called with
+		 * nblocks > curnblk during startup.
+		 */
 		if (InRecovery)
+		{
+			reln->smgr_cached_nblocks[forknum] = curnblk;
 			return;
+		}
 		ereport(ERROR,
 				(errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
 						relpath(reln->smgr_rlocator, forknum).str,
 						nblocks, curnblk)));
 	}
 	if (nblocks == curnblk)
+	{
+		reln->smgr_cached_nblocks[forknum] = curnblk;
 		return;					/* no work */
+	}
 
 	/*
 	 * Truncate segments, starting at the last one. Starting at the end makes
@@ -1357,6 +1372,15 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum,
 		}
 		curopensegs--;
 	}
+
+	/*
+	 * We might as well update the local smgr_cached_nblocks values. The smgr
+	 * cache inval message that this function sent will cause other backends
+	 * to invalidate their copies of smgr_cached_nblocks, and these ones too
+	 * at the next command boundary. But ensure they aren't outright wrong
+	 * until then.
+	 */
+	reln->smgr_cached_nblocks[forknum] = nblocks;
 }
 
 /*
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index bce37a36d51..b017266316e 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -898,20 +898,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
 	/* Do the truncation */
 	for (i = 0; i < nforks; i++)
 	{
-		/* Make the cached size is invalid if we encounter an error. */
-		reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
-
 		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
 											   old_nblocks[i], nblocks[i]);
-
-		/*
-		 * We might as well update the local smgr_cached_nblocks values. The
-		 * smgr cache inval message that this function sent will cause other
-		 * backends to invalidate their copies of smgr_cached_nblocks, and
-		 * these ones too at the next command boundary. But ensure they aren't
-		 * outright wrong until then.
-		 */
-		reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
 	}
 }
 
-- 
2.51.0