Hello Michael, Kyotaro,

Please find attached the patch fixing the problem and the updated TAP test
that addresses Nit.

-- 
Regards,
--
Alexander Kukushkin

Attachment: 042_no_contrecord_switch.pl
Description: Perl program

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 853b540945..f98ad350a9 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -3301,6 +3301,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 	uint32		targetPageOff;
 	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
 	int			r;
+	bool		timelineChanged = false;
 
 	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
 	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
@@ -3425,6 +3426,16 @@ retry:
 	Assert(targetPageOff == readOff);
 	Assert(reqLen <= readLen);
 
+	/*
+	 * Check if timeline changed while we are trying to read the page. If a
+	 * read page is still invalid after timeline change we want to return from
+	 * the function with the XLREAD_FAIL error instead or retrying, because we
+	 * might be trying to decode record that spans multiple pages. If under
+	 * this condition other replica was promoted and switched to the new WAL
+	 * segment, than the page we are trying to read might never be written.
+	 */
+	timelineChanged = xlogreader->seg.ws_tli != curFileTLI;
+
 	xlogreader->seg.ws_tli = curFileTLI;
 
 	/*
@@ -3494,8 +3505,12 @@ next_record_is_invalid:
 	readLen = 0;
 	readSource = XLOG_FROM_ANY;
 
-	/* In standby-mode, keep trying */
-	if (StandbyMode)
+	/*
+	 * In standby-mode, keep trying, but only if we know that timeline didn't
+	 * change, because after timeline change the new primary might have
+	 * overwrote previous parts of the record that spans multiple pages.
+	 */
+	if (StandbyMode && !timelineChanged)
 		goto retry;
 	else
 		return XLREAD_FAIL;

Reply via email to