On 2021-Sep-03, Andres Freund wrote:
> > +#ifdef WAL_DEBUG
> > + ereport(LOG,
> > + (errmsg_internal("recovery overwriting broken
> > contrecord at %X/%X (EndRecPtr: %X/%X)",
> > +
> > LSN_FORMAT_ARGS(abortedContrecordPtr),
> > +
> > LSN_FORMAT_ARGS(EndRecPtr))));
> > +#endif
>
> "broken" sounds a bit off. But then, it's just WAL_DEBUG. Which made me
> realize, isn't this missing a
> if (XLOG_DEBUG)?
Attached are the same patches as last night, except I added a test for
XLOG_DEBUG where pertinent. (The elog(PANIC) is not made conditional on
that, since it's a cross-check rather than informative.) Also fixed the
silly pg_rewind mistake I made.
I'll work on the new xlog record early next week.
--
Álvaro Herrera Valdivia, Chile — https://www.EnterpriseDB.com/
"I can see support will not be a problem. 10 out of 10." (Simon Wittber)
(http://archives.postgresql.org/pgsql-general/2004-12/msg00159.php)
>From 4173fc5000c9101604e6c64a795322771cb0687a Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <[email protected]>
Date: Thu, 2 Sep 2021 17:21:46 -0400
Subject: [PATCH v4 1/4] Implement FIRST_IS_ABORTED_CONTRECORD
---
src/backend/access/transam/xlog.c | 55 +++++++++++++++++++++++--
src/backend/access/transam/xlogreader.c | 39 +++++++++++++++++-
src/include/access/xlog_internal.h | 14 ++++++-
src/include/access/xlogreader.h | 3 ++
4 files changed, 105 insertions(+), 6 deletions(-)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e51a7a749d..49912483d5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -586,6 +586,8 @@ typedef struct XLogCtlData
XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
+ XLogRecPtr abortedContrecordPtr; /* LSN of incomplete record at end of
+ * WAL */
/* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
XLogRecPtr unloggedLSN;
@@ -848,6 +850,7 @@ static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr; /* start of last record read */
static XLogRecPtr EndRecPtr; /* end+1 of last record read */
+static XLogRecPtr abortedContrecordPtr; /* end+1 of incomplete record */
/*
* Local copies of equivalent fields in the control file. When running
@@ -2246,6 +2249,31 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
if (!Insert->forcePageWrites)
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
+ /*
+ * If the last page ended with an aborted partial continuation record,
+ * mark the new page to indicate that the partial record can be
+ * omitted.
+ *
+ * This happens only once at the end of recovery, so there's no race
+ * condition here.
+ */
+ if (XLogCtl->abortedContrecordPtr >= NewPageBeginPtr)
+ {
+#ifdef WAL_DEBUG
+ if (XLogCtl->abortedContrecordPtr != NewPageBeginPtr)
+ elog(PANIC, "inconsistent aborted contrecord location %X/%X, expected %X/%X",
+ LSN_FORMAT_ARGS(XLogCtl->abortedContrecordPtr),
+ LSN_FORMAT_ARGS(NewPageBeginPtr));
+ if (XLOG_DEBUG)
+ ereport(LOG,
+ (errmsg_internal("setting XLP_FIRST_IS_ABORTED_PARTIAL flag at %X/%X",
+ LSN_FORMAT_ARGS(NewPageBeginPtr))));
+#endif
+ NewPage->xlp_info |= XLP_FIRST_IS_ABORTED_PARTIAL;
+
+ XLogCtl->abortedContrecordPtr = InvalidXLogRecPtr;
+ }
+
/*
* If first page of an XLOG segment file, make it a long header.
*/
@@ -4392,6 +4420,7 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
record = XLogReadRecord(xlogreader, &errormsg);
ReadRecPtr = xlogreader->ReadRecPtr;
EndRecPtr = xlogreader->EndRecPtr;
+ abortedContrecordPtr = xlogreader->abortedContrecordPtr;
if (record == NULL)
{
if (readFile >= 0)
@@ -7691,10 +7720,30 @@ StartupXLOG(void)
/*
* Re-fetch the last valid or last applied record, so we can identify the
* exact endpoint of what we consider the valid portion of WAL.
+ *
+ * When recovery ended in an incomplete record, continue writing from the
+ * point where it went missing. This leaves behind an initial part of
+ * broken record, which rescues downstream which have already received
+ * that first part.
*/
- XLogBeginRead(xlogreader, LastRec);
- record = ReadRecord(xlogreader, PANIC, false);
- EndOfLog = EndRecPtr;
+ if (XLogRecPtrIsInvalid(abortedContrecordPtr))
+ {
+ XLogBeginRead(xlogreader, LastRec);
+ record = ReadRecord(xlogreader, PANIC, false);
+ EndOfLog = EndRecPtr;
+ }
+ else
+ {
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG)
+ ereport(LOG,
+ (errmsg_internal("recovery overwriting broken contrecord at %X/%X (EndRecPtr: %X/%X)",
+ LSN_FORMAT_ARGS(abortedContrecordPtr),
+ LSN_FORMAT_ARGS(EndRecPtr))));
+#endif
+ EndOfLog = abortedContrecordPtr;
+ XLogCtl->abortedContrecordPtr = abortedContrecordPtr;
+ }
/*
* EndOfLogTLI is the TLI in the filename of the XLOG segment containing
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 5cf74e181a..dbfa6d3562 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -278,6 +278,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
total_len;
uint32 targetRecOff;
uint32 pageHeaderSize;
+ bool assembled;
bool gotheader;
int readOff;
@@ -293,6 +294,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
state->errormsg_buf[0] = '\0';
ResetDecoder(state);
+ state->abortedContrecordPtr = InvalidXLogRecPtr;
RecPtr = state->EndRecPtr;
@@ -319,7 +321,9 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
randAccess = true;
}
+restart:
state->currRecPtr = RecPtr;
+ assembled = false;
targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ);
targetRecOff = RecPtr % XLOG_BLCKSZ;
@@ -415,6 +419,8 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
char *buffer;
uint32 gotlen;
+ assembled = true;
+
/*
* Enlarge readRecordBuf as needed.
*/
@@ -442,14 +448,28 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
readOff = ReadPageInternal(state, targetPagePtr,
Min(total_len - gotlen + SizeOfXLogShortPHD,
XLOG_BLCKSZ));
-
if (readOff < 0)
goto err;
Assert(SizeOfXLogShortPHD <= readOff);
- /* Check that the continuation on next page looks valid */
pageHeader = (XLogPageHeader) state->readBuf;
+
+ /*
+ * If we were expecting a continuation record and got an "aborted
+ * partial" flag, that means the continuation record was lost.
+ * Ignore the record we were reading, since we now know it's broken
+ * and lost forever, and restart the read by assuming the address
+ * to read is the location where we found this flag.
+ */
+ if (pageHeader->xlp_info & XLP_FIRST_IS_ABORTED_PARTIAL)
+ {
+ ResetDecoder(state);
+ RecPtr = targetPagePtr;
+ goto restart;
+ }
+
+ /* Check that the continuation on next page looks valid */
if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
{
report_invalid_record(state,
@@ -551,6 +571,21 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
return NULL;
err:
+ if (assembled)
+ {
+ /*
+ * We get here when a record that spans multiple pages needs to be
+ * assembled, but something went wrong -- perhaps a contrecord piece
+ * was lost. We deal with this by setting abortedContrecordPtr to the
+ * location of the piece we failed to read, or the start of the page
+ * we read where validation failed. If caller is WAL replay, it will
+ * know that recovery ended and that this is where to start writing
+ * future WAL marking the next piece with XLP_FIRST_IS_ABORTED_PARTIAL,
+ * which will in turn signal downstream WAL consumers that the broken
+ * WAL record here is to be ignored.
+ */
+ state->abortedContrecordPtr = targetPagePtr;
+ }
/*
* Invalidate the read state. We might read from a different source after
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 3b5eceff65..9bc72b4c95 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -76,8 +76,20 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
#define XLP_LONG_HEADER 0x0002
/* This flag indicates backup blocks starting in this page are optional */
#define XLP_BKP_REMOVABLE 0x0004
+/*
+ * This flag marks a record that replaces a missing contrecord.
+ * When on WAL replay we expect a continuation record at the start of
+ * a page that is not there, recovery ends but the checkpoint record
+ * that follows is marked with this flag, which indicates WAL readers
+ * that the incomplete record is to be skipped, and that WAL reading
+ * is to be resumed here. This is useful for downstream consumers of
+ * WAL which have already received (the first half of) the original
+ * broken WAL record, such as via archive_command or physical streaming
+ * replication, which we cannot "rewind".
+ */
+#define XLP_FIRST_IS_ABORTED_PARTIAL 0x0008
/* All defined flag bits in xlp_info (used for validity checking of header) */
-#define XLP_ALL_FLAGS 0x0007
+#define XLP_ALL_FLAGS 0x000F
#define XLogPageHeaderSize(hdr) \
(((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD)
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 21d200d3df..96e5eab1c9 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -175,6 +175,9 @@ struct XLogReaderState
XLogRecPtr ReadRecPtr; /* start of last record read */
XLogRecPtr EndRecPtr; /* end+1 of last record read */
+ /* end+1 of incomplete record at end of WAL */
+ XLogRecPtr abortedContrecordPtr;
+
/* ----------------------------------------
* Decoded representation of current record
--
2.30.2
>From d541acd6dbe253d62ca6d14cc10fd12e30c1583f Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <[email protected]>
Date: Fri, 3 Sep 2021 17:56:11 -0400
Subject: [PATCH v4 2/4] crosscheck that FIRST_IS_CONTRECORD is not together
with FIRST_IS_ABORTED_PARTIAL
---
src/backend/access/transam/xlogreader.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index dbfa6d3562..0098c42d2b 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -464,6 +464,26 @@ restart:
*/
if (pageHeader->xlp_info & XLP_FIRST_IS_ABORTED_PARTIAL)
{
+ /*
+ * If we see XLP_FIRST_IS_CONTRECORD together with
+ * XLP_FIRST_IS_ABORTED_PARTIAL, something has gone wrong
+ * while writing this record. However, we don't get too angry
+ * about it for now. (If the record is really corrupt, the
+ * header will fail the CRC check later.)
+ */
+ if (pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)
+ {
+#ifndef FRONTEND
+ ereport(WARNING,
+ (errmsg_internal("unexpected flag XLP_FIRST_IS_CONTRECORD found together with XLP_FIRST_IS_ABORTED_PARTIAL in %X/%X",
+ LSN_FORMAT_ARGS(targetPagePtr))));
+#else
+ fprintf(stderr,
+ "unexpected flag XLP_FIRST_IS_CONTRECORD found together with XLP_FIRST_IS_ABORTED_PARTIAL in %X/%X\n",
+ LSN_FORMAT_ARGS(targetPagePtr));
+#endif
+ }
+
ResetDecoder(state);
RecPtr = targetPagePtr;
goto restart;
--
2.30.2
>From 2df5c59b5680ba5f8350c82b5c6b287e36e73468 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <[email protected]>
Date: Thu, 2 Sep 2021 18:03:35 -0400
Subject: [PATCH v4 3/4] debugging changes
---
src/backend/access/transam/xlog.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 49912483d5..7f19b355da 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -726,6 +726,8 @@ typedef struct XLogCtlData
XLogRecPtr lastFpwDisableRecPtr;
slock_t info_lck; /* locks shared variables shown above */
+
+ bool crossSeg;
} XLogCtlData;
static XLogCtlData *XLogCtl = NULL;
@@ -1161,6 +1163,13 @@ XLogInsertRecord(XLogRecData *rdata,
*/
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
{
+ XLogSegNo StartSeg, EndSeg;
+
+ XLByteToSeg(StartPos, StartSeg, wal_segment_size);
+ XLByteToSeg(EndPos, EndSeg, wal_segment_size);
+ if (StartSeg != EndSeg)
+ XLogCtl->crossSeg = true;
+
SpinLockAcquire(&XLogCtl->info_lck);
/* advance global request to include new block(s) */
if (XLogCtl->LogwrtRqst.Write < EndPos)
@@ -2624,11 +2633,21 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
{
issue_xlog_fsync(openLogFile, openLogSegNo);
+ if (XLogCtl->crossSeg)
+ {
+ static int c = 0;
+ struct stat b;
+
+ if (stat("/tmp/hoge", &b) == 0)
+ Assert(c++ < 1);
+ }
+
/* signal that we need to wakeup walsenders later */
WalSndWakeupRequest();
LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
+ /* XXX can we give this responsibility to WAL writer? */
if (XLogArchivingActive())
XLogArchiveNotifySeg(openLogSegNo);
--
2.30.2
>From 868a1d220de8b95ae65a811bdff42a4e374016c5 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <[email protected]>
Date: Fri, 3 Sep 2021 17:55:21 -0400
Subject: [PATCH v4 4/4] upgrade assert to if, in pg_rewind
---
src/bin/pg_rewind/parsexlog.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 59ebac7d6a..b9351291fe 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -102,7 +102,9 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
* If 'endpoint' didn't point exactly at a record boundary, the caller
* messed up.
*/
- Assert(xlogreader->EndRecPtr == endpoint);
+ if (xlogreader->EndRecPtr != endpoint)
+ pg_fatal("end pointer %X/%X is not a valid end point; expected %X/%X",
+ LSN_FORMAT_ARGS(endpoint), LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
XLogReaderFree(xlogreader);
if (xlogreadfd != -1)
--
2.30.2