On Wed, Jun 3, 2015 at 8:24 AM, Robert Haas <robertmh...@gmail.com> wrote:
> On Tue, Jun 2, 2015 at 5:22 PM, Andres Freund <and...@anarazel.de> wrote:
>>> > Hm. If GetOldestMultiXactOnDisk() gets the starting point by scanning
>>> > the disk it'll always get one at a segment boundary, right? I'm not sure
>>> > that's actually ok; because the value at the beginning of the segment
>>> > can very well end up being a 0, as MaybeExtendOffsetSlru() will have
>>> > filled the page with zeros.
>>> >
>>> > I think that should be harmless, the worst that can happen is that
>>> > oldestOffset errorneously is 0, which should be correct, even though
>>> > possibly overly conservative, in these cases.
>>>
>>> Uh oh.  That seems like a real bad problem for this approach.  What
>>> keeps that from being the opposite of too conservative?  There's no
>>> "safe" value in a circular numbering space.
>>
>> I think it *might* (I'm really jetlagged) be fine because that'll only
>> happen after a upgrade from < 9.3. And in that case we initialize
>> nextOffset to 0. That ought to safe us?
>
> That's pretty much betting the farm on the bugs we know about today
> being the only ones there are.  That seems imprudent.

So here's a patch taking a different approach.  In this approach, if
the multixact whose members we want to look up doesn't exist, we don't
use a later one (that might or might not be valid).  Instead, we
attempt to cope with the unknown.  That means:

1. In TruncateMultiXact(), we don't truncate.

2. If setting the offset stop limit (the point where we refuse to
create new multixact space), we don't arm the stop point.  This means
that if you're in this situation, you run without member wraparound
protection until it's corrected.  A message gets logged once per
checkpoint telling you that you have this problem, and another message
gets logged when things get straightened out and the guards are
enabled.

3. If setting the vacuum force point, we assume that it's appropriate
to immediately force vacuum.

I've only tested this very lightly - this is just to see what you and
Noah and others think of the approach.  As compared with the previous
approach, it has the advantage of making minimal assumptions about the
sanity of what's on disk.  It has the disadvantage that, for some
people, the member-wraparound guard won't be enabled at startup -- but
note that those people can't start 9.3.7/9.4.2 *at all*, so currently
they are either running without member wraparound protection anyway
(if they haven't upgraded to those releases) or they're down entirely.
Another disadvantage is that we'll be triggering what may be quite a
bit of autovacuum activity for some people, which could be painful.
On the plus side, they'll hopefully end up with sane relminmxid and
datminmxid guards afterwards.

Thoughts?

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 9568ff1..4400fc5 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -198,13 +198,24 @@ typedef struct MultiXactStateData
 	/* next-to-be-assigned offset */
 	MultiXactOffset nextOffset;
 
+	/* Have we completed multixact startup? */
+	bool		finishedStartup;
+
 	/*
-	 * Oldest multixact that is still on disk.  Anything older than this
-	 * should not be consulted.  These values are updated by vacuum.
+	 * Oldest multixact that is still potentially referenced by a relation.
+	 * Anything older than this should not be consulted.  These values are
+	 * updated by vacuum.
 	 */
 	MultiXactId oldestMultiXactId;
 	Oid			oldestMultiXactDB;
+
+	/*
+	 * Oldest multixact offset that is potentially referenced by a
+	 * multixact referenced by a relation.  We don't always know this value,
+	 * so there's a flag here to indicate whether or not we currently do.
+	 */
 	MultiXactOffset oldestOffset;
+	bool		oldestOffsetKnown;
 
 	/*
 	 * This is what the previous checkpoint stored as the truncate position.
@@ -221,6 +232,7 @@ typedef struct MultiXactStateData
 
 	/* support for members anti-wraparound measures */
 	MultiXactOffset offsetStopLimit;
+	bool offsetStopLimitKnown;
 
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
@@ -350,10 +362,11 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
 						MultiXactOffset offset2);
 static void ExtendMultiXactOffset(MultiXactId multi);
 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
-static void DetermineSafeOldestOffset(MultiXactId oldestMXact);
+static void DetermineSafeOldestOffset(MultiXactOffset oldestMXact);
 static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
 						 MultiXactOffset start, uint32 distance);
-static MultiXactOffset find_multixact_start(MultiXactId multi);
+static bool SetOffsetVacuumLimit(bool finish_setup);
+static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
 static void WriteMZeroPageXlogRec(int pageno, uint8 info);
 
 
@@ -1078,7 +1091,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 	 *----------
 	 */
 #define OFFSET_WARN_SEGMENTS	20
-	if (MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
+	if (MultiXactState->offsetStopLimitKnown &&
+		MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
 								 nmembers))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
@@ -1090,7 +1104,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 						   MultiXactState->offsetStopLimit - nextOffset - 1),
 				 errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.",
 						 MultiXactState->oldestMultiXactDB)));
-	else if (MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
+	else if (MultiXactState->offsetStopLimitKnown &&
+			 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
 									  nextOffset,
 									  nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
 		ereport(WARNING,
@@ -1956,12 +1971,6 @@ StartupMultiXact(void)
 	 */
 	pageno = MXOffsetToMemberPage(offset);
 	MultiXactMemberCtl->shared->latest_page_number = pageno;
-
-	/*
-	 * compute the oldest member we need to keep around to avoid old member
-	 * data overrun.
-	 */
-	DetermineSafeOldestOffset(MultiXactState->oldestMultiXactId);
 }
 
 /*
@@ -1975,6 +1984,7 @@ TrimMultiXact(void)
 {
 	MultiXactId multi = MultiXactState->nextMXact;
 	MultiXactOffset offset = MultiXactState->nextOffset;
+	MultiXactId	oldestMXact;
 	int			pageno;
 	int			entryno;
 	int			flagsoff;
@@ -2047,6 +2057,13 @@ TrimMultiXact(void)
 	}
 
 	LWLockRelease(MultiXactMemberControlLock);
+
+	if (SetOffsetVacuumLimit(true) && IsUnderPostmaster)
+		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	oldestMXact = MultiXactState->lastCheckpointedOldest;
+	LWLockRelease(MultiXactGenLock);
+	DetermineSafeOldestOffset(oldestMXact);
 }
 
 /*
@@ -2146,8 +2163,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
 	MultiXactId multiStopLimit;
 	MultiXactId multiWrapLimit;
 	MultiXactId curMulti;
-	MultiXactOffset oldestOffset;
-	MultiXactOffset nextOffset;
+	bool		needs_offset_vacuum;
 
 	Assert(MultiXactIdIsValid(oldest_datminmxid));
 
@@ -2200,35 +2216,6 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
 	if (multiVacLimit < FirstMultiXactId)
 		multiVacLimit += FirstMultiXactId;
 
-	/*
-	 * Determine the offset of the oldest multixact that might still be
-	 * referenced.  Normally, we can read the offset from the multixact
-	 * itself, but there's an important special case: if there are no
-	 * multixacts in existence at all, oldest_datminmxid obviously can't point
-	 * to one.  It will instead point to the multixact ID that will be
-	 * assigned the next time one is needed.
-	 *
-	 * NB: oldest_dataminmxid is the oldest multixact that might still be
-	 * referenced from a table, unlike in DetermineSafeOldestOffset, where we
-	 * do this same computation based on the oldest value that might still
-	 * exist in the SLRU.  This is because here we're trying to compute a
-	 * threshold for activating autovacuum, which can only remove references
-	 * to multixacts, whereas there we are computing a threshold for creating
-	 * new multixacts, which requires the old ones to have first been
-	 * truncated away by a checkpoint.
-	 */
-	LWLockAcquire(MultiXactGenLock, LW_SHARED);
-	if (MultiXactState->nextMXact == oldest_datminmxid)
-	{
-		oldestOffset = MultiXactState->nextOffset;
-		LWLockRelease(MultiXactGenLock);
-	}
-	else
-	{
-		LWLockRelease(MultiXactGenLock);
-		oldestOffset = find_multixact_start(oldest_datminmxid);
-	}
-
 	/* Grab lock for just long enough to set the new limit values */
 	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 	MultiXactState->oldestMultiXactId = oldest_datminmxid;
@@ -2237,9 +2224,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
 	MultiXactState->multiWarnLimit = multiWarnLimit;
 	MultiXactState->multiStopLimit = multiStopLimit;
 	MultiXactState->multiWrapLimit = multiWrapLimit;
-	MultiXactState->oldestOffset = oldestOffset;
 	curMulti = MultiXactState->nextMXact;
-	nextOffset = MultiXactState->nextOffset;
 	LWLockRelease(MultiXactGenLock);
 
 	/* Log the info */
@@ -2247,6 +2232,9 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
 	 (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u",
 			 multiWrapLimit, oldest_datoid)));
 
+	/* Set limits for offset vacuum. */
+	needs_offset_vacuum = SetOffsetVacuumLimit(false);
+
 	/*
 	 * If past the autovacuum force point, immediately signal an autovac
 	 * request.  The reason for this is that autovac only processes one
@@ -2255,8 +2243,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
 	 * another iteration immediately if there are still any old databases.
 	 */
 	if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
-		 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD)) &&
-		IsUnderPostmaster && !InRecovery)
+		 needs_offset_vacuum) && IsUnderPostmaster && !InRecovery)
 		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
 
 	/* Give an immediate warning if past the wrap warn point */
@@ -2509,9 +2496,30 @@ GetOldestMultiXactId(void)
  * prevent overrun of old data in the members SLRU area.
  */
 static void
-DetermineSafeOldestOffset(MultiXactId oldestMXact)
+DetermineSafeOldestOffset(MultiXactOffset oldestMXact)
 {
+	MultiXactOffset lastCheckpointedOldest;
 	MultiXactOffset oldestOffset;
+	MultiXactOffset nextOffset;
+	MultiXactOffset offsetStopLimit;
+	MultiXactOffset prevOffsetStopLimit;
+	MultiXactId		nextMXact;
+	bool			finishedStartup;
+	bool			prevOffsetStopLimitKnown;
+
+	/* Fetch values from shared memory. */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	lastCheckpointedOldest = MultiXactState->lastCheckpointedOldest;
+	finishedStartup = MultiXactState->finishedStartup;
+	nextMXact = MultiXactState->nextMXact;
+	nextOffset = MultiXactState->nextOffset;
+	prevOffsetStopLimit = MultiXactState->offsetStopLimit;
+	prevOffsetStopLimitKnown = MultiXactState->offsetStopLimitKnown;
+	LWLockRelease(MultiXactGenLock);
+
+	/* Don't worry about this until after we've started up. */
+	if (!finishedStartup)
+		return;
 
 	/*
 	 * Determine the offset of the oldest multixact.  Normally, we can read
@@ -2521,30 +2529,133 @@ DetermineSafeOldestOffset(MultiXactId oldestMXact)
 	 * ID that will be assigned the next time one is needed.
 	 *
 	 * NB: oldestMXact should be the oldest multixact that still exists in the
-	 * SLRU, unlike in SetMultiXactIdLimit, where we do this same computation
+	 * SLRU, unlike in SetOffsetVacuumLimit, where we do this same computation
 	 * based on the oldest value that might be referenced in a table.
 	 */
-	LWLockAcquire(MultiXactGenLock, LW_SHARED);
-	if (MultiXactState->nextMXact == oldestMXact)
-	{
-		oldestOffset = MultiXactState->nextOffset;
-		LWLockRelease(MultiXactGenLock);
-	}
+	if (nextMXact == oldestMXact)
+		oldestOffset = nextOffset;
 	else
 	{
-		LWLockRelease(MultiXactGenLock);
-		oldestOffset = find_multixact_start(oldestMXact);
+		bool		oldestOffsetKnown;
+
+		oldestOffsetKnown = find_multixact_start(oldestMXact, &oldestOffset);
+		if (!oldestOffsetKnown)
+		{
+			ereport(LOG,
+					(errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
+						oldestMXact)));
+			return;
+		}
 	}
 
 	/* move back to start of the corresponding segment */
-	oldestOffset -= oldestOffset %
-		(MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
+	offsetStopLimit = oldestOffset - (oldestOffset %
+		(MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
+	/* always leave one segment before the wraparound point */
+	offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
+
+	/* if nothing has changed, we're done */
+	if (prevOffsetStopLimitKnown && offsetStopLimit == prevOffsetStopLimit)
+		return;
 
 	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
-	/* always leave one segment before the wraparound point */
-	MultiXactState->offsetStopLimit = oldestOffset -
-		(MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
+	MultiXactState->offsetStopLimit = oldestOffset;
+	MultiXactState->offsetStopLimitKnown = true;
 	LWLockRelease(MultiXactGenLock);
+
+	if (!prevOffsetStopLimitKnown && IsUnderPostmaster)
+		ereport(LOG,
+				(errmsg("MultiXact member wraparound protections are now enabled")));
+	ereport(DEBUG1,
+			(errmsg("MultiXact member stop limit is now %u based on MultiXact %u",
+				offsetStopLimit, oldestMXact)));
+}
+
+/*
+ * Determine how aggressively we need to vacuum in order to prevent member
+ * wraparound.
+ *
+ * To determine the oldest multixact ID, we look at oldestMultiXactId, not
+ * lastCheckpointedOldest.  That's because vacuuming can't help with anything
+ * older than oldestMultiXactId; anything older than that isn't referenced
+ * by any table.  Offsets older than oldestMultiXactId but not as old as
+ * lastCheckpointedOldest will go away after the next checkpoint.
+ *
+ * The return value is true if emergency autovacuum is required and false
+ * otherwise.
+ */
+static bool
+SetOffsetVacuumLimit(bool finish_setup)
+{
+	MultiXactId	oldestMultiXactId;
+	MultiXactId nextMXact;
+	bool		finishedStartup;
+	MultiXactOffset oldestOffset = 0;		/* placate compiler */
+	MultiXactOffset nextOffset;
+	bool		oldestOffsetKnown = false;
+	MultiXactOffset prevOldestOffset;
+	bool		prevOldestOffsetKnown;
+
+	/* Read relevant fields from shared memory. */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	oldestMultiXactId = MultiXactState->oldestMultiXactId;
+	nextMXact = MultiXactState->nextMXact;
+	nextOffset = MultiXactState->nextOffset;
+	finishedStartup = MultiXactState->finishedStartup;
+	prevOldestOffset = MultiXactState->oldestOffset;
+	prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
+	LWLockRelease(MultiXactGenLock);
+
+	/* Don't do this until after any recovery is complete. */
+	if (!finishedStartup && !finish_setup)
+		return false;
+
+	/*
+	 * If no multixacts exist, then oldestMultiXactId will be the next
+	 * multixact that will be created, rather than an existing multixact.
+	 */
+	if (oldestMultiXactId == nextMXact)
+	{
+		/*
+		 * When the next multixact gets created, it will be stored at the
+		 * next offset.
+		 */
+		oldestOffset = nextOffset;
+		oldestOffsetKnown = true;
+	}
+	else
+	{
+		/*
+		 * Figure out where the oldest existing multixact's offsets are stored.
+		 * Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, the
+		 * supposedly-earliest multixact might not really exist.  We are
+		 * careful not to fail in that case.
+		 */
+		oldestOffsetKnown =
+			find_multixact_start(oldestMultiXactId, &oldestOffset);
+	}
+
+	/*
+	 * Except when initializing the system for the first time, there's no
+	 * need to update anything if we don't know the oldest offset or if it
+	 * hasn't changed.
+	 */
+	if (finish_setup ||
+		(oldestOffsetKnown && prevOldestOffset != oldestOffset))
+	{
+		/* Install the new limits. */
+		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+		MultiXactState->oldestOffset = oldestOffset;
+		MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
+		MultiXactState->finishedStartup = true;
+		LWLockRelease(MultiXactGenLock);
+	}
+
+	/*
+	 * Do we need an emergency autovacuum?  If we're not sure, assume yes.
+	 */
+	return !oldestOffsetKnown ||
+		(nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
 }
 
 /*
@@ -2597,9 +2708,12 @@ MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
 
 /*
  * Find the starting offset of the given MultiXactId.
+ *
+ * Returns false if the file containing the multi does not exist on disk.
+ * Otherwise, returns true and sets *result to the starting member offset.
  */
-static MultiXactOffset
-find_multixact_start(MultiXactId multi)
+static bool
+find_multixact_start(MultiXactId multi, MultiXactOffset *result)
 {
 	MultiXactOffset offset;
 	int			pageno;
@@ -2610,6 +2724,9 @@ find_multixact_start(MultiXactId multi)
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
+	if (!SlruPageExists(MultiXactOffsetCtl, pageno))
+		return false;
+
 	/* lock is acquired by SimpleLruReadPage_ReadOnly */
 	slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
 	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
@@ -2834,8 +2951,13 @@ TruncateMultiXact(void)
 	/*
 	 * First, compute the safe truncation point for MultiXactMember. This is
 	 * the starting offset of the oldest multixact.
+	 *
+	 * Due to bugs in early releases of PostgreSQL 9.3.X and 9.4.X,
+	 * oldestOffset might point to a multixact that does not exist.  If so,
+	 * don't truncate anything until that gets cleaned up.
 	 */
-	oldestOffset = find_multixact_start(oldestMXact);
+	if (!find_multixact_start(oldestMXact, &oldestOffset))
+		return;
 
 	/*
 	 * To truncate MultiXactMembers, we need to figure out the active page
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 5fcea11..897d6e4 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1314,3 +1314,38 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
 
 	return retval;
 }
+
+/*
+ * Test whether an SLRU page exists.
+ */
+bool
+SlruPageExists(SlruCtl ctl, int pageno)
+{
+	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+	int			offset = rpageno * BLCKSZ;
+	char		path[MAXPGPATH];
+	int			fd;
+	struct stat stat;
+	bool		result;
+
+	SlruFileName(ctl, path, segno);
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		return false;
+
+	if (fstat(fd, &stat))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat \"%s\": %m", path)));
+
+	result = (stat.st_size < offset + BLCKSZ);
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat \"%s\": %m", path)));
+
+	return result;
+}
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 9c7f019..43bf2a4 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -151,6 +151,7 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
 											  void *data);
 extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
 extern void SlruDeleteSegment(SlruCtl ctl, char *filename);
+extern bool SlruPageExists(SlruCtl ctl, int pageno);
 
 /* SlruScanDirectory public callbacks */
 extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to