Re: [HACKERS] pgsql: Fix a couple of bugs in MultiXactId freezing

Alvaro Herrera Thu, 12 Dec 2013 13:26:24 -0800

Andres Freund wrote:
> On 2013-12-11 22:08:41 -0300, Alvaro Herrera wrote:
> > Andres Freund wrote:


> > > I worry that this MultiXactIdSetOldestMember() will be problematic in
> > > longrunning vacuums like a anti-wraparound vacuum of a huge
> > > table. There's no real need to set MultiXactIdSetOldestMember() here,
> > > since we will not become the member of a multi. So I think you should
> > > either move the Assert() in MultiXactIdCreateFromMembers() to it's other
> > > callers, or add a parameter to skip it.
> > 
> > I would like to have the Assert() work automatically, that is, check the
> > PROC_IN_VACUUM flag in MyProc->vacuumflags ... but this probably won't
> > work with CLUSTER.  That said, I think we *should* call SetOldestMember
> > in CLUSTER.  So maybe both things should be conditional on
> > PROC_IN_VACUUM.
> 
> Why should it be dependent on cluster? SetOldestMember() defines the
> oldest multi we can be a member of. Even in cluster, the freezing will
> not make us a member of a multi. If the transaction does something else
> requiring SetOldestMember(), that will do it?

One last thing (I hope).  It's not real easy to disable this check,
because it actually lives in GetNewMultiXactId.  It would uglify the API
a lot if we were to pass a flag down two layers of routines; and moving
it to higher-level routines doesn't seem all that appropriate either.
I'm thinking we can have a new flag in MyPgXact->vacuumFlags, so
heap_prepare_freeze_tuple does this:

        PG_TRY();
        {
            /* set flag to let multixact.c know what we're doing */
            MyPgXact->vacuumFlags |= PROC_FREEZING_MULTI;
            newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
                                        cutoff_xid, cutoff_multi, &flags);
        }
        PG_CATCH();
        {
            MyPgXact->vacuumFlags &= ~PROC_FREEZING_MULTI;
            PG_RE_THROW();
        }
        PG_END_TRY();
        MyPgXact->vacuumFlags &= ~PROC_FREEZING_MULTI;

and GetNewMultiXactId tests it to avoid the assert:

    /*
     * MultiXactIdSetOldestMember() must have been called already, but don't
     * check while freezing MultiXactIds.
     */
    Assert((MyPggXact->vacuumFlags & PROC_FREEZING_MULTI) ||
           MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));

This avoids the API uglification issues, but introduces a setjmp call
for every tuple to be frozen.  I don't think this is an excessive cost
to pay; after all, this is going to happen only for tuples for which
heap_tuple_needs_freeze already returned true; and for those we're
already going to do a lot of other work.

Attached is the whole series of patches for 9.3.  (master is the same,
only with an additional patch that removes the legacy WAL record.)

-- 
Álvaro Herrera                http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

>From d266b3cef8598c3383d3ba17105d17fc1f384f7d Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Tue, 10 Dec 2013 17:56:02 -0300
Subject: [PATCH 1/4] Fix freezing of multixacts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Andres Freund and Álvaro
---
 src/backend/access/heap/heapam.c       |  679 +++++++++++++++++++++++++-------
 src/backend/access/rmgrdesc/heapdesc.c |    9 +
 src/backend/access/transam/multixact.c |   18 +-
 src/backend/commands/vacuumlazy.c      |   31 +-
 src/include/access/heapam_xlog.h       |   43 +-
 src/include/access/multixact.h         |    3 +
 6 files changed, 628 insertions(+), 155 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 1a0dd21..24d843a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -5238,14 +5238,261 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
 		CacheInvalidateHeapTuple(relation, tuple, NULL);
 }
 
+#define		FRM_NOOP				0x0001
+#define		FRM_INVALIDATE_XMAX		0x0002
+#define		FRM_RETURN_IS_XID		0x0004
+#define		FRM_RETURN_IS_MULTI		0x0008
+#define		FRM_MARK_COMMITTED		0x0010
 
 /*
- * heap_freeze_tuple
+ * FreezeMultiXactId
+ *		Determine what to do during freezing when a tuple is marked by a
+ *		MultiXactId.
+ *
+ * NB -- this might have the side-effect of creating a new MultiXactId!
+ *
+ * "flags" is an output value; it's used to tell caller what to do on return.
+ * Possible flags are:
+ * FRM_NOOP
+ *		don't do anything -- keep existing Xmax
+ * FRM_INVALIDATE_XMAX
+ *		mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
+ * FRM_RETURN_IS_XID
+ *		The Xid return value is a single update Xid to set as xmax.
+ * FRM_MARK_COMMITTED
+ *		Xmax can be marked as HEAP_XMAX_COMMITTED
+ * FRM_RETURN_IS_MULTI
+ *		The return value is a new MultiXactId to set as new Xmax.
+ *		(caller must obtain proper infomask bits using GetMultiXactIdHintBits)
+ */
+static TransactionId
+FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
+				  TransactionId cutoff_xid, MultiXactId cutoff_multi,
+				  uint16 *flags)
+{
+	TransactionId xid = InvalidTransactionId;
+	int			i;
+	MultiXactMember *members;
+	int			nmembers;
+	bool		need_replace;
+	int			nnewmembers;
+	MultiXactMember *newmembers;
+	bool		has_lockers;
+	TransactionId update_xid;
+	bool		update_committed;
+
+	*flags = 0;
+
+	/* We should only be called in Multis */
+	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
+
+	if (!MultiXactIdIsValid(multi))
+	{
+		/* Ensure infomask bits are appropriately set/reset */
+		*flags |= FRM_INVALIDATE_XMAX;
+		return InvalidTransactionId;
+	}
+	else if (MultiXactIdPrecedes(multi, cutoff_multi))
+	{
+		/*
+		 * This old multi cannot possibly have members still running.  If it
+		 * was a locker only, it can be removed without any further
+		 * consideration; but if it contained an update, we might need to
+		 * preserve it.
+		 */
+		Assert(!MultiXactIdIsRunning(multi));
+		if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
+		{
+			*flags |= FRM_INVALIDATE_XMAX;
+			xid = InvalidTransactionId; /* not strictly necessary */
+		}
+		else
+		{
+			/* replace multi by update xid */
+			xid = MultiXactIdGetUpdateXid(multi, t_infomask);
+
+			/* wasn't only a lock, xid needs to be valid */
+			Assert(TransactionIdIsValid(xid));
+
+			/*
+			 * If the xid is older than the cutoff, it has to have aborted,
+			 * otherwise the tuple would have gotten pruned away.
+			 */
+			if (TransactionIdPrecedes(xid, cutoff_xid))
+			{
+				Assert(!TransactionIdDidCommit(xid));
+				*flags |= FRM_INVALIDATE_XMAX;
+				xid = InvalidTransactionId;		/* not strictly necessary */
+			}
+			else
+				*flags |= FRM_RETURN_IS_XID;
+		}
+
+		return xid;
+	}
+
+	/*
+	 * This multixact might have or might not have members still running, but
+	 * we know it's valid and is newer than the cutoff point for multis.
+	 * However, some member(s) of it may be below the cutoff for Xids, so we
+	 * need to walk the whole members array to figure out what to do, if
+	 * anything.
+	 */
+
+	nmembers = GetMultiXactIdMembers(multi, &members, false);
+	if (nmembers <= 0)
+	{
+		/* Nothing worth keeping */
+		*flags |= FRM_INVALIDATE_XMAX;
+		return InvalidTransactionId;
+	}
+
+	/* is there anything older than the cutoff? */
+	need_replace = false;
+	for (i = 0; i < nmembers; i++)
+	{
+		if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
+		{
+			need_replace = true;
+			break;
+		}
+	}
+
+	/*
+	 * In the simplest case, there is no member older than the cutoff; we can
+	 * keep the existing MultiXactId as is.
+	 */
+	if (!need_replace)
+	{
+		*flags |= FRM_NOOP;
+		pfree(members);
+		return InvalidTransactionId;
+	}
+
+	/*
+	 * If the multi needs to be updated, figure out which members do we need
+	 * to keep.
+	 */
+	nnewmembers = 0;
+	newmembers = palloc(sizeof(MultiXactMember) * nmembers);
+	has_lockers = false;
+	update_xid = InvalidTransactionId;
+	update_committed = false;
+
+	for (i = 0; i < nmembers; i++)
+	{
+		if (ISUPDATE_from_mxstatus(members[i].status))
+		{
+			/*
+			 * It's an update; should we keep it?  If the transaction is known
+			 * aborted then it's okay to ignore it, otherwise not.  (Note this
+			 * is just an optimization and not needed for correctness, so it's
+			 * okay to get this test wrong; for example, in case an updater is
+			 * crashed, or a running transaction in the process of aborting.)
+			 */
+			if (!TransactionIdDidAbort(members[i].xid))
+			{
+				newmembers[nnewmembers++] = members[i];
+				Assert(!TransactionIdIsValid(update_xid));
+
+				/*
+				 * Tell caller to set HEAP_XMAX_COMMITTED hint while we have
+				 * the Xid in cache.  Again, this is just an optimization, so
+				 * it's not a problem if the transaction is still running and
+				 * in the process of committing.
+				 */
+				if (TransactionIdDidCommit(update_xid))
+					update_committed = true;
+
+				update_xid = newmembers[i].xid;
+			}
+
+			/*
+			 * Checking for very old update Xids is critical: if the update
+			 * member of the multi is older than cutoff_xid, we must remove
+			 * it, because otherwise a later liveliness check could attempt
+			 * pg_clog access for a page that was truncated away by the
+			 * current vacuum.	Note that if the update had committed, we
+			 * wouldn't be freezing this tuple because it would have gotten
+			 * removed (HEAPTUPLE_DEAD) by HeapTupleSatisfiesVacuum; so it
+			 * either aborted or crashed.  Therefore, ignore this update_xid.
+			 */
+			if (TransactionIdPrecedes(update_xid, cutoff_xid))
+			{
+				update_xid = InvalidTransactionId;
+				update_committed = false;
+
+			}
+		}
+		else
+		{
+			/* We only keep lockers if they are still running */
+			if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
+				TransactionIdIsInProgress(members[i].xid))
+			{
+				/* running locker cannot possibly be older than the cutoff */
+				Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
+				newmembers[nnewmembers++] = members[i];
+				has_lockers = true;
+			}
+		}
+	}
+
+	pfree(members);
+
+	if (nnewmembers == 0)
+	{
+		/* nothing worth keeping!? Tell caller to remove the whole thing */
+		*flags |= FRM_INVALIDATE_XMAX;
+		xid = InvalidTransactionId;
+	}
+	else if (TransactionIdIsValid(update_xid) && !has_lockers)
+	{
+		/*
+		 * If there's a single member and it's an update, pass it back alone
+		 * without creating a new Multi.  (XXX we could do this when there's a
+		 * single remaining locker, too, but that would complicate the API too
+		 * much; moreover, the case with the single updater is more
+		 * interesting, because those are longer-lived.)
+		 */
+		Assert(nnewmembers == 1);
+		*flags |= FRM_RETURN_IS_XID;
+		if (update_committed)
+			*flags |= FRM_MARK_COMMITTED;
+		xid = update_xid;
+	}
+	else
+	{
+		/*
+		 * Create a new multixact with the surviving members of the previous
+		 * one, to set as new Xmax in the tuple.
+		 *
+		 * If this is the first possibly-multixact-able operation in the
+		 * current transaction, set my per-backend OldestMemberMXactId
+		 * setting. We can be certain that the transaction will never become a
+		 * member of any older MultiXactIds than that.
+		 */
+		MultiXactIdSetOldestMember();
+		xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
+		*flags |= FRM_RETURN_IS_MULTI;
+	}
+
+	pfree(newmembers);
+
+	return xid;
+}
+
+/*
+ * heap_prepare_freeze_tuple
  *
  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
- * are older than the specified cutoff XID.  If so, replace them with
- * FrozenTransactionId or InvalidTransactionId as appropriate, and return
- * TRUE.  Return FALSE if nothing was changed.
+ * are older than the specified cutoff XID and cutoff MultiXactId.	If so,
+ * setup enough state (in the *frz output argument) to later execute and
+ * WAL-log what we would need to do, and return TRUE.  Return FALSE if nothing
+ * is to be changed.
+ *
+ * Caller is responsible for setting the offset field, if appropriate.	This
+ * is only necessary if the freeze is to be WAL-logged.
  *
  * It is assumed that the caller has checked the tuple with
  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
@@ -5254,54 +5501,44 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
  * XID older than it could neither be running nor seen as running by any
  * open transaction.  This ensures that the replacement will not change
- * anyone's idea of the tuple state.  Also, since we assume the tuple is
- * not HEAPTUPLE_DEAD, the fact that an XID is not still running allows us
- * to assume that it is either committed good or aborted, as appropriate;
- * so we need no external state checks to decide what to do.  (This is good
- * because this function is applied during WAL recovery, when we don't have
- * access to any such state, and can't depend on the hint bits to be set.)
- * There is an exception we make which is to assume GetMultiXactIdMembers can
- * be called during recovery.
- *
+ * anyone's idea of the tuple state.
  * Similarly, cutoff_multi must be less than or equal to the smallest
  * MultiXactId used by any transaction currently open.
  *
  * If the tuple is in a shared buffer, caller must hold an exclusive lock on
  * that buffer.
  *
- * Note: it might seem we could make the changes without exclusive lock, since
- * TransactionId read/write is assumed atomic anyway.  However there is a race
- * condition: someone who just fetched an old XID that we overwrite here could
- * conceivably not finish checking the XID against pg_clog before we finish
- * the VACUUM and perhaps truncate off the part of pg_clog he needs.  Getting
- * exclusive lock ensures no other backend is in process of checking the
- * tuple status.  Also, getting exclusive lock makes it safe to adjust the
- * infomask bits.
- *
- * NB: Cannot rely on hint bits here, they might not be set after a crash or
- * on a standby.
+ * NB: It is not enough to set hint bits to indicate something is
+ * committed/invalid -- they might not be set on a standby, or after crash
+ * recovery.  We really need to remove old xids.
  */
 bool
-heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
-				  MultiXactId cutoff_multi)
+heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+						  TransactionId cutoff_multi,
+						  xl_heap_freeze_tuple *frz)
+
 {
 	bool		changed = false;
 	bool		freeze_xmax = false;
 	TransactionId xid;
 
+	frz->frzflags = 0;
+	frz->t_infomask2 = tuple->t_infomask2;
+	frz->t_infomask = tuple->t_infomask;
+	frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
+
 	/* Process xmin */
 	xid = HeapTupleHeaderGetXmin(tuple);
 	if (TransactionIdIsNormal(xid) &&
 		TransactionIdPrecedes(xid, cutoff_xid))
 	{
-		HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
+		frz->frzflags |= XLH_FREEZE_XMIN;
 
 		/*
 		 * Might as well fix the hint bits too; usually XMIN_COMMITTED will
 		 * already be set here, but there's a small chance not.
 		 */
-		Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
-		tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+		frz->t_infomask |= HEAP_XMIN_COMMITTED;
 		changed = true;
 	}
 
@@ -5318,91 +5555,35 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		if (!MultiXactIdIsValid(xid))
-		{
-			/* no xmax set, ignore */
-			;
-		}
-		else if (MultiXactIdPrecedes(xid, cutoff_multi))
-		{
-			/*
-			 * This old multi cannot possibly be running.  If it was a locker
-			 * only, it can be removed without much further thought; but if it
-			 * contained an update, we need to preserve it.
-			 */
-			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
-				freeze_xmax = true;
-			else
-			{
-				TransactionId update_xid;
+		TransactionId newxmax;
+		uint16		flags;
 
-				update_xid = HeapTupleGetUpdateXid(tuple);
-
-				/*
-				 * The multixact has an update hidden within.  Get rid of it.
-				 *
-				 * If the update_xid is below the cutoff_xid, it necessarily
-				 * must be an aborted transaction.  In a primary server, such
-				 * an Xmax would have gotten marked invalid by
-				 * HeapTupleSatisfiesVacuum, but in a replica that is not
-				 * called before we are, so deal with it in the same way.
-				 *
-				 * If not below the cutoff_xid, then the tuple would have been
-				 * pruned by vacuum, if the update committed long enough ago,
-				 * and we wouldn't be freezing it; so it's either recently
-				 * committed, or in-progress.  Deal with this by setting the
-				 * Xmax to the update Xid directly and remove the IS_MULTI
-				 * bit.  (We know there cannot be running lockers in this
-				 * multi, because it's below the cutoff_multi value.)
-				 */
+		newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
+									cutoff_xid, cutoff_multi, &flags);
 
-				if (TransactionIdPrecedes(update_xid, cutoff_xid))
-				{
-					Assert(InRecovery || TransactionIdDidAbort(update_xid));
-					freeze_xmax = true;
-				}
-				else
-				{
-					Assert(InRecovery || !TransactionIdIsInProgress(update_xid));
-					tuple->t_infomask &= ~HEAP_XMAX_BITS;
-					HeapTupleHeaderSetXmax(tuple, update_xid);
-					changed = true;
-				}
-			}
+		if (flags & FRM_INVALIDATE_XMAX)
+			freeze_xmax = true;
+		else if (flags & FRM_RETURN_IS_XID)
+		{
+			frz->t_infomask &= ~HEAP_XMAX_BITS;
+			frz->xmax = newxmax;
+			if (flags & FRM_MARK_COMMITTED)
+				frz->t_infomask &= HEAP_XMAX_COMMITTED;
+			changed = true;
 		}
-		else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		else if (flags & FRM_RETURN_IS_MULTI)
 		{
-			/* newer than the cutoff, so don't touch it */
-			;
+			frz->t_infomask &= ~HEAP_XMAX_BITS;
+			frz->xmax = newxmax;
+
+			GetMultiXactIdHintBits(newxmax,
+								   &frz->t_infomask,
+								   &frz->t_infomask2);
+			changed = true;
 		}
 		else
 		{
-			TransactionId	update_xid;
-
-			/*
-			 * This is a multixact which is not marked LOCK_ONLY, but which
-			 * is newer than the cutoff_multi.  If the update_xid is below the
-			 * cutoff_xid point, then we can just freeze the Xmax in the
-			 * tuple, removing it altogether.  This seems simple, but there
-			 * are several underlying assumptions:
-			 *
-			 * 1. A tuple marked by an multixact containing a very old
-			 * committed update Xid would have been pruned away by vacuum; we
-			 * wouldn't be freezing this tuple at all.
-			 *
-			 * 2. There cannot possibly be any live locking members remaining
-			 * in the multixact.  This is because if they were alive, the
-			 * update's Xid would had been considered, via the lockers'
-			 * snapshot's Xmin, as part the cutoff_xid.
-			 *
-			 * 3. We don't create new MultiXacts via MultiXactIdExpand() that
-			 * include a very old aborted update Xid: in that function we only
-			 * include update Xids corresponding to transactions that are
-			 * committed or in-progress.
-			 */
-			update_xid = HeapTupleGetUpdateXid(tuple);
-			if (TransactionIdPrecedes(update_xid, cutoff_xid))
-				freeze_xmax = true;
+			Assert(flags & FRM_NOOP);
 		}
 	}
 	else if (TransactionIdIsNormal(xid) &&
@@ -5413,17 +5594,17 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 
 	if (freeze_xmax)
 	{
-		HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
+		frz->xmax = InvalidTransactionId;
 
 		/*
 		 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
 		 * LOCKED.	Normalize to INVALID just to be sure no one gets confused.
 		 * Also get rid of the HEAP_KEYS_UPDATED bit.
 		 */
-		tuple->t_infomask &= ~HEAP_XMAX_BITS;
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		HeapTupleHeaderClearHotUpdated(tuple);
-		tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+		frz->t_infomask &= ~HEAP_XMAX_BITS;
+		frz->t_infomask |= HEAP_XMAX_INVALID;
+		frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
+		frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
 		changed = true;
 	}
 
@@ -5443,16 +5624,16 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 			 * xvac transaction succeeded.
 			 */
 			if (tuple->t_infomask & HEAP_MOVED_OFF)
-				HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
+				frz->frzflags |= XLH_INVALID_XVAC;
 			else
-				HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
+				frz->frzflags |= XLH_FREEZE_XVAC;
 
 			/*
 			 * Might as well fix the hint bits too; usually XMIN_COMMITTED
 			 * will already be set here, but there's a small chance not.
 			 */
 			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+			frz->t_infomask |= HEAP_XMIN_COMMITTED;
 			changed = true;
 		}
 	}
@@ -5461,6 +5642,68 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 }
 
 /*
+ * heap_execute_freeze_tuple
+ *		Execute the prepared freezing of a tuple.
+ *
+ * Caller is responsible for ensuring that no other backend can access the
+ * storage underlying this tuple, either by holding an exclusive lock on the
+ * buffer containing it (which is what lazy VACUUM does), or by having it by
+ * in private storage (which is what CLUSTER and friends do).
+ *
+ * Note: it might seem we could make the changes without exclusive lock, since
+ * TransactionId read/write is assumed atomic anyway.  However there is a race
+ * condition: someone who just fetched an old XID that we overwrite here could
+ * conceivably not finish checking the XID against pg_clog before we finish
+ * the VACUUM and perhaps truncate off the part of pg_clog he needs.  Getting
+ * exclusive lock ensures no other backend is in process of checking the
+ * tuple status.  Also, getting exclusive lock makes it safe to adjust the
+ * infomask bits.
+ *
+ * NB: All code in here must be safe to execute during crash recovery!
+ */
+void
+heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
+{
+	if (frz->frzflags & XLH_FREEZE_XMIN)
+		HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
+
+	HeapTupleHeaderSetXmax(tuple, frz->xmax);
+
+	if (frz->frzflags & XLH_FREEZE_XVAC)
+		HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
+
+	if (frz->frzflags & XLH_INVALID_XVAC)
+		HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
+
+	tuple->t_infomask = frz->t_infomask;
+	tuple->t_infomask2 = frz->t_infomask2;
+}
+
+/*
+ * heap_freeze_tuple - freeze tuple inplace without WAL logging.
+ *
+ * Useful for callers like CLUSTER that perform their own WAL logging.
+ */
+bool
+heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+				  TransactionId cutoff_multi)
+{
+	xl_heap_freeze_tuple frz;
+	bool		do_freeze;
+
+	do_freeze = heap_prepare_freeze_tuple(tuple, cutoff_xid, cutoff_multi, &frz);
+
+	/*
+	 * Note that because this is not a WAL-logged operation, we don't need to
+	 * fill in the offset in the freeze record.
+	 */
+
+	if (do_freeze)
+		heap_execute_freeze_tuple(tuple, &frz);
+	return do_freeze;
+}
+
+/*
  * For a given MultiXactId, return the hint bits that should be set in the
  * tuple's infomask.
  *
@@ -5763,16 +6006,26 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
 		}
 		else if (MultiXactIdPrecedes(multi, cutoff_multi))
 			return true;
-		else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
-		{
-			/* only-locker multis don't need internal examination */
-			;
-		}
 		else
 		{
-			if (TransactionIdPrecedes(HeapTupleGetUpdateXid(tuple),
-									  cutoff_xid))
-				return true;
+			MultiXactMember *members;
+			int			nmembers;
+			int			i;
+
+			/* need to check whether any member of the mxact is too old */
+
+			nmembers = GetMultiXactIdMembers(multi, &members, false);
+
+			for (i = 0; i < nmembers; i++)
+			{
+				if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
+				{
+					pfree(members);
+					return true;
+				}
+			}
+			if (nmembers > 0)
+				pfree(members);
 		}
 	}
 	else
@@ -6022,27 +6275,26 @@ log_heap_clean(Relation reln, Buffer buffer,
 }
 
 /*
- * Perform XLogInsert for a heap-freeze operation.	Caller must already
- * have modified the buffer and marked it dirty.
+ * Perform XLogInsert for a heap-freeze operation.	Caller must have already
+ * modified the buffer and marked it dirty.
  */
 XLogRecPtr
-log_heap_freeze(Relation reln, Buffer buffer,
-				TransactionId cutoff_xid, MultiXactId cutoff_multi,
-				OffsetNumber *offsets, int offcnt)
+log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
+				xl_heap_freeze_tuple *tuples, int ntuples)
 {
-	xl_heap_freeze xlrec;
+	xl_heap_freeze_page xlrec;
 	XLogRecPtr	recptr;
 	XLogRecData rdata[2];
 
 	/* Caller should not call me on a non-WAL-logged relation */
 	Assert(RelationNeedsWAL(reln));
 	/* nor when there are no tuples to freeze */
-	Assert(offcnt > 0);
+	Assert(ntuples > 0);
 
 	xlrec.node = reln->rd_node;
 	xlrec.block = BufferGetBlockNumber(buffer);
 	xlrec.cutoff_xid = cutoff_xid;
-	xlrec.cutoff_multi = cutoff_multi;
+	xlrec.ntuples = ntuples;
 
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapFreeze;
@@ -6050,17 +6302,17 @@ log_heap_freeze(Relation reln, Buffer buffer,
 	rdata[0].next = &(rdata[1]);
 
 	/*
-	 * The tuple-offsets array is not actually in the buffer, but pretend that
-	 * it is.  When XLogInsert stores the whole buffer, the offsets array need
+	 * The freeze plan array is not actually in the buffer, but pretend that
+	 * it is.  When XLogInsert stores the whole buffer, the freeze plan need
 	 * not be stored too.
 	 */
-	rdata[1].data = (char *) offsets;
-	rdata[1].len = offcnt * sizeof(OffsetNumber);
+	rdata[1].data = (char *) tuples;
+	rdata[1].len = ntuples * sizeof(xl_heap_freeze_tuple);
 	rdata[1].buffer = buffer;
 	rdata[1].buffer_std = true;
 	rdata[1].next = NULL;
 
-	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE, rdata);
+	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE, rdata);
 
 	return recptr;
 }
@@ -6402,6 +6654,99 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
 	XLogRecordPageWithFreeSpace(xlrec->node, xlrec->block, freespace);
 }
 
+/*
+ * Freeze a single tuple for XLOG_HEAP2_FREEZE
+ *
+ * NB: This type of record aren't generated anymore, since bugs around
+ * multixacts couldn't be fixed without a more robust type of freezing. This
+ * is kept around to be able to perform PITR.
+ */
+static bool
+heap_xlog_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+					   MultiXactId cutoff_multi)
+{
+	bool		changed = false;
+	TransactionId xid;
+
+	xid = HeapTupleHeaderGetXmin(tuple);
+	if (TransactionIdIsNormal(xid) &&
+		TransactionIdPrecedes(xid, cutoff_xid))
+	{
+		HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
+
+		/*
+		 * Might as well fix the hint bits too; usually XMIN_COMMITTED will
+		 * already be set here, but there's a small chance not.
+		 */
+		Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
+		tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+		changed = true;
+	}
+
+	/*
+	 * Note that this code handles IS_MULTI Xmax values, too, but only to mark
+	 * the tuple as not updated if the multixact is below the cutoff Multixact
+	 * given; it doesn't remove dead members of a very old multixact.
+	 */
+	xid = HeapTupleHeaderGetRawXmax(tuple);
+	if ((tuple->t_infomask & HEAP_XMAX_IS_MULTI) ?
+		(MultiXactIdIsValid(xid) &&
+		 MultiXactIdPrecedes(xid, cutoff_multi)) :
+		(TransactionIdIsNormal(xid) &&
+		 TransactionIdPrecedes(xid, cutoff_xid)))
+	{
+		HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
+
+		/*
+		 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
+		 * LOCKED.	Normalize to INVALID just to be sure no one gets confused.
+		 * Also get rid of the HEAP_KEYS_UPDATED bit.
+		 */
+		tuple->t_infomask &= ~HEAP_XMAX_BITS;
+		tuple->t_infomask |= HEAP_XMAX_INVALID;
+		HeapTupleHeaderClearHotUpdated(tuple);
+		tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+		changed = true;
+	}
+
+	/*
+	 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
+	 * we support having MOVED_OFF/MOVED_IN tuples in the database.
+	 */
+	if (tuple->t_infomask & HEAP_MOVED)
+	{
+		xid = HeapTupleHeaderGetXvac(tuple);
+		if (TransactionIdIsNormal(xid) &&
+			TransactionIdPrecedes(xid, cutoff_xid))
+		{
+			/*
+			 * If a MOVED_OFF tuple is not dead, the xvac transaction must
+			 * have failed; whereas a non-dead MOVED_IN tuple must mean the
+			 * xvac transaction succeeded.
+			 */
+			if (tuple->t_infomask & HEAP_MOVED_OFF)
+				HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
+			else
+				HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
+
+			/*
+			 * Might as well fix the hint bits too; usually XMIN_COMMITTED
+			 * will already be set here, but there's a small chance not.
+			 */
+			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
+			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+			changed = true;
+		}
+	}
+
+	return changed;
+}
+
+/*
+ * NB: This type of record aren't generated anymore, since bugs around
+ * multixacts couldn't be fixed without a more robust type of freezing. This
+ * is kept around to be able to perform PITR.
+ */
 static void
 heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -6450,7 +6795,7 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
 			ItemId		lp = PageGetItemId(page, *offsets);
 			HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);
 
-			(void) heap_freeze_tuple(tuple, cutoff_xid, cutoff_multi);
+			(void) heap_xlog_freeze_tuple(tuple, cutoff_xid, cutoff_multi);
 			offsets++;
 		}
 	}
@@ -6574,6 +6919,63 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
 	}
 }
 
+/*
+ * Replay XLOG_HEAP2_FREEZE_PAGE records
+ */
+static void
+heap_xlog_freeze_page(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
+	TransactionId cutoff_xid = xlrec->cutoff_xid;
+	Buffer		buffer;
+	Page		page;
+	int			ntup;
+
+	/*
+	 * In Hot Standby mode, ensure that there's no queries running which still
+	 * consider the frozen xids as running.
+	 */
+	if (InHotStandby)
+		ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node);
+
+	/* If we have a full-page image, restore it and we're done */
+	if (record->xl_info & XLR_BKP_BLOCK(0))
+	{
+		(void) RestoreBackupBlock(lsn, record, 0, false, false);
+		return;
+	}
+
+	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+	if (!BufferIsValid(buffer))
+		return;
+
+	page = (Page) BufferGetPage(buffer);
+
+	if (lsn <= PageGetLSN(page))
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
+	/* now execute freeze plan for each frozen tuple */
+	for (ntup = 0; ntup < xlrec->ntuples; ntup++)
+	{
+		xl_heap_freeze_tuple *xlrec_tp;
+		ItemId		lp;
+		HeapTupleHeader tuple;
+
+		xlrec_tp = &xlrec->tuples[ntup];
+		lp = PageGetItemId(page, xlrec_tp->offset);		/* offsets are one-based */
+		tuple = (HeapTupleHeader) PageGetItem(page, lp);
+
+		heap_execute_freeze_tuple(tuple, xlrec_tp);
+	}
+
+	PageSetLSN(page, lsn);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+}
+
 static void
 heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -7429,6 +7831,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 		case XLOG_HEAP2_CLEAN:
 			heap_xlog_clean(lsn, record);
 			break;
+		case XLOG_HEAP2_FREEZE_PAGE:
+			heap_xlog_freeze_page(lsn, record);
+			break;
 		case XLOG_HEAP2_CLEANUP_INFO:
 			heap_xlog_cleanup_info(lsn, record);
 			break;
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index bc8b985..d527aa6 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -149,6 +149,15 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->node.relNode, xlrec->block,
 						 xlrec->latestRemovedXid);
 	}
+	else if (info == XLOG_HEAP2_FREEZE_PAGE)
+	{
+		xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec;
+
+		appendStringInfo(buf, "freeze_page: rel %u/%u/%u; blk %u; cutoff xid %u ntuples %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block,
+						 xlrec->cutoff_xid, xlrec->ntuples);
+	}
 	else if (info == XLOG_HEAP2_CLEANUP_INFO)
 	{
 		xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 2081470..ed7101f 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -286,7 +286,6 @@ static MemoryContext MXactContext = NULL;
 
 /* internal MultiXactId management */
 static void MultiXactIdSetOldestVisible(void);
-static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members);
 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 				   int nmembers, MultiXactMember *members);
 static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
@@ -344,7 +343,7 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
 	members[1].xid = xid2;
 	members[1].status = status2;
 
-	newMulti = CreateMultiXactId(2, members);
+	newMulti = MultiXactIdCreateFromMembers(2, members);
 
 	debug_elog3(DEBUG2, "Create: %s",
 				mxid_to_string(newMulti, 2, members));
@@ -407,7 +406,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 		 */
 		member.xid = xid;
 		member.status = status;
-		newMulti = CreateMultiXactId(1, &member);
+		newMulti = MultiXactIdCreateFromMembers(1, &member);
 
 		debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
 					multi, newMulti);
@@ -459,7 +458,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 
 	newMembers[j].xid = xid;
 	newMembers[j++].status = status;
-	newMulti = CreateMultiXactId(j, newMembers);
+	newMulti = MultiXactIdCreateFromMembers(j, newMembers);
 
 	pfree(members);
 	pfree(newMembers);
@@ -664,16 +663,16 @@ ReadNextMultiXactId(void)
 }
 
 /*
- * CreateMultiXactId
- *		Make a new MultiXactId
+ * MultiXactIdCreateFromMembers
+ *		Make a new MultiXactId from the specified set of members
  *
  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
  * given TransactionIds as members.  Returns the newly created MultiXactId.
  *
  * NB: the passed members[] array will be sorted in-place.
  */
-static MultiXactId
-CreateMultiXactId(int nmembers, MultiXactMember *members)
+MultiXactId
+MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
 {
 	MultiXactId multi;
 	MultiXactOffset offset;
@@ -760,7 +759,8 @@ CreateMultiXactId(int nmembers, MultiXactMember *members)
  * RecordNewMultiXact
  *		Write info about a new multixact into the offsets and members files
  *
- * This is broken out of CreateMultiXactId so that xlog replay can use it.
+ * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
+ * use it.
  */
 static void
 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index ff6bd8e..01b6f46 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -424,6 +424,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 	Buffer		vmbuffer = InvalidBuffer;
 	BlockNumber next_not_all_visible_block;
 	bool		skipping_all_visible_blocks;
+	xl_heap_freeze_tuple *frozen;
 
 	pg_rusage_init(&ru0);
 
@@ -446,6 +447,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 	vacrelstats->latestRemovedXid = InvalidTransactionId;
 
 	lazy_space_alloc(vacrelstats, nblocks);
+	frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
 
 	/*
 	 * We want to skip pages that don't require vacuuming according to the
@@ -500,7 +502,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		bool		tupgone,
 					hastup;
 		int			prev_dead_count;
-		OffsetNumber frozen[MaxOffsetNumber];
 		int			nfrozen;
 		Size		freespace;
 		bool		all_visible_according_to_vm;
@@ -893,9 +894,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				 * Each non-removable tuple must be checked to see if it needs
 				 * freezing.  Note we already have exclusive buffer lock.
 				 */
-				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
-									  MultiXactCutoff))
-					frozen[nfrozen++] = offnum;
+				if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
+										  MultiXactCutoff, &frozen[nfrozen]))
+					frozen[nfrozen++].offset = offnum;
 			}
 		}						/* scan along page */
 
@@ -906,15 +907,33 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		 */
 		if (nfrozen > 0)
 		{
+			START_CRIT_SECTION();
+
 			MarkBufferDirty(buf);
+
+			/* execute collected freezes */
+			for (i = 0; i < nfrozen; i++)
+			{
+				ItemId		itemid;
+				HeapTupleHeader htup;
+
+				itemid = PageGetItemId(page, frozen[i].offset);
+				htup = (HeapTupleHeader) PageGetItem(page, itemid);
+
+				heap_execute_freeze_tuple(htup, &frozen[i]);
+			}
+
+			/* Now WAL-log freezing if neccessary */
 			if (RelationNeedsWAL(onerel))
 			{
 				XLogRecPtr	recptr;
 
 				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
-										 MultiXactCutoff, frozen, nfrozen);
+										 frozen, nfrozen);
 				PageSetLSN(page, recptr);
 			}
+
+			END_CRIT_SECTION();
 		}
 
 		/*
@@ -1015,6 +1034,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			RecordPageWithFreeSpace(onerel, blkno, freespace);
 	}
 
+	pfree(frozen);
+
 	/* save stats for use later */
 	vacrelstats->scanned_tuples = num_tuples;
 	vacrelstats->tuples_deleted = tups_vacuumed;
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 4381778..138b879 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -50,7 +50,7 @@
  */
 #define XLOG_HEAP2_FREEZE		0x00
 #define XLOG_HEAP2_CLEAN		0x10
-/* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */
+#define XLOG_HEAP2_FREEZE_PAGE	0x20
 #define XLOG_HEAP2_CLEANUP_INFO 0x30
 #define XLOG_HEAP2_VISIBLE		0x40
 #define XLOG_HEAP2_MULTI_INSERT 0x50
@@ -239,7 +239,7 @@ typedef struct xl_heap_inplace
 
 #define SizeOfHeapInplace	(offsetof(xl_heap_inplace, target) + SizeOfHeapTid)
 
-/* This is what we need to know about tuple freezing during vacuum */
+/* This is what we need to know about tuple freezing during vacuum (legacy) */
 typedef struct xl_heap_freeze
 {
 	RelFileNode node;
@@ -251,6 +251,35 @@ typedef struct xl_heap_freeze
 
 #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId))
 
+/*
+ * a 'freeze plan' struct that represents what we need to know about a single
+ * tuple being frozen during vacuum
+ */
+#define		XLH_FREEZE_XMIN		0x01
+#define		XLH_FREEZE_XVAC		0x02
+#define		XLH_INVALID_XVAC	0x04
+
+typedef struct xl_heap_freeze_tuple
+{
+	TransactionId xmax;
+	OffsetNumber offset;
+	uint16		t_infomask2;
+	uint16		t_infomask;
+	uint8		frzflags;
+} xl_heap_freeze_tuple;
+
+/*
+ * This is what we need to know about a block being frozen during vacuum
+ */
+typedef struct xl_heap_freeze_block
+{
+	RelFileNode node;
+	BlockNumber block;
+	TransactionId cutoff_xid;
+	uint16		ntuples;
+	xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER];
+} xl_heap_freeze_page;
+
 /* This is what we need to know about setting a visibility map bit */
 typedef struct xl_heap_visible
 {
@@ -277,8 +306,14 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
 			   OffsetNumber *nowunused, int nunused,
 			   TransactionId latestRemovedXid);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
-				TransactionId cutoff_xid, MultiXactId cutoff_multi,
-				OffsetNumber *offsets, int offcnt);
+				TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples,
+				int ntuples);
+extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple,
+						  TransactionId cutoff_xid,
+						  TransactionId cutoff_multi,
+						  xl_heap_freeze_tuple *frz);
+extern void heap_execute_freeze_tuple(HeapTupleHeader tuple,
+						  xl_heap_freeze_tuple *xlrec_tp);
 extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer,
 				 Buffer vm_buffer, TransactionId cutoff_xid);
 extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 6085ea3..0e3b273 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -81,6 +81,9 @@ extern MultiXactId MultiXactIdCreate(TransactionId xid1,
 				  MultiXactStatus status2);
 extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
 				  MultiXactStatus status);
+extern MultiXactId MultiXactIdCreateFromMembers(int nmembers,
+							 MultiXactMember *members);
+
 extern MultiXactId ReadNextMultiXactId(void);
 extern bool MultiXactIdIsRunning(MultiXactId multi);
 extern void MultiXactIdSetOldestMember(void);
-- 
1.7.10.4

>From 22f88d30be9f9d97febb335599f2235918685278 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Wed, 11 Dec 2013 13:44:12 -0300
Subject: [PATCH 2/4] fixups for 9.3

---
 src/backend/access/heap/heapam.c |    2 +-
 src/include/access/heapam_xlog.h |    4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 24d843a..9509480 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6297,7 +6297,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
 	xlrec.ntuples = ntuples;
 
 	rdata[0].data = (char *) &xlrec;
-	rdata[0].len = SizeOfHeapFreeze;
+	rdata[0].len = SizeOfHeapFreezePage;
 	rdata[0].buffer = InvalidBuffer;
 	rdata[0].next = &(rdata[1]);
 
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 138b879..8d25245 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -271,7 +271,7 @@ typedef struct xl_heap_freeze_tuple
 /*
  * This is what we need to know about a block being frozen during vacuum
  */
-typedef struct xl_heap_freeze_block
+typedef struct xl_heap_freeze_page
 {
 	RelFileNode node;
 	BlockNumber block;
@@ -280,6 +280,8 @@ typedef struct xl_heap_freeze_block
 	xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER];
 } xl_heap_freeze_page;
 
+#define SizeOfHeapFreezePage offsetof(xl_heap_freeze_page, tuples)
+
 /* This is what we need to know about setting a visibility map bit */
 typedef struct xl_heap_visible
 {
-- 
1.7.10.4

>From 35fdab06ceefcd0b4399550d76f9e134a1ae5880 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Thu, 12 Dec 2013 17:27:54 -0300
Subject: [PATCH 3/4] fixup XidIsInProgress before transam.c tests

---
 src/backend/access/heap/heapam.c |   81 ++++++++++++++++++++++++++------------
 1 file changed, 55 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 9509480..9616c18 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -5381,47 +5381,76 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 
 	for (i = 0; i < nmembers; i++)
 	{
+		/*
+		 * Determine whether to keep this member or ignore it.
+		 */
 		if (ISUPDATE_from_mxstatus(members[i].status))
 		{
+			TransactionId	xid = members[i].xid;
+
 			/*
 			 * It's an update; should we keep it?  If the transaction is known
-			 * aborted then it's okay to ignore it, otherwise not.  (Note this
-			 * is just an optimization and not needed for correctness, so it's
-			 * okay to get this test wrong; for example, in case an updater is
-			 * crashed, or a running transaction in the process of aborting.)
+			 * aborted then it's okay to ignore it, otherwise not.  However,
+			 * if the Xid is older than the cutoff_xid, we must remove it,
+			 * because otherwise we might allow a very old Xid to persist
+			 * which would later cause pg_clog lookups problems, because the
+			 * corresponding SLRU segment might be about to be truncated away.
+			 * (Note that such an old updater cannot possibly be committed,
+			 * because HeapTupleSatisfiesVacuum would have returned
+			 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.)
+			 *
+			 * Note the TransactionIdDidAbort() test is just an optimization
+			 * and not strictly necessary for correctness.
+			 *
+			 * As with all tuple visibility routines, it's critical to test
+			 * TransactionIdIsInProgress before the transam.c routines,
+			 * because of race conditions explained in detail in tqual.c.
 			 */
-			if (!TransactionIdDidAbort(members[i].xid))
+			if (TransactionIdIsCurrentTransactionId(xid) ||
+				TransactionIdIsInProgress(xid))
 			{
-				newmembers[nnewmembers++] = members[i];
 				Assert(!TransactionIdIsValid(update_xid));
-
+				update_xid = xid;
+			}
+			else if (!TransactionIdDidAbort(xid))
+			{
 				/*
-				 * Tell caller to set HEAP_XMAX_COMMITTED hint while we have
-				 * the Xid in cache.  Again, this is just an optimization, so
-				 * it's not a problem if the transaction is still running and
-				 * in the process of committing.
+				 * Test whether to tell caller to set HEAP_XMAX_COMMITTED
+				 * while we have the Xid still in cache.  Note this can only
+				 * be done if the transaction is known not running.
 				 */
-				if (TransactionIdDidCommit(update_xid))
+				if (TransactionIdDidCommit(xid))
 					update_committed = true;
-
-				update_xid = newmembers[i].xid;
+				Assert(!TransactionIdIsValid(update_xid));
+				update_xid = xid;
 			}
 
 			/*
-			 * Checking for very old update Xids is critical: if the update
-			 * member of the multi is older than cutoff_xid, we must remove
-			 * it, because otherwise a later liveliness check could attempt
-			 * pg_clog access for a page that was truncated away by the
-			 * current vacuum.	Note that if the update had committed, we
-			 * wouldn't be freezing this tuple because it would have gotten
-			 * removed (HEAPTUPLE_DEAD) by HeapTupleSatisfiesVacuum; so it
-			 * either aborted or crashed.  Therefore, ignore this update_xid.
+			 * If we determined that it's an Xid corresponding to an update
+			 * that must be retained, additionally add it to the list of
+			 * members of the new Multis, in case we end up using that.  (We
+			 * might still decide to use only an update Xid and not a multi,
+			 * but it's easier to maintain the list as we walk the old members
+			 * list.)
+			 *
+			 * It is possible to end up with a very old updater Xid that
+			 * crashed and thus did not mark itself as aborted in pg_clog.
+			 * That would manifest as a pre-cutoff Xid.  Make sure to ignore
+			 * it.
 			 */
-			if (TransactionIdPrecedes(update_xid, cutoff_xid))
+			if (TransactionIdIsValid(update_xid))
 			{
-				update_xid = InvalidTransactionId;
-				update_committed = false;
-
+				if (!TransactionIdPrecedes(update_xid, cutoff_xid))
+				{
+					newmembers[nnewmembers++] = members[i];
+				}
+				else
+				{
+					/* cannot have committed: would be HEAPTUPLE_DEAD */
+					Assert(!TransactionIdDidCommit(update_xid));
+					update_xid = InvalidTransactionId;
+					update_committed = false;
+				}
 			}
 		}
 		else
-- 
1.7.10.4

>From fdfd992ff907e75d4c1c009b9b191748678d5daf Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Thu, 12 Dec 2013 18:18:36 -0300
Subject: [PATCH 4/4] set the PROC_FREEZING_MULTI flag to avoid assert

---
 src/backend/access/heap/heapam.c       |   20 ++++++++++++++++----
 src/backend/access/transam/multixact.c |    9 +++++++--
 src/include/storage/proc.h             |    3 ++-
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 9616c18..bf9300d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -60,6 +60,7 @@
 #include "storage/freespace.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
@@ -5520,8 +5521,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
  * WAL-log what we would need to do, and return TRUE.  Return FALSE if nothing
  * is to be changed.
  *
- * Caller is responsible for setting the offset field, if appropriate.	This
- * is only necessary if the freeze is to be WAL-logged.
+ * Caller is responsible for setting the offset field, if appropriate.
  *
  * It is assumed that the caller has checked the tuple with
  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
@@ -5587,8 +5587,20 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 		TransactionId newxmax;
 		uint16		flags;
 
-		newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
-									cutoff_xid, cutoff_multi, &flags);
+		PG_TRY();
+		{
+			/* set flag to let multixact.c know what we're doing */
+			MyPgXact->vacuumFlags |= PROC_FREEZING_MULTI;
+			newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
+										cutoff_xid, cutoff_multi, &flags);
+		}
+		PG_CATCH();
+		{
+			MyPgXact->vacuumFlags &= ~PROC_FREEZING_MULTI;
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+		MyPgXact->vacuumFlags &= ~PROC_FREEZING_MULTI;
 
 		if (flags & FRM_INVALIDATE_XMAX)
 			freeze_xmax = true;
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index ed7101f..0166978 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -77,6 +77,7 @@
 #include "postmaster/autovacuum.h"
 #include "storage/lmgr.h"
 #include "storage/pmsignal.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
@@ -864,8 +865,12 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 
 	debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
 
-	/* MultiXactIdSetOldestMember() must have been called already */
-	Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
+	/*
+	 * MultiXactIdSetOldestMember() must have been called already, but don't
+	 * check while freezing MultiXactIds.
+	 */
+	Assert((MyPgXact->vacuumFlags & PROC_FREEZING_MULTI) ||
+		   MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
 
 	/* safety check, we should never get this far in a HS slave */
 	if (RecoveryInProgress())
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 3b04d3c..7e53a3a 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -42,9 +42,10 @@ struct XidCache
 #define		PROC_IN_VACUUM		0x02	/* currently running lazy vacuum */
 #define		PROC_IN_ANALYZE		0x04	/* currently running analyze */
 #define		PROC_VACUUM_FOR_WRAPAROUND 0x08		/* set by autovac only */
+#define		PROC_FREEZING_MULTI	0x10	/* set while freezing multis */
 
 /* flags reset at EOXact */
-#define		PROC_VACUUM_STATE_MASK (0x0E)
+#define		PROC_VACUUM_STATE_MASK (0x1E)
 
 /*
  * We allow a small number of "weak" relation locks (AccesShareLock,
-- 
1.7.10.4

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] pgsql: Fix a couple of bugs in MultiXactId freezing

Reply via email to