Here's a revamped version of this patch. One thing I didn't do here is revert the exporting of CreateMultiXactId, but I don't see any way to avoid that.
Andres mentioned the idea of sharing some code between heap_prepare_freeze_tuple and heap_tuple_needs_freeze, but I haven't explored that. -- Álvaro Herrera http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
*** a/src/backend/access/heap/heapam.c --- b/src/backend/access/heap/heapam.c *************** *** 5238,5251 **** heap_inplace_update(Relation relation, HeapTuple tuple) CacheInvalidateHeapTuple(relation, tuple, NULL); } /* ! * heap_freeze_tuple * * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) ! * are older than the specified cutoff XID. If so, replace them with ! * FrozenTransactionId or InvalidTransactionId as appropriate, and return ! * TRUE. Return FALSE if nothing was changed. * * It is assumed that the caller has checked the tuple with * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD --- 5238,5448 ---- CacheInvalidateHeapTuple(relation, tuple, NULL); } + #define FRM_NOOP 0x0001 + #define FRM_INVALIDATE_XMAX 0x0002 + #define FRM_RETURN_IS_XID 0x0004 + #define FRM_RETURN_IS_MULTI 0x0008 + #define FRM_MARK_COMMITTED 0x0010 /* ! * FreezeMultiXactId ! * Determine what to do during freezing when a tuple is marked by a ! * MultiXactId. ! * ! * "flags" is an output value; it's used to tell caller what to do on return. ! * ! * Possible flags are: ! * FRM_NOOP ! * don't do anything -- keep existing Xmax ! * FRM_INVALIDATE_XMAX ! * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag. ! * FRM_RETURN_IS_XID ! * The Xid return value is a single update Xid to set as xmax. ! * FRM_MARK_COMMITTED ! * Xmax can be marked as HEAP_XMAX_COMMITTED ! * FRM_RETURN_IS_MULTI ! * The return value is a new MultiXactId to set as new Xmax. ! * (caller must obtain proper infomask bits using GetMultiXactIdHintBits) ! */ ! static TransactionId ! FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, ! TransactionId cutoff_xid, MultiXactId cutoff_multi, ! uint16 *flags) ! { ! TransactionId xid = InvalidTransactionId; ! int i; ! MultiXactMember *members; ! int nmembers; ! bool need_replace; ! int nnewmembers; ! MultiXactMember *newmembers; ! bool has_lockers; ! TransactionId update_xid; ! bool update_committed; ! ! *flags = 0; ! ! if (!MultiXactIdIsValid(multi)) ! { ! /* Ensure infomask bits are appropriately set/reset */ ! *flags |= FRM_INVALIDATE_XMAX; ! return InvalidTransactionId; ! } ! else if (MultiXactIdPrecedes(multi, cutoff_multi)) ! { ! /* ! * This old multi cannot possibly have members still running. If it ! * was a locker only, it can be removed without any further ! * consideration; but if it contained an update, we might need to ! * preserve it. ! */ ! if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)) ! { ! *flags |= FRM_INVALIDATE_XMAX; ! return InvalidTransactionId; ! } ! else ! { ! /* replace multi by update xid */ ! xid = MultiXactIdGetUpdateXid(multi, t_infomask); ! ! /* wasn't only a lock, xid needs to be valid */ ! Assert(TransactionIdIsValid(xid)); ! ! /* ! * If the xid is older than the cutoff, it has to have aborted, ! * otherwise the tuple would have gotten pruned away. ! */ ! if (TransactionIdPrecedes(xid, cutoff_xid)) ! { ! Assert(!TransactionIdDidCommit(xid)); ! *flags |= FRM_INVALIDATE_XMAX; ! /* xid = InvalidTransactionId; */ ! } ! else ! { ! *flags |= FRM_RETURN_IS_XID; ! } ! } ! } ! ! /* ! * This multixact might have or might not have members still running, ! * but we know it's valid and is newer than the cutoff point for ! * multis. However, some member(s) of it may be below the cutoff for ! * Xids, so we need to walk the whole members array to figure out what ! * to do, if anything. ! */ ! ! nmembers = GetMultiXactIdMembers(multi, &members, false); ! if (nmembers <= 0) ! { ! /* Nothing worth keeping */ ! *flags |= FRM_INVALIDATE_XMAX; ! return InvalidTransactionId; ! } ! ! /* is there anything older than the cutoff? */ ! need_replace = false; ! for (i = 0; i < nmembers; i++) ! { ! if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) ! { ! need_replace = true; ! break; ! } ! } ! ! /* ! * In the simplest case, there is no member older than the cutoff; we can ! * keep the existing MultiXactId as is. ! */ ! if (!need_replace) ! { ! *flags |= FRM_NOOP; ! pfree(members); ! return InvalidTransactionId; ! } ! ! /* ! * If the multi needs to be updated, figure out which members do we need ! * to keep. ! */ ! nnewmembers = 0; ! newmembers = palloc(sizeof(MultiXactMember) * nmembers); ! has_lockers = false; ! update_xid = InvalidTransactionId; ! update_committed = false; ! ! for (i = 0; i < nmembers; i++) ! { ! if (ISUPDATE_from_mxstatus(members[i].status) && ! !TransactionIdDidAbort(members[i].xid)) ! { ! /* if it's an update, we must keep unless it aborted */ ! newmembers[nnewmembers++] = members[i]; ! Assert(!TransactionIdIsValid(update_xid)); ! update_xid = members[i].xid; ! /* tell caller to set hint while we have the Xid in cache */ ! if (TransactionIdDidCommit(update_xid)) ! update_committed = true; ! } ! ! /* We only keep lockers if they are still running */ ! if (TransactionIdIsCurrentTransactionId(members[i].xid) || ! TransactionIdIsInProgress(members[i].xid)) ! { ! newmembers[nnewmembers++] = members[i]; ! has_lockers = true; ! } ! } ! ! pfree(members); ! ! if (nnewmembers == 0) ! { ! /* nothing worth keeping!? Tell caller to remove the whole thing */ ! *flags |= FRM_INVALIDATE_XMAX; ! xid = InvalidTransactionId; ! } ! else if (TransactionIdIsValid(update_xid) && !has_lockers) ! { ! /* ! * If there's a single member and it's an update, pass it back alone ! * without creating a new Multi. (XXX we could do this when there's a ! * single remaining locker, too, but that would complicate the API too ! * much; moreover, the case with the single updater is more ! * interesting, because those are longer-lived.) ! */ ! Assert(nnewmembers == 1); ! *flags |= FRM_RETURN_IS_XID; ! if (update_committed) ! *flags |= FRM_MARK_COMMITTED; ! xid = update_xid; ! } ! else ! { ! /* Note this is WAL-logged */ ! xid = CreateMultiXactId(nnewmembers, newmembers); ! *flags |= FRM_RETURN_IS_MULTI; ! } ! ! pfree(newmembers); ! ! return xid; ! } ! ! /* ! * heap_prepare_freeze_tuple * * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) ! * are older than the specified cutoff XID and cutoff MultiXactId. If so, ! * setup enough state (in the *frz output argument) to later execute and ! * WAL-log what we would need to do, and return TRUE. Return FALSE if nothing ! * is to be changed. ! * ! * Caller is responsible for setting the offset field, if appropriate. This ! * is only necessary if the freeze is to be WAL-logged. * * It is assumed that the caller has checked the tuple with * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD *************** *** 5254,5307 **** heap_inplace_update(Relation relation, HeapTuple tuple) * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any * XID older than it could neither be running nor seen as running by any * open transaction. This ensures that the replacement will not change ! * anyone's idea of the tuple state. Also, since we assume the tuple is ! * not HEAPTUPLE_DEAD, the fact that an XID is not still running allows us ! * to assume that it is either committed good or aborted, as appropriate; ! * so we need no external state checks to decide what to do. (This is good ! * because this function is applied during WAL recovery, when we don't have ! * access to any such state, and can't depend on the hint bits to be set.) ! * There is an exception we make which is to assume GetMultiXactIdMembers can ! * be called during recovery. ! * * Similarly, cutoff_multi must be less than or equal to the smallest * MultiXactId used by any transaction currently open. * * If the tuple is in a shared buffer, caller must hold an exclusive lock on * that buffer. * ! * Note: it might seem we could make the changes without exclusive lock, since ! * TransactionId read/write is assumed atomic anyway. However there is a race ! * condition: someone who just fetched an old XID that we overwrite here could ! * conceivably not finish checking the XID against pg_clog before we finish ! * the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting ! * exclusive lock ensures no other backend is in process of checking the ! * tuple status. Also, getting exclusive lock makes it safe to adjust the ! * infomask bits. ! * ! * NB: Cannot rely on hint bits here, they might not be set after a crash or ! * on a standby. */ bool ! heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, ! MultiXactId cutoff_multi) { bool changed = false; bool freeze_xmax = false; TransactionId xid; /* Process xmin */ xid = HeapTupleHeaderGetXmin(tuple); if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, cutoff_xid)) { ! HeapTupleHeaderSetXmin(tuple, FrozenTransactionId); ! /* * Might as well fix the hint bits too; usually XMIN_COMMITTED will * already be set here, but there's a small chance not. */ ! Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); ! tuple->t_infomask |= HEAP_XMIN_COMMITTED; changed = true; } --- 5451,5492 ---- * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any * XID older than it could neither be running nor seen as running by any * open transaction. This ensures that the replacement will not change ! * anyone's idea of the tuple state. * Similarly, cutoff_multi must be less than or equal to the smallest * MultiXactId used by any transaction currently open. * * If the tuple is in a shared buffer, caller must hold an exclusive lock on * that buffer. * ! * NB: It is not enough to set hint bits to indicate something is ! * committed/invalid -- they might not be set on a standby, or after crash ! * recovery. We really need to remove old xids. */ bool ! heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, ! TransactionId cutoff_multi, xl_heap_freeze_tuple *frz) ! { bool changed = false; bool freeze_xmax = false; TransactionId xid; + frz->frzflags = 0; + frz->t_infomask2 = tuple->t_infomask2; + frz->t_infomask = tuple->t_infomask; + frz->xmax = HeapTupleHeaderGetRawXmax(tuple); + /* Process xmin */ xid = HeapTupleHeaderGetXmin(tuple); if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, cutoff_xid)) { ! frz->frzflags |= XLH_FREEZE_XMIN; /* * Might as well fix the hint bits too; usually XMIN_COMMITTED will * already be set here, but there's a small chance not. */ ! frz->t_infomask |= HEAP_XMIN_COMMITTED; changed = true; } *************** *** 5318,5408 **** heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { ! if (!MultiXactIdIsValid(xid)) ! { ! /* no xmax set, ignore */ ! ; ! } ! else if (MultiXactIdPrecedes(xid, cutoff_multi)) ! { ! /* ! * This old multi cannot possibly be running. If it was a locker ! * only, it can be removed without much further thought; but if it ! * contained an update, we need to preserve it. ! */ ! if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) ! freeze_xmax = true; ! else ! { ! TransactionId update_xid; ! update_xid = HeapTupleGetUpdateXid(tuple); ! /* ! * The multixact has an update hidden within. Get rid of it. ! * ! * If the update_xid is below the cutoff_xid, it necessarily ! * must be an aborted transaction. In a primary server, such ! * an Xmax would have gotten marked invalid by ! * HeapTupleSatisfiesVacuum, but in a replica that is not ! * called before we are, so deal with it in the same way. ! * ! * If not below the cutoff_xid, then the tuple would have been ! * pruned by vacuum, if the update committed long enough ago, ! * and we wouldn't be freezing it; so it's either recently ! * committed, or in-progress. Deal with this by setting the ! * Xmax to the update Xid directly and remove the IS_MULTI ! * bit. (We know there cannot be running lockers in this ! * multi, because it's below the cutoff_multi value.) ! */ ! ! if (TransactionIdPrecedes(update_xid, cutoff_xid)) ! { ! Assert(InRecovery || TransactionIdDidAbort(update_xid)); ! freeze_xmax = true; ! } ! else ! { ! Assert(InRecovery || !TransactionIdIsInProgress(update_xid)); ! tuple->t_infomask &= ~HEAP_XMAX_BITS; ! HeapTupleHeaderSetXmax(tuple, update_xid); ! changed = true; ! } ! } ! } ! else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) { ! /* newer than the cutoff, so don't touch it */ ; } ! else { ! TransactionId update_xid; ! /* ! * This is a multixact which is not marked LOCK_ONLY, but which ! * is newer than the cutoff_multi. If the update_xid is below the ! * cutoff_xid point, then we can just freeze the Xmax in the ! * tuple, removing it altogether. This seems simple, but there ! * are several underlying assumptions: ! * ! * 1. A tuple marked by an multixact containing a very old ! * committed update Xid would have been pruned away by vacuum; we ! * wouldn't be freezing this tuple at all. ! * ! * 2. There cannot possibly be any live locking members remaining ! * in the multixact. This is because if they were alive, the ! * update's Xid would had been considered, via the lockers' ! * snapshot's Xmin, as part the cutoff_xid. ! * ! * 3. We don't create new MultiXacts via MultiXactIdExpand() that ! * include a very old aborted update Xid: in that function we only ! * include update Xids corresponding to transactions that are ! * committed or in-progress. ! */ ! update_xid = HeapTupleGetUpdateXid(tuple); ! if (TransactionIdPrecedes(update_xid, cutoff_xid)) ! freeze_xmax = true; } } else if (TransactionIdIsNormal(xid) && --- 5503,5536 ---- if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { ! TransactionId newxmax; ! uint16 flags; ! newxmax = FreezeMultiXactId(xid, tuple->t_infomask, ! cutoff_xid, cutoff_multi, &flags); ! if (flags & FRM_NOOP) { ! /* nothing to do in this case */ ; } ! if (flags & FRM_INVALIDATE_XMAX) ! freeze_xmax = true; ! else if (flags & FRM_RETURN_IS_XID) { ! frz->t_infomask &= ~HEAP_XMAX_BITS; ! frz->xmax = newxmax; ! if (flags & FRM_MARK_COMMITTED) ! frz->t_infomask &= HEAP_XMAX_COMMITTED; ! } ! else if (flags & FRM_RETURN_IS_MULTI) ! { ! frz->t_infomask &= ~HEAP_XMAX_BITS; ! frz->xmax = newxmax; ! GetMultiXactIdHintBits(newxmax, ! &frz->t_infomask, ! &frz->t_infomask2); } } else if (TransactionIdIsNormal(xid) && *************** *** 5413,5429 **** heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, if (freeze_xmax) { ! HeapTupleHeaderSetXmax(tuple, InvalidTransactionId); /* * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + * LOCKED. Normalize to INVALID just to be sure no one gets confused. * Also get rid of the HEAP_KEYS_UPDATED bit. */ ! tuple->t_infomask &= ~HEAP_XMAX_BITS; ! tuple->t_infomask |= HEAP_XMAX_INVALID; ! HeapTupleHeaderClearHotUpdated(tuple); ! tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED; changed = true; } --- 5541,5557 ---- if (freeze_xmax) { ! frz->xmax = InvalidTransactionId; /* * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + * LOCKED. Normalize to INVALID just to be sure no one gets confused. * Also get rid of the HEAP_KEYS_UPDATED bit. */ ! frz->t_infomask &= ~HEAP_XMAX_BITS; ! frz->t_infomask |= HEAP_XMAX_INVALID; ! frz->t_infomask2 &= ~HEAP_HOT_UPDATED; ! frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; changed = true; } *************** *** 5443,5458 **** heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, * xvac transaction succeeded. */ if (tuple->t_infomask & HEAP_MOVED_OFF) ! HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); else ! HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); /* * Might as well fix the hint bits too; usually XMIN_COMMITTED * will already be set here, but there's a small chance not. */ Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); ! tuple->t_infomask |= HEAP_XMIN_COMMITTED; changed = true; } } --- 5571,5586 ---- * xvac transaction succeeded. */ if (tuple->t_infomask & HEAP_MOVED_OFF) ! frz->frzflags |= XLH_FREEZE_XVAC; else ! frz->frzflags |= XLH_INVALID_XVAC; /* * Might as well fix the hint bits too; usually XMIN_COMMITTED * will already be set here, but there's a small chance not. */ Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); ! frz->t_infomask |= HEAP_XMIN_COMMITTED; changed = true; } } *************** *** 5461,5466 **** heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, --- 5589,5656 ---- } /* + * heap_execute_freeze_tuple + * Execute the prepared freezing of a tuple. + * + * Caller is responsible for ensuring that no other backend can access the + * storage underlying this tuple, either by holding an exclusive lock on the + * buffer containing it (which is what lazy VACUUM does), or by having it by + * in private storage (which is what CLUSTER and friends do). + * + * Note: it might seem we could make the changes without exclusive lock, since + * TransactionId read/write is assumed atomic anyway. However there is a race + * condition: someone who just fetched an old XID that we overwrite here could + * conceivably not finish checking the XID against pg_clog before we finish + * the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting + * exclusive lock ensures no other backend is in process of checking the + * tuple status. Also, getting exclusive lock makes it safe to adjust the + * infomask bits. + * + * NB: All code in here must be safe to execute during crash recovery! + */ + void + heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz) + { + if (frz->frzflags & XLH_FREEZE_XMIN) + HeapTupleHeaderSetXmin(tuple, FrozenTransactionId); + + HeapTupleHeaderSetXmax(tuple, frz->xmax); + + if (frz->frzflags & XLH_FREEZE_XVAC) + HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); + + if (frz->frzflags & XLH_INVALID_XVAC) + HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); + + tuple->t_infomask = frz->t_infomask; + tuple->t_infomask2 = frz->t_infomask2; + } + + /* + * heap_freeze_tuple - freeze tuple inplace without WAL logging. + * + * Useful for callers like CLUSTER that perform their own WAL logging. + */ + bool + heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, + TransactionId cutoff_multi) + { + xl_heap_freeze_tuple frz; + bool do_freeze; + + do_freeze = heap_prepare_freeze_tuple(tuple, cutoff_xid, cutoff_multi, &frz); + + /* + * Note that because this is not a WAL-logged operation, we don't need + * to fill in the offset in the freeze record. + */ + + if (do_freeze) + heap_execute_freeze_tuple(tuple, &frz); + return do_freeze; + } + + /* * For a given MultiXactId, return the hint bits that should be set in the * tuple's infomask. * *************** *** 5763,5778 **** heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, } else if (MultiXactIdPrecedes(multi, cutoff_multi)) return true; - else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - /* only-locker multis don't need internal examination */ - ; - } else { ! if (TransactionIdPrecedes(HeapTupleGetUpdateXid(tuple), ! cutoff_xid)) ! return true; } } else --- 5953,5978 ---- } else if (MultiXactIdPrecedes(multi, cutoff_multi)) return true; else { ! MultiXactMember *members; ! int nmembers; ! int i; ! ! /* need to check whether any member of the mxact is too old */ ! ! nmembers = GetMultiXactIdMembers(multi, &members, false); ! ! for (i = 0; i < nmembers; i++) ! { ! if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) ! { ! pfree(members); ! return true; ! } ! } ! if (nmembers > 0) ! pfree(members); } } else *************** *** 6022,6048 **** log_heap_clean(Relation reln, Buffer buffer, } /* ! * Perform XLogInsert for a heap-freeze operation. Caller must already ! * have modified the buffer and marked it dirty. */ XLogRecPtr ! log_heap_freeze(Relation reln, Buffer buffer, ! TransactionId cutoff_xid, MultiXactId cutoff_multi, ! OffsetNumber *offsets, int offcnt) { ! xl_heap_freeze xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); /* nor when there are no tuples to freeze */ ! Assert(offcnt > 0); xlrec.node = reln->rd_node; xlrec.block = BufferGetBlockNumber(buffer); xlrec.cutoff_xid = cutoff_xid; ! xlrec.cutoff_multi = cutoff_multi; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapFreeze; --- 6222,6247 ---- } /* ! * Perform XLogInsert for a heap-freeze operation. Caller must have already ! * modified the buffer and marked it dirty. */ XLogRecPtr ! log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, ! xl_heap_freeze_tuple *tuples, int ntuples) { ! xl_heap_freeze_page xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); /* nor when there are no tuples to freeze */ ! Assert(ntuples > 0); xlrec.node = reln->rd_node; xlrec.block = BufferGetBlockNumber(buffer); xlrec.cutoff_xid = cutoff_xid; ! xlrec.ntuples = ntuples; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapFreeze; *************** *** 6050,6066 **** log_heap_freeze(Relation reln, Buffer buffer, rdata[0].next = &(rdata[1]); /* ! * The tuple-offsets array is not actually in the buffer, but pretend that ! * it is. When XLogInsert stores the whole buffer, the offsets array need * not be stored too. */ ! rdata[1].data = (char *) offsets; ! rdata[1].len = offcnt * sizeof(OffsetNumber); rdata[1].buffer = buffer; rdata[1].buffer_std = true; rdata[1].next = NULL; ! recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE, rdata); return recptr; } --- 6249,6265 ---- rdata[0].next = &(rdata[1]); /* ! * The freeze plan array is not actually in the buffer, but pretend that ! * it is. When XLogInsert stores the whole buffer, the freeze plan need * not be stored too. */ ! rdata[1].data = (char *) tuples; ! rdata[1].len = ntuples * SizeOfHeapFreezeTuple; rdata[1].buffer = buffer; rdata[1].buffer_std = true; rdata[1].next = NULL; ! recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE, rdata); return recptr; } *************** *** 6402,6407 **** heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) --- 6601,6699 ---- XLogRecordPageWithFreeSpace(xlrec->node, xlrec->block, freespace); } + /* + * Freeze a single tuple for XLOG_HEAP2_FREEZE + * + * NB: This type of record aren't generated anymore, since bugs around + * multixacts couldn't be fixed without a more robust type of freezing. This + * is kept around to be able to perform PITR. + */ + static bool + heap_xlog_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, + MultiXactId cutoff_multi) + { + bool changed = false; + TransactionId xid; + + xid = HeapTupleHeaderGetXmin(tuple); + if (TransactionIdIsNormal(xid) && + TransactionIdPrecedes(xid, cutoff_xid)) + { + HeapTupleHeaderSetXmin(tuple, FrozenTransactionId); + + /* + * Might as well fix the hint bits too; usually XMIN_COMMITTED will + * already be set here, but there's a small chance not. + */ + Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); + tuple->t_infomask |= HEAP_XMIN_COMMITTED; + changed = true; + } + + /* + * Note that this code handles IS_MULTI Xmax values, too, but only to mark + * the tuple as not updated if the multixact is below the cutoff Multixact + * given; it doesn't remove dead members of a very old multixact. + */ + xid = HeapTupleHeaderGetRawXmax(tuple); + if ((tuple->t_infomask & HEAP_XMAX_IS_MULTI) ? + (MultiXactIdIsValid(xid) && + MultiXactIdPrecedes(xid, cutoff_multi)) : + (TransactionIdIsNormal(xid) && + TransactionIdPrecedes(xid, cutoff_xid))) + { + HeapTupleHeaderSetXmax(tuple, InvalidTransactionId); + + /* + * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + + * LOCKED. Normalize to INVALID just to be sure no one gets confused. + * Also get rid of the HEAP_KEYS_UPDATED bit. + */ + tuple->t_infomask &= ~HEAP_XMAX_BITS; + tuple->t_infomask |= HEAP_XMAX_INVALID; + HeapTupleHeaderClearHotUpdated(tuple); + tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED; + changed = true; + } + + /* + * Old-style VACUUM FULL is gone, but we have to keep this code as long as + * we support having MOVED_OFF/MOVED_IN tuples in the database. + */ + if (tuple->t_infomask & HEAP_MOVED) + { + xid = HeapTupleHeaderGetXvac(tuple); + if (TransactionIdIsNormal(xid) && + TransactionIdPrecedes(xid, cutoff_xid)) + { + /* + * If a MOVED_OFF tuple is not dead, the xvac transaction must + * have failed; whereas a non-dead MOVED_IN tuple must mean the + * xvac transaction succeeded. + */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); + else + HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); + + /* + * Might as well fix the hint bits too; usually XMIN_COMMITTED + * will already be set here, but there's a small chance not. + */ + Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); + tuple->t_infomask |= HEAP_XMIN_COMMITTED; + changed = true; + } + } + + return changed; + } + + /* + * NB: This type of record aren't generated anymore, since bugs around + * multixacts couldn't be fixed without a more robust type of freezing. This + * is kept around to be able to perform PITR. + */ static void heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record) { *************** *** 6450,6456 **** heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record) ItemId lp = PageGetItemId(page, *offsets); HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp); ! (void) heap_freeze_tuple(tuple, cutoff_xid, cutoff_multi); offsets++; } } --- 6742,6748 ---- ItemId lp = PageGetItemId(page, *offsets); HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp); ! (void) heap_xlog_freeze_tuple(tuple, cutoff_xid, cutoff_multi); offsets++; } } *************** *** 6574,6579 **** heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) --- 6866,6928 ---- } } + /* + * Replay XLOG_HEAP2_FREEZE_PAGE records + */ + static void + heap_xlog_freeze_page(XLogRecPtr lsn, XLogRecord *record) + { + xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record); + TransactionId cutoff_xid = xlrec->cutoff_xid; + Buffer buffer; + Page page; + int ntup; + + /* + * In Hot Standby mode, ensure that there's no queries running which still + * consider the frozen xids as running. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node); + + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); + return; + } + + buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); + if (!BufferIsValid(buffer)) + return; + + page = (Page) BufferGetPage(buffer); + + if (lsn <= PageGetLSN(page)) + { + UnlockReleaseBuffer(buffer); + return; + } + + /* now execute freeze plan for each frozen tuple */ + for (ntup = 0; ntup < xlrec->ntuples; ntup++) + { + xl_heap_freeze_tuple *xlrec_tp; + ItemId lp; + HeapTupleHeader tuple; + + xlrec_tp = &xlrec->tuples[ntup]; + lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */ + tuple = (HeapTupleHeader) PageGetItem(page, lp); + + heap_execute_freeze_tuple(tuple, xlrec_tp); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + } + static void heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record) { *************** *** 7429,7434 **** heap2_redo(XLogRecPtr lsn, XLogRecord *record) --- 7778,7786 ---- case XLOG_HEAP2_CLEAN: heap_xlog_clean(lsn, record); break; + case XLOG_HEAP2_FREEZE_PAGE: + heap_xlog_freeze_page(lsn, record); + break; case XLOG_HEAP2_CLEANUP_INFO: heap_xlog_cleanup_info(lsn, record); break; *** a/src/backend/access/rmgrdesc/heapdesc.c --- b/src/backend/access/rmgrdesc/heapdesc.c *************** *** 149,154 **** heap2_desc(StringInfo buf, uint8 xl_info, char *rec) --- 149,163 ---- xlrec->node.relNode, xlrec->block, xlrec->latestRemovedXid); } + if (info == XLOG_HEAP2_FREEZE_PAGE) + { + xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec; + + appendStringInfo(buf, "freeze_page: rel %u/%u/%u; blk %u; cutoff xid %u ntuples %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block, + xlrec->cutoff_xid, xlrec->ntuples); + } else if (info == XLOG_HEAP2_CLEANUP_INFO) { xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; *** a/src/backend/access/rmgrdesc/mxactdesc.c --- b/src/backend/access/rmgrdesc/mxactdesc.c *************** *** 41,47 **** out_member(StringInfo buf, MultiXactMember *member) appendStringInfoString(buf, "(upd) "); break; default: ! appendStringInfoString(buf, "(unk) "); break; } } --- 41,47 ---- appendStringInfoString(buf, "(upd) "); break; default: ! appendStringInfo(buf, "(unk) ", member->status); break; } } *** a/src/backend/access/transam/multixact.c --- b/src/backend/access/transam/multixact.c *************** *** 286,292 **** static MemoryContext MXactContext = NULL; /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); - static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, MultiXactMember *members); static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); --- 286,291 ---- *************** *** 672,678 **** ReadNextMultiXactId(void) * * NB: the passed members[] array will be sorted in-place. */ ! static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members) { MultiXactId multi; --- 671,677 ---- * * NB: the passed members[] array will be sorted in-place. */ ! MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members) { MultiXactId multi; *** a/src/backend/commands/vacuumlazy.c --- b/src/backend/commands/vacuumlazy.c *************** *** 424,429 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 424,430 ---- Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; + xl_heap_freeze_tuple *frozen; pg_rusage_init(&ru0); *************** *** 446,451 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 447,453 ---- vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); + frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage); /* * We want to skip pages that don't require vacuuming according to the *************** *** 500,506 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, bool tupgone, hastup; int prev_dead_count; - OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; --- 502,507 ---- *************** *** 893,901 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ ! if (heap_freeze_tuple(tuple.t_data, FreezeLimit, ! MultiXactCutoff)) ! frozen[nfrozen++] = offnum; } } /* scan along page */ --- 894,902 ---- * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ ! if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit, ! MultiXactCutoff, &frozen[nfrozen])) ! frozen[nfrozen++].offset = offnum; } } /* scan along page */ *************** *** 906,920 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, ! MultiXactCutoff, frozen, nfrozen); PageSetLSN(page, recptr); } } /* --- 907,939 ---- */ if (nfrozen > 0) { + START_CRIT_SECTION(); + MarkBufferDirty(buf); + + /* execute collected freezes */ + for (i = 0; i < nfrozen; i++) + { + ItemId itemid; + HeapTupleHeader htup; + + itemid = PageGetItemId(page, frozen[i].offset); + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + heap_execute_freeze_tuple(htup, &frozen[i]); + } + + /* Now WAL-log freezing if neccessary */ if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, ! frozen, nfrozen); PageSetLSN(page, recptr); } + + END_CRIT_SECTION(); } /* *************** *** 1015,1020 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 1034,1041 ---- RecordPageWithFreeSpace(onerel, blkno, freespace); } + pfree(frozen); + /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; *** a/src/include/access/heapam_xlog.h --- b/src/include/access/heapam_xlog.h *************** *** 50,56 **** */ #define XLOG_HEAP2_FREEZE 0x00 #define XLOG_HEAP2_CLEAN 0x10 ! /* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */ #define XLOG_HEAP2_CLEANUP_INFO 0x30 #define XLOG_HEAP2_VISIBLE 0x40 #define XLOG_HEAP2_MULTI_INSERT 0x50 --- 50,56 ---- */ #define XLOG_HEAP2_FREEZE 0x00 #define XLOG_HEAP2_CLEAN 0x10 ! #define XLOG_HEAP2_FREEZE_PAGE 0x20 #define XLOG_HEAP2_CLEANUP_INFO 0x30 #define XLOG_HEAP2_VISIBLE 0x40 #define XLOG_HEAP2_MULTI_INSERT 0x50 *************** *** 239,245 **** typedef struct xl_heap_inplace #define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid) ! /* This is what we need to know about tuple freezing during vacuum */ typedef struct xl_heap_freeze { RelFileNode node; --- 239,245 ---- #define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid) ! /* This is what we need to know about tuple freezing during vacuum (legacy) */ typedef struct xl_heap_freeze { RelFileNode node; *************** *** 251,256 **** typedef struct xl_heap_freeze --- 251,289 ---- #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId)) + /* + * a 'freeze plan' struct that represents what we need to know about a single + * tuple being frozen during vacuum + */ + #define XLH_FREEZE_XMIN 0x01 + #define XLH_FREEZE_XVAC 0x02 + #define XLH_INVALID_XVAC 0x04 + + typedef struct xl_heap_freeze_tuple + { + TransactionId xmax; + OffsetNumber offset; + uint16 t_infomask2; + uint16 t_infomask; + uint8 frzflags; + } xl_heap_freeze_tuple; + + /* XXX we could define size as offsetof(struct, frzflags) and save some + * padding, but then the array below wouldn't work properly ... */ + #define SizeOfHeapFreezeTuple sizeof(xl_heap_freeze_tuple) + + /* + * This is what we need to know about a block being frozen during vacuum + */ + typedef struct xl_heap_freeze_block + { + RelFileNode node; + BlockNumber block; + TransactionId cutoff_xid; + uint16 ntuples; + xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER]; + } xl_heap_freeze_page; + /* This is what we need to know about setting a visibility map bit */ typedef struct xl_heap_visible { *************** *** 277,284 **** extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, ! TransactionId cutoff_xid, MultiXactId cutoff_multi, ! OffsetNumber *offsets, int offcnt); extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, --- 310,321 ---- OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, ! TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples, int ntuples); ! extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, ! TransactionId cutoff_xid, TransactionId cutoff_multi, ! xl_heap_freeze_tuple *frz); ! extern void heap_execute_freeze_tuple(HeapTupleHeader tuple, ! xl_heap_freeze_tuple *xlrec_tp); extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, *** a/src/include/access/multixact.h --- b/src/include/access/multixact.h *************** *** 81,86 **** extern MultiXactId MultiXactIdCreate(TransactionId xid1, --- 81,87 ---- MultiXactStatus status2); extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status); + extern MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members); extern MultiXactId ReadNextMultiXactId(void); extern bool MultiXactIdIsRunning(MultiXactId multi); extern void MultiXactIdSetOldestMember(void);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers