On Fri, 2010-03-26 at 16:16 -0400, Tom Lane wrote:
> Simon Riggs <[email protected]> writes:
> > On Sun, 2010-01-31 at 23:43 +0200, Heikki Linnakangas wrote:
> >> When replaying the deletion record, the standby could look at all the
> >> heap tuples whose index pointers are being removed, to see which one
> >> was newest.
>
> > Long after coding this, I now realise this really is a dumb-ass idea.
>
> > There is no spoon. The index tuples did once point at valid heap tuples.
> > 1. heap tuples are deleted
> > 2. heap tuples become dead
> > 3. index tuples can now be marked killed
> > 4. index tuples removed
> > Heap tuples can be removed at step 2, index tuples can't be removed
> > until step 4.
>
> Uh, no, heap tuples can't be removed until after all index entries that
> are pointing at them have been removed. Please tell me you have not
> broken this.
Nothing broken.
It appears that in practice many of the index items point to heap items
that are LP_DEAD. So for the purposes of accessing a heap tuple's xmin,
then we're both right. To the current purpose the tuple has been
removed, though you are also right: its stub remains.
So how do I calculate xmin and xmax for an LP_DEAD tuple? Clearly
nothing can be done directly. Is there another way?
A conjecture: if the index items point to a heap tuple that is LP_NORMAL
then we can get the xmin/xmax from there. The xmin/xmax of LP_DEAD items
will always be *earlier* than the latest LP_NORMAL tuple that is being
removed. So as long as I have at least 1 LP_NORMAL heap tuple, then I
can use the latestRemovedXid from that and simply discard the LP_DEAD
items (for the purposes of this calculation). The idea is that whatever
marked those heap tuples LP_DEAD would also have marked the others, if
they were the same or earlier than the LP_DEAD ones.
Do you agree with this conjecture? If you do, then attached patch is
complete.
--
Simon Riggs www.2ndQuadrant.com
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***************
*** 57,63 **** static void _bt_findinsertloc(Relation rel,
OffsetNumber *offsetptr,
int keysz,
ScanKey scankey,
! IndexTuple newtup);
static void _bt_insertonpg(Relation rel, Buffer buf,
BTStack stack,
IndexTuple itup,
--- 57,64 ----
OffsetNumber *offsetptr,
int keysz,
ScanKey scankey,
! IndexTuple newtup,
! Relation heapRel);
static void _bt_insertonpg(Relation rel, Buffer buf,
BTStack stack,
IndexTuple itup,
***************
*** 78,84 **** static void _bt_pgaddtup(Relation rel, Page page,
OffsetNumber itup_off, const char *where);
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
int keysz, ScanKey scankey);
! static void _bt_vacuum_one_page(Relation rel, Buffer buffer);
/*
--- 79,85 ----
OffsetNumber itup_off, const char *where);
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
int keysz, ScanKey scankey);
! static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
/*
***************
*** 175,181 **** top:
if (checkUnique != UNIQUE_CHECK_EXISTING)
{
/* do the insertion */
! _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
_bt_insertonpg(rel, buf, stack, itup, offset, false);
}
else
--- 176,182 ----
if (checkUnique != UNIQUE_CHECK_EXISTING)
{
/* do the insertion */
! _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
_bt_insertonpg(rel, buf, stack, itup, offset, false);
}
else
***************
*** 491,497 **** _bt_findinsertloc(Relation rel,
OffsetNumber *offsetptr,
int keysz,
ScanKey scankey,
! IndexTuple newtup)
{
Buffer buf = *bufptr;
Page page = BufferGetPage(buf);
--- 492,499 ----
OffsetNumber *offsetptr,
int keysz,
ScanKey scankey,
! IndexTuple newtup,
! Relation heapRel)
{
Buffer buf = *bufptr;
Page page = BufferGetPage(buf);
***************
*** 556,562 **** _bt_findinsertloc(Relation rel,
*/
if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
{
! _bt_vacuum_one_page(rel, buf);
/*
* remember that we vacuumed this page, because that makes the
--- 558,564 ----
*/
if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
{
! _bt_vacuum_one_page(rel, buf, heapRel);
/*
* remember that we vacuumed this page, because that makes the
***************
*** 1998,2004 **** _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
* super-exclusive "cleanup" lock (see nbtree/README).
*/
static void
! _bt_vacuum_one_page(Relation rel, Buffer buffer)
{
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
--- 2000,2006 ----
* super-exclusive "cleanup" lock (see nbtree/README).
*/
static void
! _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
{
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
***************
*** 2025,2031 **** _bt_vacuum_one_page(Relation rel, Buffer buffer)
}
if (ndeletable > 0)
! _bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
/*
* Note: if we didn't find any LP_DEAD items, then the page's
--- 2027,2033 ----
}
if (ndeletable > 0)
! _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
/*
* Note: if we didn't find any LP_DEAD items, then the page's
*** a/src/backend/access/nbtree/nbtpage.c
--- b/src/backend/access/nbtree/nbtpage.c
***************
*** 719,733 **** _bt_page_recyclable(Page page)
* ensure correct locking.
*/
void
! _bt_delitems(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, bool isVacuum,
! BlockNumber lastBlockVacuumed)
{
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
- Assert(isVacuum || lastBlockVacuumed == 0);
-
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
--- 719,730 ----
* ensure correct locking.
*/
void
! _bt_delitems_vacuum(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
{
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
***************
*** 759,793 **** _bt_delitems(Relation rel, Buffer buf,
XLogRecPtr recptr;
XLogRecData rdata[2];
! if (isVacuum)
! {
! xl_btree_vacuum xlrec_vacuum;
!
! xlrec_vacuum.node = rel->rd_node;
! xlrec_vacuum.block = BufferGetBlockNumber(buf);
!
! xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
! rdata[0].data = (char *) &xlrec_vacuum;
! rdata[0].len = SizeOfBtreeVacuum;
! }
! else
! {
! xl_btree_delete xlrec_delete;
!
! xlrec_delete.node = rel->rd_node;
! xlrec_delete.block = BufferGetBlockNumber(buf);
! /*
! * XXX: We would like to set an accurate latestRemovedXid, but
! * there is no easy way of obtaining a useful value. So we punt
! * and store InvalidTransactionId, which forces the standby to
! * wait for/cancel all currently running transactions.
! */
! xlrec_delete.latestRemovedXid = InvalidTransactionId;
! rdata[0].data = (char *) &xlrec_delete;
! rdata[0].len = SizeOfBtreeDelete;
! }
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
--- 756,769 ----
XLogRecPtr recptr;
XLogRecData rdata[2];
! xl_btree_vacuum xlrec_vacuum;
! xlrec_vacuum.node = rel->rd_node;
! xlrec_vacuum.block = BufferGetBlockNumber(buf);
+ xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+ rdata[0].data = (char *) &xlrec_vacuum;
+ rdata[0].len = SizeOfBtreeVacuum;
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
***************
*** 810,819 **** _bt_delitems(Relation rel, Buffer buf,
rdata[1].buffer_std = true;
rdata[1].next = NULL;
! if (isVacuum)
! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
! else
! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
--- 786,867 ----
rdata[1].buffer_std = true;
rdata[1].next = NULL;
! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
!
! PageSetLSN(page, recptr);
! PageSetTLI(page, ThisTimeLineID);
! }
!
! END_CRIT_SECTION();
! }
!
! void
! _bt_delitems_delete(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, Relation heapRel)
! {
! Page page = BufferGetPage(buf);
! BTPageOpaque opaque;
!
! Assert(nitems > 0);
!
! /* No ereport(ERROR) until changes are logged */
! START_CRIT_SECTION();
!
! /* Fix the page */
! PageIndexMultiDelete(page, itemnos, nitems);
!
! /*
! * We can clear the vacuum cycle ID since this page has certainly been
! * processed by the current vacuum scan.
! */
! opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! opaque->btpo_cycleid = 0;
!
! /*
! * Mark the page as not containing any LP_DEAD items. This is not
! * certainly true (there might be some that have recently been marked, but
! * weren't included in our target-item list), but it will almost always be
! * true and it doesn't seem worth an additional page scan to check it.
! * Remember that BTP_HAS_GARBAGE is only a hint anyway.
! */
! opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
!
! MarkBufferDirty(buf);
!
! /* XLOG stuff */
! if (!rel->rd_istemp)
! {
! XLogRecPtr recptr;
! XLogRecData rdata[3];
!
! xl_btree_delete xlrec_delete;
!
! xlrec_delete.node = rel->rd_node;
! xlrec_delete.hnode = heapRel->rd_node;
! xlrec_delete.block = BufferGetBlockNumber(buf);
! xlrec_delete.nitems = nitems;
!
! rdata[0].data = (char *) &xlrec_delete;
! rdata[0].len = SizeOfBtreeDelete;
! rdata[0].buffer = InvalidBuffer;
! rdata[0].next = &(rdata[1]);
!
! /*
! * We need the target-offsets array whether or not we store the
! * to allow us to find the latestRemovedXid on a standby server.
! */
! rdata[1].data = (char *) itemnos;
! rdata[1].len = nitems * sizeof(OffsetNumber);
! rdata[1].buffer = InvalidBuffer;
! rdata[1].next = &(rdata[2]);
!
! rdata[2].data = NULL;
! rdata[2].len = 0;
! rdata[2].buffer = buf;
! rdata[2].buffer_std = true;
! rdata[2].next = NULL;
!
! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
*** a/src/backend/access/nbtree/nbtree.c
--- b/src/backend/access/nbtree/nbtree.c
***************
*** 708,714 **** btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
info->strategy);
LockBufferForCleanup(buf);
! _bt_delitems(rel, buf, NULL, 0, true, vstate.lastBlockVacuumed);
_bt_relbuf(rel, buf);
}
--- 708,714 ----
buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
info->strategy);
LockBufferForCleanup(buf);
! _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
_bt_relbuf(rel, buf);
}
***************
*** 889,895 **** restart:
{
BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
! _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
/*
* Keep track of the block number of the lastBlockVacuumed, so we
--- 889,895 ----
{
BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
! _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed);
/*
* Keep track of the block number of the lastBlockVacuumed, so we
*** a/src/backend/access/nbtree/nbtxlog.c
--- b/src/backend/access/nbtree/nbtxlog.c
***************
*** 553,558 **** btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
--- 553,672 ----
UnlockReleaseBuffer(buffer);
}
+ static TransactionId
+ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record)
+ {
+ OffsetNumber *unused;
+ Buffer ibuffer, hbuffer;
+ Page ipage, hpage;
+ ItemId iitemid, hitemid;
+ IndexTuple itup;
+ HeapTupleHeader htuphdr;
+ BlockNumber hblkno;
+ OffsetNumber hoffnum;
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ TransactionId htupxid = InvalidTransactionId;
+ int i;
+ int num_unused, num_redirect, num_dead;
+
+ xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+ /*
+ * Get index page
+ */
+ ibuffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+ if (!BufferIsValid(ibuffer))
+ return InvalidTransactionId;
+ ipage = (Page) BufferGetPage(ibuffer);
+
+ /*
+ * Loop through the deleted index items to obtain the TransactionId
+ * from the heap items they point to.
+ */
+ unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
+
+ for (i = 0; i < xlrec->nitems; i++)
+ {
+ /*
+ * Identify the index tuple about to be deleted
+ */
+ iitemid = PageGetItemId(ipage, unused[i]);
+ itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ hbuffer = XLogReadBuffer(xlrec->hnode, hblkno, false);
+ if (!BufferIsValid(hbuffer))
+ {
+ UnlockReleaseBuffer(ibuffer);
+ return InvalidTransactionId;
+ }
+ hpage = (Page) BufferGetPage(hbuffer);
+
+ /*
+ * Look up the heap tuple header that the index tuple points at
+ * by using the heap node supplied with the xlrec. We can't use
+ * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+ */
+ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+ hitemid = PageGetItemId(hpage, hoffnum);
+
+ /*
+ * Follow any redirections until we find something useful.
+ */
+ while (ItemIdIsRedirected(hitemid))
+ {
+ num_redirect++;
+ hoffnum = ItemIdGetRedirect(hitemid);
+ hitemid = PageGetItemId(hpage, hoffnum);
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (ItemIdHasStorage(hitemid))
+ {
+ htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+ /*
+ * Get the heap tuple's xmin/xmax and ratchet up the latestRemovedXid.
+ */
+ htupxid = HeapTupleHeaderGetXmin(htuphdr);
+ if (TransactionIdFollows(htupxid, latestRemovedXid))
+ latestRemovedXid = htupxid;
+
+ htupxid = HeapTupleHeaderGetXmax(htuphdr);
+ if (TransactionIdFollows(htupxid, latestRemovedXid))
+ latestRemovedXid = htupxid;
+ }
+ else if (ItemIdIsDead(hitemid))
+ {
+ /*
+ * Conjecture: if hitemid is dead then it had xids before the xids
+ * marked on LP_NORMAL items. So we just ignore this item and move
+ * onto the next, for the purposes of calculating latestRemovedxids.
+ */
+ num_dead++;
+ }
+ else
+ {
+ Assert(!ItemIdIsUsed(hitemid));
+ num_unused++;
+ }
+
+ UnlockReleaseBuffer(hbuffer);
+ }
+
+ UnlockReleaseBuffer(ibuffer);
+
+ /* debug */
+ elog(LOG, "nbtree latestRemovedXid %u nitems %u normal %u dead %u unused %u redirected %u",
+ latestRemovedXid,
+ xlrec->nitems,
+ xlrec->nitems - num_dead - num_redirect - num_unused,
+ num_dead, num_unused, num_redirect);
+ Assert(num_unused == 0);
+
+ return latestRemovedXid;
+ }
+
static void
btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
{
***************
*** 584,595 **** btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
if (record->xl_len > SizeOfBtreeDelete)
{
OffsetNumber *unused;
- OffsetNumber *unend;
unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
- unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
! PageIndexMultiDelete(page, unused, unend - unused);
}
/*
--- 698,707 ----
if (record->xl_len > SizeOfBtreeDelete)
{
OffsetNumber *unused;
unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
! PageIndexMultiDelete(page, unused, xlrec->nitems);
}
/*
***************
*** 830,835 **** btree_redo(XLogRecPtr lsn, XLogRecord *record)
--- 942,948 ----
* from individual btree vacuum records on that index.
*/
{
+ TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record);
xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
/*
***************
*** 839,845 **** btree_redo(XLogRecPtr lsn, XLogRecord *record)
* here is worth some thought and possibly some effort to
* improve.
*/
! ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
}
break;
--- 952,958 ----
* here is worth some thought and possibly some effort to
* improve.
*/
! ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node);
}
break;
***************
*** 1012,1021 **** btree_desc(StringInfo buf, uint8 xl_info, char *rec)
{
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
! appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
! xlrec->node.spcNode, xlrec->node.dbNode,
! xlrec->node.relNode, xlrec->block,
! xlrec->latestRemovedXid);
break;
}
case XLOG_BTREE_DELETE_PAGE:
--- 1125,1134 ----
{
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
! appendStringInfo(buf, "delete: index %u/%u/%u; iblk %u, heap %u/%u/%u;",
! xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode,
! xlrec->block,
! xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode);
break;
}
case XLOG_BTREE_DELETE_PAGE:
*** a/src/include/access/nbtree.h
--- b/src/include/access/nbtree.h
***************
*** 314,327 **** typedef struct xl_btree_split
*/
typedef struct xl_btree_delete
{
! RelFileNode node;
BlockNumber block;
! TransactionId latestRemovedXid;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete;
! #define SizeOfBtreeDelete (offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId))
/*
* This is what we need to know about page reuse within btree.
--- 314,328 ----
*/
typedef struct xl_btree_delete
{
! RelFileNode node; /* RelFileNode of the index */
BlockNumber block;
! RelFileNode hnode; /* RelFileNode of the heap the index currently points at */
! int nitems;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete;
! #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int))
/*
* This is what we need to know about page reuse within btree.
***************
*** 349,361 **** typedef struct xl_btree_reuse_page
* heap tuples.
*
* Any changes to any one block are registered on just one WAL record. All
! * blocks that we need to run EnsureBlockUnpinned() before we touch the changed
! * block are also given on this record as a variable length array. The array
! * is compressed by way of storing an array of block ranges, rather than an
! * actual array of blockids.
*
* Note that the *last* WAL record in any vacuum of an index is allowed to
! * have numItems == 0. All other WAL records must have numItems > 0.
*/
typedef struct xl_btree_vacuum
{
--- 350,361 ----
* heap tuples.
*
* Any changes to any one block are registered on just one WAL record. All
! * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
! * starting from the last block vacuumed through until this one. Individual
! * block numbers aren't given.
*
* Note that the *last* WAL record in any vacuum of an index is allowed to
! * have a zero length array of offsets. Earlier records must have at least one.
*/
typedef struct xl_btree_vacuum
{
***************
*** 588,596 **** extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
! extern void _bt_delitems(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, bool isVacuum,
! BlockNumber lastBlockVacuumed);
extern int _bt_pagedel(Relation rel, Buffer buf, BTStack stack);
/*
--- 588,597 ----
extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
! extern void _bt_delitems_delete(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, Relation heapRel);
! extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed);
extern int _bt_pagedel(Relation rel, Buffer buf, BTStack stack);
/*
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers