On Mon, 2013-04-08 at 09:19 +0100, Simon Riggs wrote:
> Applied, with this as the only code change.
>
>
> Thanks everybody for good research and coding and fast testing.
>
>
> We're in good shape now.
Thank you.
I have attached two more patches:
1. I believe that the issue I brought up at the end of this email:
http://www.postgresql.org/message-id/1365035537.7580.380.camel@sussancws0025
is a real issue. In lazy_vacuum_page(), the following sequence can
happen when checksums are on:
a. PageSetAllVisible
b. Pass heap page to visibilitymap_set
c. visibilitymap_set logs the heap page and sets the LSN
d. MarkBufferDirty
If a checkpoint happens between (c) and (d), then we have a problem. The
fix is easy: just mark the heap buffer dirty first. There's another call
site that looks like a potential problem, but I don't think it is. I
simplified the code there to make it (hopefully) more clearly correct.
2. A cleanup patch to pass the buffer_std flag down through
MarkBufferDirtyHint. This is a matter of preference and purely cosmetic,
so it might not be wanted. The reason I thought it was useful is that a
future caller that sets a hint on a non-standard buffer might easily
miss the assumption that we have a standard buffer.
Regards,
Jeff Davis
*** a/src/backend/access/hash/hash.c
--- b/src/backend/access/hash/hash.c
***************
*** 287,293 **** hashgettuple(PG_FUNCTION_ARGS)
/*
* Since this can be redone later if needed, mark as a hint.
*/
! MarkBufferDirtyHint(buf);
}
/*
--- 287,293 ----
/*
* Since this can be redone later if needed, mark as a hint.
*/
! MarkBufferDirtyHint(buf, true);
}
/*
*** a/src/backend/access/heap/pruneheap.c
--- b/src/backend/access/heap/pruneheap.c
***************
*** 262,268 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
{
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
PageClearFull(page);
! MarkBufferDirtyHint(buffer);
}
}
--- 262,268 ----
{
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
PageClearFull(page);
! MarkBufferDirtyHint(buffer, true);
}
}
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***************
*** 413,421 **** _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
* crucial. Be sure to mark the proper buffer dirty.
*/
if (nbuf != InvalidBuffer)
! MarkBufferDirtyHint(nbuf);
else
! MarkBufferDirtyHint(buf);
}
}
}
--- 413,421 ----
* crucial. Be sure to mark the proper buffer dirty.
*/
if (nbuf != InvalidBuffer)
! MarkBufferDirtyHint(nbuf, true);
else
! MarkBufferDirtyHint(buf, true);
}
}
}
*** a/src/backend/access/nbtree/nbtree.c
--- b/src/backend/access/nbtree/nbtree.c
***************
*** 1052,1058 **** restart:
opaque->btpo_cycleid == vstate->cycleid)
{
opaque->btpo_cycleid = 0;
! MarkBufferDirtyHint(buf);
}
}
--- 1052,1058 ----
opaque->btpo_cycleid == vstate->cycleid)
{
opaque->btpo_cycleid = 0;
! MarkBufferDirtyHint(buf, true);
}
}
*** a/src/backend/access/nbtree/nbtutils.c
--- b/src/backend/access/nbtree/nbtutils.c
***************
*** 1789,1795 **** _bt_killitems(IndexScanDesc scan, bool haveLock)
if (killedsomething)
{
opaque->btpo_flags |= BTP_HAS_GARBAGE;
! MarkBufferDirtyHint(so->currPos.buf);
}
if (!haveLock)
--- 1789,1795 ----
if (killedsomething)
{
opaque->btpo_flags |= BTP_HAS_GARBAGE;
! MarkBufferDirtyHint(so->currPos.buf, true);
}
if (!haveLock)
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 7661,7667 **** XLogRestorePoint(const char *rpName)
* i.e. those for which buffer_std == true
*/
XLogRecPtr
! XLogSaveBufferForHint(Buffer buffer)
{
XLogRecPtr recptr = InvalidXLogRecPtr;
XLogRecPtr lsn;
--- 7661,7667 ----
* i.e. those for which buffer_std == true
*/
XLogRecPtr
! XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
{
XLogRecPtr recptr = InvalidXLogRecPtr;
XLogRecPtr lsn;
***************
*** 7683,7689 **** XLogSaveBufferForHint(Buffer buffer)
* We reuse and reset rdata for any actual WAL record insert.
*/
rdata[0].buffer = buffer;
! rdata[0].buffer_std = true;
/*
* Check buffer while not holding an exclusive lock.
--- 7683,7689 ----
* We reuse and reset rdata for any actual WAL record insert.
*/
rdata[0].buffer = buffer;
! rdata[0].buffer_std = buffer_std;
/*
* Check buffer while not holding an exclusive lock.
*** a/src/backend/commands/sequence.c
--- b/src/backend/commands/sequence.c
***************
*** 1118,1124 **** read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple)
HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId);
seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID;
! MarkBufferDirtyHint(*buf);
}
seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
--- 1118,1124 ----
HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId);
seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID;
! MarkBufferDirtyHint(*buf, true);
}
seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 2582,2588 **** IncrBufferRefCount(Buffer buffer)
* (due to a race condition), so it cannot be used for important changes.
*/
void
! MarkBufferDirtyHint(Buffer buffer)
{
volatile BufferDesc *bufHdr;
Page page = BufferGetPage(buffer);
--- 2582,2588 ----
* (due to a race condition), so it cannot be used for important changes.
*/
void
! MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
{
volatile BufferDesc *bufHdr;
Page page = BufferGetPage(buffer);
***************
*** 2667,2673 **** MarkBufferDirtyHint(Buffer buffer)
* rather than full transactionids.
*/
MyPgXact->delayChkpt = delayChkpt = true;
! lsn = XLogSaveBufferForHint(buffer);
}
LockBufHdr(bufHdr);
--- 2667,2673 ----
* rather than full transactionids.
*/
MyPgXact->delayChkpt = delayChkpt = true;
! lsn = XLogSaveBufferForHint(buffer, buffer_std);
}
LockBufHdr(bufHdr);
*** a/src/backend/storage/freespace/freespace.c
--- b/src/backend/storage/freespace/freespace.c
***************
*** 216,222 **** XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
PageInit(page, BLCKSZ, 0);
if (fsm_set_avail(page, slot, new_cat))
! MarkBufferDirtyHint(buf);
UnlockReleaseBuffer(buf);
}
--- 216,222 ----
PageInit(page, BLCKSZ, 0);
if (fsm_set_avail(page, slot, new_cat))
! MarkBufferDirtyHint(buf, true);
UnlockReleaseBuffer(buf);
}
***************
*** 286,292 **** FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
return; /* nothing to do; the FSM was already smaller */
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
! MarkBufferDirtyHint(buf);
UnlockReleaseBuffer(buf);
new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
--- 286,292 ----
return; /* nothing to do; the FSM was already smaller */
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
! MarkBufferDirtyHint(buf, true);
UnlockReleaseBuffer(buf);
new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
***************
*** 619,625 **** fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
page = BufferGetPage(buf);
if (fsm_set_avail(page, slot, newValue))
! MarkBufferDirtyHint(buf);
if (minValue != 0)
{
--- 619,625 ----
page = BufferGetPage(buf);
if (fsm_set_avail(page, slot, newValue))
! MarkBufferDirtyHint(buf, true);
if (minValue != 0)
{
***************
*** 770,776 **** fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
{
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_set_avail(BufferGetPage(buf), slot, child_avail);
! MarkBufferDirtyHint(buf);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
}
--- 770,776 ----
{
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
fsm_set_avail(BufferGetPage(buf), slot, child_avail);
! MarkBufferDirtyHint(buf, true);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
}
*** a/src/backend/storage/freespace/fsmpage.c
--- b/src/backend/storage/freespace/fsmpage.c
***************
*** 284,290 **** restart:
exclusive_lock_held = true;
}
fsm_rebuild_page(page);
! MarkBufferDirtyHint(buf);
goto restart;
}
}
--- 284,290 ----
exclusive_lock_held = true;
}
fsm_rebuild_page(page);
! MarkBufferDirtyHint(buf, true);
goto restart;
}
}
*** a/src/backend/utils/time/tqual.c
--- b/src/backend/utils/time/tqual.c
***************
*** 121,127 **** SetHintBits(HeapTupleHeader tuple, Buffer buffer,
}
tuple->t_infomask |= infomask;
! MarkBufferDirtyHint(buffer);
}
/*
--- 121,127 ----
}
tuple->t_infomask |= infomask;
! MarkBufferDirtyHint(buffer, true);
}
/*
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
***************
*** 267,273 **** extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock);
extern int XLogFileOpen(XLogSegNo segno);
! extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer);
extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
extern void XLogSetAsyncXactLSN(XLogRecPtr record);
--- 267,273 ----
extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock);
extern int XLogFileOpen(XLogSegNo segno);
! extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
extern void XLogSetAsyncXactLSN(XLogRecPtr record);
*** a/src/include/storage/bufmgr.h
--- b/src/include/storage/bufmgr.h
***************
*** 204,210 **** extern Size BufferShmemSize(void);
extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
ForkNumber *forknum, BlockNumber *blknum);
! extern void MarkBufferDirtyHint(Buffer buffer);
extern void UnlockBuffers(void);
extern void LockBuffer(Buffer buffer, int mode);
--- 204,210 ----
extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
ForkNumber *forknum, BlockNumber *blknum);
! extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
extern void UnlockBuffers(void);
extern void LockBuffer(Buffer buffer, int mode);
*** a/src/backend/commands/vacuumlazy.c
--- b/src/backend/commands/vacuumlazy.c
***************
*** 901,926 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
freespace = PageGetHeapFreeSpace(page);
/* mark page all-visible, if appropriate */
! if (all_visible)
{
! if (!PageIsAllVisible(page))
! {
! PageSetAllVisible(page);
! MarkBufferDirty(buf);
! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
! vmbuffer, visibility_cutoff_xid);
! }
! else if (!all_visible_according_to_vm)
! {
! /*
! * It should never be the case that the visibility map page is
! * set while the page-level bit is clear, but the reverse is
! * allowed. Set the visibility map bit as well so that we get
! * back in sync.
! */
! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
! vmbuffer, visibility_cutoff_xid);
! }
}
/*
--- 901,925 ----
freespace = PageGetHeapFreeSpace(page);
/* mark page all-visible, if appropriate */
! if (all_visible && !all_visible_according_to_vm)
{
! /*
! * It should never be the case that the visibility map page is set
! * while the page-level bit is clear, but the reverse is allowed
! * (if checksums are not enabled). Regardless, set the both bits
! * so that we get back in sync.
! *
! * NB: If the heap page is all-visible but the VM bit is not set,
! * we don't need to dirty the heap page. However, if checksums are
! * enabled, we do need to make sure that the heap page is dirtied
! * before passing it to visibilitymap_set(), because it may be
! * logged. Given that this situation should only happen in rare
! * cases after a crash, it is not worth optimizing.
! */
! PageSetAllVisible(page);
! MarkBufferDirty(buf);
! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
! vmbuffer, visibility_cutoff_xid);
}
/*
***************
*** 1146,1151 **** lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
--- 1145,1158 ----
PageRepairFragmentation(page);
/*
+ * If checksums are enabled, visibilitymap_set() may log the heap page, so
+ * we mark heap buffer dirty before calling visibilitmap_set(). We need to
+ * mark the heap page dirty regardless, but the order only matters when
+ * checksums are enabled.
+ */
+ MarkBufferDirty(buffer);
+
+ /*
* Now that we have removed the dead tuples from the page, once again check
* if the page has become all-visible.
*/
***************
*** 1158,1165 **** lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
visibility_cutoff_xid);
}
- MarkBufferDirty(buffer);
-
/* XLOG stuff */
if (RelationNeedsWAL(onerel))
{
--- 1165,1170 ----
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers