Here's a patch for WAL logging tuple freezes in vacuum, per discussion
on pgsql-bugs.
This patch is against CVS head. Should this be backported to stable
branches? I think it should.
After writing the patch, I realized that it needs some thought if
backported, because WAL records of removing tuples and freezing tuples
share the same heapam opcode XLOG_HEAP_CLEAN, and are only
differentiated by setting a flag. If we applied the patch as it is, and
for some reason someone replayed a WAL log generated by a newer version,
with the patch, with an older version, without the patch, the older
version would interpret the freeze WAL records as dead tuple removals,
and remove live records. I would've liked to give freezing a new opcode,
but we've ran out of them (see htup.h).
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
Index: src/backend/access/heap/heapam.c
===================================================================
RCS file:
/home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/heap/heapam.c,v
retrieving revision 1.220
diff -c -r1.220 heapam.c
*** src/backend/access/heap/heapam.c 4 Oct 2006 00:29:48 -0000 1.220
--- src/backend/access/heap/heapam.c 23 Oct 2006 18:17:17 -0000
***************
*** 2877,2889 ****
/*
* Perform XLogInsert for a heap-clean operation. Caller must already
* have modified the buffer and marked it dirty.
*/
XLogRecPtr
! log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
{
xl_heap_clean xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
/* Caller should not call me on a temp relation */
Assert(!reln->rd_istemp);
--- 2877,2895 ----
/*
* Perform XLogInsert for a heap-clean operation. Caller must already
* have modified the buffer and marked it dirty.
+ *
+ * If freeze is true, the tuples specified in offsets array were frozen,
+ * otherwise they were dead and removed.
*/
XLogRecPtr
! log_heap_clean(Relation reln, Buffer buffer,
! OffsetNumber *offsets, int noffsets, bool freeze)
{
xl_heap_clean xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
+ uint8 info = freeze ?
+ (XLOG_HEAP_CLEAN | XLOG_HEAP_FREEZE) : XLOG_HEAP_CLEAN;
/* Caller should not call me on a temp relation */
Assert(!reln->rd_istemp);
***************
*** 2901,2910 ****
* that it is. When XLogInsert stores the whole buffer, the offsets
array
* need not be stored too.
*/
! if (uncnt > 0)
{
! rdata[1].data = (char *) unused;
! rdata[1].len = uncnt * sizeof(OffsetNumber);
}
else
{
--- 2907,2916 ----
* that it is. When XLogInsert stores the whole buffer, the offsets
array
* need not be stored too.
*/
! if (noffsets > 0)
{
! rdata[1].data = (char *) offsets;
! rdata[1].len = noffsets * sizeof(OffsetNumber);
}
else
{
***************
*** 2915,2921 ****
rdata[1].buffer_std = true;
rdata[1].next = NULL;
! recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata);
return recptr;
}
--- 2921,2927 ----
rdata[1].buffer_std = true;
rdata[1].next = NULL;
! recptr = XLogInsert(RM_HEAP_ID, info, rdata);
return recptr;
}
***************
*** 3030,3039 ****
--- 3036,3048 ----
Relation reln;
Buffer buffer;
Page page;
+ bool freeze;
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
+ freeze = record->xl_info & XLOG_HEAP_FREEZE;
+
reln = XLogOpenRelation(xlrec->node);
buffer = XLogReadBuffer(reln, xlrec->block, false);
if (!BufferIsValid(buffer))
***************
*** 3048,3069 ****
if (record->xl_len > SizeOfHeapClean)
{
! OffsetNumber *unused;
! OffsetNumber *unend;
ItemId lp;
! unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
! unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
! while (unused < unend)
{
! lp = PageGetItemId(page, *unused + 1);
! lp->lp_flags &= ~LP_USED;
! unused++;
}
}
! PageRepairFragmentation(page, NULL);
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
--- 3057,3089 ----
if (record->xl_len > SizeOfHeapClean)
{
! OffsetNumber *offsets;
! OffsetNumber *offend;
ItemId lp;
! offsets = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
! offend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
! while (offsets < offend)
{
! lp = PageGetItemId(page, *offsets + 1);
!
! if(freeze)
! {
! HeapTupleHeader htup = (HeapTupleHeader)
PageGetItem(page, lp);
!
! Assert(!(htup->t_infomask & HEAP_XMIN_INVALID));
!
! htup->t_infomask |= HEAP_XMIN_COMMITTED;
! HeapTupleHeaderSetXmin(htup,
FrozenTransactionId);
! } else
! lp->lp_flags &= ~LP_USED;
! offsets++;
}
}
! if(!freeze)
! PageRepairFragmentation(page, NULL);
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
Index: src/backend/commands/vacuum.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/vacuum.c,v
retrieving revision 1.341
diff -c -r1.341 vacuum.c
*** src/backend/commands/vacuum.c 4 Oct 2006 00:29:51 -0000 1.341
--- src/backend/commands/vacuum.c 23 Oct 2006 18:36:07 -0000
***************
*** 1357,1364 ****
Buffer buf;
OffsetNumber offnum,
maxoff;
! bool pgchanged,
! notup;
vacuum_delay_point();
--- 1357,1365 ----
Buffer buf;
OffsetNumber offnum,
maxoff;
! bool notup;
! OffsetNumber frozen[MaxOffsetNumber];
! int nfrozen;
vacuum_delay_point();
***************
*** 1414,1420 ****
continue;
}
! pgchanged = false;
notup = true;
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber;
--- 1415,1421 ----
continue;
}
! nfrozen = 0;
notup = true;
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber;
***************
*** 1458,1464 ****
HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
/* infomask should be okay
already */
Assert(tuple.t_data->t_infomask
& HEAP_XMIN_COMMITTED);
! pgchanged = true;
}
/*
--- 1459,1465 ----
HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
/* infomask should be okay
already */
Assert(tuple.t_data->t_infomask
& HEAP_XMIN_COMMITTED);
! frozen[nfrozen++] = offnum;
}
/*
***************
*** 1627,1634 ****
else
empty_end_pages = 0;
! if (pgchanged)
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
}
--- 1628,1650 ----
else
empty_end_pages = 0;
! /*
! * If we froze any tuples, write a WAL record. We used to treat
! * freezing the same as hint bit updates, because it was
thought that
! * losing a tuple freeze doesn't matter since the tuple is
marked as
! * committed anyway. But that's not safe: if we later truncate
the
! * clog and crash, we might end up with xids on the disk that
belonged
! * to a truncated clog segment.
! */
! if (nfrozen > 0)
! {
! XLogRecPtr recptr;
!
MarkBufferDirty(buf);
+ recptr = log_heap_clean(onerel, buf, frozen, nfrozen,
true);
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
UnlockReleaseBuffer(buf);
}
***************
*** 2603,2609 ****
{
XLogRecPtr recptr;
! recptr = log_heap_clean(onerel, buf, unused,
uncnt);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
--- 2619,2625 ----
{
XLogRecPtr recptr;
! recptr = log_heap_clean(onerel, buf, unused,
uncnt, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
***************
*** 3074,3080 ****
{
XLogRecPtr recptr;
! recptr = log_heap_clean(onerel, buffer, unused, uncnt);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
--- 3090,3096 ----
{
XLogRecPtr recptr;
! recptr = log_heap_clean(onerel, buffer, unused, uncnt, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
Index: src/backend/commands/vacuumlazy.c
===================================================================
RCS file:
/home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/vacuumlazy.c,v
retrieving revision 1.80
diff -c -r1.80 vacuumlazy.c
*** src/backend/commands/vacuumlazy.c 4 Oct 2006 00:29:52 -0000 1.80
--- src/backend/commands/vacuumlazy.c 23 Oct 2006 18:35:52 -0000
***************
*** 266,275 ****
Page page;
OffsetNumber offnum,
maxoff;
! bool pgchanged,
! tupgone,
hastup;
int prev_dead_count;
vacuum_delay_point();
--- 266,276 ----
Page page;
OffsetNumber offnum,
maxoff;
! bool tupgone,
hastup;
int prev_dead_count;
+ OffsetNumber frozen[MaxOffsetNumber];
+ int nfrozen;
vacuum_delay_point();
***************
*** 349,355 ****
continue;
}
! pgchanged = false;
hastup = false;
prev_dead_count = vacrelstats->num_dead_tuples;
maxoff = PageGetMaxOffsetNumber(page);
--- 350,356 ----
continue;
}
! nfrozen = 0;
hastup = false;
prev_dead_count = vacrelstats->num_dead_tuples;
maxoff = PageGetMaxOffsetNumber(page);
***************
*** 398,404 ****
HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
/* infomask should be okay
already */
Assert(tuple.t_data->t_infomask
& HEAP_XMIN_COMMITTED);
! pgchanged = true;
}
/*
--- 399,405 ----
HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
/* infomask should be okay
already */
Assert(tuple.t_data->t_infomask
& HEAP_XMIN_COMMITTED);
! frozen[nfrozen++] = offnum;
}
/*
***************
*** 485,492 ****
if (hastup)
vacrelstats->nonempty_pages = blkno + 1;
! if (pgchanged)
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
}
--- 486,508 ----
if (hastup)
vacrelstats->nonempty_pages = blkno + 1;
! /*
! * If we froze any tuples, write a WAL record. We used to treat
! * freezing the same as hint bit updates, because it was
thought that
! * losing a tuple freeze doesn't matter since the tuple is
marked as
! * committed anyway. But that's not safe: if we later truncate
the
! * clog and crash, we might end up with xids on the disk that
belonged
! * to a truncated clog segment.
! */
! if (nfrozen > 0)
! {
! XLogRecPtr recptr;
!
MarkBufferDirty(buf);
+ recptr = log_heap_clean(onerel, buf, frozen, nfrozen,
true);
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
UnlockReleaseBuffer(buf);
}
***************
*** 635,641 ****
{
XLogRecPtr recptr;
! recptr = log_heap_clean(onerel, buffer, unused, uncnt);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
--- 651,657 ----
{
XLogRecPtr recptr;
! recptr = log_heap_clean(onerel, buffer, unused, uncnt, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
Index: src/include/access/heapam.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/heapam.h,v
retrieving revision 1.116
diff -c -r1.116 heapam.h
*** src/include/access/heapam.h 4 Oct 2006 00:30:07 -0000 1.116
--- src/include/access/heapam.h 23 Oct 2006 17:52:27 -0000
***************
*** 182,188 ****
extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
! OffsetNumber *unused, int uncnt);
extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
ItemPointerData from,
Buffer newbuf, HeapTuple newtup);
--- 182,188 ----
extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
! OffsetNumber *offsets, int noffsets, bool freeze);
extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
ItemPointerData from,
Buffer newbuf, HeapTuple newtup);
Index: src/include/access/htup.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/htup.h,v
retrieving revision 1.86
diff -c -r1.86 htup.h
*** src/include/access/htup.h 4 Oct 2006 00:30:07 -0000 1.86
--- src/include/access/htup.h 23 Oct 2006 17:14:44 -0000
***************
*** 510,515 ****
--- 510,521 ----
* we can (and we do) restore entire page in redo
*/
#define XLOG_HEAP_INIT_PAGE 0x80
+ /*
+ * XLOG_HEAP_CLEAN | XLOG_HEAP_FREEZE means that tuples on this page
+ * should be frozen. We can share the bit with XLOG_HEAP_INIT_PAGE,
+ * because it's not used when cleaning.
+ */
+ #define XLOG_HEAP_FREEZE 0x80
/*
* All what we need to find changed tuple
---------------------------(end of broadcast)---------------------------
TIP 5: don't forget to increase your free space map settings