Gregory Stark wrote:
> I think we're talking past each other. Martin and I are talking about doing
> something like:
>
> for (...)
> ...
> crc(word including hint bits)
> ...
> for (each line pointer)
> crc-negated(word & LP_DEAD<<15)
>
> Because CRC is a cyclic checksum it's possible to add or remove bits
> incrementally.
I see.
Since our CRC implementation is a simple byte loop, and since ItemIdData
fits in a uint32, the attached patch should do mostly the same by
copying the line pointer into a uint32, turning off the lp_flags, and
summing the modified copy.
This patch is also skipping pd_special and the unused area of the page.
I'm still testing this; please beware that this likely has an even
higher bug density than my regular patches (and some debugging printouts
as well).
While reading the pg_filedump code I noticed that there's a way to tell
the different index pages apart, so perhaps we can use that to be able
to checksum the special space as well.
--
Alvaro Herrera http://www.CommandPrompt.com/
PostgreSQL Replication, Consulting, Custom Development, 24x7 support
Index: src/backend/access/heap/heapam.c
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/heap/heapam.c,v
retrieving revision 1.269
diff -c -p -r1.269 heapam.c
*** src/backend/access/heap/heapam.c 6 Nov 2008 20:51:14 -0000 1.269
--- src/backend/access/heap/heapam.c 13 Nov 2008 17:44:23 -0000
***************
*** 4036,4041 ****
--- 4036,4128 ----
}
/*
+ * Perform XLogInsert for hint bits changes in a page. This handles hint
+ * bits set in HeapTupleHeaderData (t_infomask and t_infomask2).
+ *
+ * This is intended to be called right before writing a page from shared
+ * buffers to disk.
+ *
+ * The approach used here, instead of WAL-logging every change, is to produce
+ * a complete record of the current state of hint bits in a page just before
+ * flushing it. There are two downsides to this approach: first, it stores
+ * all hint bits in the page, not only those that changed; and second, that
+ * the flusher of the page needs to flush a lot more of the WAL (namely up
+ * to this new record's LSN) than the original LSN marked on the page.
+ */
+ XLogRecPtr
+ log_hintbits(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+ Page page)
+ {
+ xl_heap_hintbits xlrec;
+ OffsetNumber i;
+ XLogRecPtr recptr;
+ XLogRecData rdata[2];
+ char *bits;
+ int pos = 0;
+ StringInfoData buf;
+
+ /*
+ * 1 byte for line pointer bits, 2 bytes for infomask,
+ * 2 bytes for infomask2
+ */
+ bits = palloc(MaxHeapTuplesPerPage * 5);
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "page %u: ", blkno);
+
+ for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page);
+ i = OffsetNumberNext(i))
+ {
+ HeapTupleHeader htup;
+ ItemId lp = PageGetItemId(page, i);
+
+ if (!ItemIdHasStorage(lp))
+ continue;
+
+ appendStringInfo(&buf, "offset %d: ", i);
+
+ htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+ *((uint16 *) (bits + pos)) = htup->t_infomask & HEAP_XACT_MASK;
+ appendStringInfo(&buf, "infomask %04x/%04x ", htup->t_infomask,
+ htup->t_infomask & HEAP_XACT_MASK);
+ pos += 2;
+ *((uint16 *) (bits + pos)) = htup->t_infomask2 & HEAP2_XACT_MASK;
+ appendStringInfo(&buf, "infomask2 %04x/%04x\n", htup->t_infomask2,
+ htup->t_infomask2 & HEAP2_XACT_MASK);
+ pos += 2;
+ }
+
+ elog(LOG, "%s", buf.data);
+ pfree(buf.data);
+
+ /* NO ELOG(ERROR) from here till hint bits are logged */
+ START_CRIT_SECTION();
+
+ xlrec.node = *rnode;
+ xlrec.block = blkno;
+
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfHeapHintbits;
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].next = &(rdata[1]);
+
+ rdata[1].data = (char *) bits;
+ rdata[1].len = pos;
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].next = NULL;
+
+ recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_HINTBITS, rdata);
+
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+
+ END_CRIT_SECTION();
+
+ return recptr;
+ }
+
+ /*
* Handles CLEAN and CLEAN_MOVE record types
*/
static void
***************
*** 4153,4158 ****
--- 4240,4324 ----
}
static void
+ heap_xlog_hintbits(XLogRecPtr lsn, XLogRecord *record)
+ {
+ xl_heap_hintbits *xlrec = (xl_heap_hintbits *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+ if (!BufferIsValid(buffer))
+ return;
+ page = (Page) BufferGetPage(buffer);
+
+ if (XLByteLE(lsn, PageGetLSN(page)))
+ {
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+
+ if (record->xl_len > SizeOfHeapHintbits)
+ {
+ char *bits;
+ char *bits_end;
+ OffsetNumber offset = FirstOffsetNumber;
+ StringInfoData buf;
+
+
+ bits = (char *) xlrec + SizeOfHeapHintbits;
+ bits_end = (char *) xlrec + record->xl_len;
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "page %u: ", xlrec->block);
+
+ while (bits < bits_end)
+ {
+
+ for (;;)
+ {
+ HeapTupleHeader htup;
+ ItemId lp = PageGetItemId(page, offset);
+
+ if (!ItemIdHasStorage(lp))
+ {
+ offset++;
+ continue;
+ }
+
+ appendStringInfo(&buf, "offset %d: ", offset);
+
+ htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+ /* set the right bits in infomask */
+ htup->t_infomask = *(uint16 *) bits |
+ (htup->t_infomask & ~HEAP_XACT_MASK);
+ appendStringInfo(&buf, "infomask %04x/%04x ", htup->t_infomask,
+ *(uint16 *) bits);
+ bits += 2;
+
+ /* set the right bits in infomask2 */
+ htup->t_infomask2 = *(uint16 *) bits |
+ (htup->t_infomask2 & ~HEAP2_XACT_MASK);
+ appendStringInfo(&buf, "infomask2 %04x/%04x\n", htup->t_infomask2,
+ *(uint16 *) bits);
+ bits += 2;
+
+ offset++;
+
+ break;
+ }
+ }
+ elog(LOG, "%s", buf.data);
+ pfree(buf.data);
+ }
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+ }
+
+ static void
heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record);
***************
*** 4692,4697 ****
--- 4858,4866 ----
case XLOG_HEAP2_CLEAN_MOVE:
heap_xlog_clean(lsn, record, true);
break;
+ case XLOG_HEAP2_HINTBITS:
+ heap_xlog_hintbits(lsn, record);
+ break;
default:
elog(PANIC, "heap2_redo: unknown op code %u", info);
}
***************
*** 4833,4838 ****
--- 5002,5015 ----
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode, xlrec->block);
}
+ else if (info == XLOG_HEAP2_HINTBITS)
+ {
+ xl_heap_hintbits *xlrec = (xl_heap_hintbits *) rec;
+
+ appendStringInfo(buf, "hintbits: rel %u/%u/%u; blk %u",
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode, xlrec->block);
+ }
else
appendStringInfo(buf, "UNKNOWN");
}
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.241
diff -c -p -r1.241 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c 11 Nov 2008 13:19:16 -0000 1.241
--- src/backend/storage/buffer/bufmgr.c 13 Nov 2008 17:44:23 -0000
***************
*** 33,38 ****
--- 33,39 ----
#include <sys/file.h>
#include <unistd.h>
+ #include "access/heapam.h"
#include "catalog/catalog.h"
#include "miscadmin.h"
#include "pg_trace.h"
***************
*** 43,48 ****
--- 44,50 ----
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
+ #include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/resowner.h"
***************
*** 1461,1467 ****
* BUF_REUSABLE: buffer is available for replacement, ie, it has
* pin count 0 and usage count 0.
*
! * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
* after locking it, but we don't care all that much.)
*
* Note: caller must have done ResourceOwnerEnlargeBuffers.
--- 1463,1469 ----
* BUF_REUSABLE: buffer is available for replacement, ie, it has
* pin count 0 and usage count 0.
*
! * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
* after locking it, but we don't care all that much.)
*
* Note: caller must have done ResourceOwnerEnlargeBuffers.
***************
*** 1772,1777 ****
--- 1774,1787 ----
{
XLogRecPtr recptr;
ErrorContextCallback errcontext;
+ static char *dblbuf = NULL;
+ bool done = false;
+
+ if (enable_block_checksums && dblbuf == NULL)
+ {
+ dblbuf = MemoryContextAlloc(TopMemoryContext, BLCKSZ + ALIGNOF_BUFFER);
+ dblbuf = (char *) BUFFERALIGN(dblbuf);
+ }
/*
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
***************
*** 1796,1801 ****
--- 1806,1835 ----
reln->smgr_rnode.relNode);
/*
+ * We make a copy of the buffer to write.
+ */
+ if (enable_block_checksums)
+ memcpy(dblbuf, BufHdrGetBlock(buf), BLCKSZ);
+
+ /*
+ * If the page has been modified by a hint bit setter, ensure we WAL-log
+ * their changes before actually writing the page; otherwise the CRC we're
+ * about to store could be invalid if the page is torn. Note: we check
+ * the flag on the shared-memory copy of the buffer, not the private copy
+ * we just made, to forestall the possibility that hints bits could have
+ * been set in the later parts of the page after we copied the flag in
+ * unset state.
+ */
+ if (enable_block_checksums && PageHasUnloggedChange(BufHdrGetBlock(buf)) &&
+ !InRecovery)
+ {
+ /* XXX cast away the "volatile" qualifier */
+ log_hintbits(&((BufferDesc *) buf)->tag.rnode, buf->tag.forkNum,
+ buf->tag.blockNum, BufHdrGetBlock(buf));
+ done = true;
+ }
+
+ /*
* Force XLOG flush up to buffer's LSN. This implements the basic WAL
* rule that log updates must hit disk before any of the data-file changes
* they describe do.
***************
*** 1817,1823 ****
smgrwrite(reln,
buf->tag.forkNum,
buf->tag.blockNum,
! (char *) BufHdrGetBlock(buf),
false);
BufferFlushCount++;
--- 1851,1857 ----
smgrwrite(reln,
buf->tag.forkNum,
buf->tag.blockNum,
! enable_block_checksums ? dblbuf : BufHdrGetBlock(buf),
false);
BufferFlushCount++;
Index: src/backend/storage/page/bufpage.c
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/page/bufpage.c,v
retrieving revision 1.81
diff -c -p -r1.81 bufpage.c
*** src/backend/storage/page/bufpage.c 3 Nov 2008 20:47:48 -0000 1.81
--- src/backend/storage/page/bufpage.c 13 Nov 2008 17:44:23 -0000
***************
*** 41,46 ****
--- 41,47 ----
MemSet(p, 0, pageSize);
/* p->pd_flags = 0; done by above MemSet */
+ p->pd_checksum = PAGE_INVALID_CHECKSUM;
p->pd_lower = SizeOfPageHeaderData;
p->pd_upper = pageSize - specialSize;
p->pd_special = pageSize - specialSize;
***************
*** 84,92 ****
page->pd_special == MAXALIGN(page->pd_special))
return true;
! /* Check all-zeroes case */
pagebytes = (char *) page;
! for (i = 0; i < BLCKSZ; i++)
{
if (pagebytes[i] != 0)
return false;
--- 85,93 ----
page->pd_special == MAXALIGN(page->pd_special))
return true;
! /* Check all-zeroes case (skipping the checksum) */
pagebytes = (char *) page;
! for (i = sizeof(PAGE_CHECKSUM_TYPE); i < BLCKSZ; i++)
{
if (pagebytes[i] != 0)
return false;
Index: src/backend/storage/smgr/smgr.c
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/smgr/smgr.c,v
retrieving revision 1.113
diff -c -p -r1.113 smgr.c
*** src/backend/storage/smgr/smgr.c 11 Nov 2008 13:19:16 -0000 1.113
--- src/backend/storage/smgr/smgr.c 13 Nov 2008 17:44:23 -0000
***************
*** 28,33 ****
--- 28,36 ----
#include "utils/memutils.h"
+ /* Perform block checksumming for corruption detection */
+ bool enable_block_checksums = false;
+
/*
* This struct of function pointers defines the API between smgr.c and
* any individual storage manager module. Note that smgr subfunctions are
***************
*** 504,509 ****
--- 507,518 ----
smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool isTemp)
{
+ /* Perform block checksumming for corruption detection */
+ if (enable_block_checksums)
+ WritePageChecksum(buffer);
+ else
+ WriteInvalidPageChecksum(buffer);
+
(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum,
buffer, isTemp);
}
***************
*** 521,526 ****
--- 530,557 ----
char *buffer)
{
(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
+
+ /* Perform block checksumming for corruption detection */
+ if (enable_block_checksums && !PageIsNew(buffer) && !InRecovery &&
+ PageGetChecksum(buffer) != PAGE_INVALID_CHECKSUM)
+ {
+ PAGE_CHECKSUM_TYPE chksum;
+
+ CalcPageChecksum(buffer, chksum);
+
+ if (chksum != PageGetChecksum(buffer))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid checksum on read of block %u of relation %u/%u/%u",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode),
+ errdetail("Got %08x, expected %08x.",
+ chksum, PageGetChecksum(buffer))));
+ }
+ }
}
/*
***************
*** 542,547 ****
--- 573,584 ----
smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool isTemp)
{
+ /* Perform block checksumming for corruption detection */
+ if (enable_block_checksums)
+ WritePageChecksum(buffer);
+ else
+ WriteInvalidPageChecksum(buffer);
+
(*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum,
buffer, isTemp);
}
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.477
diff -c -p -r1.477 guc.c
*** src/backend/utils/misc/guc.c 11 Nov 2008 02:42:32 -0000 1.477
--- src/backend/utils/misc/guc.c 13 Nov 2008 17:44:23 -0000
***************
*** 57,62 ****
--- 57,63 ----
#include "regex/regex.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
+ #include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "tsearch/ts_cache.h"
#include "utils/builtins.h"
***************
*** 770,775 ****
--- 771,786 ----
false, NULL, NULL
},
{
+ {"perform_checksum", PGC_SIGHUP, UNGROUPED,
+ gettext_noop("Forces checksumming of blocks to/from disk."),
+ gettext_noop("The server will perform a checksum on the block "
+ "when read from or written to disk in order to detect storage-related "
+ "corruption.")
+ },
+ &enable_block_checksums,
+ false, NULL, NULL
+ },
+ {
{"log_duration", PGC_SUSET, LOGGING_WHAT,
gettext_noop("Logs the duration of each completed SQL statement."),
NULL
Index: src/backend/utils/misc/postgresql.conf.sample
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.247
diff -c -p -r1.247 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample 9 Nov 2008 00:28:35 -0000 1.247
--- src/backend/utils/misc/postgresql.conf.sample 12 Nov 2008 13:14:17 -0000
***************
*** 481,486 ****
--- 481,491 ----
#transform_null_equals = off
+ #------------------------------------------------------------------------------
+ # CORRUPTION DETECTION
+ #------------------------------------------------------------------------------
+
+ #perform_checksum = off # Perform block checksumming to/from disk
#------------------------------------------------------------------------------
# CUSTOMIZED OPTIONS
Index: src/backend/utils/time/tqual.c
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/time/tqual.c,v
retrieving revision 1.110
diff -c -p -r1.110 tqual.c
*** src/backend/utils/time/tqual.c 26 Mar 2008 16:20:47 -0000 1.110
--- src/backend/utils/time/tqual.c 13 Nov 2008 17:44:23 -0000
***************
*** 44,49 ****
--- 44,50 ----
#include "access/xact.h"
#include "storage/bufmgr.h"
#include "storage/procarray.h"
+ #include "storage/smgr.h"
#include "utils/tqual.h"
***************
*** 96,101 ****
--- 97,104 ----
}
tuple->t_infomask |= infomask;
+ if (enable_block_checksums)
+ PageSetUnloggedChange(BufferGetPage(buffer));
SetBufferCommitInfoNeedsSave(buffer);
}
Index: src/include/pg_config_manual.h
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/pg_config_manual.h,v
retrieving revision 1.35
diff -c -p -r1.35 pg_config_manual.h
Index: src/include/access/heapam.h
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/heapam.h,v
retrieving revision 1.140
diff -c -p -r1.140 heapam.h
*** src/include/access/heapam.h 6 Nov 2008 20:51:15 -0000 1.140
--- src/include/access/heapam.h 12 Nov 2008 13:14:17 -0000
***************
*** 140,145 ****
--- 140,147 ----
OffsetNumber *offsets, int offcnt);
extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
BlockNumber blk, Page page);
+ extern XLogRecPtr log_hintbits(RelFileNode *rnode, ForkNumber forkNum,
+ BlockNumber blk, Page page);
/* in heap/pruneheap.c */
extern void heap_page_prune_opt(Relation relation, Buffer buffer,
Index: src/include/access/htup.h
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/htup.h,v
retrieving revision 1.103
diff -c -p -r1.103 htup.h
*** src/include/access/htup.h 2 Nov 2008 01:45:28 -0000 1.103
--- src/include/access/htup.h 12 Nov 2008 13:14:17 -0000
***************
*** 580,585 ****
--- 580,586 ----
#define XLOG_HEAP2_FREEZE 0x00
#define XLOG_HEAP2_CLEAN 0x10
#define XLOG_HEAP2_CLEAN_MOVE 0x20
+ #define XLOG_HEAP2_HINTBITS 0x30
/*
* All what we need to find changed tuple
***************
*** 714,719 ****
--- 715,730 ----
#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
+ /* This is what we need to know about hint bits */
+ typedef struct xl_heap_hintbits
+ {
+ RelFileNode node;
+ BlockNumber block;
+ /* HINT BIT ARRAY FOLLOWS AT THE END */
+ } xl_heap_hintbits;
+
+ #define SizeOfHeapHintbits (offsetof(xl_heap_hintbits, block) + sizeof(BlockNumber))
+
/* HeapTupleHeader functions implemented in utils/time/combocid.c */
extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
Index: src/include/storage/bufpage.h
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/bufpage.h,v
retrieving revision 1.84
diff -c -p -r1.84 bufpage.h
*** src/include/storage/bufpage.h 3 Nov 2008 20:47:49 -0000 1.84
--- src/include/storage/bufpage.h 13 Nov 2008 14:42:20 -0000
***************
*** 17,22 ****
--- 17,23 ----
#include "access/xlogdefs.h"
#include "storage/item.h"
#include "storage/off.h"
+ #include "utils/pg_crc.h"
/*
* A postgres disk page is an abstraction layered on top of a postgres
***************
*** 87,92 ****
--- 88,94 ----
*
* space management information generic to any page
*
+ * pd_checksum - the checksum of the page
* pd_lsn - identifies xlog record for last change to this page.
* pd_tli - ditto.
* pd_flags - flag bits.
***************
*** 118,136 ****
* the constraint on pagesize mod 256 is not an important restriction.
* On the high end, we can only support pages up to 32KB because lp_off/lp_len
* are 15 bits.
*/
typedef struct PageHeaderData
{
! /* XXX LSN is member of *any* block, not only page-organized ones */
XLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog
* record for last change to this page */
- uint16 pd_tli; /* least significant bits of the TimeLineID
- * containing the LSN */
- uint16 pd_flags; /* flag bits, see below */
LocationIndex pd_lower; /* offset to start of free space */
LocationIndex pd_upper; /* offset to end of free space */
LocationIndex pd_special; /* offset to start of special space */
uint16 pd_pagesize_version;
TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
ItemIdData pd_linp[1]; /* beginning of line pointer array */
} PageHeaderData;
--- 120,143 ----
* the constraint on pagesize mod 256 is not an important restriction.
* On the high end, we can only support pages up to 32KB because lp_off/lp_len
* are 15 bits.
+ *
+ * Note that pd_tli appears in a rather awkward position in the struct;
+ * this is because we moved it to accomodate pd_checksum without changing
+ * pg_pagesize_version's offset.
*/
typedef struct PageHeaderData
{
! /* XXX CRC & LSN are members of *any* block, not only page-organized ones */
! pg_crc32 pd_checksum; /* The block-level checksum */
XLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog
* record for last change to this page */
LocationIndex pd_lower; /* offset to start of free space */
LocationIndex pd_upper; /* offset to end of free space */
LocationIndex pd_special; /* offset to start of special space */
uint16 pd_pagesize_version;
+ uint16 pd_tli; /* least significant bits of the TimeLineID
+ * containing the LSN */
+ uint16 pd_flags; /* flag bits, see below */
TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
ItemIdData pd_linp[1]; /* beginning of line pointer array */
} PageHeaderData;
***************
*** 148,159 ****
* PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
* page for its new tuple version; this suggests that a prune is needed.
* Again, this is just a hint.
*/
#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */
#define PD_PAGE_FULL 0x0002 /* not enough free space for new
* tuple? */
! #define PD_VALID_FLAG_BITS 0x0003 /* OR of all valid pd_flags bits */
/*
* Page layout version number 0 is for pre-7.3 Postgres releases.
--- 155,172 ----
* PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
* page for its new tuple version; this suggests that a prune is needed.
* Again, this is just a hint.
+ *
+ * PG_UNLOGGED_CHANGE indicates whether a process has set hint bits on the
+ * page. This is used to determine whether a WAL message needs to be emitted
+ * before writing the page to disk when page checksums are enabled.
*/
#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */
#define PD_PAGE_FULL 0x0002 /* not enough free space for new
* tuple? */
+ #define PD_UNLOGGED_CHANGE 0x0004 /* does the page have unlogged hint
+ bits? */
! #define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */
/*
* Page layout version number 0 is for pre-7.3 Postgres releases.
***************
*** 163,170 ****
* Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
* added the pd_flags field (by stealing some bits from pd_tli),
* as well as adding the pd_prune_xid field (which enlarges the header).
*/
! #define PG_PAGE_LAYOUT_VERSION 4
/* ----------------------------------------------------------------
--- 176,186 ----
* Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
* added the pd_flags field (by stealing some bits from pd_tli),
* as well as adding the pd_prune_xid field (which enlarges the header).
+ * Release 8.4 uses 5; it added a checksum to the page header, and moved
+ * pd_tli and pd_flags so that the page version would keep the same
+ * offset.
*/
! #define PG_PAGE_LAYOUT_VERSION 5
/* ----------------------------------------------------------------
***************
*** 352,357 ****
--- 368,432 ----
#define PageClearPrunable(page) \
(((PageHeader) (page))->pd_prune_xid = InvalidTransactionId)
+ /* ----------------------------------------------------------------
+ * CRC support
+ * ----------------------------------------------------------------
+ */
+ #define PAGE_CHECKSUM_TYPE pg_crc32
+ #define SIZEOF_PAGE_CHECKSUM sizeof(PAGE_CHECKSUM_TYPE)
+ #define PAGE_INVALID_CHECKSUM 0xb79a6e9c
+
+ /*
+ * Given a page, calculate its checksum.
+ *
+ * We only include: the page header, the line pointers (except lp_flags), and
+ * the area between pd_upper and pd_special. The unused area is not included,
+ * and neither is the "special space".
+ */
+ #define CalcPageChecksum(buffer, sum) \
+ do { \
+ int i; \
+ INIT_CRC32(sum); \
+ /* The page header, excluding pd_crc, pd_flags and pd_prune_xid */ \
+ COMP_CRC32(sum, (char *) (buffer) + sizeof(pg_crc32), \
+ offsetof(PageHeaderData, pd_flags)); \
+ /* each line pointer, excluding lp_flags */ \
+ for (i = 1; i <= PageGetMaxOffsetNumber(buffer); i++) \
+ { \
+ uint32 lpval; \
+ lpval = *(uint32 *) PageGetItemId(buffer, i); \
+ lpval &= ~ITEM_LP_FLAGS_MASK; \
+ COMP_CRC32(sum, &lpval, sizeof(ItemIdData)); \
+ } \
+ /* the space occupied by tuples */ \
+ COMP_CRC32(sum, (char *) (buffer) + ((PageHeader) (buffer))->pd_upper, \
+ ((PageHeader) (buffer))->pd_special - ((PageHeader) (buffer))->pd_upper); \
+ FIN_CRC32(sum); \
+ } while (0)
+
+
+ /* beware multiple evaluation of argument */
+ #define WritePageChecksum(buffer) \
+ do { \
+ PAGE_CHECKSUM_TYPE chksum; \
+ CalcPageChecksum(buffer, chksum); \
+ PageSetChecksum(buffer, chksum); \
+ } while (0)
+
+ #define WriteInvalidPageChecksum(buffer) \
+ PageSetChecksum((buffer), PAGE_INVALID_CHECKSUM)
+
+ #define PageGetChecksum(page) \
+ (((PageHeader) (page))->pd_checksum)
+ #define PageSetChecksum(page, checksum) \
+ (((PageHeader) (page))->pd_checksum = (checksum))
+
+ #define PageHasUnloggedChange(page) \
+ (((PageHeader) (page))->pd_flags & PD_UNLOGGED_CHANGE)
+ #define PageSetUnloggedChange(page) \
+ (((PageHeader) (page))->pd_flags |= PD_UNLOGGED_CHANGE)
+ #define PageClearUnloggedChange(page) \
+ (((PageHeader) (page))->pd_flags &= ~PD_UNLOGGED_CHANGE)
/* ----------------------------------------------------------------
* extern declarations
Index: src/include/storage/itemid.h
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/itemid.h,v
retrieving revision 1.30
diff -c -p -r1.30 itemid.h
*** src/include/storage/itemid.h 1 Jan 2008 19:45:59 -0000 1.30
--- src/include/storage/itemid.h 13 Nov 2008 12:41:07 -0000
***************
*** 39,44 ****
--- 39,47 ----
#define LP_REDIRECT 2 /* HOT redirect (should have lp_len=0) */
#define LP_DEAD 3 /* dead, may or may not have storage */
+ /* the bits used by lp_flags */
+ #define ITEM_LP_FLAGS_MASK 0x00018000L
+
/*
* Item offsets and lengths are represented by these types when
* they're not actually stored in an ItemIdData.
Index: src/include/storage/smgr.h
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/smgr.h,v
retrieving revision 1.63
diff -c -p -r1.63 smgr.h
*** src/include/storage/smgr.h 11 Aug 2008 11:05:11 -0000 1.63
--- src/include/storage/smgr.h 12 Nov 2008 13:14:17 -0000
***************
*** 20,25 ****
--- 20,28 ----
#include "storage/relfilenode.h"
+ /* Perform block checksumming for corruption detection */
+ bool enable_block_checksums;
+
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
* cached file handles. An SMgrRelation is created (if not already present)
Index: src/include/utils/pg_crc.h
===================================================================
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/utils/pg_crc.h,v
retrieving revision 1.18
diff -c -p -r1.18 pg_crc.h
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers