Simon Riggs wrote: > But perhaps writing a single WAL record if you scan whole page and set > all bits at once. Then it makes sense in some cases.
So this is what I ended up doing; attached. There are some gotchas in this patch: 1. it does not consider hint bits other than the ones defined in htup.h. Some index AMs use hint bits to "kill" tuples (LP_DEAD mostly, I think). This means that CRCs will be broken for such pages when pages are torn. 2. some parts of the code could be considered modularity violations. For example, tqual.c is setting a bit in a Page structure; bufmgr.c is later checking that bit to determine when to log. 3. the bgwriter is seen writing WAL entries at checkpoint. At shutdown, this might cause an error to be reported on how there was not supposed to be activity on the log. I didn't save the exact error report and I can't find it in the source :-( So it "mostly works" at this time. I very much welcome opinions to improve the weak points. -- Alvaro Herrera http://www.CommandPrompt.com/ The PostgreSQL Company - Command Prompt, Inc.
Index: src/backend/access/heap/heapam.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/heap/heapam.c,v retrieving revision 1.266 diff -c -p -r1.266 heapam.c *** src/backend/access/heap/heapam.c 27 Oct 2008 21:50:12 -0000 1.266 --- src/backend/access/heap/heapam.c 29 Oct 2008 17:56:25 -0000 *************** log_newpage(RelFileNode *rnode, ForkNumb *** 4007,4012 **** --- 4007,4064 ---- return recptr; } + XLogRecPtr + log_hintbits(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page) + { + xl_heap_hintbits xlrec; + OffsetNumber i; + XLogRecPtr recptr; + XLogRecData rdata[2]; + uint16 bits[MaxHeapTuplesPerPage * 4]; + int pos = 0; + + for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); + i = OffsetNumberNext(i)) + { + HeapTupleHeader htup; + ItemId lp = PageGetItemId(page, i); + + if (!ItemIdHasStorage(lp)) + continue; + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + bits[pos++] = htup->t_infomask & HEAP_XACT_MASK; + bits[pos++] = htup->t_infomask2 & HEAP2_XACT_MASK; + } + + /* NO ELOG(ERROR) from here till hint bits are logged */ + START_CRIT_SECTION(); + + xlrec.node = *rnode; + xlrec.block = blkno; + + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfHeapHintbits; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + rdata[1].data = (char *) bits; + rdata[1].len = sizeof(uint16) * pos; + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_HINTBITS, rdata); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + + END_CRIT_SECTION(); + + return recptr; + } + /* * Handles CLEAN and CLEAN_MOVE record types */ *************** heap_xlog_freeze(XLogRecPtr lsn, XLogRec *** 4113,4118 **** --- 4165,4230 ---- } static void + heap_xlog_hintbits(XLogRecPtr lsn, XLogRecord *record) + { + xl_heap_hintbits *xlrec = (xl_heap_hintbits *) XLogRecGetData(record); + Buffer buffer; + Page page; + + buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); + if (!BufferIsValid(buffer)) + return; + page = (Page) BufferGetPage(buffer); + + if (XLByteLE(lsn, PageGetLSN(page))) + { + UnlockReleaseBuffer(buffer); + return; + } + + if (record->xl_len > SizeOfHeapHintbits) + { + uint16 *infomask; + uint16 *infomask_end; + OffsetNumber offset = FirstOffsetNumber; + + infomask = (uint16 *) ((char *) xlrec + SizeOfHeapHintbits); + infomask_end = (uint16 *) ((char *) xlrec + record->xl_len); + + while (infomask < infomask_end) + { + for (;;) + { + HeapTupleHeader htup; + ItemId lp = PageGetItemId(page, offset); + + if (!ItemIdHasStorage(lp)) + { + offset++; + continue; + } + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask |= *infomask; + infomask++; + htup->t_infomask2 |= *infomask; + infomask++; + + offset++; + + break; + } + } + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + } + + static void heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record) { xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record); *************** heap2_redo(XLogRecPtr lsn, XLogRecord *r *** 4614,4619 **** --- 4726,4734 ---- case XLOG_HEAP2_CLEAN_MOVE: heap_xlog_clean(lsn, record, true); break; + case XLOG_HEAP2_HINTBITS: + heap_xlog_hintbits(lsn, record); + break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); } *************** heap2_desc(StringInfo buf, uint8 xl_info *** 4755,4760 **** --- 4870,4883 ---- xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, xlrec->block); } + else if (info == XLOG_HEAP2_HINTBITS) + { + xl_heap_hintbits *xlrec = (xl_heap_hintbits *) rec; + + appendStringInfo(buf, "hintbits: rel %u/%u/%u; blk %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block); + } else appendStringInfo(buf, "UNKNOWN"); } Index: src/backend/storage/buffer/bufmgr.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/buffer/bufmgr.c,v retrieving revision 1.239 diff -c -p -r1.239 bufmgr.c *** src/backend/storage/buffer/bufmgr.c 20 Oct 2008 21:11:15 -0000 1.239 --- src/backend/storage/buffer/bufmgr.c 29 Oct 2008 18:02:34 -0000 *************** *** 33,38 **** --- 33,39 ---- #include <sys/file.h> #include <unistd.h> + #include "access/heapam.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" *************** *** 42,47 **** --- 43,49 ---- #include "storage/ipc.h" #include "storage/proc.h" #include "storage/smgr.h" + #include "utils/memutils.h" #include "utils/rel.h" #include "utils/resowner.h" *************** BgBufferSync(void) *** 1482,1488 **** * BUF_REUSABLE: buffer is available for replacement, ie, it has * pin count 0 and usage count 0. * ! * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean * after locking it, but we don't care all that much.) * * Note: caller must have done ResourceOwnerEnlargeBuffers. --- 1484,1490 ---- * BUF_REUSABLE: buffer is available for replacement, ie, it has * pin count 0 and usage count 0. * ! * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean * after locking it, but we don't care all that much.) * * Note: caller must have done ResourceOwnerEnlargeBuffers. *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1792,1797 **** --- 1794,1807 ---- { XLogRecPtr recptr; ErrorContextCallback errcontext; + static char *dblbuf = NULL; + bool done = false; + + if (enable_block_checksums && dblbuf == NULL) + { + dblbuf = MemoryContextAlloc(TopMemoryContext, BLCKSZ + ALIGNOF_BUFFER); + dblbuf = (char *) BUFFERALIGN(dblbuf); + } /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1816,1821 **** --- 1826,1857 ---- reln->smgr_rnode.relNode); /* + * We make a copy of the buffer to write. + */ + if (enable_block_checksums) + memcpy(dblbuf, BufHdrGetBlock(buf), BLCKSZ); + + /* + * If the page has been modified by a hint bit setter, ensure we WAL-log + * their changes before actually writing the page; otherwise the CRC we're + * about to store could be invalid if the page is torn. Note: we check + * the flag on the shared-memory copy of the buffer, not the private copy + * we just made, to forestall the possibility that hints bits could have + * been set in the later parts of the page after we copied the flag in + * unset state. + * + * FIXME -- this is missing other hint bits (particularly those used in + * indexes) + */ + if (enable_block_checksums && PageHasUnloggedChange(BufHdrGetBlock(buf)) && !InRecovery) + { + /* XXX cast away the "volatile" qualifier */ + log_hintbits(&((BufferDesc *) buf)->tag.rnode, buf->tag.forkNum, + buf->tag.blockNum, BufHdrGetBlock(buf)); + done = true; + } + + /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL * rule that log updates must hit disk before any of the data-file changes * they describe do. *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1837,1843 **** smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! (char *) BufHdrGetBlock(buf), false); BufferFlushCount++; --- 1873,1879 ---- smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! enable_block_checksums ? dblbuf : BufHdrGetBlock(buf), false); BufferFlushCount++; Index: src/backend/storage/page/bufpage.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/page/bufpage.c,v retrieving revision 1.80 diff -c -p -r1.80 bufpage.c *** src/backend/storage/page/bufpage.c 13 Jul 2008 21:50:04 -0000 1.80 --- src/backend/storage/page/bufpage.c 27 Oct 2008 21:50:58 -0000 *************** PageInit(Page page, Size pageSize, Size *** 41,46 **** --- 41,47 ---- MemSet(p, 0, pageSize); /* p->pd_flags = 0; done by above MemSet */ + p->pd_checksum = PAGE_INVALID_CHECKSUM; p->pd_lower = SizeOfPageHeaderData; p->pd_upper = pageSize - specialSize; p->pd_special = pageSize - specialSize; *************** PageHeaderIsValid(PageHeader page) *** 84,92 **** page->pd_special == MAXALIGN(page->pd_special)) return true; ! /* Check all-zeroes case */ pagebytes = (char *) page; ! for (i = 0; i < BLCKSZ; i++) { if (pagebytes[i] != 0) return false; --- 85,93 ---- page->pd_special == MAXALIGN(page->pd_special)) return true; ! /* Check all-zeroes case (skipping the checksum) */ pagebytes = (char *) page; ! for (i = sizeof(PAGE_CHECKSUM_TYPE); i < BLCKSZ; i++) { if (pagebytes[i] != 0) return false; Index: src/backend/storage/smgr/smgr.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/smgr/smgr.c,v retrieving revision 1.112 diff -c -p -r1.112 smgr.c *** src/backend/storage/smgr/smgr.c 30 Sep 2008 10:52:13 -0000 1.112 --- src/backend/storage/smgr/smgr.c 29 Oct 2008 18:18:07 -0000 *************** *** 27,32 **** --- 27,35 ---- #include "utils/memutils.h" + /* Perform block checksumming for corruption detection */ + bool enable_block_checksums = false; + /* * This struct of function pointers defines the API between smgr.c and * any individual storage manager module. Note that smgr subfunctions are *************** void *** 503,508 **** --- 506,515 ---- smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { + /* Perform block checksumming for corruption detection */ + if (enable_block_checksums) + WritePageChecksum(buffer); + (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum, buffer, isTemp); } *************** smgrread(SMgrRelation reln, ForkNumber f *** 520,525 **** --- 527,551 ---- char *buffer) { (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer); + + /* Perform block checksumming for corruption detection */ + if (enable_block_checksums && PageGetChecksum(buffer) != PAGE_INVALID_CHECKSUM) + { + PAGE_CHECKSUM_TYPE chksum; + + CalcPageChecksum(buffer, chksum); + + if (chksum != PageGetChecksum(buffer)) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid checksum on read of block %u of relation %u/%u/%u", + blocknum, + reln->smgr_rnode.spcNode, + reln->smgr_rnode.dbNode, + reln->smgr_rnode.relNode))); + } + } } /* *************** void *** 541,546 **** --- 567,578 ---- smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { + /* + * Perform block checksumming before writing. + */ + if (enable_block_checksums) + WritePageChecksum(buffer); + (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum, buffer, isTemp); } Index: src/backend/utils/misc/guc.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/guc.c,v retrieving revision 1.475 diff -c -p -r1.475 guc.c *** src/backend/utils/misc/guc.c 6 Oct 2008 13:05:36 -0000 1.475 --- src/backend/utils/misc/guc.c 27 Oct 2008 21:50:58 -0000 *************** *** 57,62 **** --- 57,63 ---- #include "regex/regex.h" #include "storage/bufmgr.h" #include "storage/fd.h" + #include "storage/smgr.h" #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" *************** static struct config_bool ConfigureNames *** 762,767 **** --- 763,778 ---- false, NULL, NULL }, { + {"perform_checksum", PGC_SIGHUP, UNGROUPED, + gettext_noop("Forces checksumming of blocks to/from disk."), + gettext_noop("The server will perform a checksum on the block " + "when read from or written to disk in order to detect storage-related " + "corruption.") + }, + &enable_block_checksums, + false, NULL, NULL + }, + { {"log_duration", PGC_SUSET, LOGGING_WHAT, gettext_noop("Logs the duration of each completed SQL statement."), NULL Index: src/backend/utils/misc/postgresql.conf.sample =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/postgresql.conf.sample,v retrieving revision 1.246 diff -c -p -r1.246 postgresql.conf.sample *** src/backend/utils/misc/postgresql.conf.sample 30 Sep 2008 10:52:13 -0000 1.246 --- src/backend/utils/misc/postgresql.conf.sample 27 Oct 2008 21:50:58 -0000 *************** *** 480,485 **** --- 480,490 ---- #transform_null_equals = off + #------------------------------------------------------------------------------ + # CORRUPTION DETECTION + #------------------------------------------------------------------------------ + + #perform_checksum = off # Perform block checksumming to/from disk #------------------------------------------------------------------------------ # CUSTOMIZED OPTIONS Index: src/backend/utils/time/tqual.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/time/tqual.c,v retrieving revision 1.110 diff -c -p -r1.110 tqual.c *** src/backend/utils/time/tqual.c 26 Mar 2008 16:20:47 -0000 1.110 --- src/backend/utils/time/tqual.c 29 Oct 2008 18:05:13 -0000 *************** *** 44,49 **** --- 44,50 ---- #include "access/xact.h" #include "storage/bufmgr.h" #include "storage/procarray.h" + #include "storage/smgr.h" #include "utils/tqual.h" *************** SetHintBits(HeapTupleHeader tuple, Buffe *** 96,101 **** --- 97,104 ---- } tuple->t_infomask |= infomask; + if (enable_block_checksums) + PageSetUnloggedChange(BufferGetPage(buffer)); SetBufferCommitInfoNeedsSave(buffer); } Index: src/include/access/heapam.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/heapam.h,v retrieving revision 1.139 diff -c -p -r1.139 heapam.h *** src/include/access/heapam.h 8 Oct 2008 01:14:44 -0000 1.139 --- src/include/access/heapam.h 28 Oct 2008 20:36:39 -0000 *************** extern XLogRecPtr log_heap_freeze(Relati *** 131,136 **** --- 131,138 ---- OffsetNumber *offsets, int offcnt); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, Page page); + extern XLogRecPtr log_hintbits(RelFileNode *rnode, ForkNumber forkNum, + BlockNumber blk, Page page); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer, Index: src/include/access/htup.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/htup.h,v retrieving revision 1.102 diff -c -p -r1.102 htup.h *** src/include/access/htup.h 28 Oct 2008 15:51:03 -0000 1.102 --- src/include/access/htup.h 28 Oct 2008 16:27:28 -0000 *************** typedef HeapTupleData *HeapTuple; *** 580,585 **** --- 580,586 ---- #define XLOG_HEAP2_FREEZE 0x00 #define XLOG_HEAP2_CLEAN 0x10 #define XLOG_HEAP2_CLEAN_MOVE 0x20 + #define XLOG_HEAP2_HINTBITS 0x30 /* * All what we need to find changed tuple *************** typedef struct xl_heap_freeze *** 714,719 **** --- 715,730 ---- #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId)) + /* This is what we need to know about hint bits */ + typedef struct xl_heap_hintbits + { + RelFileNode node; + BlockNumber block; + /* HINT BIT ARRAY FOLLOWS AT THE END */ + } xl_heap_hintbits; + + #define SizeOfHeapHintbits (offsetof(xl_heap_hintbits, block) + sizeof(BlockNumber)) + /* HeapTupleHeader functions implemented in utils/time/combocid.c */ extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup); extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup); Index: src/include/storage/bufpage.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/bufpage.h,v retrieving revision 1.83 diff -c -p -r1.83 bufpage.h *** src/include/storage/bufpage.h 14 Jul 2008 03:22:32 -0000 1.83 --- src/include/storage/bufpage.h 27 Oct 2008 21:50:58 -0000 *************** *** 17,22 **** --- 17,23 ---- #include "access/xlogdefs.h" #include "storage/item.h" #include "storage/off.h" + #include "utils/pg_crc.h" /* * A postgres disk page is an abstraction layered on top of a postgres *************** typedef uint16 LocationIndex; *** 87,92 **** --- 88,94 ---- * * space management information generic to any page * + * pd_checksum - the checksum of the page * pd_lsn - identifies xlog record for last change to this page. * pd_tli - ditto. * pd_flags - flag bits. *************** typedef uint16 LocationIndex; *** 121,127 **** */ typedef struct PageHeaderData { ! /* XXX LSN is member of *any* block, not only page-organized ones */ XLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog * record for last change to this page */ uint16 pd_tli; /* least significant bits of the TimeLineID --- 123,130 ---- */ typedef struct PageHeaderData { ! /* XXX CRC & LSN are members of *any* block, not only page-organized ones */ ! pg_crc32 pd_checksum; /* The block-level checksum */ XLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog * record for last change to this page */ uint16 pd_tli; /* least significant bits of the TimeLineID *************** typedef PageHeaderData *PageHeader; *** 148,159 **** * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ ! #define PD_VALID_FLAG_BITS 0x0003 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. --- 151,168 ---- * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. + * + * PG_UNLOGGED_CHANGE indicates whether a process has set hint bits on the + * page. This is used to determine whether a WAL message needs to be emitted + * before writing the page to disk when page checksums are enabled. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ + #define PD_UNLOGGED_CHANGE 0x0004 /* does the page have unlogged hint + bits? */ ! #define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. *************** do { \ *** 352,357 **** --- 361,400 ---- #define PageClearPrunable(page) \ (((PageHeader) (page))->pd_prune_xid = InvalidTransactionId) + /* ---------------------------------------------------------------- + * CRC support + * ---------------------------------------------------------------- + */ + #define PAGE_CHECKSUM_TYPE pg_crc32 + #define SIZEOF_PAGE_CHECKSUM sizeof(PAGE_CHECKSUM_TYPE) + #define PAGE_INVALID_CHECKSUM 0xb79a6e9c + + #define CalcPageChecksum(buffer, sum) \ + do { \ + INIT_CRC32(sum); \ + COMP_CRC32(sum, &buffer[sizeof(pg_crc32)], BLCKSZ - sizeof(pg_crc32)); \ + FIN_CRC32(sum); \ + } while (0) + + /* beware multiple evaluation of argument */ + #define WritePageChecksum(buffer) \ + do { \ + PAGE_CHECKSUM_TYPE chksum; \ + CalcPageChecksum(buffer, chksum); \ + PageSetChecksum(buffer, chksum); \ + } while (0) + + #define PageGetChecksum(page) \ + (((PageHeader) (page))->pd_checksum) + #define PageSetChecksum(page, checksum) \ + (((PageHeader) (page))->pd_checksum = (checksum)) + + #define PageHasUnloggedChange(page) \ + (((PageHeader) (page))->pd_flags & PD_UNLOGGED_CHANGE) + #define PageSetUnloggedChange(page) \ + (((PageHeader) (page))->pd_flags |= PD_UNLOGGED_CHANGE) + #define PageClearUnloggedChange(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_UNLOGGED_CHANGE) /* ---------------------------------------------------------------- * extern declarations Index: src/include/storage/smgr.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/smgr.h,v retrieving revision 1.63 diff -c -p -r1.63 smgr.h *** src/include/storage/smgr.h 11 Aug 2008 11:05:11 -0000 1.63 --- src/include/storage/smgr.h 27 Oct 2008 21:50:58 -0000 *************** *** 20,25 **** --- 20,28 ---- #include "storage/relfilenode.h" + /* Perform block checksumming for corruption detection */ + bool enable_block_checksums; + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present)
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers