On Wed, Jan 4, 2012 at 1:35 PM, Kevin Grittner <kevin.gritt...@wicourts.gov> wrote: > Simon Riggs wrote: > >> My focus was on getting something working first, then tuning. If >> we're agreed that we have everything apart from the tuning then we >> can proceed with tests to see which works better. > > Sure. I just think you are there already except for what I got into. > > FWIW, moving the modulus application out of the loop is a very > trivial change and has no affect on the results; it's strictly a > performance issue.
New version attached, with your suggested changes included. Hole check code is there as well, but ifdef'd out since it isn't a valid check in all cases. -- Simon Riggs http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0cc3296..9b367a3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1701,6 +1701,48 @@ SET ENABLE_SEQSCAN TO OFF; </listitem> </varlistentry> + <varlistentry id="guc-page-checksums" xreflabel="page_checksums"> + <indexterm> + <primary><varname>page_checksums</> configuration parameter</primary> + </indexterm> + <term><varname>page_checksums</varname> (<type>boolean</type>)</term> + <listitem> + <para> + When this parameter is on, the <productname>PostgreSQL</> server + calculates checksums when it writes main database pages to disk, + flagging the page as checksum protected. When this parameter is off, + no checksum is written, only a standard watermark in the page header. + The database may thus contain a mix of pages with checksums and pages + without checksums. + </para> + + <para> + When pages are read into shared buffers any page flagged with a + checksum has the checksum re-calculated and compared against the + stored value to provide greatly improved validation of page contents. + </para> + + <para> + Writes via temp_buffers are not checksummed. + </para> + + <para> + Turning this parameter off speeds normal operation, but + might allow data corruption to go unnoticed. The checksum uses + 16-bit checksums, using the fast Fletcher 16 algorithm. With this + parameter enabled there is still a non-zero probability that an error + could go undetected, as well as a non-zero probability of false + positives. + </para> + + <para> + This parameter can only be set in the <filename>postgresql.conf</> + file or on the server command line. + The default is <literal>off</>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers"> <term><varname>wal_buffers</varname> (<type>integer</type>)</term> <indexterm> diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 91cc001..a43b7be 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -440,7 +440,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgrread(smgr, forkNum, blockNum, (char *) bufBlock); /* check for garbage data */ - if (!PageHeaderIsValid((PageHeader) bufBlock)) + if (!PageIsVerified((Page) bufBlock)) { if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) { @@ -1860,6 +1860,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) { XLogRecPtr recptr; ErrorContextCallback errcontext; + Block bufBlock; + char *bufCopy; /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns @@ -1907,10 +1909,24 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) buf->flags &= ~BM_JUST_DIRTIED; UnlockBufHdr(buf); + /* + * Set page verification info immediately before we write the buffer to disk. + * Once we have flushed the buffer is marked clean again, meaning it can + * be replaced quickly and silently with another data block, so we must + * write verification info now. For efficiency, the process of cleaning + * and page replacement is asynchronous, so we can't do this *only* when + * we are about to replace the buffer, we need to do this for every flush. + */ + bufBlock = BufHdrGetBlock(buf); + bufCopy = PageSetVerificationInfo((Page) bufBlock); + + /* + * bufToWrite is either the shared buffer or a copy, as appropriate. + */ smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, - (char *) BufHdrGetBlock(buf), + (char *) bufCopy, false); pgBufferUsage.shared_blks_written++; @@ -1921,6 +1937,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) */ TerminateBufferIO(buf, true, 0); + /* XXX Assert(buf is not BM_JUST_DIRTIED) */ + TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum, buf->tag.blockNum, reln->smgr_rnode.node.spcNode, diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 096d36a..a220310 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -200,6 +200,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, /* Find smgr relation for buffer */ oreln = smgropen(bufHdr->tag.rnode, MyBackendId); + /* XXX do we want to write checksums for local buffers? An option? */ + /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 90a731c..c49c2e1 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -16,6 +16,12 @@ #include "access/htup.h" +bool page_checksums = false; + +static char pageCopy[BLCKSZ]; /* temporary buffer to allow checksum calculation */ + +static bool PageVerificationInfoOK(Page page); +static uint16 PageCalcChecksum16(Page page); /* ---------------------------------------------------------------- * Page support functions @@ -25,6 +31,10 @@ /* * PageInit * Initializes the contents of a page. + * Note that we don't automatically add a checksum, or flag that the + * page has a checksum field. We start with a normal page layout and defer + * the decision on what page verification will be written just before + * we write the block to disk. */ void PageInit(Page page, Size pageSize, Size specialSize) @@ -67,20 +77,20 @@ PageInit(Page page, Size pageSize, Size specialSize) * will clean up such a page and make it usable. */ bool -PageHeaderIsValid(PageHeader page) +PageIsVerified(Page page) { + PageHeader p = (PageHeader) page; char *pagebytes; int i; /* Check normal case */ - if (PageGetPageSize(page) == BLCKSZ && - PageGetPageLayoutVersion(page) == PG_PAGE_LAYOUT_VERSION && - (page->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && - page->pd_lower >= SizeOfPageHeaderData && - page->pd_lower <= page->pd_upper && - page->pd_upper <= page->pd_special && - page->pd_special <= BLCKSZ && - page->pd_special == MAXALIGN(page->pd_special)) + if (PageVerificationInfoOK(page) && + (p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && + p->pd_lower >= SizeOfPageHeaderData && + p->pd_lower <= p->pd_upper && + p->pd_upper <= p->pd_special && + p->pd_special <= BLCKSZ && + p->pd_special == MAXALIGN(p->pd_special)) return true; /* Check all-zeroes case */ @@ -93,7 +103,6 @@ PageHeaderIsValid(PageHeader page) return true; } - /* * PageAddItem * @@ -827,3 +836,239 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) pfree(itemidbase); } + +/* + * Test whether the page verification information is correct or not. + * + * IMPORTANT NOTE - + * Verification info is not valid at all times on a data page. We set + * verification info before we flush page/buffer, and implicitly invalidate + * verification info when we write to the page. A heavily accessed buffer + * might then spend most of its life with invalid page verification info, + * so testing verification info on random pages in the buffer pool will tell + * you nothing. The reason for this is that page verification info protects + * Postgres data from errors on the filesystems on which we rely. We do not + * protect buffers against uncorrectable memory errors, since these have a + * very low measured incidence according to research on large server farms, + * http://www.google.com/research/pubs/archive/35162.pdf, discussed 2010/12/22. + * + * To confirm your understanding that means that WAL-logged changes to a page + * do NOT update the page verification info, so full page images may not have + * correct verification information on them. But those page images have the + * WAL CRC covering them and so are verified separately from this mechanism. + * WAL replay ignores page verification info unless it writes out or reads in + * blocks from disk; restoring full page writes does not check verification + * info via this function. + * + * The best way to understand this is that WAL CRCs protect records entering + * the WAL stream, and page verification protects blocks entering and leaving + * the buffer pool. They are similar in purpose, yet completely separate. + * Together they ensure we are able to detect errors in data leaving and + * re-entering PostgreSQL controlled memory. + * + * Note also that the verification mechanism can vary from page to page. + * All we do here is look at what the page itself says is the verification + * mechanism and then apply that test. This allows us to run without the CPU + * cost of verification if we choose, as well as to provide an upgrade path + * for anyone doing direct upgrades using pg_upgrade. + * + * There is some concern that trusting page data to say how to check page + * data is dangerously self-referential. To ensure no mistakes we set two + * non-adjacent bits to signify that the page has a checksum and + * should be verified when that block is read back into a buffer. + * We use two bits in case a multiple bit error removes one of the checksum + * flags *and* destroys data, which would lead to skipping the checksum check + * and silently accepting bad data. + * + * Note also that this returns a boolean, not a full damage assessment. + */ +static bool +PageVerificationInfoOK(Page page) +{ + PageHeader p = (PageHeader) page; + + /* + * We set two non-adjacent bits to signify that the page has a checksum and + * should be verified against that block is read back into a buffer. + * We use two bits in case a multiple bit error removes one of the checksum + * flags and destroys data, which would lead to skipping the checksum check + * and silently accepting bad data. + */ + if (PageHasChecksumFlag1(p) && PageHasChecksumFlag2(p)) + { + uint16 checksum = PageCalcChecksum16(page); + + if (checksum == p->pd_verify.pd_checksum16) + { +#ifdef CHECK_HOLE + /* Also check page hole is all-zeroes */ + char *pagebytes; + bool empty = true; + int i; + + pagebytes = (char *) page; + for (i = p->pd_lower; i < p->pd_upper; i++) + { + if (pagebytes[i] != 0) + { + empty = false; + break; + } + } + + if (!empty) + elog(LOG, "hole was not empty at byte %d pd_lower %d pd_upper %d", + i, p->pd_lower, p->pd_upper); +#endif + return true; + } + + elog(LOG, "page verification failed - checksum was %u page checksum field is %u", + checksum, p->pd_verify.pd_checksum16); + } + else if (!PageHasChecksumFlag1(p) && !PageHasChecksumFlag2(p)) + { + if (PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION && + PageGetPageSize(p) == BLCKSZ) + return true; + } + else + elog(LOG, "page verification failed - page has one checksum flag set"); + + return false; +} + +/* + * Set verification info for page. + * + * Either we set a new checksum, or we set the standard watermark. We must + * not leave an invalid checksum in place. Note that the verification info is + * not WAL logged, whereas the data changes to pages are, so data is safe + * whether or not we have page_checksums enabled. The purpose of checksums + * is to detect page corruption to allow replacement from backup. + * + * Returns a pointer to the block-sized data that needs to be written. That + * allows us to either copy, or not, depending upon whether we checksum. + */ +char * +PageSetVerificationInfo(Page page) +{ + PageHeader p; + + if (PageIsNew(page)) + return (char *) page; + + if (page_checksums) + { + /* + * We make a copy iff we need to calculate a checksum because other + * backends may set hint bits on this page while we write, which + * would mean the checksum differs from the page contents. It doesn't + * matter if we include or exclude hints during the copy, as long + * as we write a valid page and associated checksum. + */ + memcpy(&pageCopy, page, BLCKSZ); + + p = (PageHeader) &pageCopy; + p->pd_flags |= PD_CHECKSUM; + p->pd_verify.pd_checksum16 = PageCalcChecksum16((Page) &pageCopy); + + return (char *) &pageCopy; + } + + p = (PageHeader) page; + + if (PageHasChecksumFlag1(p) || PageHasChecksumFlag2(p)) + { + /* ensure any older checksum info is overwritten with watermark */ + p->pd_flags &= ~PD_CHECKSUM; + PageSetPageSizeAndVersion(p, BLCKSZ, PG_PAGE_LAYOUT_VERSION); + } + + return (char *) page; +} + +/* + * Calculate checksum for a PostgreSQL Page. We do this in 3 steps, first + * we calculate the checksum for the header, avoiding the verification + * info, which will be added afterwards. Next, we add the line pointers up to + * the hole in the middle of the block at pd_lower. Last, we add the tail + * of the page from pd_upper to the end of page. + */ +static uint16 +PageCalcChecksum16(Page page) +{ +#define PAGE_VERIFICATION_USES_FLETCHER16 (true) +#ifdef PAGE_VERIFICATION_USES_FLETCHER16 + /* + * Following calculation is a Flecther's 16 checksum. The calc is isolated + * here and tuning and/or replacement algorithms are possible. + * + * XXX present implementation is raw, untuned calculation, please tweak + */ + PageHeader p = (PageHeader) page; + uint page_header_stop = (uint)(offsetof(PageHeaderData, pd_special) + sizeof(LocationIndex)); + uint page_lower_start = (uint)(offsetof(PageHeaderData, pd_prune_xid)); + uint page_lower_stop; + uint sum1 = 0; + uint64 sum2 = 0; + int i; + + /* + * Avoid calculating checksum if page is new, just return a value that + * will cause the check to fail. We may still pass the all-zeroes check. + */ + if (PageIsNew(page)) + return 1; + + /* + * Just add in the pd_prune_xid if there are no line pointers yet. + */ + page_lower_stop = p->pd_lower; + if (page_lower_stop == 0) + page_lower_stop = page_lower_start + sizeof(TransactionId); + + Assert(p->pd_upper != 0); + +#ifdef DEBUG_CHECKSUM + elog(LOG, "calculating checksum for %u-%u %u-%u %u-%u", + 0, /* page_header_start */ + page_header_stop, + page_lower_start, + page_lower_stop, + p->pd_upper, + BLCKSZ + ); +#endif + +#define COMP_F16(from, to) \ +do { \ + for (i = from; i < to; i++) \ + { \ + sum1 = sum1 + page[i]; \ + sum2 = sum1 + sum2; \ + } \ + sum1 %= 255; \ + sum2 %= 255; \ +} while (0); \ + + COMP_F16(0, + page_header_stop); + + /* ignore the checksum field since not done yet... */ + + COMP_F16(page_lower_start, + page_lower_stop); + + /* ignore the hole in the middle of the block */ + + COMP_F16(p->pd_upper, + BLCKSZ - 1); + +#ifdef DEBUG_CHECKSUM + elog(LOG, "checksum %u", ((sum2 << 8) | sum1)); +#endif + + return ((sum2 << 8) | sum1); +#endif +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5c910dd..e868280 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -830,6 +830,20 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"page_checksums", PGC_SIGHUP, WAL_SETTINGS, + gettext_noop("Marks database blocks with a checksum before writing them to disk. "), + gettext_noop("When enabled all database blocks will be marked with a checksums before writing to disk. " + "When we read a database block from disk the checksum is checked, if it exists. " + "If there is no checksum marked yet then no check is performed, though a " + "checksum will be added later when we re-write the database block. " + "When disabled checksums will be ignored, even if the block was marked " + "with checksum. When disabled checksums will not be added to database blocks.") + }, + &page_checksums, + true, + NULL, NULL, NULL + }, + { {"full_page_writes", PGC_SIGHUP, WAL_SETTINGS, gettext_noop("Writes full pages to WAL when first modified after a checkpoint."), gettext_noop("A page write in process during an operating system crash might be " diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 315db46..6f81023 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -150,15 +150,21 @@ #------------------------------------------------------------------------------ -# WRITE AHEAD LOG +# WRITE AHEAD LOG & RELIABILITY #------------------------------------------------------------------------------ -# - Settings - +# - Reliability - -#wal_level = minimal # minimal, archive, or hot_standby - # (change requires restart) +#page_checksums = off # calculate checksum before database I/O +#full_page_writes = on # recover from partial page writes #fsync = on # turns forced synchronization on or off + #synchronous_commit = on # synchronization level; on, off, or local + +# - Write Ahead Log - + +#wal_level = minimal # minimal, archive, or hot_standby + # (change requires restart) #wal_sync_method = fsync # the default is the first option # supported by the operating system: # open_datasync @@ -166,7 +172,6 @@ # fsync # fsync_writethrough # open_sync -#full_page_writes = on # recover from partial page writes #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers # (change requires restart) #wal_writer_delay = 200ms # 1-10000 milliseconds diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 1ab64e0..dab9189 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -18,6 +18,8 @@ #include "storage/item.h" #include "storage/off.h" +extern bool page_checksums; + /* * A postgres disk page is an abstraction layered on top of a postgres * disk block (which is simply a unit of i/o, see block.h). @@ -93,7 +95,7 @@ typedef uint16 LocationIndex; * pd_lower - offset to start of free space. * pd_upper - offset to end of free space. * pd_special - offset to start of special space. - * pd_pagesize_version - size in bytes and page layout version number. + * pd_verify - page verification information of different kinds * pd_prune_xid - oldest XID among potentially prunable tuples on page. * * The LSN is used by the buffer manager to enforce the basic rule of WAL: @@ -106,7 +108,8 @@ typedef uint16 LocationIndex; * pd_prune_xid is a hint field that helps determine whether pruning will be * useful. It is currently unused in index pages. * - * The page version number and page size are packed together into a single + * For verification we store either a 16 bit checksum or a watermark of + * the page version number and page size packed together into a single * uint16 field. This is for historical reasons: before PostgreSQL 7.3, * there was no concept of a page version number, and doing it this way * lets us pretend that pre-7.3 databases have page version number zero. @@ -130,7 +133,13 @@ typedef struct PageHeaderData LocationIndex pd_lower; /* offset to start of free space */ LocationIndex pd_upper; /* offset to end of free space */ LocationIndex pd_special; /* offset to start of special space */ - uint16 pd_pagesize_version; + + union + { + uint16 pd_pagesize_version; + uint16 pd_checksum16; + } pd_verify; /* page verification data */ + TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ ItemIdData pd_linp[1]; /* beginning of line pointer array */ } PageHeaderData; @@ -155,7 +164,16 @@ typedef PageHeaderData *PageHeader; #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ -#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ +#define PD_VALID_FLAG_BITS 0x800F /* OR of all non-checksum pd_flags bits */ + +#define PD_CHECKSUM1 0x0008 /* First checksum bit */ +#define PD_CHECKSUM2 0x8000 /* Second checksum bit */ +#define PD_CHECKSUM 0x8008 /* OR of both checksum flags */ + +#define PageHasChecksumFlag1(page) \ + ((((PageHeader) (page))->pd_flags & PD_CHECKSUM1) == PD_CHECKSUM1) +#define PageHasChecksumFlag2(page) \ + ((((PageHeader) (page))->pd_flags & PD_CHECKSUM2) == PD_CHECKSUM2) /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -165,6 +183,8 @@ typedef PageHeaderData *PageHeader; * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and * added the pd_flags field (by stealing some bits from pd_tli), * as well as adding the pd_prune_xid field (which enlarges the header). + * Release 9.2 uses 4 as well, though with changed meaning of verification bits. + * We deliberately don't bump the page version for that, to allow upgrades. */ #define PG_PAGE_LAYOUT_VERSION 4 @@ -231,19 +251,22 @@ typedef PageHeaderData *PageHeader; * PageGetPageSize * Returns the page size of a page. * - * this can only be called on a formatted page (unlike - * BufferGetPageSize, which can be called on an unformatted page). - * however, it can be called on a page that is not stored in a buffer. + * Since PageSizeIsValid() when pagesize == BLCKSZ, just written BLCKSZ. + * This can be called on any page, initialised or not, in or out of buffers. + * You might think this can vary at runtime but you'd be wrong, since pages + * frequently need to occupy buffers and pages are copied from one to another + * so there are many hidden assumptions that this simple definition is true. */ -#define PageGetPageSize(page) \ - ((Size) (((PageHeader) (page))->pd_pagesize_version & (uint16) 0xFF00)) +#define PageGetPageSize(page) (BLCKSZ) /* * PageGetPageLayoutVersion * Returns the page layout version of a page. + * + * Must not be used on a page that is flagged for checksums. */ #define PageGetPageLayoutVersion(page) \ - (((PageHeader) (page))->pd_pagesize_version & 0x00FF) + (((PageHeader) (page))->pd_verify.pd_pagesize_version & 0x00FF) /* * PageSetPageSizeAndVersion @@ -251,12 +274,14 @@ typedef PageHeaderData *PageHeader; * * We could support setting these two values separately, but there's * no real need for it at the moment. + * + * Must not be used on a page that is flagged for checksums. */ #define PageSetPageSizeAndVersion(page, size, version) \ ( \ AssertMacro(((size) & 0xFF00) == (size)), \ AssertMacro(((version) & 0x00FF) == (version)), \ - ((PageHeader) (page))->pd_pagesize_version = (size) | (version) \ + ((PageHeader) (page))->pd_verify.pd_pagesize_version = (size) | (version) \ ) /* ---------------- @@ -368,7 +393,7 @@ do { \ */ extern void PageInit(Page page, Size pageSize, Size specialSize); -extern bool PageHeaderIsValid(PageHeader page); +extern bool PageIsVerified(Page page); extern OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap); extern Page PageGetTempPage(Page page); @@ -381,5 +406,6 @@ extern Size PageGetExactFreeSpace(Page page); extern Size PageGetHeapFreeSpace(Page page); extern void PageIndexTupleDelete(Page page, OffsetNumber offset); extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); +extern char *PageSetVerificationInfo(Page page); #endif /* BUFPAGE_H */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers