On Sat, Jan 7, 2012 at 11:09 AM, Simon Riggs <si...@2ndquadrant.com> wrote:
> On Sat, Jan 7, 2012 at 10:55 AM, Simon Riggs <si...@2ndquadrant.com> wrote:
>
>> So there isn't any problem with there being incorrect checksums on
>> blocks and you can turn the parameter on and off as often and as
>> easily as you want. I think it can be USERSET but I wouldn't want to
>> encourage users to see turning it off as a performance tuning feature.
>> If the admin turns it on for the server, its on, so its SIGHUP.
>>
>> Any holes in that I haven't noticed?
>
> And of course, as soon as I wrote that I thought of the problem. We
> mustn't make a write that hasn't been covered by a FPW, so we must
> know ahead of time whether to WAL log hints or not. We can't simply
> turn it on/off any longer, now that we have to WAL log hint bits also.
> So thanks for making me think of that.
>
> We *could* make it turn on/off at each checkpoint, but its easier just
> to say that it can be turned on/off at server start.

Attached patch v6 now handles hint bits and checksums correctly,
following Heikki's comments.

In recovery, setting a hint doesn't dirty a block if it wasn't already
dirty. So we can write some hints, and we can set others but not write
them.

Lots of comments in the code.

-- 
 Simon Riggs                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0cc3296..3cb8d2a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1701,6 +1701,47 @@ SET ENABLE_SEQSCAN TO OFF;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-page-checksums" xreflabel="page_checksums">
+      <indexterm>
+       <primary><varname>page_checksums</> configuration parameter</primary>
+      </indexterm>
+      <term><varname>page_checksums</varname> (<type>boolean</type>)</term>
+      <listitem>
+       <para>
+        When this parameter is on, the <productname>PostgreSQL</> server
+        calculates checksums when it writes main database pages to disk,
+        flagging the page as checksum protected.  When this parameter is off,
+        no checksum is written, only a standard watermark in the page header.
+        The database may thus contain a mix of pages with checksums and pages
+        without checksums.
+       </para>
+
+       <para>
+        When pages are read into shared buffers any page flagged with a
+        checksum has the checksum re-calculated and compared against the
+        stored value to provide greatly improved validation of page contents.
+       </para>
+
+       <para>
+        Writes via temp_buffers are not checksummed.
+       </para>
+
+       <para>
+        Turning this parameter off speeds normal operation, but
+        might allow data corruption to go unnoticed. The checksum uses
+        16-bit checksums, using the fast Fletcher 16 algorithm. With this
+        parameter enabled there is still a non-zero probability that an error
+        could go undetected, as well as a non-zero probability of false
+        positives.
+       </para>
+
+       <para>
+        This parameter can only be set at server start.
+        The default is <literal>off</>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers">
       <term><varname>wal_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8e65962..c9538d3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -708,6 +708,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		updrqst;
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+	bool		IsHint = (rmid == RM_SMGR_ID && info == XLOG_SMGR_HINT);
 
 	/* cross-check on whether we should be here or not */
 	if (!XLogInsertAllowed())
@@ -975,6 +976,18 @@ begin:;
 	}
 
 	/*
+	 * If this is a hint record and we don't need a backup block then
+	 * we have no more work to do and can exit quickly without inserting
+	 * a WAL record at all. In that case return InvalidXLogRecPtr.
+	 */
+	if (IsHint && !(info & XLR_BKP_BLOCK_MASK))
+	{
+		LWLockRelease(WALInsertLock);
+		END_CRIT_SECTION();
+		return InvalidXLogRecPtr;
+	}
+
+	/*
 	 * If there isn't enough space on the current XLOG page for a record
 	 * header, advance to the next page (leaving the unused space as zeroes).
 	 */
@@ -3670,6 +3683,13 @@ RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
 				   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
 		}
 
+		/*
+		 * Any checksum set on this page will be invalid. We don't need
+		 * to reset it here since it will be reset before being written
+		 * but it seems worth doing this for general sanity and hygiene.
+		 */
+		PageSetPageSizeAndVersion(page, BLCKSZ, PG_PAGE_LAYOUT_VERSION);
+
 		PageSetLSN(page, lsn);
 		PageSetTLI(page, ThisTimeLineID);
 		MarkBufferDirty(buffer);
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index a017101..618c8f9 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -20,6 +20,7 @@
 #include "postgres.h"
 
 #include "access/visibilitymap.h"
+#include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
@@ -70,6 +71,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
 /* XLOG gives us high 4 bits */
 #define XLOG_SMGR_CREATE	0x10
 #define XLOG_SMGR_TRUNCATE	0x20
+#define XLOG_SMGR_HINT		0x40
 
 typedef struct xl_smgr_create
 {
@@ -477,19 +479,74 @@ AtSubAbort_smgr(void)
 	smgrDoPendingDeletes(false);
 }
 
+/*
+ * Write a backup block if needed when we are setting a hint.
+ *
+ * Deciding the "if needed" bit is delicate and requires us to either
+ * grab WALInsertLock or check the info_lck spinlock. If we check the
+ * spinlock and it says Yes then we will need to get WALInsertLock as well,
+ * so the design choice here is to just go straight for the WALInsertLock
+ * and trust that calls to this function are minimised elsewhere.
+ *
+ * Callable while holding share lock on the buffer content.
+ *
+ * Possible that multiple concurrent backends could attempt to write
+ * WAL records. In that case, more than one backup block may be recorded
+ * though that isn't important to the outcome and the backup blocks are
+ * likely to be identical anyway.
+ */
+#define	SMGR_HINT_WATERMARK		13579
+void
+smgr_buffer_hint(Buffer buffer)
+{
+	/*
+	 * Make an XLOG entry reporting the hint
+	 */
+	XLogRecPtr	lsn;
+	XLogRecData rdata[2];
+	int			watermark = SMGR_HINT_WATERMARK;
+
+	/*
+	 * Not allowed to have zero-length records, so use a small watermark
+	 */
+	rdata[0].data = (char *) (&watermark);
+	rdata[0].len = sizeof(int);
+	rdata[0].buffer = InvalidBuffer;
+	rdata[0].buffer_std = false;
+	rdata[0].next = &(rdata[1]);
+
+	rdata[1].data = NULL;
+	rdata[1].len = 0;
+	rdata[1].buffer = buffer;
+	rdata[1].buffer_std = true;
+	rdata[1].next = NULL;
+
+	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_HINT, rdata);
+
+	/*
+	 * Set the page LSN if we wrote a backup block.
+	 */
+	if (!XLByteEQ(InvalidXLogRecPtr, lsn))
+	{
+		Page 	page = BufferGetPage(buffer);
+		PageSetLSN(page, lsn);
+		elog(LOG, "inserted backup block for hint bit");
+	}
+}
+
 void
 smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	/* Backup blocks are not used in smgr records */
-	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
-
 	if (info == XLOG_SMGR_CREATE)
 	{
 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
 		SMgrRelation reln;
 
+		/* Backup blocks are not used in smgr truncate records */
+		Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+
 		reln = smgropen(xlrec->rnode, InvalidBackendId);
 		smgrcreate(reln, xlrec->forkNum, true);
 	}
@@ -499,6 +556,9 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 		SMgrRelation reln;
 		Relation	rel;
 
+		/* Backup blocks are not used in smgr truncate records */
+		Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+
 		reln = smgropen(xlrec->rnode, InvalidBackendId);
 
 		/*
@@ -524,6 +584,28 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		FreeFakeRelcacheEntry(rel);
 	}
+	else if (info == XLOG_SMGR_HINT)
+	{
+		int	*watermark = (int *) XLogRecGetData(record);
+
+		/* Check the watermark is correct for the hint record */
+		Assert(*watermark == SMGR_HINT_WATERMARK);
+
+		/* Backup blocks must be present for smgr hint records */
+		Assert(record->xl_info & XLR_BKP_BLOCK_MASK);
+
+		/*
+		 * Hint records have no information that needs to be replayed.
+		 * The sole purpose of them is to ensure that a hint bit does
+		 * not cause a checksum invalidation if a hint bit write should
+		 * cause a torn page. So the body of the record is empty but
+		 * there can be one backup block.
+		 *
+		 * Since the only change in the backup block is a hint bit,
+		 * there is no confict with Hot Standby.
+		 */
+		RestoreBkpBlocks(lsn, record, false);
+	}
 	else
 		elog(PANIC, "smgr_redo: unknown op code %u", info);
 }
@@ -550,6 +632,8 @@ smgr_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->blkno);
 		pfree(path);
 	}
+	else if (info == XLOG_SMGR_HINT)
+		appendStringInfo(buf, "buffer hint");
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index f7712a9..8a8232b 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -1092,10 +1092,10 @@ read_info(SeqTable elm, Relation rel, Buffer *buf)
 	 */
 	if (HeapTupleHeaderGetXmax(tuple.t_data) != InvalidTransactionId)
 	{
-		HeapTupleHeaderSetXmax(tuple.t_data, InvalidTransactionId);
-		tuple.t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
-		tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(*buf);
+			HeapTupleHeaderSetXmax(tuple.t_data, InvalidTransactionId);
+			tuple.t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
+			tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
+			SetBufferCommitInfoNeedsSave(*buf);
 	}
 
 	seq = (Form_pg_sequence) GETSTRUCT(&tuple);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 91cc001..42d43cd 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -34,6 +34,7 @@
 #include <unistd.h>
 
 #include "catalog/catalog.h"
+#include "catalog/storage.h"
 #include "executor/instrument.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
@@ -440,7 +441,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
 
 			/* check for garbage data */
-			if (!PageHeaderIsValid((PageHeader) bufBlock))
+			if (!PageIsVerified((Page) bufBlock))
 			{
 				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
 				{
@@ -1860,6 +1861,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 {
 	XLogRecPtr	recptr;
 	ErrorContextCallback errcontext;
+	Block		bufBlock;
+	char		*bufCopy;
 
 	/*
 	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
@@ -1907,10 +1910,24 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	buf->flags &= ~BM_JUST_DIRTIED;
 	UnlockBufHdr(buf);
 
+	/*
+	 * Set page verification info immediately before we write the buffer to disk.
+	 * Once we have flushed the buffer is marked clean again, meaning it can
+	 * be replaced quickly and silently with another data block, so we must
+	 * write verification info now. For efficiency, the process of cleaning
+	 * and page replacement is asynchronous, so we can't do this *only* when
+	 * we are about to replace the buffer, we need to do this for every flush.
+	 */
+	bufBlock = BufHdrGetBlock(buf);
+	bufCopy = PageSetVerificationInfo((Page) bufBlock);
+
+	/*
+	 * bufToWrite is either the shared buffer or a copy, as appropriate.
+	 */
 	smgrwrite(reln,
 			  buf->tag.forkNum,
 			  buf->tag.blockNum,
-			  (char *) BufHdrGetBlock(buf),
+			  (char *) bufCopy,
 			  false);
 
 	pgBufferUsage.shared_blks_written++;
@@ -1921,6 +1938,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	 */
 	TerminateBufferIO(buf, true, 0);
 
+	/* XXX Assert(buf is not BM_JUST_DIRTIED) */
+
 	TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
 									   buf->tag.blockNum,
 									   reln->smgr_rnode.node.spcNode,
@@ -2341,6 +2360,41 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 		(BM_DIRTY | BM_JUST_DIRTIED))
 	{
+		/*
+		 * If we're writing checksums and we care about torn pages then we
+		 * cannot dirty a page during recovery as a result of a hint.
+		 * We can set the hint, just not dirty the page as a result.
+		 *
+		 * See long discussion in bufpage.c
+		 */
+		if (HintsMustNotDirtyPage())
+			return;
+
+		/*
+		 * Write a full page into WAL iff this is the first change on the
+		 * block since the last checkpoint. That will never be the case
+		 * if the block is already dirty because we either made a change
+		 * or set a hint already. Note that aggressive cleaning of blocks
+		 * dirtied by hint bit setting would increase the call rate.
+		 * Bulk setting of hint bits would reduce the call rate...
+		 *
+		 * We must issue the WAL record before we mark the buffer dirty.
+		 * Otherwise we might write the page before we write the WAL.
+		 * That causes a race condition, since a checkpoint might
+		 * occur between writing the WAL record and marking the buffer dirty.
+		 * We solve that with a kluge, but one that is already in use
+		 * during transaction commit to prevent race conditions.
+		 * Basically, we simply prevent the checkpoint WAL record from
+		 * being written until we have marked the buffer dirty. We don't
+		 * start the checkpoint flush until we have marked dirty, so our
+		 * checkpoint must flush the change to disk successfully or the
+		 * checkpoint never gets written, so crash recovery will set us right.
+		 *
+		 * XXX rename PGPROC variable later; keep it same now for clarity
+		 */
+		MyPgXact->inCommit = true;
+		smgr_buffer_hint(buffer);
+
 		LockBufHdr(bufHdr);
 		Assert(bufHdr->refcount > 0);
 		if (!(bufHdr->flags & BM_DIRTY))
@@ -2351,6 +2405,7 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
 		}
 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
 		UnlockBufHdr(bufHdr);
+		MyPgXact->inCommit = false;
 	}
 }
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 096d36a..a220310 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -200,6 +200,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		/* Find smgr relation for buffer */
 		oreln = smgropen(bufHdr->tag.rnode, MyBackendId);
 
+		/* XXX do we want to write checksums for local buffers? An option? */
+
 		/* And write... */
 		smgrwrite(oreln,
 				  bufHdr->tag.forkNum,
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 90a731c..e07f133 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -16,6 +16,12 @@
 
 #include "access/htup.h"
 
+bool page_checksums = false;
+
+static char pageCopy[BLCKSZ];	/* temporary buffer to allow checksum calculation */
+
+static bool PageVerificationInfoOK(Page page);
+static uint16 PageCalcChecksum16(Page page);
 
 /* ----------------------------------------------------------------
  *						Page support functions
@@ -25,6 +31,10 @@
 /*
  * PageInit
  *		Initializes the contents of a page.
+ *		Note that we don't automatically add a checksum, or flag that the
+ * 		page has a checksum field. We start with a normal page layout and defer
+ *		the decision on what page verification will be written just before
+ *		we write the block to disk.
  */
 void
 PageInit(Page page, Size pageSize, Size specialSize)
@@ -67,20 +77,20 @@ PageInit(Page page, Size pageSize, Size specialSize)
  * will clean up such a page and make it usable.
  */
 bool
-PageHeaderIsValid(PageHeader page)
+PageIsVerified(Page page)
 {
+	PageHeader	p = (PageHeader) page;
 	char	   *pagebytes;
 	int			i;
 
 	/* Check normal case */
-	if (PageGetPageSize(page) == BLCKSZ &&
-		PageGetPageLayoutVersion(page) == PG_PAGE_LAYOUT_VERSION &&
-		(page->pd_flags & ~PD_VALID_FLAG_BITS) == 0 &&
-		page->pd_lower >= SizeOfPageHeaderData &&
-		page->pd_lower <= page->pd_upper &&
-		page->pd_upper <= page->pd_special &&
-		page->pd_special <= BLCKSZ &&
-		page->pd_special == MAXALIGN(page->pd_special))
+	if (PageVerificationInfoOK(page) &&
+		(p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 &&
+		p->pd_lower >= SizeOfPageHeaderData &&
+		p->pd_lower <= p->pd_upper &&
+		p->pd_upper <= p->pd_special &&
+		p->pd_special <= BLCKSZ &&
+		p->pd_special == MAXALIGN(p->pd_special))
 		return true;
 
 	/* Check all-zeroes case */
@@ -93,7 +103,6 @@ PageHeaderIsValid(PageHeader page)
 	return true;
 }
 
-
 /*
  *	PageAddItem
  *
@@ -827,3 +836,258 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
 
 	pfree(itemidbase);
 }
+
+/*
+ * Test whether the page verification information is correct or not.
+ *
+ * IMPORTANT NOTE -
+ * Verification info is not valid at all times on a data page. We set
+ * verification info before we flush page/buffer, and implicitly invalidate
+ * verification info when we write to the page. A heavily accessed buffer
+ * might then spend most of its life with invalid page verification info,
+ * so testing verification info on random pages in the buffer pool will tell
+ * you nothing. The reason for this is that page verification info protects
+ * Postgres data from errors on the filesystems on which we rely. We do not
+ * protect buffers against uncorrectable memory errors, since these have a
+ * very low measured incidence according to research on large server farms,
+ * http://www.google.com/research/pubs/archive/35162.pdf, discussed 2010/12/22.
+ *
+ * To confirm your understanding that means that WAL-logged changes to a page
+ * do NOT update the page verification info, so full page images may not have
+ * correct verification information on them. But those page images have the
+ * WAL CRC covering them and so are verified separately from this mechanism.
+ *
+ * Any write of a data block can cause a torn page if the write is unsuccessful.
+ * Full page writes protect us from that, which are stored in WAL. Setting
+ * hint bits when a page is already dirty is OK because a full page write
+ * must already have been written for that since the last checkpoint.
+ * Setting hint bits on an otherwise clean page can allow torn pages; this
+ * doesn't normally matter since they are just hints. When the page has
+ * checksums, losing a few bits would cause the checksum to be invalid.
+ * So if we have full_page_writes = on and page_checksums = on then we must
+ * write a WAL record specifically so that we record a full page image in WAL.
+ * New WAL records cannot be written during recovery, so hint bits set
+ * during recovery must not dirty the page if the buffer is not already dirty,
+ * when page_checksums = on. Enforced by checking HintsMustNotDirtyPage()
+ *
+ * So we cannot enable/disable page_checksums except at a checkpoint if
+ * full_page_writes is enabled. We choose to only allow changes at server start. 
+ *
+ * WAL replay ignores page verification info unless it writes out or reads in
+ * blocks from disk; restoring full page writes does not check verification
+ * info via this function. So we zero the checksum when restoring backup blocks.
+ * In recovery, since we only dirty a block when we have a full page image
+ * available if we crash, we are fully OK to use page verification.
+ *
+ * The best way to understand this is that WAL CRCs protect records entering
+ * the WAL stream, and page verification protects blocks entering and leaving
+ * the buffer pool. They are similar in purpose, yet completely separate.
+ * Together they ensure we are able to detect errors in data leaving and
+ * re-entering PostgreSQL controlled memory.
+ *
+ * Note also that the verification mechanism can vary from page to page.
+ * All we do here is look at what the page itself says is the verification
+ * mechanism and then apply that test. This allows us to run without the CPU
+ * cost of verification if we choose, as well as to provide an upgrade path
+ * for anyone doing direct upgrades using pg_upgrade.
+ *
+ * There is some concern that trusting page data to say how to check page
+ * data is dangerously self-referential. To ensure no mistakes we set two
+ * non-adjacent bits to signify that the page has a checksum and
+ * should be verified when that block is read back into a buffer.
+ * We use two bits in case a multiple bit error removes one of the checksum
+ * flags *and* destroys data, which would lead to skipping the checksum check
+ * and silently accepting bad data.
+ *
+ * Note also that this returns a boolean, not a full damage assessment.
+ */
+static bool
+PageVerificationInfoOK(Page page)
+{
+	PageHeader	p = (PageHeader) page;
+
+	/*
+	 * We set two non-adjacent bits to signify that the page has a checksum and
+	 * should be verified against that block is read back into a buffer.
+	 * We use two bits in case a multiple bit error removes one of the checksum
+	 * flags and destroys data, which would lead to skipping the checksum check
+	 * and silently accepting bad data.
+	 */
+	if (PageHasChecksumFlag1(p) && PageHasChecksumFlag2(p))
+	{
+		uint16	checksum = PageCalcChecksum16(page);
+
+		if (checksum == p->pd_verify.pd_checksum16)
+		{
+#ifdef CHECK_HOLE
+			/* Also check page hole is all-zeroes */
+			char	   *pagebytes;
+			bool		empty = true;
+			int			i;
+
+			pagebytes = (char *) page;
+			for (i = p->pd_lower; i < p->pd_upper; i++)
+			{
+				if (pagebytes[i] != 0)
+				{
+					empty = false;
+					break;
+				}
+			}
+
+			if (!empty)
+				elog(LOG, "hole was not empty at byte %d pd_lower %d pd_upper %d",
+								i, p->pd_lower, p->pd_upper);
+#endif
+			return true;
+		}
+
+		elog(LOG, "page verification failed - checksum was %u page checksum field is %u",
+						checksum, p->pd_verify.pd_checksum16);
+	}
+	else if (!PageHasChecksumFlag1(p) && !PageHasChecksumFlag2(p))
+	{
+		if (PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION &&
+			PageGetPageSize(p) == BLCKSZ)
+			return true;
+	}
+	else
+		elog(LOG, "page verification failed - page has one checksum flag set");
+
+	return false;
+}
+
+/*
+ * Set verification info for page.
+ *
+ * Either we set a new checksum, or we set the standard watermark. We must
+ * not leave an invalid checksum in place. Note that the verification info is
+ * not WAL logged, whereas the data changes to pages are, so data is safe
+ * whether or not we have page_checksums enabled. The purpose of checksums
+ * is to detect page corruption to allow replacement from backup.
+ *
+ * Returns a pointer to the block-sized data that needs to be written. That
+ * allows us to either copy, or not, depending upon whether we checksum.
+ */
+char *
+PageSetVerificationInfo(Page page)
+{
+	PageHeader	p;
+
+	if (PageIsNew(page))
+		return (char *) page;
+
+	if (page_checksums)
+	{
+		/*
+		 * We make a copy iff we need to calculate a checksum because other
+		 * backends may set hint bits on this page while we write, which
+		 * would mean the checksum differs from the page contents. It doesn't
+		 * matter if we include or exclude hints during the copy, as long
+		 * as we write a valid page and associated checksum.
+		 */
+		memcpy(&pageCopy, page, BLCKSZ);
+
+		p = (PageHeader) &pageCopy;
+		p->pd_flags |= PD_CHECKSUM;
+		p->pd_verify.pd_checksum16 = PageCalcChecksum16((Page) &pageCopy);
+
+		return (char *) &pageCopy;
+	}
+
+	p = (PageHeader) page;
+
+	if (PageHasChecksumFlag1(p) || PageHasChecksumFlag2(p))
+	{
+		/* ensure any older checksum info is overwritten with watermark */
+		p->pd_flags &= ~PD_CHECKSUM;
+		PageSetPageSizeAndVersion(p, BLCKSZ, PG_PAGE_LAYOUT_VERSION);
+	}
+
+	return (char *) page;
+}
+
+/*
+ * Calculate checksum for a PostgreSQL Page. We do this in 3 steps, first
+ * we calculate the checksum for the header, avoiding the verification
+ * info, which will be added afterwards. Next, we add the line pointers up to
+ * the hole in the middle of the block at pd_lower. Last, we add the tail
+ * of the page from pd_upper to the end of page.
+ */
+static uint16
+PageCalcChecksum16(Page page)
+{
+#define PAGE_VERIFICATION_USES_FLETCHER16 (true)
+#ifdef PAGE_VERIFICATION_USES_FLETCHER16
+	/*
+	 * Following calculation is a Flecther's 16 checksum. The calc is isolated
+	 * here and tuning and/or replacement algorithms are possible.
+	 *
+	 * XXX present implementation is raw, untuned calculation, please tweak
+	 */
+	PageHeader	p = (PageHeader) page;
+	uint	page_header_stop = (uint)(offsetof(PageHeaderData, pd_special) + sizeof(LocationIndex));
+	uint	page_lower_start = (uint)(offsetof(PageHeaderData, pd_prune_xid));
+	uint	page_lower_stop;
+	uint 	sum1 = 0;
+	uint64	sum2 = 0;
+	int		i;
+
+	/*
+	 * Avoid calculating checksum if page is new, just return a value that
+	 * will cause the check to fail. We may still pass the all-zeroes check.
+	 */
+	if (PageIsNew(page))
+		return 1;
+
+	/*
+	 * Just add in the pd_prune_xid if there are no line pointers yet.
+	 */
+	page_lower_stop = p->pd_lower;
+	if (page_lower_stop == 0)
+		page_lower_stop = page_lower_start + sizeof(TransactionId);
+
+	Assert(p->pd_upper != 0);
+
+#ifdef DEBUG_CHECKSUM
+	elog(LOG, "calculating checksum for %u-%u %u-%u %u-%u",
+			0,	/* page_header_start */
+			page_header_stop,
+			page_lower_start,
+			page_lower_stop,
+			p->pd_upper,
+			BLCKSZ
+			);
+#endif
+
+#define	COMP_F16(from, to) \
+do { \
+	for (i = from; i < to; i++) \
+	{ \
+			sum1 = sum1 + page[i]; \
+			sum2 = sum1 + sum2; \
+	} \
+	sum1 %= 255; \
+	sum2 %= 255; \
+} while (0); \
+
+	COMP_F16(0,
+			 page_header_stop);
+
+	/* ignore the checksum field since not done yet... */
+
+	COMP_F16(page_lower_start,
+			 page_lower_stop);
+
+	/* ignore the hole in the middle of the block */
+
+	COMP_F16(p->pd_upper,
+			 BLCKSZ - 1);
+
+#ifdef DEBUG_CHECKSUM
+	elog(LOG, "checksum %u", ((sum2 << 8) | sum1));
+#endif
+
+	return ((sum2 << 8) | sum1);
+#endif
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5c910dd..1c3f485 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -830,6 +830,20 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 	{
+		{"page_checksums", PGC_POSTMASTER, WAL_SETTINGS,
+			gettext_noop("Marks database blocks with a checksum before writing them to disk. "),
+			gettext_noop("When enabled all database blocks will be marked with a checksums before writing to disk. "
+						 "When we read a database block from disk the checksum is checked, if it exists. "
+						 "If there is no checksum marked yet then no check is performed, though a "
+						 "checksum will be added later when we re-write the database block. "
+						 "When disabled checksums will be ignored, even if the block was marked "
+						 "with checksum. When disabled checksums will not be added to database blocks.")
+		},
+		&page_checksums,
+		true,
+		NULL, NULL, NULL
+	},
+	{
 		{"full_page_writes", PGC_SIGHUP, WAL_SETTINGS,
 			gettext_noop("Writes full pages to WAL when first modified after a checkpoint."),
 			gettext_noop("A page write in process during an operating system crash might be "
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 315db46..6f81023 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -150,15 +150,21 @@
 
 
 #------------------------------------------------------------------------------
-# WRITE AHEAD LOG
+# WRITE AHEAD LOG & RELIABILITY
 #------------------------------------------------------------------------------
 
-# - Settings -
+# - Reliability -
 
-#wal_level = minimal			# minimal, archive, or hot_standby
-					# (change requires restart)
+#page_checksums = off			# calculate checksum before database I/O
+#full_page_writes = on			# recover from partial page writes
 #fsync = on				# turns forced synchronization on or off
+
 #synchronous_commit = on		# synchronization level; on, off, or local
+
+# - Write Ahead Log -
+
+#wal_level = minimal			# minimal, archive, or hot_standby
+					# (change requires restart)
 #wal_sync_method = fsync		# the default is the first option
 					# supported by the operating system:
 					#   open_datasync
@@ -166,7 +172,6 @@
 					#   fsync
 					#   fsync_writethrough
 					#   open_sync
-#full_page_writes = on			# recover from partial page writes
 #wal_buffers = -1			# min 32kB, -1 sets based on shared_buffers
 					# (change requires restart)
 #wal_writer_delay = 200ms		# 1-10000 milliseconds
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index db6380f..eb32856 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -114,6 +114,8 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
 #define XLogPageHeaderSize(hdr)		\
 	(((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD)
 
+#define XLOG_SMGR_HINT		0x40
+
 /*
  * We break each logical log file (xlogid value) into segment files of the
  * size indicated by XLOG_SEG_SIZE.  One possible segment at the end of each
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index d5103a8..48a728c 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -36,6 +36,7 @@ extern void PostPrepare_smgr(void);
 
 extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum);
 
+extern void smgr_buffer_hint(Buffer buffer);
 extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec);
 
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 1ab64e0..38708c0 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -18,6 +18,8 @@
 #include "storage/item.h"
 #include "storage/off.h"
 
+extern bool page_checksums;
+
 /*
  * A postgres disk page is an abstraction layered on top of a postgres
  * disk block (which is simply a unit of i/o, see block.h).
@@ -93,7 +95,7 @@ typedef uint16 LocationIndex;
  *		pd_lower	- offset to start of free space.
  *		pd_upper	- offset to end of free space.
  *		pd_special	- offset to start of special space.
- *		pd_pagesize_version - size in bytes and page layout version number.
+ *		pd_verify	- page verification information of different kinds
  *		pd_prune_xid - oldest XID among potentially prunable tuples on page.
  *
  * The LSN is used by the buffer manager to enforce the basic rule of WAL:
@@ -106,7 +108,8 @@ typedef uint16 LocationIndex;
  * pd_prune_xid is a hint field that helps determine whether pruning will be
  * useful.	It is currently unused in index pages.
  *
- * The page version number and page size are packed together into a single
+ * For verification we store either a 16 bit checksum or a watermark of
+ * the page version number and page size packed together into a single
  * uint16 field.  This is for historical reasons: before PostgreSQL 7.3,
  * there was no concept of a page version number, and doing it this way
  * lets us pretend that pre-7.3 databases have page version number zero.
@@ -130,7 +133,13 @@ typedef struct PageHeaderData
 	LocationIndex pd_lower;		/* offset to start of free space */
 	LocationIndex pd_upper;		/* offset to end of free space */
 	LocationIndex pd_special;	/* offset to start of special space */
-	uint16		pd_pagesize_version;
+
+	union
+	{
+		uint16		pd_pagesize_version;
+		uint16		pd_checksum16;
+	} pd_verify;				/* page verification data */
+
 	TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
 	ItemIdData	pd_linp[1];		/* beginning of line pointer array */
 } PageHeaderData;
@@ -155,7 +164,16 @@ typedef PageHeaderData *PageHeader;
 #define PD_ALL_VISIBLE		0x0004		/* all tuples on page are visible to
 										 * everyone */
 
-#define PD_VALID_FLAG_BITS	0x0007		/* OR of all valid pd_flags bits */
+#define PD_VALID_FLAG_BITS	0x800F		/* OR of all non-checksum pd_flags bits */
+
+#define PD_CHECKSUM1		0x0008		/* First checksum bit */
+#define PD_CHECKSUM2		0x8000		/* Second checksum bit */
+#define PD_CHECKSUM 		0x8008		/* OR of both checksum flags */
+
+#define PageHasChecksumFlag1(page) \
+	((((PageHeader) (page))->pd_flags & PD_CHECKSUM1) == PD_CHECKSUM1)
+#define PageHasChecksumFlag2(page) \
+	((((PageHeader) (page))->pd_flags & PD_CHECKSUM2) == PD_CHECKSUM2)
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.
@@ -165,6 +183,8 @@ typedef PageHeaderData *PageHeader;
  * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
  *		added the pd_flags field (by stealing some bits from pd_tli),
  *		as well as adding the pd_prune_xid field (which enlarges the header).
+ * Release 9.2 uses 4 as well, though with changed meaning of verification bits.
+ * We deliberately don't bump the page version for that, to allow upgrades.
  */
 #define PG_PAGE_LAYOUT_VERSION		4
 
@@ -231,19 +251,22 @@ typedef PageHeaderData *PageHeader;
  * PageGetPageSize
  *		Returns the page size of a page.
  *
- * this can only be called on a formatted page (unlike
- * BufferGetPageSize, which can be called on an unformatted page).
- * however, it can be called on a page that is not stored in a buffer.
+ * Since PageSizeIsValid() when pagesize == BLCKSZ, just written BLCKSZ.
+ * This can be called on any page, initialised or not, in or out of buffers.
+ * You might think this can vary at runtime but you'd be wrong, since pages
+ * frequently need to occupy buffers and pages are copied from one to another
+ * so there are many hidden assumptions that this simple definition is true.
  */
-#define PageGetPageSize(page) \
-	((Size) (((PageHeader) (page))->pd_pagesize_version & (uint16) 0xFF00))
+#define PageGetPageSize(page) (BLCKSZ)
 
 /*
  * PageGetPageLayoutVersion
  *		Returns the page layout version of a page.
+ *
+ * Must not be used on a page that is flagged for checksums.
  */
 #define PageGetPageLayoutVersion(page) \
-	(((PageHeader) (page))->pd_pagesize_version & 0x00FF)
+	(((PageHeader) (page))->pd_verify.pd_pagesize_version & 0x00FF)
 
 /*
  * PageSetPageSizeAndVersion
@@ -251,14 +274,24 @@ typedef PageHeaderData *PageHeader;
  *
  * We could support setting these two values separately, but there's
  * no real need for it at the moment.
+ *
+ * Must not be used on a page that is flagged for checksums.
  */
 #define PageSetPageSizeAndVersion(page, size, version) \
 ( \
 	AssertMacro(((size) & 0xFF00) == (size)), \
 	AssertMacro(((version) & 0x00FF) == (version)), \
-	((PageHeader) (page))->pd_pagesize_version = (size) | (version) \
+	((PageHeader) (page))->pd_verify.pd_pagesize_version = (size) | (version) \
 )
 
+/*
+ * HintsMustNotDirtyPage
+ *		See discussion for PageVerificationInfoOK()
+ */
+#define	HintsMustNotDirtyPage()	\
+	(page_checksums && fullPageWrites && RecoveryInProgress())
+extern bool fullPageWrites;
+
 /* ----------------
  *		page special data macros
  * ----------------
@@ -368,7 +401,7 @@ do { \
  */
 
 extern void PageInit(Page page, Size pageSize, Size specialSize);
-extern bool PageHeaderIsValid(PageHeader page);
+extern bool PageIsVerified(Page page);
 extern OffsetNumber PageAddItem(Page page, Item item, Size size,
 			OffsetNumber offsetNumber, bool overwrite, bool is_heap);
 extern Page PageGetTempPage(Page page);
@@ -381,5 +414,6 @@ extern Size PageGetExactFreeSpace(Page page);
 extern Size PageGetHeapFreeSpace(Page page);
 extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
 extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
+extern char *PageSetVerificationInfo(Page page);
 
 #endif   /* BUFPAGE_H */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to