Hi,

I have rebased this patch now.

I also fixed the two issues Andres reported, namely a zeroed-out
pageheader and a random LSN. The first is caught be checking for an all-
zero-page in the way PageIsVerified() does. The second is caught by
comparing the upper 32 bits of the LSN as well and demanding that they
are equal. If the LSN is corrupted, the upper 32 bits should be wildly
different to the current checkpoint LSN.

Well, at least that is a stab at a fix; there is a window where the
upper 32 bits could legitimately be different. In order to make that as
small as possible, I update the checkpoint LSN every once in a while.

Am Montag, den 18.03.2019, 21:15 +0100 schrieb Michael Banck:
> I have also added a paragraph to the documentation about possilby
> skipping new or recently updated pages:
> 
> +   If the cluster is online, pages that have been (re-)written since the last
> +   checkpoint will not count as checksum failures if they cannot be read or
> +   verified correctly.

I have removed that for now as it seems to be more confusing than
helpful.


Michael

-- 
Michael Banck
Projektleiter / Senior Berater
Tel.: +49 2166 9901-171
Fax:  +49 2166 9901-100
Email: michael.ba...@credativ.de

credativ GmbH, HRB Mönchengladbach 12080
USt-ID-Nummer: DE204566209
Trompeterallee 108, 41189 Mönchengladbach
Geschäftsführung: Dr. Michael Meskes, Jörg Folz, Sascha Heuer

Unser Umgang mit personenbezogenen Daten unterliegt
folgenden Bestimmungen: https://www.credativ.de/datenschutz
diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml
index 1f4d4ab8b4..5027809a59 100644
--- a/doc/src/sgml/ref/pg_checksums.sgml
+++ b/doc/src/sgml/ref/pg_checksums.sgml
@@ -37,12 +37,10 @@ PostgreSQL documentation
   <title>Description</title>
   <para>
    <application>pg_checksums</application> checks, enables or disables data
-   checksums in a <productname>PostgreSQL</productname> cluster.  The server
-   must be shut down cleanly before running
-   <application>pg_checksums</application>. The exit status is zero if there
-   are no checksum errors when checking them, and nonzero if at least one
-   checksum failure is detected. If enabling or disabling checksums, the
-   exit status is nonzero if the operation failed.
+   checksums in a <productname>PostgreSQL</productname> cluster. The exit
+   status is zero if there are no checksum errors when checking them, and
+   nonzero if at least one checksum failure is detected. If enabling or
+   disabling checksums, the exit status is nonzero if the operation failed.
   </para>
 
   <para>
@@ -50,6 +48,12 @@ PostgreSQL documentation
    the cluster, disabling checksums will only update the file
    <filename>pg_control</filename>.
   </para>
+
+  <para>
+   If the cluster is online, pages that have been (re-)written since the last
+   checkpoint will not count as checksum failures if they cannot be verified
+   correctly.
+  </page>
  </refsect1>
 
  <refsect1>
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 5c185ebed8..f62fdc90a4 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -1,8 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_checksums.c
- *	  Checks, enables or disables page level checksums for an offline
- *	  cluster
+ *	  Checks, enables or disables page level checksums for a cluster
  *
  * Copyright (c) 2010-2019, PostgreSQL Global Development Group
  *
@@ -30,13 +29,20 @@
 
 
 static int64 files = 0;
+static int64 skippedfiles = 0;
 static int64 blocks = 0;
 static int64 badblocks = 0;
+static int64 skippedblocks = 0;
 static ControlFileData *ControlFile;
+static XLogRecPtr checkpointLSN;
+static int64 blocks_last_lsn_update = 0;
 
 static char *only_relfilenode = NULL;
 static bool do_sync = true;
 static bool verbose = false;
+static bool online = false;
+
+static char *DataDir = NULL;
 
 typedef enum
 {
@@ -97,6 +103,23 @@ static const char *const skip[] = {
 	NULL,
 };
 
+static void
+update_checkpoint_lsn(void)
+{
+	bool		crc_ok;
+
+	ControlFile = get_controlfile(DataDir, progname, &crc_ok);
+	if (!crc_ok)
+	{
+		fprintf(stderr, _("%s: pg_control CRC value is incorrect\n"), progname);
+		exit(1);
+	}
+
+	/* Update checkpointLSN and blocks_last_lsn_update with the current value */
+	checkpointLSN = ControlFile->checkPoint;
+	blocks_last_lsn_update = blocks;
+}
+
 static bool
 skipfile(const char *fn)
 {
@@ -116,6 +139,10 @@ scan_file(const char *fn, BlockNumber segmentno)
 	PageHeader	header = (PageHeader) buf.data;
 	int			f;
 	BlockNumber blockno;
+	int			block_loc = 0;
+	bool		block_retry = false;
+	bool		all_zeroes;
+	size_t	   *pagebytes;
 	int			flags;
 
 	Assert(mode == PG_MODE_ENABLE ||
@@ -126,6 +153,12 @@ scan_file(const char *fn, BlockNumber segmentno)
 
 	if (f < 0)
 	{
+		if (online && errno == ENOENT)
+		{
+			/* File was removed in the meantime */
+			return;
+		}
+
 		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
 				progname, fn, strerror(errno));
 		exit(1);
@@ -136,27 +169,129 @@ scan_file(const char *fn, BlockNumber segmentno)
 	for (blockno = 0;; blockno++)
 	{
 		uint16		csum;
-		int			r = read(f, buf.data, BLCKSZ);
+		int			r = read(f, buf.data + block_loc, BLCKSZ - block_loc);
 
 		if (r == 0)
+		{
+			/*
+			 * We had a short read and got an EOF before we could read the
+			 * whole block, so count this as a skipped block.
+			 */
+			if (online && block_loc != 0)
+				skippedblocks++;
 			break;
-		if (r != BLCKSZ)
+		}
+		if (r < 0)
 		{
-			fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"),
-					progname, blockno, fn, r, BLCKSZ);
-			exit(1);
+			skippedfiles++;
+			fprintf(stderr, _("%s: could not read block %u in file \"%s\": %s\n"),
+					progname, blockno, fn, strerror(errno));
+			return;
+		}
+		if (r < (BLCKSZ - block_loc))
+		{
+			if (online)
+			{
+				/*
+				 * We read only parts of the block, possibly due to a
+				 * concurrent extension of the relation file.  Increment
+				 * block_loc by the amount that we read and try again.
+				 */
+				block_loc += r;
+				continue;
+			}
+			else
+			{
+				skippedfiles++;
+				fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"),
+						progname, blockno, fn, r, BLCKSZ);
+				return;
+			}
 		}
+
 		blocks++;
 
 		/* New pages have no checksum yet */
 		if (PageIsNew(header))
+		{
+			/* Check for an all-zeroes page */
+			all_zeroes = true;
+			pagebytes = (size_t *) buf.data;
+			for (int i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
+			{
+				if (pagebytes[i] != 0)
+				{
+					all_zeroes = false;
+					break;
+				}
+			}
+			if (!all_zeroes)
+			{
+				fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: pd_upper is zero but block is not all-zero\n"),
+						progname, fn, blockno);
+				badblocks++;
+			}
+			else
+			{
+				skippedblocks++;
+			}
 			continue;
+		}
 
 		csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
 		if (mode == PG_MODE_CHECK)
 		{
 			if (csum != header->pd_checksum)
 			{
+				if (online)
+				{
+					/*
+					 * Retry the block on the first failure if online.  If the
+					 * verification is done while the instance is online, it is
+					 * possible that we read the first 4K page of the block just
+					 * before postgres updated the entire block so it ends up
+					 * looking torn to us.  We only need to retry once because the
+					 * LSN should be updated to something we can ignore on the next
+					 * pass.  If the error happens again then it is a true
+					 * validation failure.
+					 */
+					if (!block_retry)
+					{
+						/* Seek to the beginning of the failed block */
+						if (lseek(f, -BLCKSZ, SEEK_CUR) == -1)
+						{
+							skippedfiles++;
+							fprintf(stderr, _("%s: could not lseek in file \"%s\": %m\n"),
+									progname, fn);
+							return;
+						}
+
+						/* Set flag so we know a retry was attempted */
+						block_retry = true;
+
+						/* Reset loop to validate the block again */
+						blockno--;
+						blocks--;
+
+						continue;
+					}
+
+					/*
+					 * The checksum verification failed on retry as well.  Check if
+					 * the page has been modified since the checkpoint and skip it
+					 * in this case.  As a sanity check, demand that the upper
+					 * 32 bits of the LSN are identical in order to skip as a
+					 * guard against a corrupted LSN in the pageheader.
+					 */
+					if ((PageGetLSN(buf.data) > checkpointLSN) &&
+					   (PageGetLSN(buf.data) >> 32 == checkpointLSN >> 32))
+					{
+						block_retry = false;
+						skippedblocks++;
+						continue;
+					}
+				}
+
 				if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
 					fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X\n"),
 							progname, fn, blockno, csum, header->pd_checksum);
@@ -183,6 +318,9 @@ scan_file(const char *fn, BlockNumber segmentno)
 				exit(1);
 			}
 		}
+
+		block_retry = false;
+		block_loc = 0;
 	}
 
 	if (verbose)
@@ -237,6 +375,12 @@ scan_directory(const char *basedir, const char *subdir)
 		snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
 		if (lstat(fn, &st) < 0)
 		{
+			if (online && errno == ENOENT)
+			{
+				/* File was removed in the meantime */
+				continue;
+			}
+
 			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
 					progname, fn, strerror(errno));
 			exit(1);
@@ -280,6 +424,10 @@ scan_directory(const char *basedir, const char *subdir)
 				continue;
 
 			scan_file(fn, segmentno);
+
+			/* Update checkpointLSN every 1024 * 1024 blocks */
+			if (online && ((blocks - blocks_last_lsn_update) > (1024 * 1024)))
+				update_checkpoint_lsn();
 		}
 #ifndef WIN32
 		else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
@@ -304,7 +452,6 @@ main(int argc, char *argv[])
 		{NULL, 0, NULL, 0}
 	};
 
-	char	   *DataDir = NULL;
 	int			c;
 	int			option_index;
 	bool		crc_ok;
@@ -398,7 +545,7 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
-	/* Check if cluster is running */
+	/* Check if checksums are enabled */
 	ControlFile = get_controlfile(DataDir, progname, &crc_ok);
 	if (!crc_ok)
 	{
@@ -422,12 +569,10 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	/* Check if cluster is running */
 	if (ControlFile->state != DB_SHUTDOWNED &&
 		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
-	{
-		fprintf(stderr, _("%s: cluster must be shut down\n"), progname);
-		exit(1);
-	}
+		online = true;
 
 	if (ControlFile->data_checksum_version == 0 &&
 		mode == PG_MODE_CHECK)
@@ -450,6 +595,9 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	/* Get checkpoint LSN */
+	checkpointLSN = ControlFile->checkPoint;
+
 	/* Operate on all files if checking or enabling checksums */
 	if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE)
 	{
@@ -459,7 +607,11 @@ main(int argc, char *argv[])
 
 		printf(_("Checksum operation completed\n"));
 		printf(_("Files scanned:  %s\n"), psprintf(INT64_FORMAT, files));
+		if (skippedfiles > 0)
+			printf(_("Files skipped: %s\n"), psprintf(INT64_FORMAT, skippedfiles));
 		printf(_("Blocks scanned: %s\n"), psprintf(INT64_FORMAT, blocks));
+		if (skippedblocks > 0)
+			printf(_("Blocks skipped: %s\n"), psprintf(INT64_FORMAT, skippedblocks));
 		if (mode == PG_MODE_CHECK)
 		{
 			printf(_("Bad checksums:  %s\n"), psprintf(INT64_FORMAT, badblocks));
@@ -497,5 +649,10 @@ main(int argc, char *argv[])
 			printf(_("Checksums disabled in cluster\n"));
 	}
 
+	/* skipped blocks or files are considered an error if offline */
+	if (!online)
+		if (skippedblocks > 0 || skippedfiles > 0)
+			return 1;
+
 	return 0;
 }
diff --git a/src/bin/pg_checksums/t/002_actions.pl b/src/bin/pg_checksums/t/002_actions.pl
index 41575c5245..b6b6092da2 100644
--- a/src/bin/pg_checksums/t/002_actions.pl
+++ b/src/bin/pg_checksums/t/002_actions.pl
@@ -5,8 +5,7 @@ use strict;
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 62;
-
+use Test::More tests => 126;
 
 # Utility routine to create and check a table with corrupted checksums
 # on a wanted tablespace.  Note that this stops and starts the node
@@ -17,6 +16,9 @@ sub check_relation_corruption
 	my $node = shift;
 	my $table = shift;
 	my $tablespace = shift;
+	my $offset = shift;
+	my $corrupted_data = shift;
+	my $description = shift;
 	my $pgdata = $node->data_dir;
 
 	$node->safe_psql('postgres',
@@ -31,21 +33,18 @@ sub check_relation_corruption
 	my $relfilenode_corrupted =  $node->safe_psql('postgres',
 		"SELECT relfilenode FROM pg_class WHERE relname = '$table';");
 
-	# Set page header and block size
-	my $pageheader_size = 24;
-	my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
 	$node->stop;
 
 	# Checksums are correct for single relfilenode as the table is not
 	# corrupted yet.
 	command_ok(['pg_checksums',  '--check', '-D', $pgdata, '-r',
 			   $relfilenode_corrupted],
-		"succeeds for single relfilenode on tablespace $tablespace with offline cluster");
+		"succeeds for single relfilenode $description with offline cluster");
 
 	# Time to create some corruption
 	open my $file, '+<', "$pgdata/$file_corrupted";
-	seek($file, $pageheader_size, 0);
-	syswrite($file, "\0\0\0\0\0\0\0\0\0");
+	seek($file, $offset, 0);
+	syswrite($file, $corrupted_data);
 	close $file;
 
 	# Checksum checks on single relfilenode fail
@@ -54,23 +53,38 @@ sub check_relation_corruption
 							  1,
 							  [qr/Bad checksums:.*1/],
 							  [qr/checksum verification failed/],
-							  "fails with corrupted data for single relfilenode on tablespace $tablespace");
+							  "fails with corrupted data for single relfilenode $description");
 
 	# Global checksum checks fail as well
 	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
 							  1,
 							  [qr/Bad checksums:.*1/],
 							  [qr/checksum verification failed/],
-							  "fails with corrupted data on tablespace $tablespace");
+							  "fails with corrupted data $description");
 
-	# Drop corrupted table again and make sure there is no more corruption.
+	# Now check online as well
 	$node->start;
+
+	# Checksum checks on single relfilenode fail
+	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata,
+							  '-r', $relfilenode_corrupted],
+							  1,
+							  [qr/Bad checksums:.*1/],
+							  [qr/checksum verification failed/],
+							  "fails with corrupted data for single relfilenode $description with online cluster");
+
+	# Global checksum checks fail as well
+	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
+							  1,
+							  [qr/Bad checksums:.*1/],
+							  [qr/checksum verification failed/],
+							  "fails with corrupted data $description with online cluster");
+
+
+	# Drop corrupted table again and make sure there is no more corruption.
 	$node->safe_psql('postgres', "DROP TABLE $table;");
-	$node->stop;
 	$node->command_ok(['pg_checksums', '--check', '-D', $pgdata],
 	        "succeeds again after table drop on tablespace $tablespace");
-
-	$node->start;
 	return;
 }
 
@@ -143,6 +157,11 @@ command_ok(['pg_checksums', '--check', '-D', $pgdata],
 command_ok(['pg_checksums', '-D', $pgdata],
 		   "verifies checksums as default action");
 
+# Checksums pass on an online cluster
+$node->start;
+command_ok(['pg_checksums', '--check', '-D', $pgdata],
+		   "succeeds with online cluster");
+
 # Specific relation files cannot be requested when action is --disable
 # or --enable.
 command_fails(['pg_checksums', '--disable', '-r', '1234', '-D', $pgdata],
@@ -150,13 +169,12 @@ command_fails(['pg_checksums', '--disable', '-r', '1234', '-D', $pgdata],
 command_fails(['pg_checksums', '--enable', '-r', '1234', '-D', $pgdata],
 	      "fails when relfilenodes are requested and action is --enable");
 
-# Checks cannot happen with an online cluster
-$node->start;
-command_fails(['pg_checksums', '--check', '-D', $pgdata],
-			  "fails with online cluster");
+# Set page header and block size
+my $pageheader_size = 24;
+my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
 
 # Check corruption of table on default tablespace.
-check_relation_corruption($node, 'corrupt1', 'pg_default');
+check_relation_corruption($node, 'corrupt1', 'pg_default', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace pg_default");
 
 # Create tablespace to check corruptions in a non-default tablespace.
 my $basedir = $node->basedir;
@@ -165,7 +183,15 @@ mkdir ($tablespace_dir);
 $tablespace_dir = TestLib::real_dir($tablespace_dir);
 $node->safe_psql('postgres',
 	"CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
-check_relation_corruption($node, 'corrupt2', 'ts_corrupt');
+check_relation_corruption($node, 'corrupt2', 'ts_corrupt', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace ts_corrupt");
+
+# Check corruption in the pageheader with random data in it
+my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $random_data, "with random data in pageheader");
+
+# Check corruption when the pageheader has been zeroed-out completely
+my $zero_data = "\0"x$pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $zero_data, "with zeroed-out pageheader");
 
 # Utility routine to check that pg_checksums is able to detect
 # correctly-named relation files filled with some corrupted data.
@@ -179,23 +205,32 @@ sub fail_corrupt
 	my $file_name = "$pgdata/global/$file";
 	append_to_file $file_name, "foo";
 
+	$node->stop;
+	# If the instance is offline, the whole file is skipped and this is
+	# considered to be an error.
 	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
 						  1,
-						  [qr/^$/],
+						  [qr/Files skipped:.*1/],
 						  [qr/could not read block 0 in file.*$file\":/],
-						  "fails for corrupted data in $file");
+						  "skips corrupted data in $file");
+
+	$node->start;
+	# If the instance is online, the block is skipped and this is not
+	# considered to be an error
+	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
+						  0,
+						  [qr/Blocks skipped:.*1/],
+						  [qr/^$/],
+						  "skips corrupted data in $file");
 
 	# Remove file to prevent future lookup errors on conflicts.
 	unlink $file_name;
 	return;
 }
 
-# Stop instance for the follow-up checks.
-$node->stop;
-
-# Authorized relation files filled with corrupted data cause the
-# checksum checks to fail.  Make sure to use file names different
-# than the previous ones.
+# Authorized relation files filled with corrupted data cause the files to be
+# skipped and, if the instance is offline, a non-zero exit status.  Make sure
+# to use file names different than the previous ones.
 fail_corrupt($node, "99990");
 fail_corrupt($node, "99990.123");
 fail_corrupt($node, "99990_fsm");
@@ -204,3 +239,6 @@ fail_corrupt($node, "99990_vm");
 fail_corrupt($node, "99990_init.123");
 fail_corrupt($node, "99990_fsm.123");
 fail_corrupt($node, "99990_vm.123");
+
+# Stop node again at the end of tests
+$node->stop;

Reply via email to