[Patch] Base backups and random or zero pageheaders (was: Online verification of checksums)

Michael Banck Tue, 26 Mar 2019 10:23:30 -0700

Hi,

Am Dienstag, den 19.03.2019, 13:00 -0700 schrieb Andres Freund:
> CREATE TABLE corruptme AS SELECT g.i::text AS data FROM generate_series(1, 
> 1000000) g(i);
> SELECT pg_relation_size('corruptme');
> postgres[22890][1]=# SELECT current_setting('data_directory') || '/' || 
> pg_relation_filepath('corruptme');
> ┌─────────────────────────────────────┐
> │              ?column?               │
> ├─────────────────────────────────────┤
> │ /srv/dev/pgdev-dev/base/13390/16384 │
> └─────────────────────────────────────┘
> (1 row)
> dd if=/dev/urandom of=/srv/dev/pgdev-dev/base/13390/16384 bs=8192 count=1 
> conv=notrunc
> 
> Try a basebackup and see how many times it'll detect the corrupt
> data. In the vast majority of cases you're going to see checksum
> failures when reading the data for normal operation, but not when using
> basebackup (or this new tool).
> 
> At the very very least this would need to do
> 
> a) checks that the page is all zeroes if PageIsNew() (like
>    PageIsVerified() does for the backend). That avoids missing cases
>    where corruption just zeroed out the header, but not the whole page.
> b) Check that pd_lsn is between startlsn and the insertion pointer. That
>    avoids accepting just about all random data.
> 
> And that'd *still* be less strenuous than what normal backends
> check. And that's already not great (due to not noticing zeroed out
> data).


I've done the above in the attached patch now. Well, literally like an
hour ago, then went jogging and came back to see you outlined about
fixing this differently in a separate thread. Still might be helpful for
the TAP test changes at least.


Michael

-- 
Michael Banck
Projektleiter / Senior Berater
Tel.: +49 2166 9901-171
Fax:  +49 2166 9901-100
Email: michael.ba...@credativ.de

credativ GmbH, HRB Mönchengladbach 12080
USt-ID-Nummer: DE204566209
Trompeterallee 108, 41189 Mönchengladbach
Geschäftsführung: Dr. Michael Meskes, Jörg Folz, Sascha Heuer

Unser Umgang mit personenbezogenen Daten unterliegt
folgenden Bestimmungen: https://www.credativ.de/datenschutz

diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 537f09e342..70a41b6998 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1369,16 +1369,18 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 		 bool missing_ok, Oid dboid)
 {
 	FILE	   *fp;
+	bool		all_zeroes = false;
 	BlockNumber blkno = 0;
 	bool		block_retry = false;
 	char		buf[TAR_SEND_SIZE];
 	uint16		checksum;
 	int			checksum_failures = 0;
 	off_t		cnt;
-	int			i;
+	int			page_in_buf;
 	pgoff_t		len = 0;
 	char	   *page;
 	size_t		pad;
+	size_t     *pagebytes;
 	PageHeader	phdr;
 	int			segmentno = 0;
 	char	   *segmentpath;
@@ -1449,78 +1451,42 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 
 		if (verify_checksum)
 		{
-			for (i = 0; i < cnt / BLCKSZ; i++)
+			for (page_in_buf = 0; page_in_buf < cnt / BLCKSZ; page_in_buf++)
 			{
-				page = buf + BLCKSZ * i;
+				page = buf + BLCKSZ * page_in_buf;
 
 				/*
-				 * Only check pages which have not been modified since the
-				 * start of the base backup. Otherwise, they might have been
-				 * written only halfway and the checksum would not be valid.
-				 * However, replaying WAL would reinstate the correct page in
-				 * this case. We also skip completely new pages, since they
-				 * don't have a checksum yet.
+				 * We skip completely new pages after checking they are
+				 * all-zero, since they don't have a checksum yet.
 				 */
-				if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+				if (PageIsNew(page))
 				{
-					checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
-					phdr = (PageHeader) page;
-					if (phdr->pd_checksum != checksum)
+					all_zeroes = true;
+					pagebytes = (size_t *) page;
+					for (int i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
 					{
-						/*
-						 * Retry the block on the first failure.  It's
-						 * possible that we read the first 4K page of the
-						 * block just before postgres updated the entire block
-						 * so it ends up looking torn to us.  We only need to
-						 * retry once because the LSN should be updated to
-						 * something we can ignore on the next pass.  If the
-						 * error happens again then it is a true validation
-						 * failure.
-						 */
-						if (block_retry == false)
+						if (pagebytes[i] != 0)
 						{
-							/* Reread the failed block */
-							if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
-							{
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not fseek in file \"%s\": %m",
-												readfilename)));
-							}
-
-							if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
-							{
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not reread block %d of file \"%s\": %m",
-												blkno, readfilename)));
-							}
-
-							if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
-							{
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not fseek in file \"%s\": %m",
-												readfilename)));
-							}
-
-							/* Set flag so we know a retry was attempted */
-							block_retry = true;
-
-							/* Reset loop to validate the block again */
-							i--;
-							continue;
+							all_zeroes = false;
+							break;
 						}
-
+					}
+					if (!all_zeroes)
+					{
+						/*
+						 * pd_upper is zero, but the page is not all zero.  We
+						 * cannot run pg_checksum_page() on the page as it
+						 * would throw an assertion failure.  Consider this a
+						 * checksum failure.
+						 */
 						checksum_failures++;
 
 						if (checksum_failures <= 5)
 							ereport(WARNING,
 									(errmsg("checksum verification failed in "
-											"file \"%s\", block %d: calculated "
-											"%X but expected %X",
-											readfilename, blkno, checksum,
-											phdr->pd_checksum)));
+											"file \"%s\", block %d: pd_upper "
+											"is zero but page is not all-zero",
+											readfilename, blkno)));
 						if (checksum_failures == 5)
 							ereport(WARNING,
 									(errmsg("further checksum verification "
@@ -1528,6 +1494,84 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 											"be reported", readfilename)));
 					}
 				}
+				else
+				{
+					/*
+					 * Only check pages which have not been modified since the
+					 * start of the base backup. Otherwise, they might have been
+					 * written only halfway and the checksum would not be valid.
+					 * However, replaying WAL would reinstate the correct page in
+					 * this case. If the page LSN is larger than the current redo
+					 * pointer then we assume a bogus LSN due to random page header
+					 * corruption and do verify the checksum.
+					 */
+					if (PageGetLSN(page) < startptr || PageGetLSN(page) > GetRedoRecPtr())
+					{
+						checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
+						phdr = (PageHeader) page;
+						if (phdr->pd_checksum != checksum)
+						{
+							/*
+							 * Retry the block on the first failure.  It's
+							 * possible that we read the first 4K page of the
+							 * block just before postgres updated the entire block
+							 * so it ends up looking torn to us.  We only need to
+							 * retry once because the LSN should be updated to
+							 * something we can ignore on the next pass.  If the
+							 * error happens again then it is a true validation
+							 * failure.
+							 */
+							if (block_retry == false)
+							{
+								/* Reread the failed block */
+								if (fseek(fp, -(cnt - BLCKSZ * page_in_buf), SEEK_CUR) == -1)
+								{
+									ereport(ERROR,
+											(errcode_for_file_access(),
+											 errmsg("could not fseek in file \"%s\": %m",
+													readfilename)));
+								}
+
+								if (fread(buf + BLCKSZ * page_in_buf, 1, BLCKSZ, fp) != BLCKSZ)
+								{
+									ereport(ERROR,
+											(errcode_for_file_access(),
+											 errmsg("could not reread block %d of file \"%s\": %m",
+												blkno, readfilename)));
+								}
+
+								if (fseek(fp, cnt - BLCKSZ * page_in_buf - BLCKSZ, SEEK_CUR) == -1)
+								{
+									ereport(ERROR,
+											(errcode_for_file_access(),
+											 errmsg("could not fseek in file \"%s\": %m",
+												readfilename)));
+								}
+
+								/* Set flag so we know a retry was attempted */
+								block_retry = true;
+
+								/* Reset loop to validate the block again */
+								page_in_buf--;
+								continue;
+							}
+							checksum_failures++;
+
+							if (checksum_failures <= 5)
+								ereport(WARNING,
+										(errmsg("checksum verification failed in "
+												"file \"%s\", block %d: calculated "
+												"%X but expected %X",
+												readfilename, blkno, checksum,
+												phdr->pd_checksum)));
+							if (checksum_failures == 5)
+								ereport(WARNING,
+										(errmsg("further checksum verification "
+												"failures in file \"%s\" will not "
+												"be reported", readfilename)));
+						}
+					}
+				}
 				block_retry = false;
 				blkno++;
 			}
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index 33869fecc9..2467750f4d 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 106;
+use Test::More tests => 109;
 
 program_help_ok('pg_basebackup');
 program_version_ok('pg_basebackup');
@@ -495,21 +495,37 @@ my $file_corrupt2 = $node->safe_psql('postgres',
 my $pageheader_size = 24;
 my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
 
-# induce corruption
+# induce corruption in the pageheader by writing random data into it
 system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
 open $file, '+<', "$pgdata/$file_corrupt1";
-seek($file, $pageheader_size, 0);
-syswrite($file, "\0\0\0\0\0\0\0\0\0");
+my $random_data = map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
+syswrite($file, $random_data);
+close $file;
+system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
+
+$node->command_checks_all(
+	[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1" ],
+	1,
+	[qr{^$}],
+	[qr/^WARNING.*checksum verification failed/s],
+	'pg_basebackup reports checksum mismatch for random pageheader data');
+rmtree("$tempdir/backup_corrupt1");
+
+# zero out the pageheader completely
+open $file, '+<', "$pgdata/$file_corrupt1";
+system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
+my $zero_data = "\0"x$pageheader_size;
+syswrite($file, $zero_data);
 close $file;
 system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
 
 $node->command_checks_all(
-	[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt" ],
+	[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1a" ],
 	1,
 	[qr{^$}],
 	[qr/^WARNING.*checksum verification failed/s],
-	'pg_basebackup reports checksum mismatch');
-rmtree("$tempdir/backup_corrupt");
+	"pg_basebackup reports checksum mismatch for zeroed pageheader");
+rmtree("$tempdir/backup_corrupt1a");
 
 # induce further corruption in 5 more blocks
 system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';

[Patch] Base backups and random or zero pageheaders (was: Online verification of checksums)

Reply via email to