Hi,
Am Dienstag, den 26.03.2019, 19:23 +0100 schrieb Michael Banck:
> Am Dienstag, den 26.03.2019, 10:30 -0700 schrieb Andres Freund:
> > On 2019-03-26 18:22:55 +0100, Michael Banck wrote:
> > > /*
> > > - * Only check pages which have not been
> > > modified since the
> > > - * start of the base backup. Otherwise, they
> > > might have been
> > > - * written only halfway and the checksum would
> > > not be valid.
> > > - * However, replaying WAL would reinstate the
> > > correct page in
> > > - * this case. We also skip completely new
> > > pages, since they
> > > - * don't have a checksum yet.
> > > + * We skip completely new pages after checking
> > > they are
> > > + * all-zero, since they don't have a checksum
> > > yet.
> > > */
> > > - if (!PageIsNew(page) && PageGetLSN(page) <
> > > startptr)
> > > + if (PageIsNew(page))
> > > {
> > > - checksum = pg_checksum_page((char *)
> > > page, blkno + segmentno * RELSEG_SIZE);
> > > - phdr = (PageHeader) page;
> > > - if (phdr->pd_checksum != checksum)
> > > + all_zeroes = true;
> > > + pagebytes = (size_t *) page;
> > > + for (int i = 0; i < (BLCKSZ /
> > > sizeof(size_t)); i++)
> >
> > Can we please abstract the zeroeness check into a separate function to
> > be used both by PageIsVerified() and this?
>
> Ok, done so as PageIsZero further down in bufpage.c.
It turns out that pg_checksums (current master and back branches, not
just the online version) needs this treatment as well as it won't catch
zeroed-out pageheader corruption, see attached patch to its TAP tests
which trigger it (I also added a random data check similar to
pg_basebackup as well which is not a problem for the current codebase).
Any suggestion on how to handle this? Should I duplicate the
PageIsZero() code in pg_checksums? Should I move PageIsZero into
something like bufpage_impl.h for use by external programs, similar to
pg_checksum_page()?
I've done the latter as a POC in the second attached patch.
Michael
--
Michael Banck
Projektleiter / Senior Berater
Tel.: +49 2166 9901-171
Fax: +49 2166 9901-100
Email: michael.ba...@credativ.de
credativ GmbH, HRB Mönchengladbach 12080
USt-ID-Nummer: DE204566209
Trompeterallee 108, 41189 Mönchengladbach
Geschäftsführung: Dr. Michael Meskes, Jörg Folz, Sascha Heuer
Unser Umgang mit personenbezogenen Daten unterliegt
folgenden Bestimmungen: https://www.credativ.de/datenschutz
diff --git a/src/bin/pg_checksums/t/002_actions.pl b/src/bin/pg_checksums/t/002_actions.pl
index 41575c5245..680c26f2d6 100644
--- a/src/bin/pg_checksums/t/002_actions.pl
+++ b/src/bin/pg_checksums/t/002_actions.pl
@@ -5,7 +5,7 @@ use strict;
use warnings;
use PostgresNode;
use TestLib;
-use Test::More tests => 62;
+use Test::More tests => 78;
# Utility routine to create and check a table with corrupted checksums
@@ -17,6 +17,9 @@ sub check_relation_corruption
my $node = shift;
my $table = shift;
my $tablespace = shift;
+ my $offset = shift;
+ my $corrupted_data = shift;
+ my $description = shift;
my $pgdata = $node->data_dir;
$node->safe_psql('postgres',
@@ -44,8 +47,8 @@ sub check_relation_corruption
# Time to create some corruption
open my $file, '+<', "$pgdata/$file_corrupted";
- seek($file, $pageheader_size, 0);
- syswrite($file, "\0\0\0\0\0\0\0\0\0");
+ seek($file, $offset, 0);
+ syswrite($file, $corrupted_data);
close $file;
# Checksum checks on single relfilenode fail
@@ -54,14 +57,14 @@ sub check_relation_corruption
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
- "fails with corrupted data for single relfilenode on tablespace $tablespace");
+ "fails with corrupted data for single relfilenode $description");
# Global checksum checks fail as well
$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
- "fails with corrupted data on tablespace $tablespace");
+ "fails with corrupted data on $description");
# Drop corrupted table again and make sure there is no more corruption.
$node->start;
@@ -155,8 +158,12 @@ $node->start;
command_fails(['pg_checksums', '--check', '-D', $pgdata],
"fails with online cluster");
+# Set page header and block size
+my $pageheader_size = 24;
+my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
+
# Check corruption of table on default tablespace.
-check_relation_corruption($node, 'corrupt1', 'pg_default');
+check_relation_corruption($node, 'corrupt1', 'pg_default', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace pg_default");
# Create tablespace to check corruptions in a non-default tablespace.
my $basedir = $node->basedir;
@@ -165,7 +172,15 @@ mkdir ($tablespace_dir);
$tablespace_dir = TestLib::real_dir($tablespace_dir);
$node->safe_psql('postgres',
"CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
-check_relation_corruption($node, 'corrupt2', 'ts_corrupt');
+check_relation_corruption($node, 'corrupt2', 'ts_corrupt', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace ts_corrupt");
+
+# Check corruption in the pageheader with random data in it
+my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $random_data, "with random data $random_data in pageheader");
+
+# Check corruption when the pageheader has been zeroed-out completely
+my $zero_data = "\0"x$pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $zero_data, "with zeroed-out pageheader");
# Utility routine to check that pg_checksums is able to detect
# correctly-named relation files filled with some corrupted data.
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 537f09e342..0f6c66a126 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1369,16 +1369,18 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
bool missing_ok, Oid dboid)
{
FILE *fp;
+ bool all_zeroes = false;
BlockNumber blkno = 0;
bool block_retry = false;
char buf[TAR_SEND_SIZE];
uint16 checksum;
int checksum_failures = 0;
off_t cnt;
- int i;
+ int page_in_buf;
pgoff_t len = 0;
char *page;
size_t pad;
+ size_t *pagebytes;
PageHeader phdr;
int segmentno = 0;
char *segmentpath;
@@ -1449,78 +1451,32 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
if (verify_checksum)
{
- for (i = 0; i < cnt / BLCKSZ; i++)
+ for (page_in_buf = 0; page_in_buf < cnt / BLCKSZ; page_in_buf++)
{
- page = buf + BLCKSZ * i;
+ page = buf + BLCKSZ * page_in_buf;
/*
- * Only check pages which have not been modified since the
- * start of the base backup. Otherwise, they might have been
- * written only halfway and the checksum would not be valid.
- * However, replaying WAL would reinstate the correct page in
- * this case. We also skip completely new pages, since they
- * don't have a checksum yet.
+ * We skip completely new pages after checking they are
+ * all-zero, since they don't have a checksum yet.
*/
- if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+ if (PageIsNew(page))
{
- checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
- phdr = (PageHeader) page;
- if (phdr->pd_checksum != checksum)
+ if (!PageIsZero(page))
{
/*
- * Retry the block on the first failure. It's
- * possible that we read the first 4K page of the
- * block just before postgres updated the entire block
- * so it ends up looking torn to us. We only need to
- * retry once because the LSN should be updated to
- * something we can ignore on the next pass. If the
- * error happens again then it is a true validation
- * failure.
+ * pd_upper is zero, but the page is not all zero. We
+ * cannot run pg_checksum_page() on the page as it
+ * would throw an assertion failure. Consider this a
+ * checksum failure.
*/
- if (block_retry == false)
- {
- /* Reread the failed block */
- if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
- {
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not fseek in file \"%s\": %m",
- readfilename)));
- }
-
- if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
- {
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not reread block %d of file \"%s\": %m",
- blkno, readfilename)));
- }
-
- if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
- {
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not fseek in file \"%s\": %m",
- readfilename)));
- }
-
- /* Set flag so we know a retry was attempted */
- block_retry = true;
-
- /* Reset loop to validate the block again */
- i--;
- continue;
- }
-
checksum_failures++;
if (checksum_failures <= 5)
ereport(WARNING,
(errmsg("checksum verification failed in "
- "file \"%s\", block %d: calculated "
- "%X but expected %X",
- readfilename, blkno, checksum,
- phdr->pd_checksum)));
+ "file \"%s\", block %d: pd_upper "
+ "is zero but page is not all-zero",
+ readfilename, blkno)));
if (checksum_failures == 5)
ereport(WARNING,
(errmsg("further checksum verification "
@@ -1528,6 +1484,84 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
"be reported", readfilename)));
}
}
+ else
+ {
+ /*
+ * Only check pages which have not been modified since the
+ * start of the base backup. Otherwise, they might have been
+ * written only halfway and the checksum would not be valid.
+ * However, replaying WAL would reinstate the correct page in
+ * this case. If the page LSN is larger than the current redo
+ * pointer then we assume a bogus LSN due to random page header
+ * corruption and do verify the checksum.
+ */
+ if (PageGetLSN(page) < startptr || PageGetLSN(page) > GetInsertRecPtr())
+ {
+ checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
+ phdr = (PageHeader) page;
+ if (phdr->pd_checksum != checksum)
+ {
+ /*
+ * Retry the block on the first failure. It's
+ * possible that we read the first 4K page of the
+ * block just before postgres updated the entire block
+ * so it ends up looking torn to us. We only need to
+ * retry once because the LSN should be updated to
+ * something we can ignore on the next pass. If the
+ * error happens again then it is a true validation
+ * failure.
+ */
+ if (block_retry == false)
+ {
+ /* Reread the failed block */
+ if (fseek(fp, -(cnt - BLCKSZ * page_in_buf), SEEK_CUR) == -1)
+ {
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fseek in file \"%s\": %m",
+ readfilename)));
+ }
+
+ if (fread(buf + BLCKSZ * page_in_buf, 1, BLCKSZ, fp) != BLCKSZ)
+ {
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not reread block %d of file \"%s\": %m",
+ blkno, readfilename)));
+ }
+
+ if (fseek(fp, cnt - BLCKSZ * page_in_buf - BLCKSZ, SEEK_CUR) == -1)
+ {
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fseek in file \"%s\": %m",
+ readfilename)));
+ }
+
+ /* Set flag so we know a retry was attempted */
+ block_retry = true;
+
+ /* Reset loop to validate the block again */
+ page_in_buf--;
+ continue;
+ }
+ checksum_failures++;
+
+ if (checksum_failures <= 5)
+ ereport(WARNING,
+ (errmsg("checksum verification failed in "
+ "file \"%s\", block %d: calculated "
+ "%X but expected %X",
+ readfilename, blkno, checksum,
+ phdr->pd_checksum)));
+ if (checksum_failures == 5)
+ ereport(WARNING,
+ (errmsg("further checksum verification "
+ "failures in file \"%s\" will not "
+ "be reported", readfilename)));
+ }
+ }
+ }
block_retry = false;
blkno++;
}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 14bc61b8ad..3fea7786ba 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -23,6 +23,13 @@
#include "utils/memutils.h"
+/*
+ * The actual code for PageIsZero() is in storage/bufpage_impl.h. This is done
+ * so that external programs can incorporate it by #include'ing that file from
+ * the exported Postgres headers.
+ */
+#include "storage/bufpage_impl.h"
+
/* GUC variable */
bool ignore_checksum_failure = false;
@@ -82,11 +89,8 @@ bool
PageIsVerified(Page page, BlockNumber blkno)
{
PageHeader p = (PageHeader) page;
- size_t *pagebytes;
- int i;
bool checksum_failure = false;
bool header_sane = false;
- bool all_zeroes = false;
uint16 checksum = 0;
/*
@@ -120,25 +124,9 @@ PageIsVerified(Page page, BlockNumber blkno)
}
/*
- * Check all-zeroes case. Luckily BLCKSZ is guaranteed to always be a
- * multiple of size_t - and it's much faster to compare memory using the
- * native word size.
+ * Check all-zeroes case
*/
- StaticAssertStmt(BLCKSZ == (BLCKSZ / sizeof(size_t)) * sizeof(size_t),
- "BLCKSZ has to be a multiple of sizeof(size_t)");
-
- all_zeroes = true;
- pagebytes = (size_t *) page;
- for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
- {
- if (pagebytes[i] != 0)
- {
- all_zeroes = false;
- break;
- }
- }
-
- if (all_zeroes)
+ if (PageIsZero(page))
return true;
/*
@@ -161,7 +149,6 @@ PageIsVerified(Page page, BlockNumber blkno)
return false;
}
-
/*
* PageAddItemExtended
*
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index 33869fecc9..c623252293 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
use File::Path qw(rmtree);
use PostgresNode;
use TestLib;
-use Test::More tests => 106;
+use Test::More tests => 109;
program_help_ok('pg_basebackup');
program_version_ok('pg_basebackup');
@@ -495,21 +495,37 @@ my $file_corrupt2 = $node->safe_psql('postgres',
my $pageheader_size = 24;
my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
-# induce corruption
+# induce corruption in the pageheader by writing random data into it
system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
open $file, '+<', "$pgdata/$file_corrupt1";
-seek($file, $pageheader_size, 0);
-syswrite($file, "\0\0\0\0\0\0\0\0\0");
+my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
+syswrite($file, $random_data);
+close $file;
+system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
+
+$node->command_checks_all(
+ [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1" ],
+ 1,
+ [qr{^$}],
+ [qr/^WARNING.*checksum verification failed/s],
+ "pg_basebackup reports checksum mismatch for random pageheader data");
+rmtree("$tempdir/backup_corrupt1");
+
+# zero out the pageheader completely
+open $file, '+<', "$pgdata/$file_corrupt1";
+system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
+my $zero_data = "\0"x$pageheader_size;
+syswrite($file, $zero_data);
close $file;
system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
$node->command_checks_all(
- [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt" ],
+ [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1a" ],
1,
[qr{^$}],
[qr/^WARNING.*checksum verification failed/s],
- 'pg_basebackup reports checksum mismatch');
-rmtree("$tempdir/backup_corrupt");
+ "pg_basebackup reports checksum mismatch for zeroed pageheader");
+rmtree("$tempdir/backup_corrupt1a");
# induce further corruption in 5 more blocks
system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 5c185ebed8..7776c0dfc9 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -25,6 +25,7 @@
#include "getopt_long.h"
#include "pg_getopt.h"
#include "storage/bufpage.h"
+#include "storage/bufpage_impl.h"
#include "storage/checksum.h"
#include "storage/checksum_impl.h"
@@ -150,7 +151,16 @@ scan_file(const char *fn, BlockNumber segmentno)
/* New pages have no checksum yet */
if (PageIsNew(header))
+ {
+ /* Check for an all-zeroes page */
+ if (!PageIsZero(buf.data))
+ {
+ fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: pd_upper is zero but block is not all-zero\n"),
+ progname, fn, blockno);
+ badblocks++;
+ }
continue;
+ }
csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
if (mode == PG_MODE_CHECK)
diff --git a/src/bin/pg_checksums/t/002_actions.pl b/src/bin/pg_checksums/t/002_actions.pl
index 41575c5245..680c26f2d6 100644
--- a/src/bin/pg_checksums/t/002_actions.pl
+++ b/src/bin/pg_checksums/t/002_actions.pl
@@ -5,7 +5,7 @@ use strict;
use warnings;
use PostgresNode;
use TestLib;
-use Test::More tests => 62;
+use Test::More tests => 78;
# Utility routine to create and check a table with corrupted checksums
@@ -17,6 +17,9 @@ sub check_relation_corruption
my $node = shift;
my $table = shift;
my $tablespace = shift;
+ my $offset = shift;
+ my $corrupted_data = shift;
+ my $description = shift;
my $pgdata = $node->data_dir;
$node->safe_psql('postgres',
@@ -44,8 +47,8 @@ sub check_relation_corruption
# Time to create some corruption
open my $file, '+<', "$pgdata/$file_corrupted";
- seek($file, $pageheader_size, 0);
- syswrite($file, "\0\0\0\0\0\0\0\0\0");
+ seek($file, $offset, 0);
+ syswrite($file, $corrupted_data);
close $file;
# Checksum checks on single relfilenode fail
@@ -54,14 +57,14 @@ sub check_relation_corruption
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
- "fails with corrupted data for single relfilenode on tablespace $tablespace");
+ "fails with corrupted data for single relfilenode $description");
# Global checksum checks fail as well
$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
- "fails with corrupted data on tablespace $tablespace");
+ "fails with corrupted data on $description");
# Drop corrupted table again and make sure there is no more corruption.
$node->start;
@@ -155,8 +158,12 @@ $node->start;
command_fails(['pg_checksums', '--check', '-D', $pgdata],
"fails with online cluster");
+# Set page header and block size
+my $pageheader_size = 24;
+my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
+
# Check corruption of table on default tablespace.
-check_relation_corruption($node, 'corrupt1', 'pg_default');
+check_relation_corruption($node, 'corrupt1', 'pg_default', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace pg_default");
# Create tablespace to check corruptions in a non-default tablespace.
my $basedir = $node->basedir;
@@ -165,7 +172,15 @@ mkdir ($tablespace_dir);
$tablespace_dir = TestLib::real_dir($tablespace_dir);
$node->safe_psql('postgres',
"CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
-check_relation_corruption($node, 'corrupt2', 'ts_corrupt');
+check_relation_corruption($node, 'corrupt2', 'ts_corrupt', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace ts_corrupt");
+
+# Check corruption in the pageheader with random data in it
+my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $random_data, "with random data $random_data in pageheader");
+
+# Check corruption when the pageheader has been zeroed-out completely
+my $zero_data = "\0"x$pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $zero_data, "with zeroed-out pageheader");
# Utility routine to check that pg_checksums is able to detect
# correctly-named relation files filled with some corrupted data.
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 85cd0abb97..1e4419cf4d 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -416,6 +416,7 @@ do { \
extern void PageInit(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerified(Page page, BlockNumber blkno);
+extern bool PageIsZero(Page page);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
extern Page PageGetTempPage(Page page);
diff --git a/src/include/storage/bufpage_impl.h b/src/include/storage/bufpage_impl.h
new file mode 100644
index 0000000000..9c161e9544
--- /dev/null
+++ b/src/include/storage/bufpage_impl.h
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage_impl.h
+ * Implementation for buffer page checks.
+ *
+ * This file exists for the benefit of external programs that may wish to
+ * check Postgres buffer pages. They can #include this to get the code
+ * referenced by storage/bufpage.h.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/bufpage_impl.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * PageIsZero
+ * Check that the page consists only of zero bytes.
+ *
+ */
+bool
+PageIsZero(Page page)
+{
+ bool all_zeroes = true;
+ int i;
+ size_t *pagebytes = (size_t *) page;
+
+ /*
+ * Luckily BLCKSZ is guaranteed to always be a multiple of size_t - and
+ * it's much faster to compare memory using the native word size.
+ */
+ StaticAssertStmt(BLCKSZ == (BLCKSZ / sizeof(size_t)) * sizeof(size_t),
+ "BLCKSZ has to be a multiple of sizeof(size_t)");
+
+ for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
+ {
+ if (pagebytes[i] != 0)
+ {
+ all_zeroes = false;
+ break;
+ }
+ }
+
+ if (all_zeroes)
+ return true;
+ else
+ return false;
+}