Bug#928378: unblock: pg-checksums/0.8-3

Michael Banck Fri, 03 May 2019 03:15:23 -0700

Package: release.debian.org
Severity: normal
User: release.debian....@packages.debian.org
Usertags: unblock


Please unblock package pg-checksums

It fixes #928232 which is a rather serious problem for a program that is
supposed to find data corruption.

unblock pg-checksums/0.8-3

-- System Information:
Debian Release: 8.10
  APT prefers oldstable
  APT policy: (500, 'oldstable')
Architecture: i386 (i686)

Kernel: Linux 3.16.0-4-686-pae (SMP w/1 CPU core)
Locale: LANG=de_DE.UTF-8, LC_CTYPE=de_DE.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)

diff -Nru pg-checksums-0.8/debian/changelog pg-checksums-0.8/debian/changelog
--- pg-checksums-0.8/debian/changelog   2019-03-09 16:14:40.000000000 +0100
+++ pg-checksums-0.8/debian/changelog   2019-04-30 14:55:21.000000000 +0200
@@ -1,3 +1,16 @@
+pg-checksums (0.8-3) unstable; urgency=medium
+
+  [ Christoph Berg ]
+  * Remove myself from uploaders.
+
+  [ Michael Banck ]
+  * debian/patches/zero_random_pageheader.patch: New patch, fixes false
+    negatives for all-zero or random pageheaders which were not detected as
+    checksum failures previously. Adapted from upstream commit 043f003f
+    (Closes: #928232).
+
+ -- Michael Banck <michael.ba...@credativ.de>  Tue, 30 Apr 2019 14:55:21 +0200
+
 pg-checksums (0.8-2) unstable; urgency=medium
 
   * debian/patches/fsync_pgdata.patch: New patch, backports the fsync_pgdata()
diff -Nru pg-checksums-0.8/debian/control pg-checksums-0.8/debian/control
--- pg-checksums-0.8/debian/control     2019-02-26 21:39:57.000000000 +0100
+++ pg-checksums-0.8/debian/control     2019-04-30 14:44:14.000000000 +0200
@@ -3,7 +3,6 @@
 Priority: optional
 Maintainer: Debian PostgreSQL Maintainers <team+postgre...@tracker.debian.org>
 Uploaders: Michael Banck <michael.ba...@credativ.de>,
- Christoph Berg <m...@debian.org>,
 Build-Depends:
  debhelper (>= 9),
  postgresql-server-dev-all (>= 153~),
diff -Nru pg-checksums-0.8/debian/control.in pg-checksums-0.8/debian/control.in
--- pg-checksums-0.8/debian/control.in  2018-10-12 14:04:31.000000000 +0200
+++ pg-checksums-0.8/debian/control.in  2019-04-15 23:18:34.000000000 +0200
@@ -3,7 +3,6 @@
 Priority: optional
 Maintainer: Debian PostgreSQL Maintainers <team+postgre...@tracker.debian.org>
 Uploaders: Michael Banck <michael.ba...@credativ.de>,
- Christoph Berg <m...@debian.org>,
 Build-Depends:
  debhelper (>= 9),
  postgresql-server-dev-all (>= 153~),
diff -Nru pg-checksums-0.8/debian/patches/series 
pg-checksums-0.8/debian/patches/series
--- pg-checksums-0.8/debian/patches/series      2019-02-26 21:19:49.000000000 
+0100
+++ pg-checksums-0.8/debian/patches/series      2019-04-23 17:14:59.000000000 
+0200
@@ -1,2 +1,3 @@
 postgresnode.patch
 fsync_pgdata.patch
+zero_random_pageheader.patch
diff -Nru pg-checksums-0.8/debian/patches/zero_random_pageheader.patch 
pg-checksums-0.8/debian/patches/zero_random_pageheader.patch
--- pg-checksums-0.8/debian/patches/zero_random_pageheader.patch        
1970-01-01 01:00:00.000000000 +0100
+++ pg-checksums-0.8/debian/patches/zero_random_pageheader.patch        
2019-04-30 13:40:05.000000000 +0200
@@ -0,0 +1,355 @@
+commit 043f003fc1fc1cdf8a370971edf6e9323034b15d
+Author: Michael Banck <mba...@debian.org>
+Date:   Tue Apr 23 16:03:16 2019 +0200
+
+    Fix checksum failure detection for all-zero or random pageheaders.
+    
+    Previously, a zeroed-out pageheader would lead to that page getting 
skipped as
+    being new, even if other parts of the page have (corrupted) data in it.  
Add an
+    additional check that the whole page shall be zero and report a checksum
+    failure if that is not the case.
+    
+    Also, random data in the pageheader would very likely make the page's LSN 
so
+    large that it would be considered newer than the checkpoint LSN.  As we 
are not
+    connected to the server, we cannot inquire the current insert record 
pointer to
+    check against or lock the page, so we (i) advance the checkpoint LSN to the
+    current pg_control value and (ii) demand that the upper 32 bits of the LSN 
are
+    identical between the checkpoint LSN and the page's LSN in order to skip 
the
+    page rather than report a checksum failure.
+    
+    Additional TAP tests are added for both cases as well.
+
+Index: pg-checksums/pg_checksums.c
+===================================================================
+--- pg-checksums.orig/pg_checksums.c
++++ pg-checksums/pg_checksums.c
+@@ -77,6 +77,8 @@ static bool deactivate = false;
+ static bool show_progress = false;
+ static bool online = false;
+ 
++static char *DataDir = NULL;
++
+ static const char *progname;
+ 
+ /*
+@@ -181,6 +183,28 @@ isRelFileName(const char *fn)
+       return true;
+ }
+ 
++ static void
++update_checkpoint_lsn(void)
++{
++      bool    crc_ok;
++
++#if PG_VERSION_NUM >= 100000
++      ControlFile = get_controlfile(DataDir, progname, &crc_ok);
++      if (!crc_ok)
++      {
++              fprintf(stderr, _("%s: pg_control CRC value is incorrect\n"), 
progname);
++              exit(1);
++      }
++#elif PG_VERSION_NUM >= 90600
++      ControlFile = get_controlfile(DataDir, progname);
++#else
++      ControlFile = getControlFile(DataDir);
++#endif
++
++      /* Update checkpointLSN with the current value */
++      checkpointLSN = ControlFile->checkPoint;
++}
++
+ static void
+ toggle_progress_report(int signo,
+                                          siginfo_t * siginfo,
+@@ -252,6 +276,8 @@ scan_file(const char *fn, BlockNumber se
+       int                     f;
+       BlockNumber     blockno;
+       bool            block_retry = false;
++      bool            all_zeroes;
++      size_t     *pagebytes;
+ 
+       f = open(fn, O_RDWR | PG_BINARY, 0);
+       if (f < 0)
+@@ -273,6 +299,7 @@ scan_file(const char *fn, BlockNumber se
+       {
+               uint16          csum;
+               int                     r = read(f, buf.data, BLCKSZ);
++              int                     i;
+ 
+               if (debug && block_retry)
+                       fprintf(stderr, _("%s: retrying block %d in file 
\"%s\"\n"),
+@@ -321,18 +348,38 @@ scan_file(const char *fn, BlockNumber se
+                       continue;
+               }
+ 
++              blocks++;
++
+               /* New pages have no checksum yet */
+               if (PageIsNew(header))
+               {
+-                      if (debug && block_retry)
+-                              fprintf(stderr, _("%s: block %d in file \"%s\" 
is new, ignoring\n"),
+-                                              progname, blockno, fn);
+-                      skippedblocks++;
++                      /* Check for an all-zeroes page */
++                      all_zeroes = true;
++                      pagebytes = (size_t *) buf.data;
++                      for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
++                      {
++                              if (pagebytes[i] != 0)
++                              {
++                                      all_zeroes = false;
++                                      break;
++                              }
++                      }
++                      if (!all_zeroes)
++                      {
++                              fprintf(stderr, _("%s: checksum verification 
failed in file \"%s\", block %u: pd_upper is zero but block is not all-zero\n"),
++                                              progname, fn, blockno);
++                              badblocks++;
++                      }
++                      else
++                      {
++                              if (debug && block_retry)
++                                      fprintf(stderr, _("%s: block %d in file 
\"%s\" is new, ignoring\n"),
++                                                      progname, blockno, fn);
++                              skippedblocks++;
++                      }
+                       continue;
+               }
+ 
+-              blocks++;
+-
+               csum = pg_checksum_page(buf.data, blockno + segmentno * 
RELSEG_SIZE);
+               current_size += r;
+ 
+@@ -367,23 +414,34 @@ scan_file(const char *fn, BlockNumber se
+ 
+                                       /* Reset loop to validate the block 
again */
+                                       blockno--;
++                                      blocks--;
+                                       current_size -= r;
+ 
++                                      /*
++                                       * Update the checkpoint LSN now. If we 
get a failure on
++                                       * re-read, we would need to do this 
anyway, and doing it
++                                       * now lowers the probability that we 
see the same torn
++                                       * page on re-read.
++                                       */
++                                      update_checkpoint_lsn();
++
+                                       continue;
+                               }
+ 
+                               /*
+                                * The checksum verification failed on retry as 
well.  Check if
+                                * the page has been modified since the 
checkpoint and skip it
+-                               * in this case.
++                               * in this case. As a sanity check, demand that 
the upper 32
++                               * bits of the LSN are identical in order to 
skip as a guard
++                               * against a corrupted LSN in the pageheader.
+                                */
+-                              if (PageGetLSN(buf.data) > checkpointLSN)
++                              if ((PageGetLSN(buf.data) > checkpointLSN) &&
++                                      (PageGetLSN(buf.data) >> 32 == 
checkpointLSN >> 32))
+                               {
+                                       if (debug)
+                                               fprintf(stderr, _("%s: block %d 
in file \"%s\" with LSN %X/%X is newer than checkpoint LSN %X/%X, ignoring\n"),
+                                                               progname, 
blockno, fn, (uint32) (PageGetLSN(buf.data) >> 32), (uint32) 
PageGetLSN(buf.data), (uint32) (checkpointLSN >> 32), (uint32) checkpointLSN);
+                                       block_retry = false;
+-                                      blocks--;
+                                       skippedblocks++;
+                                       continue;
+                               }
+@@ -897,7 +955,6 @@ main(int argc, char *argv[])
+               {NULL, 0, NULL, 0}
+       };
+ 
+-      char       *DataDir = NULL;
+       int                     c;
+       int                     option_index;
+ #if PG_VERSION_NUM >= 100000
+Index: pg-checksums/t/001_checksums.pl
+===================================================================
+--- pg-checksums.orig/t/001_checksums.pl
++++ pg-checksums/t/001_checksums.pl
+@@ -6,7 +6,7 @@ use Cwd;
+ use Config;
+ use PostgresNode;
+ use TestLib;
+-use Test::More tests => 49;
++use Test::More tests => 97;
+ 
+ program_help_ok('pg_checksums');
+ program_version_ok('pg_checksums');
+@@ -14,8 +14,8 @@ program_options_handling_ok('pg_checksum
+ 
+ my $tempdir = TestLib::tempdir;
+ 
+-# Initialize node
+-my $node = get_new_node('main');
++# Initialize node with checksums disabled.
++my $node = get_new_node('node_checksum');
+ $node->init;
+ 
+ $node->start;
+@@ -80,70 +80,112 @@ $node->stop;
+ $node->command_ok(['pg_checksums', '-a', '-D', $pgdata],
+         'pg_checksums are again activated in offline cluster');
+ 
+-#exit 0;
+ $node->start;
+ 
+ $node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
+         'pg_checksums can be verified in online cluster');
+ 
+-# create table to corrupt and get their relfilenode
+-create_corruption($node, 'corrupt1', 'pg_default');
+-
+-$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
+-        1,
+-        [qr/Bad checksums:  1/s],
+-        [qr/checksum verification failed/s],
+-        'pg_checksums reports checksum mismatch'
+-);
+-
+-# drop corrupt table again and make sure there is no more corruption
+-$node->safe_psql('postgres', 'DROP TABLE corrupt1;');
+-$node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
+-        'pg_checksums can be verified in online cluster: '.getcwd());
+-
+-
+-# create table to corrupt in a non-default tablespace and get their 
relfilenode
+-my $tablespace_dir = getcwd()."/tmp_check/ts_corrupt_dir";
++# Set page header and block size
++my $pageheader_size = 24;
++my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
++
++# Check corruption of table on default tablespace.
++check_relation_corruption($node, 'corrupt1', 'pg_default', $pageheader_size, 
"\0\0\0\0\0\0\0\0\0", "on tablespace pg_default");
++
++# Create tablespace to check corruptions in a non-default tablespace.
++my $basedir = $node->basedir;
++my $tablespace_dir = "$basedir/ts_corrupt_dir";
+ mkdir ($tablespace_dir);
+-$node->safe_psql('postgres', "CREATE TABLESPACE ts_corrupt LOCATION 
'".$tablespace_dir."';");
+-create_corruption($node, 'corrupt2', 'ts_corrupt');
+-
+-$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
+-        1,
+-        [qr/Bad checksums:  1/s],
+-        [qr/checksum verification failed/s],
+-        'pg_checksums reports checksum mismatch on non-default tablespace'
+-);
+-
+-# drop corrupt table again and make sure there is no more corruption
+-$node->safe_psql('postgres', 'DROP TABLE corrupt2;');
+-$node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
+-        'pg_checksums can be verified in online cluster');
+-
+-# Utility routine to create a table with corrupted checksums.
+-# It stops the node (if running), and starts it again.
+-sub create_corruption
++$tablespace_dir = TestLib::real_dir($tablespace_dir);
++$node->safe_psql('postgres',
++    "CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
++check_relation_corruption($node, 'corrupt2', 'ts_corrupt', $pageheader_size, 
"\0\0\0\0\0\0\0\0\0", "on tablespace ts_corrupt");
++
++# Check corruption in the pageheader with random data in it
++my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
++check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $random_data, 
"with random data in pageheader");
++
++# Check corruption when the pageheader has been zeroed-out completely
++my $zero_data = "\0"x$pageheader_size;
++check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $zero_data, 
"with zeroed-out pageheader");
++
++# Utility routine to create and check a table with corrupted checksums
++# on a wanted tablespace.  Note that this stops and starts the node
++# multiple times to perform the checks, leaving the node started
++# at the end.
++sub check_relation_corruption
+ {
+       my $node = shift;
+       my $table = shift;
+       my $tablespace = shift;
+-
+-      my $query = "SELECT a INTO ".$table." FROM generate_series(1,10000) AS 
a; ALTER TABLE ".$table." SET (autovacuum_enabled=false), SET TABLESPACE 
".$tablespace."; SELECT pg_relation_filepath('".$table."')";
+-      my $file_name = $node->safe_psql('postgres', $query);
+-
+-      # set page header and block sizes
+-      my $pageheader_size = 24;
+-      my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
++      my $offset = shift;
++      my $corrupted_data = shift;
++      my $description = shift;
++      my $pgdata = $node->data_dir;
++
++      $node->safe_psql('postgres',
++              "SELECT a INTO $table FROM generate_series(1,10000) AS a;
++              ALTER TABLE $table SET (autovacuum_enabled=false);");
++
++      $node->safe_psql('postgres',
++              "ALTER TABLE ".$table." SET TABLESPACE ".$tablespace.";");
++
++      my $file_corrupted = $node->safe_psql('postgres',
++              "SELECT pg_relation_filepath('$table');");
++      my $relfilenode_corrupted =  $node->safe_psql('postgres',
++              "SELECT relfilenode FROM pg_class WHERE relname = '$table';");
+ 
+       $node->stop;
+ 
+-      open my $file, '+<', "$pgdata/$file_name";
+-      seek($file, $pageheader_size, 0);
+-      syswrite($file, '\0\0\0\0\0\0\0\0\0');
++      # Checksums are correct for single relfilenode as the table is not
++      # corrupted yet.
++      command_ok(['pg_checksums',  '-c', '-D', $pgdata, '-r',
++                         $relfilenode_corrupted],
++              "succeeds for single relfilenode $description with offline 
cluster");
++
++      # Time to create some corruption
++      open my $file, '+<', "$pgdata/$file_corrupted";
++      seek($file, $offset, 0);
++      syswrite($file, $corrupted_data);
+       close $file;
+ 
++      # Checksum checks on single relfilenode fail
++      $node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata,
++                                                        '-r', 
$relfilenode_corrupted],
++                                                        1,
++                                                        [qr/Bad 
checksums:.*1/],
++                                                        [qr/checksum 
verification failed/],
++                                                        "fails with corrupted 
data $description");
++
++      # Global checksum checks fail as well
++      $node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
++                                                        1,
++                                                        [qr/Bad 
checksums:.*1/],
++                                                        [qr/checksum 
verification failed/],
++                                                        "fails with corrupted 
data for single relfilenode on tablespace $tablespace");
++
++      # Now check online as well
+       $node->start;
+ 
++      # Checksum checks on single relfilenode fail
++      $node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata,
++                                                        '-r', 
$relfilenode_corrupted],
++                                                        1,
++                                                        [qr/Bad 
checksums:.*1/],
++                                                        [qr/checksum 
verification failed/],
++                                                        "fails with corrupted 
data $description");
++
++      # Global checksum checks fail as well
++      $node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
++                                                        1,
++                                                        [qr/Bad 
checksums:.*1/],
++                                                        [qr/checksum 
verification failed/],
++                                                        "fails with corrupted 
data for single relfilenode on tablespace $tablespace");
++
++      # Drop corrupted table again and make sure there is no more corruption.
++      $node->safe_psql('postgres', "DROP TABLE $table;");
++      $node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
++              "succeeds again after table drop on tablespace $tablespace");
+       return;
+ }
+ 
diff -Nru pg-checksums-0.8/debian/source/lintian-overrides 
pg-checksums-0.8/debian/source/lintian-overrides
--- pg-checksums-0.8/debian/source/lintian-overrides    2018-10-12 
14:04:31.000000000 +0200
+++ pg-checksums-0.8/debian/source/lintian-overrides    1970-01-01 
01:00:00.000000000 +0100
@@ -1,3 +0,0 @@
-# don't bug people uploading from @work
-source: changelog-should-mention-nmu
-source: source-nmu-has-incorrect-version-number

Bug#928378: unblock: pg-checksums/0.8-3

Reply via email to