From ce4aba74f52c78f4e4b4b4580aac37a5ef2002ce Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 2 Aug 2024 09:23:17 -0400
Subject: [PATCH v2] pg_combinebackup: Detect missing files when possible.

You need a backup_manifest to take an incremental backup, but it's
optional when running pg_combinebackup. In practice, we'll very often
have one, because backup manifests are the default and most people
don't turn them off. And, if we do have one, we can check that no
files mentioned in the manifeste are absent from the corresponding
backup directory.

This falls a long way short of comprehensive verification, because
it only checks against the last backup in the chain, and even then
only for totally missing files. If you want to verify your backups,
it's a much better idea to use pg_verifybackup, which can detect a
much wider variety of problems and which you can run against every
backup you have. But, this check doesn't cost much and has a chance
of alerting you to a very serious problem, so we may as well include
it.

Per discussion with David Steele.

Discussion: http://postgr.es/m/CA+Tgmoa6McCGpaA66dbrGA6onGUytc5Jds_3Y=y7tMaPy1p9mQ@mail.gmail.com
---
 src/bin/pg_combinebackup/load_manifest.h      |   1 +
 src/bin/pg_combinebackup/meson.build          |   1 +
 src/bin/pg_combinebackup/pg_combinebackup.c   | 142 +++++++++++++-----
 .../pg_combinebackup/t/009_missing_files.pl   |  67 +++++++++
 4 files changed, 174 insertions(+), 37 deletions(-)
 create mode 100644 src/bin/pg_combinebackup/t/009_missing_files.pl

diff --git a/src/bin/pg_combinebackup/load_manifest.h b/src/bin/pg_combinebackup/load_manifest.h
index a96ae12eb8..df9e63f50c 100644
--- a/src/bin/pg_combinebackup/load_manifest.h
+++ b/src/bin/pg_combinebackup/load_manifest.h
@@ -27,6 +27,7 @@ typedef struct manifest_file
 	pg_checksum_type checksum_type;
 	int			checksum_length;
 	uint8	   *checksum_payload;
+	bool		matched;		/* for caller use, initially false */
 } manifest_file;
 
 #define SH_PREFIX		manifest_files
diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build
index d142608e94..4bb7175a33 100644
--- a/src/bin/pg_combinebackup/meson.build
+++ b/src/bin/pg_combinebackup/meson.build
@@ -36,6 +36,7 @@ tests += {
       't/006_db_file_copy.pl',
       't/007_wal_level_minimal.pl',
       't/008_promote.pl',
+      't/009_missing_files.pl',
     ],
   }
 }
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 9ded5a2140..29eee1cf0d 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -118,6 +118,7 @@ static void process_directory_recursively(Oid tsoid,
 										  cb_options *opt);
 static int	read_pg_version_file(char *directory);
 static void remember_to_cleanup_directory(char *target_path, bool rmtopdir);
+static void report_missing_files(manifest_data *manifest);
 static void reset_directory_cleanup_list(void);
 static cb_tablespace *scan_for_existing_tablespaces(char *pathname,
 													cb_options *opt);
@@ -407,6 +408,13 @@ main(int argc, char *argv[])
 									  manifests, mwriter, &opt);
 	}
 
+	/*
+	 * If we have a manifest for the latest backup, we can complain if any
+	 * files seem to be missing.
+	 */
+	if (manifests[n_prior_backups] != NULL)
+		report_missing_files(manifests[n_prior_backups]);
+
 	/* Finalize the backup_manifest, if we're generating one. */
 	if (mwriter != NULL)
 		finalize_manifest(mwriter,
@@ -999,12 +1007,11 @@ process_directory_recursively(Oid tsoid,
 		}
 
 		/*
-		 * Skip the backup_label and backup_manifest files; they require
-		 * special handling and are handled elsewhere.
+		 * Skip the backup_manifest file; it requires special handling and is
+		 * handled elsewhere.
 		 */
 		if (relative_path == NULL &&
-			(strcmp(de->d_name, "backup_label") == 0 ||
-			 strcmp(de->d_name, "backup_manifest") == 0))
+			strcmp(de->d_name, "backup_manifest") == 0)
 			continue;
 
 		/*
@@ -1019,11 +1026,37 @@ process_directory_recursively(Oid tsoid,
 			snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir,
 					 de->d_name + INCREMENTAL_PREFIX_LENGTH);
 
-
-			/* Manifest path likewise omits incremental prefix. */
+			/*
+			 * The path that gets used in any new backup manifest that we
+			 * generate likewise omits incremental prefix, but it includes a
+			 * tablespace prefix if required.
+			 */
 			snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
 					 de->d_name + INCREMENTAL_PREFIX_LENGTH);
 
+			/*
+			 * If we have a backup_manifest for the final input directory,
+			 * check whether this file is present. If so, mark it as a file
+			 * we've seen already; if not, emit a warning, so that the user
+			 * knows that the directory is out of sync with the
+			 * backup_manifest.
+			 */
+			if (latest_manifest != NULL)
+			{
+				char		old_manifest_path[MAXPGPATH];
+				manifest_file *mfile;
+
+				snprintf(old_manifest_path, MAXPGPATH, "%s%s",
+						 manifest_prefix, de->d_name);
+				mfile = manifest_files_lookup(latest_manifest->files,
+											  old_manifest_path);
+				if (mfile != NULL)
+					mfile->matched = true;
+				else
+					pg_log_warning("\"%s\" is present on disk but not in the manifest",
+								   manifest_path);
+			}
+
 			/* Reconstruction logic will do the rest. */
 			reconstruct_from_incremental_file(ifullpath, ofullpath,
 											  manifest_prefix,
@@ -1041,44 +1074,54 @@ process_directory_recursively(Oid tsoid,
 		}
 		else
 		{
-			/* Construct the path that the backup_manifest will use. */
+			manifest_file *mfile;
+
+			/*
+			 * We need to copy the entire file to the output directory.
+			 *
+			 * Compute the path used for this file in the backup manifest.
+			 */
 			snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
 					 de->d_name);
 
 			/*
-			 * It's not an incremental file, so we need to copy the entire
-			 * file to the output directory.
-			 *
-			 * If a checksum of the required type already exists in the
-			 * backup_manifest for the final input directory, we can save some
-			 * work by reusing that checksum instead of computing a new one.
+			 * If we have a backup_manifest for the final input directory,
+			 * check whether this file is present. If so, mark it as a file
+			 * we've seen already; if not, emit a warning, so that the user
+			 * knows that the directory is out of sync with the
+			 * backup_manifest.
 			 */
-			if (checksum_type != CHECKSUM_TYPE_NONE &&
-				latest_manifest != NULL)
+			if (latest_manifest != NULL)
 			{
-				manifest_file *mfile;
+				mfile =
+					manifest_files_lookup(latest_manifest->files, manifest_path);
+				if (mfile != NULL)
+					mfile->matched = true;
+				else
+					pg_log_warning("\"%s\" is present on disk but not in the manifest",
+								   manifest_path);
+			}
 
-				mfile = manifest_files_lookup(latest_manifest->files,
-											  manifest_path);
-				if (mfile == NULL)
-				{
-					char	   *bmpath;
-
-					/*
-					 * The directory is out of sync with the backup_manifest,
-					 * so emit a warning.
-					 */
-					bmpath = psprintf("%s/%s", input_directory,
-									  "backup_manifest");
-					pg_log_warning("\"%s\" contains no entry for \"%s\"",
-								   bmpath, manifest_path);
-					pfree(bmpath);
-				}
-				else if (mfile->checksum_type == checksum_type)
-				{
-					checksum_length = mfile->checksum_length;
-					checksum_payload = mfile->checksum_payload;
-				}
+			/*
+			 * The backup_label file is generated elsewhere by separate code,
+			 * so skip it here, but make sure that we do this after marking it
+			 * as matched in the manifest, so that report_missing_files
+			 * doesn't complain about it being missing.
+			 */
+			if (relative_path == NULL &&
+				strcmp(de->d_name, "backup_label") == 0)
+				continue;
+
+			/*
+			 * If the backup manifest entry we just looked up happens to
+			 * contain a checksum of the required type, we can save some work
+			 * by reusing that checksum instead of computing a new one.
+			 */
+			if (checksum_type != CHECKSUM_TYPE_NONE && mfile != NULL &&
+				mfile->checksum_type == checksum_type)
+			{
+				checksum_length = mfile->checksum_length;
+				checksum_payload = mfile->checksum_payload;
 			}
 
 			/*
@@ -1212,6 +1255,31 @@ remember_to_cleanup_directory(char *target_path, bool rmtopdir)
 	cleanup_dir_list = dir;
 }
 
+/*
+ * Complain if any files are missing on disk.
+ */
+static void
+report_missing_files(manifest_data *manifest)
+{
+	manifest_files_iterator it;
+	manifest_file *mfile;
+	bool		die = false;
+
+	manifest_files_start_iterate(manifest->files, &it);
+	while ((mfile = manifest_files_iterate(manifest->files, &it)) != NULL)
+	{
+		if (!mfile->matched)
+		{
+			pg_log_error("\"%s\" is present in the manifest but not on disk",
+						 mfile->pathname);
+			die = true;
+		}
+	}
+
+	if (die)
+		exit(1);
+}
+
 /*
  * Empty out the list of directories scheduled for cleanup at exit.
  *
diff --git a/src/bin/pg_combinebackup/t/009_missing_files.pl b/src/bin/pg_combinebackup/t/009_missing_files.pl
new file mode 100644
index 0000000000..55cc19e354
--- /dev/null
+++ b/src/bin/pg_combinebackup/t/009_missing_files.pl
@@ -0,0 +1,67 @@
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+#
+# Verify that, if we have a backup manifest for the final backup directory,
+# we can detect missing files.
+
+use strict;
+use warnings FATAL => 'all';
+use File::Compare;
+use File::Path qw(rmtree);
+use File::Copy;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Set up a new database instance.
+my $node1 = PostgreSQL::Test::Cluster->new('node1');
+$node1->init(has_archiving => 1, allows_streaming => 1);
+$node1->append_conf('postgresql.conf', 'summarize_wal = on');
+$node1->start;
+
+# Set up another new database instance.  force_initdb is used because
+# we want it to be a separate cluster with a different system ID.
+my $node2 = PostgreSQL::Test::Cluster->new('node2');
+$node2->init(force_initdb => 1, has_archiving => 1, allows_streaming => 1);
+$node2->append_conf('postgresql.conf', 'summarize_wal = on');
+$node2->start;
+
+# Take a full backup from node1.
+my $backup1path = $node1->backup_dir . '/backup1';
+$node1->command_ok(
+	[ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ],
+	"full backup from node1");
+
+# Now take an incremental backup.
+my $backup2path = $node1->backup_dir . '/backup2';
+$node1->command_ok(
+	[ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast',
+	  '--incremental', $backup1path . '/backup_manifest' ],
+	"incremental backup from node1");
+
+# Result directory.
+my $resultpath = $node1->backup_dir . '/result';
+
+# Cause one file to be missing.
+unlink("$backup2path/postgresql.conf")
+	|| die "unlink $backup2path/postgresql.conf: $!";
+
+# Test that it's detected.
+$node1->command_fails_like(
+	[ 'pg_combinebackup', $backup1path, $backup2path, '-o', $resultpath ],
+	qr/error: "postgresql.conf" is present in the manifest but not on disk/,
+	"can detect missing file");
+
+# Now remove the backup_manifest so that we can't detect missing files.
+unlink("$backup2path/backup_manifest")
+	|| die "unlink $backup2path/backup_manifest: $!";
+
+# Test that it's OK now. We have to use --no-manifest here because it's not
+# possible to generate a manifest for the output directory without a manifest
+# for the final input directory.
+$node1->command_ok(
+	[ 'pg_combinebackup', '--no-manifest', $backup1path, $backup2path,
+		'-o', $resultpath ],
+	"removing backup_manifest suppresses missing file detection");
+
+# OK, that's all.
+done_testing();
-- 
2.39.3 (Apple Git-145)

