Hi,

Am Montag, den 05.03.2018, 06:36 -0500 schrieb Stephen Frost:
> Michael,
> 
> * Michael Banck (michael.ba...@credativ.de) wrote:
> > On Sun, Mar 04, 2018 at 06:19:00PM +0100, Magnus Hagander wrote:
> > > So sure, if we go with WARNING + exit with an errorcode, that is perhaps
> > > the best combination of the two.
> > 
> > I had a look at how to go about this, but it appears to be a bit
> > complicated; the first problem is that sendFile() and sendDir() don't
> > have status return codes that could be set on checksum verifcation
> > failure. So I added a global variable and threw an ereport(ERROR) at the
> > end of perform_base_backup(), but then I realized that `pg_basebackup'
> > the client program purges the datadir it created if it gets an error:
> > 
> > > pg_basebackup: final receive failed: ERROR:  Checksum mismatch during
> > > basebackup
> > > 
> > > pg_basebackup: removing data directory "data2"
> 
> Oh, ugh.

I came up with the attached patch, which sets a checksum_failure
variable in both basebackup.c and pg_basebackup.c, and emits an ereport
with (for now) ERRCODE_DATA_CORRUPTED at the end of
perform_base_backup(), which gets caught in pg_basebackup and then used
to not cleanup the datadir, but exit with a non-zero exit code.

Does that seem feasible?


Michael

-- 
Michael Banck
Projektleiter / Senior Berater
Tel.: +49 2166 9901-171
Fax:  +49 2166 9901-100
Email: michael.ba...@credativ.de

credativ GmbH, HRB Mönchengladbach 12080
USt-ID-Nummer: DE204566209
Trompeterallee 108, 41189 Mönchengladbach
Geschäftsführung: Dr. Michael Meskes, Jörg Folz, Sascha Heuer
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 361cde3291..11fb009b59 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -100,6 +100,9 @@ static TimeOffset elapsed_min_unit;
 /* The last check of the transfer rate. */
 static TimestampTz throttled_last;
 
+/* Whether a checksum failure occured. */
+static bool checksum_failure;
+
 /*
  * The contents of these directories are removed or recreated during server
  * start so they are not included in backups.  The directories themselves are
@@ -208,6 +211,8 @@ perform_base_backup(basebackup_options *opt)
 	labelfile = makeStringInfo();
 	tblspc_map_file = makeStringInfo();
 
+	checksum_failure = false;
+
 	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
 								  labelfile, &tablespaces,
 								  tblspc_map_file,
@@ -566,6 +571,12 @@ perform_base_backup(basebackup_options *opt)
 		pq_putemptymessage('c');
 	}
 	SendXlogRecPtrResult(endptr, endtli);
+
+	if (checksum_failure == true)
+		ereport(ERROR,
+			(errcode(ERRCODE_DATA_CORRUPTED),
+			errmsg("Checksum mismatch during basebackup\n")));
+
 }
 
 /*
@@ -1295,10 +1306,13 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 				checksum = pg_checksum_page((char *) page, blkno + segmentno*RELSEG_SIZE);
 				phdr = (PageHeader) page;
 				if (phdr->pd_checksum != checksum)
+				{
 					ereport(WARNING,
-							(errmsg("checksum mismatch in file \"%s\", block %d: "
-							"expected %x, found %x", readfilename, blkno,
+							(errmsg("checksum mismatch in file \"%s\", segment %d, block %d: "
+							"expected %x, found %x", readfilename, segmentno, blkno,
 							phdr->pd_checksum, checksum)));
+					checksum_failure = true;
+				}
 				blkno++;
 			}
 		}
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 1b32592063..e1b364b4ef 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -39,6 +39,7 @@
 #include "replication/basebackup.h"
 #include "streamutil.h"
 
+#define ERRCODE_DATA_CORRUPTED	"XX001"
 
 typedef struct TablespaceListCell
 {
@@ -81,6 +82,7 @@ static char *xlog_dir = NULL;
 static char format = 'p';		/* p(lain)/t(ar) */
 static char *label = "pg_basebackup base backup";
 static bool noclean = false;
+static bool checksum_failure = false;
 static bool showprogress = false;
 static int	verbose = 0;
 static int	compresslevel = 0;
@@ -155,7 +157,7 @@ cleanup_directories_atexit(void)
 	if (success || in_log_streamer)
 		return;
 
-	if (!noclean)
+	if (!noclean && !checksum_failure)
 	{
 		if (made_new_pgdata)
 		{
@@ -195,7 +197,7 @@ cleanup_directories_atexit(void)
 	}
 	else
 	{
-		if (made_new_pgdata || found_existing_pgdata)
+		if ((made_new_pgdata || found_existing_pgdata) && !checksum_failure)
 			fprintf(stderr,
 					_("%s: data directory \"%s\" not removed at user's request\n"),
 					progname, basedir);
@@ -1970,8 +1972,17 @@ BaseBackup(void)
 	res = PQgetResult(conn);
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
-		fprintf(stderr, _("%s: final receive failed: %s"),
-				progname, PQerrorMessage(conn));
+		const char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+		if (sqlstate &&
+			strcmp(sqlstate, ERRCODE_DATA_CORRUPTED) == 0)
+		{
+			fprintf(stderr, _("%s: checksum error occured\n"),
+					progname);
+			checksum_failure = true;
+		} else {
+			fprintf(stderr, _("%s: final receive failed: %s"),
+					progname, PQerrorMessage(conn));
+		}
 		disconnect_and_exit(1);
 	}
 

Reply via email to