Currently, if you take a backup with "pg_basebackup -x" (which means it will include all the WAL to required restore within the backup dump), and hit Ctrl-C while the WAL is being streamed, you end up with a data directory that you can start postmaster from, even though the backup was not complete. So what appears to be a valid backup - it starts up fine - can actually be corrupt.

I put in a check against that back in March, but it had to be reverted because it broke crash recovery when the system crashed while a pg_start_backup() based backup was in progress:

http://archives.postgresql.org/message-id/4da58686.1050...@enterprisedb.com

Here's a patch to add it back in a more fine-grained fashion. The patch adds an extra line to backup_label, indicating whether the backup was taken with pg_start/stop_backup(), or by streaming (= pg_basebackup). For a backup taken with pg_start_backup(), the behavior is kept the same as it has been - if the end-of-backup record is not reached during crash recovery, the database starts up anyway. But for a streamed backup, you get an error at startup.

I think this is a nice additional safeguard to have, making streamed backups more robust. I'd like to add this to 9.1, but it required an extra field to be added to the control file, so it would force an initdb. It's probably not worth that. Or, we could sneak in the extra boolean field to some currently unused pad space in the ControlFile struct.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6a6959f..d057e66 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -662,7 +662,8 @@ static bool CheckForStandbyTrigger(void);
 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 #endif
 static void pg_start_backup_callback(int code, Datum arg);
-static bool read_backup_label(XLogRecPtr *checkPointLoc);
+static bool read_backup_label(XLogRecPtr *checkPointLoc,
+				  bool *backupEndRequired);
 static void rm_redo_error_callback(void *arg);
 static int	get_sync_bit(int method);
 
@@ -6016,6 +6017,7 @@ StartupXLOG(void)
 	XLogRecord *record;
 	uint32		freespace;
 	TransactionId oldestActiveXID;
+	bool		backupEndRequired = false;
 
 	/*
 	 * Read control file and check XLOG status looks valid.
@@ -6149,7 +6151,7 @@ StartupXLOG(void)
 	if (StandbyMode)
 		OwnLatch(&XLogCtl->recoveryWakeupLatch);
 
-	if (read_backup_label(&checkPointLoc))
+	if (read_backup_label(&checkPointLoc, &backupEndRequired))
 	{
 		/*
 		 * When a backup_label file is present, we want to roll forward from
@@ -6328,7 +6330,10 @@ StartupXLOG(void)
 		 * set backupStartPoint if we're starting recovery from a base backup
 		 */
 		if (haveBackupLabel)
+		{
 			ControlFile->backupStartPoint = checkPoint.redo;
+			ControlFile->backupEndRequired = backupEndRequired;
+		}
 		ControlFile->time = (pg_time_t) time(NULL);
 		/* No need to hold ControlFileLock yet, we aren't up far enough */
 		UpdateControlFile();
@@ -6698,9 +6703,13 @@ StartupXLOG(void)
 		 * crashes while an online backup is in progress. We must not treat
 		 * that as an error, or the database will refuse to start up.
 		 */
-		if (InArchiveRecovery)
+		if (InArchiveRecovery || ControlFile->backupEndRequired)
 		{
-			if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+			if (ControlFile->backupEndRequired)
+				ereport(FATAL,
+						(errmsg("WAL ends before end of online backup"),
+						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
+			else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
 				ereport(FATAL,
 						(errmsg("WAL ends before end of online backup"),
 						 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
@@ -8531,6 +8540,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 			if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
 				ControlFile->minRecoveryPoint = lsn;
 			MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
+			ControlFile->backupEndRequired = false;
 			UpdateControlFile();
 
 			LWLockRelease(ControlFileLock);
@@ -9013,6 +9023,8 @@ do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
 						 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
 		appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
 						 checkpointloc.xlogid, checkpointloc.xrecoff);
+		appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
+						 exclusive ? "pg_start_backup" : "streamed");
 		appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
 		appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
 
@@ -9768,15 +9780,19 @@ pg_xlogfile_name(PG_FUNCTION_ARGS)
  *
  * Returns TRUE if a backup_label was found (and fills the checkpoint
  * location and its REDO location into *checkPointLoc and RedoStartLSN,
- * respectively); returns FALSE if not.
+ * respectively); returns FALSE if not. If this backup_label came from a
+ * streamed backup, *backupEndRequired is set to TRUE.
  */
 static bool
-read_backup_label(XLogRecPtr *checkPointLoc)
+read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired)
 {
 	char		startxlogfilename[MAXFNAMELEN];
 	TimeLineID	tli;
 	FILE	   *lfp;
 	char		ch;
+	char		backuptype[16];
+
+	*backupEndRequired = false;
 
 	/*
 	 * See if label file is present
@@ -9809,6 +9825,17 @@ read_backup_label(XLogRecPtr *checkPointLoc)
 		ereport(FATAL,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	/*
+	 * BACKUP METHOD line is new in 9.0. Don't complain if it doesn't exist,
+	 * in case you're restoring from a backup taken with an 9.0 beta version
+	 * that didn't emit it.
+	 */
+	if (fscanf(lfp, "BACKUP METHOD: %16s", backuptype) == 1)
+	{
+		if (strcmp(backuptype, "streamed") == 0)
+			*backupEndRequired = true;
+	}
+
 	if (ferror(lfp) || FreeFile(lfp))
 		ereport(FATAL,
 				(errcode_for_file_access(),
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index d543ef6..1648d7d 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	903
+#define PG_CONTROL_VERSION	911
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -137,9 +137,16 @@ typedef struct ControlFileData
 	 * we use the redo pointer as a cross-check when we see an end-of-backup
 	 * record, to make sure the end-of-backup record corresponds the base
 	 * backup we're recovering from.
+	 *
+	 * If backupEndRequired is true, we know for sure that we're restoring
+	 * from a backup, and must see a backup-end record before we can safely
+	 * start up. If it's false, but backupStartPoint is set, a backup_label
+	 * file was found at startup but it may have been a leftover from a stray
+	 * pg_start_backup() call, not accompanied with pg_stop_backup().
 	 */
 	XLogRecPtr	minRecoveryPoint;
 	XLogRecPtr	backupStartPoint;
+	bool		backupEndRequired;
 
 	/*
 	 * Parameter settings that determine if the WAL can be used for archival
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to