On 2015-01-26 22:03:03 +0100, Andres Freund wrote:
> Attached is a patch trying to this. Doesn't look too bad and lead me to
> discover missing recovery conflicts during a AD ST.
> 
> But: It doesn't actually work on standbys, because lock.c prevents any
> stronger lock than RowExclusive from being acquired. And we need need a
> lock that can conflict with WAL replay of DBASE_CREATE, to handle base
> backups that are executed on the primary. Those obviously can't detect
> whether any standby is currently doing a base backup...
> 
> I currently don't have a good idea how to mangle lock.c to allow
> this. I've played with doing it like in the second patch, but that
> doesn't actually work because of some asserts around ProcSleep - leading
> to locks on database objects not working in the startup process (despite
> already being used).
> 
> The easiest thing would be to just use a lwlock instead of a heavyweight
> lock - but those aren't canceleable...

As Stephen noticed on irc I forgot to attach those. Caused by the
generic HS issue mentioned in 20150126212458.ga29...@awork2.anarazel.de.

Attached now.

As mentioned above, this isn't really isn't ready (e.g. it'll Assert() on a
standby due to the HS issue).

Greetings,

Andres Freund

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
>From e5d46c73955e7647912c2625e31027a6fef7c880 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 26 Jan 2015 21:03:35 +0100
Subject: [PATCH] Prevent ALTER DATABASE SET TABLESPACE from running
 concurrently with a base backup.

The combination can cause a corrupt base backup.
---
 src/backend/access/transam/xlog.c    | 33 +++++++++++++++++++++++++++++
 src/backend/commands/dbcommands.c    | 41 ++++++++++++++++++++++++++++++++++++
 src/backend/replication/basebackup.c | 11 ++++++++++
 src/include/access/xlog.h            |  1 +
 4 files changed, 86 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 629a457..1daa10d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -53,6 +53,7 @@
 #include "storage/ipc.h"
 #include "storage/large_object.h"
 #include "storage/latch.h"
+#include "storage/lmgr.h"
 #include "storage/pmsignal.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -9329,6 +9330,14 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 	else
 		XLogCtl->Insert.nonExclusiveBackups++;
 	XLogCtl->Insert.forcePageWrites = true;
+
+	/*
+	 * Acquire lock on pg datababase just before releasing the WAL insert lock
+	 * and entering the ENSURE_ERROR_CLEANUP block. That way it's sufficient
+	 * to release it in the error cleanup callback.
+	 */
+	LockRelationOid(DatabaseRelationId, ShareLock);
+
 	WALInsertLockRelease();
 
 	/* Ensure we release forcePageWrites if fail below */
@@ -9523,6 +9532,8 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
 
+	UnlockRelationOid(DatabaseRelationId, ShareLock);
+
 	/*
 	 * We're done.  As a convenience, return the starting WAL location.
 	 */
@@ -9537,6 +9548,8 @@ pg_start_backup_callback(int code, Datum arg)
 {
 	bool		exclusive = DatumGetBool(arg);
 
+	UnlockRelationOid(DatabaseRelationId, ShareLock);
+
 	/* Update backup counters and forcePageWrites on failure */
 	WALInsertLockAcquireExclusive();
 	if (exclusive)
@@ -9937,6 +9950,26 @@ do_pg_abort_backup(void)
 }
 
 /*
+ * Is a (exclusive or nonexclusive) base backup running?
+ *
+ * Note that this does not check whether any standby of this node has a
+ * basebackup running, or whether any upstream master (if this is a standby)
+ * has one in progress
+ */
+bool
+LocalBaseBackupInProgress(void)
+{
+	bool ret;
+
+	WALInsertLockAcquire();
+	ret = XLogCtl->Insert.exclusiveBackup ||
+		XLogCtl->Insert.nonExclusiveBackups > 0;
+	WALInsertLockRelease();
+
+	return ret;
+}
+
+/*
  * Get latest redo apply position.
  *
  * Exported to allow WALReceiver to read the pointer directly.
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 5e66961..6184c83 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1087,6 +1087,30 @@ movedb(const char *dbname, const char *tblspcname)
 				 errmsg("cannot change the tablespace of the currently open database")));
 
 	/*
+	 * Prevent SET TABLESPACE from running concurrently with a base
+	 * backup. Without that check a base backup would potentially copy a
+	 * partially removed source database; which WAL replay then would copy
+	 * over the new database...
+	 *
+	 * Starting a base backup takes a SHARE lock on pg_database. In addition a
+	 * streaming basebackup takes the same lock for the entirety of the copy
+	 * of the data directory.  That, combined with this check, prevents base
+	 * backups from being taken at the same time a SET TABLESPACE is in
+	 * progress.
+	 *
+	 * Note that this check here will not trigger if a standby currently has a
+	 * base backup ongoing; instead WAL replay will have a recovery conflict
+	 * when replaying the DBASE_CREATE record. That is only safe because
+	 * standbys can only do nonexclusive base backups which hold the lock over
+	 * the entire runtime - otherwise no lock would prevent replay after
+	 * pg_start_backup().
+	 */
+	if (LocalBaseBackupInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot change a database's tablespace while a base backup is in progress")));
+
+	/*
 	 * Get tablespace's oid
 	 */
 	dst_tblspcoid = get_tablespace_oid(tblspcname, false);
@@ -2052,6 +2076,17 @@ dbase_redo(XLogReaderState *record)
 		src_path = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id);
 		dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
 
+		if (InHotStandby)
+		{
+			/* Lock source database, do avoid concurrent hint bit writes et al. */
+			LockSharedObjectForSession(DatabaseRelationId, xlrec->src_db_id, 0, AccessExclusiveLock);
+			ResolveRecoveryConflictWithDatabase(xlrec->src_db_id);
+
+			/* Lock target database, it'll be overwritten in a second */
+			LockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock);
+			ResolveRecoveryConflictWithDatabase(xlrec->db_id);
+		}
+
 		/*
 		 * Our theory for replaying a CREATE is to forcibly drop the target
 		 * subdirectory if present, then re-copy the source data. This may be
@@ -2078,6 +2113,12 @@ dbase_redo(XLogReaderState *record)
 		 * We don't need to copy subdirectories
 		 */
 		copydir(src_path, dst_path, false);
+
+		if (InHotStandby)
+		{
+			UnlockSharedObjectForSession(DatabaseRelationId, xlrec->src_db_id, 0, AccessExclusiveLock);
+			UnlockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock);
+		}
 	}
 	else if (info == XLOG_DBASE_DROP)
 	{
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 3058ce9..0e76497 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -19,6 +19,7 @@
 
 #include "access/xlog_internal.h"		/* for pg_start/stop_backup */
 #include "catalog/catalog.h"
+#include "catalog/pg_database.h"
 #include "catalog/pg_type.h"
 #include "lib/stringinfo.h"
 #include "libpq/libpq.h"
@@ -32,6 +33,8 @@
 #include "replication/walsender_private.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/lock.h"
 #include "utils/builtins.h"
 #include "utils/elog.h"
 #include "utils/ps_status.h"
@@ -110,6 +113,8 @@ static void
 base_backup_cleanup(int code, Datum arg)
 {
 	do_pg_abort_backup();
+
+	UnlockRelationOid(DatabaseRelationId, ShareLock);
 }
 
 /*
@@ -134,6 +139,9 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 
 	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
 								  &labelfile);
+
+	LockRelationOid(DatabaseRelationId, ShareLock);
+
 	/*
 	 * Once do_pg_start_backup has been called, ensure that any failure causes
 	 * us to abort the backup so we don't "leak" a backup counter. For this reason,
@@ -304,6 +312,9 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 
+	/* release lock again, before stop_backup, as that can error out */
+	UnlockRelationOid(DatabaseRelationId, ShareLock);
+
 	endptr = do_pg_stop_backup(labelfile, !opt->nowait, &endtli);
 
 	if (opt->includewal)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 138deaf..f3893f3 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -253,6 +253,7 @@ extern XLogRecPtr do_pg_start_backup(const char *backupidstr, bool fast,
 extern XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive,
 				  TimeLineID *stoptli_p);
 extern void do_pg_abort_backup(void);
+extern bool LocalBaseBackupInProgress(void);
 
 /* File path names (all relative to $PGDATA) */
 #define BACKUP_LABEL_FILE		"backup_label"
-- 
2.0.0.rc2.4.g1dc51c6.dirty

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1daa10d..5c70567 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9336,7 +9336,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 	 * and entering the ENSURE_ERROR_CLEANUP block. That way it's sufficient
 	 * to release it in the error cleanup callback.
 	 */
-	LockRelationOid(DatabaseRelationId, ShareLock);
+	LockRelationOidForSession(DatabaseRelationId, ShareLock);
 
 	WALInsertLockRelease();
 
@@ -9532,7 +9532,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
 
-	UnlockRelationOid(DatabaseRelationId, ShareLock);
+	UnlockRelationOidForSession(DatabaseRelationId, ShareLock);
 
 	/*
 	 * We're done.  As a convenience, return the starting WAL location.
@@ -9548,7 +9548,7 @@ pg_start_backup_callback(int code, Datum arg)
 {
 	bool		exclusive = DatumGetBool(arg);
 
-	UnlockRelationOid(DatabaseRelationId, ShareLock);
+	UnlockRelationOidForSession(DatabaseRelationId, ShareLock);
 
 	/* Update backup counters and forcePageWrites on failure */
 	WALInsertLockAcquireExclusive();
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 6184c83..d312e23 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -2085,6 +2085,9 @@ dbase_redo(XLogReaderState *record)
 			/* Lock target database, it'll be overwritten in a second */
 			LockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock);
 			ResolveRecoveryConflictWithDatabase(xlrec->db_id);
+
+			/* Lock pg_database, to conflict with base backups */
+			LockRelationOidForSession(DatabaseRelationId, RowExclusiveLock);
 		}
 
 		/*
@@ -2116,8 +2119,9 @@ dbase_redo(XLogReaderState *record)
 
 		if (InHotStandby)
 		{
-			UnlockSharedObjectForSession(DatabaseRelationId, xlrec->src_db_id, 0, AccessExclusiveLock);
+			UnlockRelationOidForSession(DatabaseRelationId, RowExclusiveLock);
 			UnlockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock);
+			UnlockSharedObjectForSession(DatabaseRelationId, xlrec->src_db_id, 0, AccessExclusiveLock);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 0e76497..481ba18 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -114,7 +114,7 @@ base_backup_cleanup(int code, Datum arg)
 {
 	do_pg_abort_backup();
 
-	UnlockRelationOid(DatabaseRelationId, ShareLock);
+	UnlockRelationOidForSession(DatabaseRelationId, ShareLock);
 }
 
 /*
@@ -140,7 +140,7 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
 								  &labelfile);
 
-	LockRelationOid(DatabaseRelationId, ShareLock);
+	LockRelationOidForSession(DatabaseRelationId, ShareLock);
 
 	/*
 	 * Once do_pg_start_backup has been called, ensure that any failure causes
@@ -313,7 +313,7 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 	PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 
 	/* release lock again, before stop_backup, as that can error out */
-	UnlockRelationOid(DatabaseRelationId, ShareLock);
+	UnlockRelationOidForSession(DatabaseRelationId, ShareLock);
 
 	endptr = do_pg_stop_backup(labelfile, !opt->nowait, &endtli);
 
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index d13a167..c428b38 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -248,6 +248,37 @@ UnlockRelation(Relation relation, LOCKMODE lockmode)
 }
 
 /*
+ *		LockRelationOidForSession
+ *
+ * Lock a relation in session mode, without requiring having it opened it in
+ * transaction local mode. That's likely only useful during WAL replay.
+ */
+void
+LockRelationOidForSession(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	(void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ *		UnlockRelationOidForSession
+ *
+ * Unlock lock acquired by LockRelationOidForSession.
+ */
+void
+UnlockRelationOidForSession(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	LockRelease(&tag, lockmode, true);
+}
+
+/*
  *		LockHasWaitersRelation
  *
  * This is a functiion to check if someone else is waiting on a
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 61c8d21..3ebbe9c 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -710,13 +710,16 @@ LockAcquireExtended(const LOCKTAG *locktag,
 	if (RecoveryInProgress() && !InRecovery &&
 		(locktag->locktag_type == LOCKTAG_OBJECT ||
 		 locktag->locktag_type == LOCKTAG_RELATION) &&
-		lockmode > RowExclusiveLock)
+		lockmode > RowExclusiveLock &&
+		!sessionLock)
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("cannot acquire lock mode %s on database objects while recovery is in progress",
 						lockMethodTable->lockModeNames[lockmode]),
 				 errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery.")));
 
+	Assert(!InRecovery || (sessionLock && dontWait));
+
 #ifdef LOCK_DEBUG
 	if (LOCK_DEBUG_ENABLED(locktag))
 		elog(LOG, "LockAcquire: lock [%u,%u] %s",
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index f5d70e5..2397f11 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -50,6 +50,9 @@ extern bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode);
 extern void LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
 extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
 
+extern void LockRelationOidForSession(Oid relid, LOCKMODE lockmode);
+extern void UnlockRelationOidForSession(Oid relid, LOCKMODE lockmode);
+
 /* Lock a relation for extension */
 extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode);
 extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to