On Wed, Feb 20, 2019 at 11:50:42AM +0100, Oleksii Kliukin wrote:
> RecoverPreparedTransactions calls ProcessRecords with
> twophase_recover_callbacks right after releasing the TwoPhaseStateLock, so I
> think lock_held should be false here (matching the similar call of
> TwoPhaseGetDummyProc at lock_twophase_recover).

Indeed.  That would be a bad idea.  We don't actually stress this
routine in 009_twophase.pl or the assertion I added would have blown
up immediately.  So I propose to close the gap, and the updated patch
attached adds dedicated tests causing the problem you spotted to be
triggered (soft and hard restarts with 2PC transactions including
DDLs).  The previous version of the patch and those tests cause the
assertion to blow up, failing the tests.

> Since the patch touches TwoPhaseGetDummyBackendId, let’s fix the comment
> header to mention it instead of TwoPhaseGetDummyProc

Not sure I understand the issue you are pointing out here.  The patch
adds an extra argument to both TwoPhaseGetDummyProc() and
TwoPhaseGetDummyBackendId(), and the headers of both functions
document the new argument.

One extra trick I have been using for testing this patch is the
following:
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -816,13 +816,6 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)

    Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));

-   /*
-    * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
-    * repeatedly for the same XID.  We can save work with a simple cache.
-    */
-   if (xid == cached_xid)
-       return cached_gxact;

This has the advantage to check more aggressively for lock conflicts,
causing the tests to deadlock if the flag is not correctly set in the
worst case scenario.
--
Michael
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 07ae7a31db..c399871940 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1713,7 +1713,7 @@ PostPrepare_MultiXact(TransactionId xid)
 	myOldestMember = OldestMemberMXactId[MyBackendId];
 	if (MultiXactIdIsValid(myOldestMember))
 	{
-		BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid);
+		BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
 
 		/*
 		 * Even though storing MultiXactId is atomic, acquire lock to make
@@ -1755,7 +1755,7 @@ void
 multixact_twophase_recover(TransactionId xid, uint16 info,
 						   void *recdata, uint32 len)
 {
-	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid);
+	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
 	MultiXactId oldestMember;
 
 	/*
@@ -1776,7 +1776,7 @@ void
 multixact_twophase_postcommit(TransactionId xid, uint16 info,
 							  void *recdata, uint32 len)
 {
-	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid);
+	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid, true);
 
 	Assert(len == sizeof(MultiXactId));
 
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 9a8a6bb119..ace71230e4 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -801,9 +801,12 @@ pg_prepared_xact(PG_FUNCTION_ARGS)
  * TwoPhaseGetGXact
  *		Get the GlobalTransaction struct for a prepared transaction
  *		specified by XID
+ *
+ * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
+ * caller had better hold it.
  */
 static GlobalTransaction
-TwoPhaseGetGXact(TransactionId xid)
+TwoPhaseGetGXact(TransactionId xid, bool lock_held)
 {
 	GlobalTransaction result = NULL;
 	int			i;
@@ -811,6 +814,8 @@ TwoPhaseGetGXact(TransactionId xid)
 	static TransactionId cached_xid = InvalidTransactionId;
 	static GlobalTransaction cached_gxact = NULL;
 
+	Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));
+
 	/*
 	 * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
 	 * repeatedly for the same XID.  We can save work with a simple cache.
@@ -818,7 +823,8 @@ TwoPhaseGetGXact(TransactionId xid)
 	if (xid == cached_xid)
 		return cached_gxact;
 
-	LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+	if (!lock_held)
+		LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
 
 	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
 	{
@@ -832,7 +838,8 @@ TwoPhaseGetGXact(TransactionId xid)
 		}
 	}
 
-	LWLockRelease(TwoPhaseStateLock);
+	if (!lock_held)
+		LWLockRelease(TwoPhaseStateLock);
 
 	if (result == NULL)			/* should not happen */
 		elog(ERROR, "failed to find GlobalTransaction for xid %u", xid);
@@ -849,12 +856,13 @@ TwoPhaseGetGXact(TransactionId xid)
  *
  * Dummy backend IDs are similar to real backend IDs of real backends.
  * They start at MaxBackends + 1, and are unique across all currently active
- * real backends and prepared transactions.
+ * real backends and prepared transactions.  If lock_held is set to true,
+ * TwoPhaseStateLock will not be taken, so the caller had better hold it.
  */
 BackendId
-TwoPhaseGetDummyBackendId(TransactionId xid)
+TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held)
 {
-	GlobalTransaction gxact = TwoPhaseGetGXact(xid);
+	GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
 
 	return gxact->dummyBackendId;
 }
@@ -862,11 +870,14 @@ TwoPhaseGetDummyBackendId(TransactionId xid)
 /*
  * TwoPhaseGetDummyProc
  *		Get the PGPROC that represents a prepared transaction specified by XID
+ *
+ * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
+ * caller had better hold it.
  */
 PGPROC *
-TwoPhaseGetDummyProc(TransactionId xid)
+TwoPhaseGetDummyProc(TransactionId xid, bool lock_held)
 {
-	GlobalTransaction gxact = TwoPhaseGetGXact(xid);
+	GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
 
 	return &ProcGlobal->allProcs[gxact->pgprocno];
 }
@@ -1560,6 +1571,13 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	if (hdr->initfileinval)
 		RelationCacheInitFilePostInvalidate();
 
+	/*
+	 * Acquire the two-phase lock, we want to work on the two-phase
+	 * callbacks while holding it to avoid potential conflicts with
+	 * other transactions, until the shared memory state is cleared.
+	 */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+
 	/* And now do the callbacks */
 	if (isCommit)
 		ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
@@ -1568,6 +1586,15 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 
 	PredicateLockTwoPhaseFinish(xid, isCommit);
 
+	/* Clear shared memory state */
+	RemoveGXact(gxact);
+
+	/*
+	 * Release the lock as all callbacks are called and shared memory
+	 * cleanup is done.
+	 */
+	LWLockRelease(TwoPhaseStateLock);
+
 	/* Count the prepared xact as committed or aborted */
 	AtEOXact_PgStat(isCommit);
 
@@ -1577,9 +1604,6 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	if (gxact->ondisk)
 		RemoveTwoPhaseFile(xid, true);
 
-	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
-	RemoveGXact(gxact);
-	LWLockRelease(TwoPhaseStateLock);
 	MyLockedGxact = NULL;
 
 	RESUME_INTERRUPTS();
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 3bb5ce350a..78fdbd6ff8 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -3243,7 +3243,7 @@ AtPrepare_Locks(void)
 void
 PostPrepare_Locks(TransactionId xid)
 {
-	PGPROC	   *newproc = TwoPhaseGetDummyProc(xid);
+	PGPROC	   *newproc = TwoPhaseGetDummyProc(xid, false);
 	HASH_SEQ_STATUS status;
 	LOCALLOCK  *locallock;
 	LOCK	   *lock;
@@ -4034,7 +4034,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
 					  void *recdata, uint32 len)
 {
 	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
-	PGPROC	   *proc = TwoPhaseGetDummyProc(xid);
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, false);
 	LOCKTAG    *locktag;
 	LOCKMODE	lockmode;
 	LOCKMETHODID lockmethodid;
@@ -4247,7 +4247,7 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
 						 void *recdata, uint32 len)
 {
 	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
-	PGPROC	   *proc = TwoPhaseGetDummyProc(xid);
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, true);
 	LOCKTAG    *locktag;
 	LOCKMETHODID lockmethodid;
 	LockMethod	lockMethodTable;
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 6228b091d8..fcd1913c43 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -34,8 +34,8 @@ extern void TwoPhaseShmemInit(void);
 extern void AtAbort_Twophase(void);
 extern void PostPrepare_Twophase(void);
 
-extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid);
-extern BackendId TwoPhaseGetDummyBackendId(TransactionId xid);
+extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid, bool lock_held);
+extern BackendId TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held);
 
 extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid,
 				TimestampTz prepared_at,
diff --git a/src/test/recovery/t/009_twophase.pl b/src/test/recovery/t/009_twophase.pl
index 2be1afcd8b..1b748ad857 100644
--- a/src/test/recovery/t/009_twophase.pl
+++ b/src/test/recovery/t/009_twophase.pl
@@ -4,7 +4,7 @@ use warnings;
 
 use PostgresNode;
 use TestLib;
-use Test::More tests => 20;
+use Test::More tests => 24;
 
 my $psql_out = '';
 my $psql_rc  = '';
@@ -334,6 +334,60 @@ $cur_standby->psql(
 	stdout => \$psql_out);
 is($psql_out, '1', "Replay prepared transaction with DDL");
 
+###############################################################################
+# Check recovery of prepared transaction with DDL inside after a hard restart
+# of the master.
+###############################################################################
+
+$cur_master->psql(
+	'postgres', "
+	BEGIN;
+	CREATE TABLE t_009_tbl3 (id int, msg text);
+	SAVEPOINT s1;
+	INSERT INTO t_009_tbl3 VALUES (28, 'issued to ${cur_master_name}');
+	PREPARE TRANSACTION 'xact_009_14';
+	BEGIN;
+	CREATE TABLE t_009_tbl4 (id int, msg text);
+	SAVEPOINT s1;
+	INSERT INTO t_009_tbl4 VALUES (29, 'issued to ${cur_master_name}');
+	PREPARE TRANSACTION 'xact_009_15';");
+
+$cur_master->teardown_node;
+$cur_master->start;
+
+$psql_rc = $cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_14'");
+is($psql_rc, '0', 'Commit prepared transaction after teardown');
+
+$psql_rc = $cur_master->psql('postgres', "ROLLBACK PREPARED 'xact_009_15'");
+is($psql_rc, '0', 'Rollback prepared transaction after teardown');
+
+###############################################################################
+# Check recovery of prepared transaction with DDL inside after a soft restart
+# of the master.
+###############################################################################
+
+$cur_master->psql(
+	'postgres', "
+	BEGIN;
+	CREATE TABLE t_009_tbl5 (id int, msg text);
+	SAVEPOINT s1;
+	INSERT INTO t_009_tbl5 VALUES (30, 'issued to ${cur_master_name}');
+	PREPARE TRANSACTION 'xact_009_16';
+	BEGIN;
+	CREATE TABLE t_009_tbl6 (id int, msg text);
+	SAVEPOINT s1;
+	INSERT INTO t_009_tbl6 VALUES (31, 'issued to ${cur_master_name}');
+	PREPARE TRANSACTION 'xact_009_17';");
+
+$cur_master->stop;
+$cur_master->start;
+
+$psql_rc = $cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_16'");
+is($psql_rc, '0', 'Commit prepared transaction after restart');
+
+$psql_rc = $cur_master->psql('postgres', "ROLLBACK PREPARED 'xact_009_17'");
+is($psql_rc, '0', 'Rollback prepared transaction after restart');
+
 ###############################################################################
 # Verify expected data appears on both servers.
 ###############################################################################

Attachment: signature.asc
Description: PGP signature

Reply via email to