From ddde253b10b87d060d34f8d7250be27a27c18c0a Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Fri, 22 May 2020 14:13:33 +0500
Subject: [PATCH v2 3/4] Add conditional variable to wait for next MultXact
 offset in edge case

---
 src/backend/access/transam/multixact.c | 23 ++++++++++++++++++++++-
 src/backend/postmaster/pgstat.c        |  2 ++
 src/include/pgstat.h                   |  1 +
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 445b6258a4..02de9a7c13 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -82,6 +82,7 @@
 #include "lib/ilist.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
+#include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "storage/lmgr.h"
 #include "storage/pmsignal.h"
@@ -233,6 +234,7 @@ typedef struct MultiXactStateData
 	/* support for members anti-wraparound measures */
 	MultiXactOffset offsetStopLimit;	/* known if oldestOffsetKnown */
 
+	ConditionVariable nextoff_cv;
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
 	 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -873,6 +875,14 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	/* Exchange our lock */
 	LWLockRelease(MultiXactOffsetSLRULock);
 
+	/*
+	 *  Let everybody know the offset of this mxid is recorded now. The waiters
+	 *  are waiting for the offset of the mxid next of the target to know the
+	 *  number of members of the target mxid, so we don't need to wait for
+	 *  members of this mxid are recorded.
+	 */
+	ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
+
 	LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
 
 	prev_pageno = -1;
@@ -1367,9 +1377,19 @@ retry:
 		if (nextMXOffset == 0)
 		{
 			/* Corner case 2: next multixact is still being filled in */
+
+			/*
+			 * The recorder of the next mxid is just before writing the offset.
+			 * Wait for the offset to be written.
+			 */
+			ConditionVariablePrepareToSleep(&MultiXactState->nextoff_cv);
+
 			LWLockRelease(MultiXactOffsetSLRULock);
 			CHECK_FOR_INTERRUPTS();
-			pg_usleep(1000L);
+
+			ConditionVariableSleep(&MultiXactState->nextoff_cv,
+								   WAIT_EVENT_WAIT_NEXT_MXMEMBERS);
+			ConditionVariableCancelSleep();
 			goto retry;
 		}
 
@@ -1847,6 +1867,7 @@ MultiXactShmemInit(void)
 
 		/* Make sure we zero out the per-backend state */
 		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
+		ConditionVariableInit(&MultiXactState->nextoff_cv);
 	}
 	else
 		Assert(found);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index edfa774ee4..4bce534c7a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3877,6 +3877,8 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 			break;
 		case WAIT_EVENT_XACT_GROUP_UPDATE:
 			event_name = "XactGroupUpdate";
+		case WAIT_EVENT_WAIT_NEXT_MXMEMBERS:
+			event_name = "MultiXact/WaitNextXactMembers";
 			break;
 			/* no default case, so that compiler will warn */
 	}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 1387201382..26f828597b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -887,6 +887,7 @@ typedef enum
 	WAIT_EVENT_REPLICATION_SLOT_DROP,
 	WAIT_EVENT_SAFE_SNAPSHOT,
 	WAIT_EVENT_SYNC_REP,
+	WAIT_EVENT_WAIT_NEXT_MXMEMBERS,
 	WAIT_EVENT_XACT_GROUP_UPDATE
 } WaitEventIPC;
 
-- 
2.24.2 (Apple Git-127)

