From 975eb3448acbec97c14d48f21261e44aca7b3acc Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Fri, 22 May 2020 14:13:33 +0500
Subject: [PATCH 1/3] Add conditional variable to wait for next MultXact offset
 in edge case

---
 src/backend/access/transam/multixact.c        | 23 ++++++++++++++++++-
 .../utils/activity/wait_event_names.txt       |  1 +
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 59523be901..03fcd25d4c 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -82,6 +82,7 @@
 #include "lib/ilist.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
+#include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "storage/lmgr.h"
 #include "storage/pmsignal.h"
@@ -233,6 +234,7 @@ typedef struct MultiXactStateData
 	/* support for members anti-wraparound measures */
 	MultiXactOffset offsetStopLimit;	/* known if oldestOffsetKnown */
 
+	ConditionVariable nextoff_cv;
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
 	 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -894,6 +896,14 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	/* Exchange our lock */
 	LWLockRelease(MultiXactOffsetSLRULock);
 
+	/*
+	 *  Let everybody know the offset of this mxid is recorded now. The waiters
+	 *  are waiting for the offset of the mxid next of the target to know the
+	 *  number of members of the target mxid, so we don't need to wait for
+	 *  members of this mxid are recorded.
+	 */
+	ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
+
 	LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
 
 	prev_pageno = -1;
@@ -1388,9 +1398,19 @@ retry:
 		if (nextMXOffset == 0)
 		{
 			/* Corner case 2: next multixact is still being filled in */
+
+			/*
+			 * The recorder of the next mxid is just before writing the offset.
+			 * Wait for the offset to be written.
+			 */
+			ConditionVariablePrepareToSleep(&MultiXactState->nextoff_cv);
+
 			LWLockRelease(MultiXactOffsetSLRULock);
 			CHECK_FOR_INTERRUPTS();
-			pg_usleep(1000L);
+
+			ConditionVariableSleep(&MultiXactState->nextoff_cv,
+								   WAIT_EVENT_NEXT_MXMEMBERS);
+			ConditionVariableCancelSleep();
 			goto retry;
 		}
 
@@ -1875,6 +1895,7 @@ MultiXactShmemInit(void)
 
 		/* Make sure we zero out the per-backend state */
 		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
+		ConditionVariableInit(&MultiXactState->nextoff_cv);
 	}
 	else
 		Assert(found);
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index a5df835dd4..1bb78d5aad 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -144,6 +144,7 @@ SYNC_REP	"Waiting for confirmation from a remote server during synchronous repli
 WAL_RECEIVER_EXIT	"Waiting for the WAL receiver to exit."
 WAL_RECEIVER_WAIT_START	"Waiting for startup process to send initial data for streaming replication."
 WAL_SUMMARY_READY	"Waiting for a new WAL summary to be generated."
+NEXT_MXMEMBERS	"Waiting for a next multixact member to be filled."
 XACT_GROUP_UPDATE	"Waiting for the group leader to update transaction status at end of a parallel operation."
 
 
-- 
2.37.1 (Apple Git-137.1)

