On Mon, Aug 3, 2015 at 8:39 AM, Amit Kapila <amit.kapil...@gmail.com> wrote:
>> I agree and modified the patch to use 32-bit atomics based on idea
>> suggested by Robert and didn't modify lwlock.c.
>
> While looking at patch, I found that the way it was initialising the list
> to be empty was wrong, it was using pgprocno as 0 to initialise the
> list, as 0 is a valid pgprocno.  I think we should use a number greater
> that PROCARRAY_MAXPROC (maximum number of procs in proc
> array).
>
> Apart from above fix, I have modified src/backend/access/transam/README
> to include the information about the improvement this patch brings to
> reduce ProcArrayLock contention.

I spent some time looking at this patch today and found that a good
deal of cleanup work seemed to be needed.  Attached is a cleaned-up
version which makes a number of changes:

1. I got rid of all of the typecasts.  You're supposed to treat
pg_atomic_u32 as a magic data type that is only manipulated via the
primitives provided, not just cast back and forth between that and
u32.

2. I got rid of the memory barriers.  System calls are full barriers,
and so are compare-and-exchange operations.  Between those two facts,
we should be fine without these.

3. I removed the clearXid field you added to PGPROC.  Since that's
just a sentinel, I used nextClearXidElem for that purpose. There
doesn't seem to be a need for two fields.

4. I factored out the actual XID-clearing logic into a new function
ProcArrayEndTransactionInternal instead of repeating it twice. On the
flip side, I merged PushProcAndWaitForXidClear with
PopProcsAndClearXids and renamed the result to ProcArrayGroupClearXid,
since there seemed to be no need to separate them.

5. I renamed PROC_NOT_IN_PGPROCARRAY to INVALID_PGPROCNO, which I
think is more consistent with what we do elsewhere, and made it a
compile-time constant instead of a value that must be computed every
time it's used.

6. I overhauled the comments and README updates.

I'm not entirely happy with the name "nextClearXidElem" but apart from
that I'm fairly happy with this version.  We should probably test it
to make sure I haven't broken anything; on a quick 3-minute pgbench
test on cthulhu (128-way EDB server) with 128 clients I got 2778 tps
with master and 4330 tps with this version of the patch.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index bc68b47..f6db580 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -252,6 +252,9 @@ implementation of this is that GetSnapshotData takes the ProcArrayLock in
 shared mode (so that multiple backends can take snapshots in parallel),
 but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode
 while clearing MyPgXact->xid at transaction end (either commit or abort).
+(To reduce context switching, when multiple transactions commit nearly
+simultaneously, we have one backend take ProcArrayLock and clear the XIDs
+of multiple processes at once.)
 
 ProcArrayEndTransaction also holds the lock while advancing the shared
 latestCompletedXid variable.  This allows GetSnapshotData to use
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 4f3c5c9..14d1219 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -167,6 +167,9 @@ static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
 static TransactionId KnownAssignedXidsGetOldestXmin(void);
 static void KnownAssignedXidsDisplay(int trace_level);
 static void KnownAssignedXidsReset(void);
+static void ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
+								TransactionId latestXid);
+static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
 
 /*
  * Report shared-memory space needed by CreateSharedProcArray.
@@ -370,7 +373,6 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 	elog(LOG, "failed to find proc %p in ProcArray", proc);
 }
 
-
 /*
  * ProcArrayEndTransaction -- mark a transaction as no longer running
  *
@@ -399,26 +401,18 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		 */
 		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
 
-		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-		pgxact->xid = InvalidTransactionId;
-		proc->lxid = InvalidLocalTransactionId;
-		pgxact->xmin = InvalidTransactionId;
-		/* must be cleared with xid/xmin: */
-		pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
-		pgxact->delayChkpt = false;		/* be sure this is cleared in abort */
-		proc->recoveryConflictPending = false;
-
-		/* Clear the subtransaction-XID cache too while holding the lock */
-		pgxact->nxids = 0;
-		pgxact->overflowed = false;
-
-		/* Also advance global latestCompletedXid while holding the lock */
-		if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-								  latestXid))
-			ShmemVariableCache->latestCompletedXid = latestXid;
-
-		LWLockRelease(ProcArrayLock);
+		/*
+		 * If we can immediately acquire ProcArrayLock, we clear our own XID
+		 * and release the lock.  If not, use group XID clearing to improve
+		 * efficiency.
+		 */
+		if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
+		{
+			ProcArrayEndTransactionInternal(proc, pgxact, latestXid);
+			LWLockRelease(ProcArrayLock);
+		}
+		else
+			ProcArrayGroupClearXid(proc, latestXid);
 	}
 	else
 	{
@@ -441,6 +435,137 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 	}
 }
 
+/*
+ * Mark a write transaction as no longer running.
+ *
+ * We don't do any locking here; caller must handle that.
+ */
+static void
+ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
+								TransactionId latestXid)
+{
+	pgxact->xid = InvalidTransactionId;
+	proc->lxid = InvalidLocalTransactionId;
+	pgxact->xmin = InvalidTransactionId;
+	/* must be cleared with xid/xmin: */
+	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
+	pgxact->delayChkpt = false;		/* be sure this is cleared in abort */
+	proc->recoveryConflictPending = false;
+
+	/* Clear the subtransaction-XID cache too while holding the lock */
+	pgxact->nxids = 0;
+	pgxact->overflowed = false;
+
+	/* Also advance global latestCompletedXid while holding the lock */
+	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
+							  latestXid))
+		ShmemVariableCache->latestCompletedXid = latestXid;
+}
+
+/*
+ * ProcArrayGroupClearXid -- group XID clearing
+ *
+ * When we cannot immediately acquire ProcArrayLock in exclusive mode at
+ * commit time, add ourselves to a list of processes that need their XIDs
+ * cleared.  The first process to add itself to the list will acquire
+ * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
+ * on behalf of all group members.  This avoids a great deal of context
+ * switching when many processes are trying to commit at once, since the lock
+ * only needs to be handed from the last share-locker to one process waiting
+ * for the exclusive lock, rather than to each one in turn.
+ */
+static void
+ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
+{
+	volatile PROC_HDR *procglobal = ProcGlobal;
+	uint32		nextidx;
+	uint32		wakeidx;
+	int			extraWaits = -1;
+
+	/* We should definitely have an XID to clear. */
+	Assert(TransactionIdIsValid(pgxact->xid));
+
+	/* Add ourselves to the list of processes needing a group XID clear. */
+	proc->backendLatestXid = latestXid;
+	while (true)
+	{
+		nextidx = pg_atomic_read_u32(&procglobal->nextClearXidElem);
+		pg_atomic_write_u32(&proc->nextClearXidElem, nextidx);
+
+		if (pg_atomic_compare_exchange_u32(&procglobal->nextClearXidElem,
+										   &nextidx,
+										   (uint32) proc->pgprocno))
+			break;
+	}
+
+	/* If the list was not empty, the leader will clear our XID. */
+	if (nextidx != INVALID_PGPROCNO)
+	{
+		/* Sleep until the leader clears our XID. */
+		while (pg_atomic_read_u32(&proc->nextClearXidElem) != INVALID_PGPROCNO)
+		{
+			extraWaits++;
+			PGSemaphoreLock(&proc->sem);
+		}
+
+		/* Fix semaphore count for any absorbed wakeups */
+		while (extraWaits-- > 0)
+			PGSemaphoreUnlock(&proc->sem);
+		return;
+	}
+
+	/* We are the leader.  Acquire the lock on behalf of everyone. */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Now that we've got the lock, clear the list of processes waiting for
+	 * group XID clearing, saving a pointer to the head of the list.
+	 */
+	while (true)
+	{
+		nextidx = pg_atomic_read_u32(&procglobal->nextClearXidElem);
+		if (pg_atomic_compare_exchange_u32(&procglobal->nextClearXidElem,
+										   &nextidx,
+										   INVALID_PGPROCNO))
+			break;
+	}
+
+	/* Remember head of list so we can perform wakeups after dropping lock. */
+	wakeidx = nextidx;
+
+	/* Walk the list and clear all XIDs. */
+	while (nextidx != INVALID_PGPROCNO)
+	{
+		PGPROC	*proc = &allProcs[nextidx];
+		PGXACT	*pgxact = &allPgXact[nextidx];
+
+		ProcArrayEndTransactionInternal(proc, pgxact, proc->backendLatestXid);
+
+		/* Move to next proc in list. */
+		nextidx = pg_atomic_read_u32(&proc->nextClearXidElem);
+	}
+
+	/* We're done with the lock now. */
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * Now that we've released the lock, go back and wake everybody up.  We
+	 * don't do this under the lock so as to keep lock hold times to a
+	 * minimum.  The system calls we need to perform to wake other processes
+	 * up are probably much slower than the simple memory writes we did while
+	 * holding the lock.
+	 */
+	while (wakeidx != INVALID_PGPROCNO)
+	{
+		PGPROC	*proc = &allProcs[wakeidx];
+
+		wakeidx = pg_atomic_read_u32(&proc->nextClearXidElem);
+		pg_atomic_write_u32(&proc->nextClearXidElem, INVALID_PGPROCNO);
+
+		if (proc != MyProc)
+			PGSemaphoreUnlock(&proc->sem);
+	}
+}
 
 /*
  * ProcArrayClearTransaction -- clear the transaction fields
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 884e91b..93f2656 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -181,6 +181,7 @@ InitProcGlobal(void)
 	ProcGlobal->startupBufferPinWaitBufId = -1;
 	ProcGlobal->walwriterLatch = NULL;
 	ProcGlobal->checkpointerLatch = NULL;
+	pg_atomic_init_u32(&ProcGlobal->nextClearXidElem, INVALID_PGPROCNO);
 
 	/*
 	 * Create and initialize all the PGPROC structures we'll need.  There are
@@ -393,6 +394,10 @@ InitProcess(void)
 	MyProc->syncRepState = SYNC_REP_NOT_WAITING;
 	SHMQueueElemInit(&(MyProc->syncRepLinks));
 
+	/* Initialize fields for group XID clearing. */
+	MyProc->backendLatestXid = InvalidTransactionId;
+	pg_atomic_init_u32(&MyProc->nextClearXidElem, INVALID_PGPROCNO);
+
 	/*
 	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
 	 * on it.  That allows us to repoint the process latch, which so far
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 202a672..421bb58 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -59,6 +59,12 @@ struct XidCache
 #define		FP_LOCK_SLOTS_PER_BACKEND 16
 
 /*
+ * An invalid pgprocno.  Must be larger than the maximum number of PGPROC
+ * structures we could possibly have.  See comments for MAX_BACKENDS.
+ */
+#define INVALID_PGPROCNO		PG_INT32_MAX
+
+/*
  * Each backend has a PGPROC struct in shared memory.  There is also a list of
  * currently-unused PGPROC structs that will be reallocated to new backends.
  *
@@ -135,6 +141,10 @@ struct PGPROC
 
 	struct XidCache subxids;	/* cache for subtransaction XIDs */
 
+	/* Support for group XID clearing. */
+	volatile pg_atomic_uint32	nextClearXidElem;
+	TransactionId	backendLatestXid;
+
 	/* Per-backend LWLock.  Protects fields below. */
 	LWLock	   *backendLock;	/* protects the fields below */
 
@@ -196,6 +206,8 @@ typedef struct PROC_HDR
 	PGPROC	   *autovacFreeProcs;
 	/* Head of list of bgworker free PGPROC structures */
 	PGPROC	   *bgworkerFreeProcs;
+	/* First pgproc waiting for group XID clear */
+	volatile pg_atomic_uint32 nextClearXidElem;
 	/* WALWriter process's latch */
 	Latch	   *walwriterLatch;
 	/* Checkpointer process's latch */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to