I've been thinking, what exactly is the important part of this group commit patch that gives the benefit? Keeping the queue sorted isn't all that important - XLogFlush() requests for commits will come in almost the correct order anyway.

I also don't much like the division of labour between groupcommit.c and xlog.c. XLogFlush() calls GroupCommitWaitForLSN(), which calls back DoXLogFlush(), which is a bit hard to follow. (that's in my latest version; the original patch had similar division but it also cut across processes, with the wal writer actually doing the flush)

It occurs to me that the WALWriteLock already provides much of the same machinery we're trying to build here. It provides FIFO-style queuing, with the capability to wake up the next process or processes in the queue. Perhaps all we need is some new primitive to LWLock, to make it do exactly what we need.

Attached is a patch to do that. It adds a new mode to LWLockConditionalAcquire(), LW_EXCLUSIVE_BUT_WAIT. If the lock is free, it is acquired and the function returns immediately. However, unlike normal LWLockConditionalAcquire(), if the lock is not free it goes to sleep until it is released. But unlike normal LWLockAcquire(), when the lock is released, the function returns *without* acquiring the lock.

I modified XLogFlush() to use that new function for WALWriteLock. It tries to get WALWriteLock, but if it's not immediately free, it waits until it is released. Then, before trying to acquire the lock again, it rechecks LogwrtResult to see if the record was already flushed by the process that released the lock.

This is a much smaller patch than the group commit patch, and in my pgbench-tools tests on my laptop, it has the same effect on performance. The downside is that it touches lwlock.c, which is a change at a lower level. Robert's flexlocks patch probably would've been useful here.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ce659ec..d726a98 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2066,23 +2066,42 @@ XLogFlush(XLogRecPtr record)
 	/* initialize to given target; may increase below */
 	WriteRqstPtr = record;
 
-	/* read LogwrtResult and update local state */
+	/*
+	 * Now wait until we get the write lock, or someone else does the
+	 * flush for us.
+	 */
+	for (;;)
 	{
 		/* use volatile pointer to prevent code rearrangement */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 
+		/* read LogwrtResult and update local state */
 		SpinLockAcquire(&xlogctl->info_lck);
 		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
 			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
 		LogwrtResult = xlogctl->LogwrtResult;
 		SpinLockRelease(&xlogctl->info_lck);
-	}
 
-	/* done already? */
-	if (!XLByteLE(record, LogwrtResult.Flush))
-	{
-		/* now wait for the write lock */
-		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+		/* done already? */
+		if (XLByteLE(record, LogwrtResult.Flush))
+			break;
+
+		/*
+		 * Try to get the write lock. If we can't get it immediately, wait
+		 * until it's released, but before actually acquiring it, loop back
+		 * to check if the backend holding the lock did the flush for us
+		 * already. This helps to maintain a good rate of group committing
+		 * when the system is bottlenecked by the speed of fsyncing.
+		 */
+		if (!LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE_BUT_WAIT))
+		{
+			/*
+			 * Didn't get the lock straight away, but we might be done
+			 * already
+			 */
+			continue;
+		}
+		/* Got the lock */
 		LogwrtResult = XLogCtl->Write.LogwrtResult;
 		if (!XLByteLE(record, LogwrtResult.Flush))
 		{
@@ -2111,6 +2130,8 @@ XLogFlush(XLogRecPtr record)
 			XLogWrite(WriteRqst, false, false);
 		}
 		LWLockRelease(WALWriteLock);
+		/* done */
+		break;
 	}
 
 	END_CRIT_SECTION();
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index cc41568..488f573 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -430,6 +430,7 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
 			elog(PANIC, "cannot wait without a PGPROC structure");
 
 		proc->lwWaiting = true;
+		proc->lwWaitOnly = false;
 		proc->lwExclusive = (mode == LW_EXCLUSIVE);
 		proc->lwWaitLink = NULL;
 		if (lock->head == NULL)
@@ -496,7 +497,9 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
 /*
  * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
  *
- * If the lock is not available, return FALSE with no side-effects.
+ * If the lock is not available, return FALSE with no side-effects. In
+ * LW_EXCLUSIVE_BUT_WAIT mode, if the lock is not available, waits until it
+ * is available, but then returns false without acquiring it.
  *
  * If successful, cancel/die interrupts are held off until lock release.
  */
@@ -504,7 +507,9 @@ bool
 LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
 {
 	volatile LWLock *lock = &(LWLockArray[lockid].lock);
+	PGPROC	   *proc = MyProc;
 	bool		mustwait;
+	int			extraWaits = 0;
 
 	PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock);
 
@@ -523,7 +528,7 @@ LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
 	SpinLockAcquire(&lock->mutex);
 
 	/* If I can get the lock, do so quickly. */
-	if (mode == LW_EXCLUSIVE)
+	if (mode == LW_EXCLUSIVE || mode == LW_EXCLUSIVE_BUT_WAIT)
 	{
 		if (lock->exclusive == 0 && lock->shared == 0)
 		{
@@ -544,8 +549,77 @@ LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
 			mustwait = true;
 	}
 
-	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
+	if (mustwait && mode == LW_EXCLUSIVE_BUT_WAIT)
+	{
+		/*
+		 * Add myself to wait queue.
+		 *
+		 * If we don't have a PGPROC structure, there's no way to wait. This
+		 * should never occur, since MyProc should only be null during shared
+		 * memory initialization.
+		 */
+		if (proc == NULL)
+			elog(PANIC, "cannot wait without a PGPROC structure");
+
+		proc->lwWaiting = true;
+		proc->lwWaitOnly = true;
+		proc->lwExclusive = (mode == LW_EXCLUSIVE);
+		proc->lwWaitLink = NULL;
+		if (lock->head == NULL)
+			lock->head = proc;
+		else
+			lock->tail->lwWaitLink = proc;
+		lock->tail = proc;
+
+		/* Can release the mutex now */
+		SpinLockRelease(&lock->mutex);
+
+		/*
+		 * Wait until awakened.
+		 *
+		 * Since we share the process wait semaphore with the regular lock
+		 * manager and ProcWaitForSignal, and we may need to acquire an LWLock
+		 * while one of those is pending, it is possible that we get awakened
+		 * for a reason other than being signaled by LWLockRelease. If so,
+		 * loop back and wait again.  Once we've gotten the LWLock,
+		 * re-increment the sema by the number of additional signals received,
+		 * so that the lock manager or signal manager will see the received
+		 * signal when it next waits.
+		 */
+		LOG_LWDEBUG("LWLockAcquire", lockid, "waiting");
+
+#ifdef LWLOCK_STATS
+		block_counts[lockid]++;
+#endif
+
+		TRACE_POSTGRESQL_LWLOCK_WAIT_START(lockid, mode);
+
+		for (;;)
+		{
+			/* "false" means cannot accept cancel/die interrupt here. */
+			PGSemaphoreLock(&proc->sem, false);
+			if (!proc->lwWaiting)
+				break;
+			extraWaits++;
+		}
+
+		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(lockid, mode);
+
+		LOG_LWDEBUG("LWLockAcquire", lockid, "awakened");
+	}
+	else
+	{
+		/* We are done updating shared state of the lock itself. */
+		SpinLockRelease(&lock->mutex);
+	}
+
+	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(lockid, mode);
+
+	/*
+	 * Fix the process wait semaphore's count for any absorbed wakeups.
+	 */
+	while (extraWaits-- > 0)
+		PGSemaphoreUnlock(&proc->sem);
 
 	if (mustwait)
 	{
@@ -619,19 +693,29 @@ LWLockRelease(LWLockId lockid)
 			 * Remove the to-be-awakened PGPROCs from the queue.  If the front
 			 * waiter wants exclusive lock, awaken him only. Otherwise awaken
 			 * as many waiters as want shared access.
+			 *
+			 * Everyone marked with lwWaitOnly are woken up, but they don't
+			 * affect the setting of releaseOK. They're woken up, but they
+			 * might decide to not acquire the lock after all.
 			 */
 			proc = head;
-			if (!proc->lwExclusive)
+			if (!proc->lwExclusive || proc->lwWaitOnly)
 			{
 				while (proc->lwWaitLink != NULL &&
-					   !proc->lwWaitLink->lwExclusive)
+					   (proc->lwWaitLink->lwWaitOnly ||
+						!proc->lwWaitLink->lwExclusive))
+				{
 					proc = proc->lwWaitLink;
+					if (!proc->lwWaitOnly)
+						lock->releaseOK = false;
+				}
 			}
 			/* proc is now the last PGPROC to be released */
 			lock->head = proc->lwWaitLink;
 			proc->lwWaitLink = NULL;
 			/* prevent additional wakeups until retryer gets to run */
-			lock->releaseOK = false;
+			if (!proc->lwWaitOnly)
+				lock->releaseOK = false;
 		}
 		else
 		{
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index df3df29..85c9a5c 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -94,7 +94,8 @@ typedef enum LWLockId
 typedef enum LWLockMode
 {
 	LW_EXCLUSIVE,
-	LW_SHARED
+	LW_SHARED,
+	LW_EXCLUSIVE_BUT_WAIT
 } LWLockMode;
 
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 358d1a4..027a793 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -102,6 +102,7 @@ struct PGPROC
 	/* Info about LWLock the process is currently waiting for, if any. */
 	bool		lwWaiting;		/* true if waiting for an LW lock */
 	bool		lwExclusive;	/* true if waiting for exclusive access */
+	bool		lwWaitOnly;		/* true if just want to wake up on release */
 	struct PGPROC *lwWaitLink;	/* next waiter for same LW lock */
 
 	/* Info about lock the process is currently waiting for, if any. */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to