Hi,

I've previously posted a patch at
http://archives.postgresql.org/message-id/20141010160020.GG6670%40alap3.anarazel.de
that reduces contention in StrategyGetBuffer() by making the clock sweep
lockless.  Robert asked me to post it to a new thread; I originally
wrote it to see some other contention in more detail, that's why it
ended up in that thread...

The performance numbers I've quoted over there are:
> Test:
> pgbench  -M prepared -P 5 -S -c 496 -j 496 -T 5000
> on a scale=1000 database, with 4GB of shared buffers.
> 
> Before:
> progress: 40.0 s, 136252.3 tps, lat 3.628 ms stddev 4.547
> progress: 45.0 s, 135049.0 tps, lat 3.660 ms stddev 4.515
> progress: 50.0 s, 135788.9 tps, lat 3.640 ms stddev 4.398
> progress: 55.0 s, 135268.4 tps, lat 3.654 ms stddev 4.469
> progress: 60.0 s, 134991.6 tps, lat 3.661 ms stddev 4.739
> 
> after:
> progress: 40.0 s, 207701.1 tps, lat 2.382 ms stddev 3.018
> progress: 45.0 s, 208022.4 tps, lat 2.377 ms stddev 2.902
> progress: 50.0 s, 209187.1 tps, lat 2.364 ms stddev 2.970
> progress: 55.0 s, 206462.7 tps, lat 2.396 ms stddev 2.871
> progress: 60.0 s, 210263.8 tps, lat 2.351 ms stddev 2.914

Imo the patch doesn't complicate the logic noticeably...

I do wonder if we could make the freelist accesses lockless as well -
but I think that's harder. So I don't want to mix that with this.

Greetings,

Andres Freund

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
>From 6b486e5b467e94ab9297d7656a5b39b816c5c55a Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Fri, 10 Oct 2014 17:36:46 +0200
Subject: [PATCH] WIP: lockless StrategyGetBuffer hotpath

---
 src/backend/storage/buffer/freelist.c | 154 ++++++++++++++++++++--------------
 1 file changed, 90 insertions(+), 64 deletions(-)

diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 5966beb..0c634a0 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -18,6 +18,12 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 
+#include "port/atomics.h"
+
+
+#define LATCHPTR_ACCESS_ONCE(var)	((Latch *)(*((volatile Latch **)&(var))))
+#define INT_ACCESS_ONCE(var)	((int)(*((volatile int *)&(var))))
+
 
 /*
  * The shared freelist control information.
@@ -27,8 +33,12 @@ typedef struct
 	/* Spinlock: protects the values below */
 	slock_t		buffer_strategy_lock;
 
-	/* Clock sweep hand: index of next buffer to consider grabbing */
-	int			nextVictimBuffer;
+	/*
+	 * Clock sweep hand: index of next buffer to consider grabbing. Note that
+	 * this isn't a concrete buffer - we only ever increase the value. So, to
+	 * get an actual buffer, it needs to be used modulo NBuffers.
+	 */
+	pg_atomic_uint32 nextVictimBuffer;
 
 	int			firstFreeBuffer;	/* Head of list of unused buffers */
 	int			lastFreeBuffer; /* Tail of list of unused buffers */
@@ -42,8 +52,8 @@ typedef struct
 	 * Statistics.  These counters should be wide enough that they can't
 	 * overflow during a single bgwriter cycle.
 	 */
-	uint32		completePasses; /* Complete cycles of the clock sweep */
-	uint32		numBufferAllocs;	/* Buffers allocated since last reset */
+	pg_atomic_uint32 completePasses; /* Complete cycles of the clock sweep */
+	pg_atomic_uint32 numBufferAllocs;	/* Buffers allocated since last reset */
 
 	/*
 	 * Notification latch, or NULL if none.  See StrategyNotifyBgWriter.
@@ -124,87 +134,107 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			return buf;
 	}
 
-	/* Nope, so lock the freelist */
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
 	/*
 	 * We count buffer allocation requests so that the bgwriter can estimate
 	 * the rate of buffer consumption.  Note that buffers recycled by a
 	 * strategy object are intentionally not counted here.
 	 */
-	StrategyControl->numBufferAllocs++;
+	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
 
 	/*
-	 * If bgwriterLatch is set, we need to waken the bgwriter, but we should
-	 * not do so while holding buffer_strategy_lock; so release and re-grab.
-	 * This is annoyingly tedious, but it happens at most once per bgwriter
-	 * cycle, so the performance hit is minimal.
+	 * If bgwriterLatch is set, we need to waken the bgwriter. We don't want
+	 * to check this while holding the spinlock, so we the latch from memory
+	 * once to see whether it's set. There's no need to do so with a lock
+	 * present - we'll just set the latch next call if we missed it once.
+	 *
+	 * Since we're not guaranteed atomic 8 byte reads we need to acquire the
+	 * spinlock if not null to be sure we get a correct pointer. Because we
+	 * don't want to set the latch while holding the buffer_strategy_lock we
+	 * just grab the lock to read and reset the pointer.
 	 */
-	bgwriterLatch = StrategyControl->bgwriterLatch;
+	bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch);
 	if (bgwriterLatch)
 	{
+		/* we don't have guaranteed atomic 64bit reads */
+		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+		bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch);
 		StrategyControl->bgwriterLatch = NULL;
 		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-		SetLatch(bgwriterLatch);
-		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+		/* recheck */
+		if (bgwriterLatch)
+			SetLatch(bgwriterLatch);
 	}
 
 	/*
-	 * Try to get a buffer from the freelist.  Note that the freeNext fields
-	 * are considered to be protected by the buffer_strategy_lock not the
-	 * individual buffer spinlocks, so it's OK to manipulate them without
-	 * holding the spinlock.
+	 * First check, without acquiring the lock, wether there's buffers in the
+	 * freelist. Since we otherwise don't require the spinlock in every
+	 * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
+	 * uselessly in mos tcases.
+	 *
+	 * If there's buffers on the freelist, acquire the spinlock and look into
+	 * them.
+	 *
+	 * Note that the freeNext fields are considered to be protected by
+	 * the buffer_strategy_lock not the individual buffer spinlocks, so it's
+	 * OK to manipulate them without holding the spinlock.
 	 */
-	while (StrategyControl->firstFreeBuffer >= 0)
+	if (INT_ACCESS_ONCE(StrategyControl->firstFreeBuffer) >= 0)
 	{
-		buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
-		Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 
-		/* Unconditionally remove buffer from freelist */
-		StrategyControl->firstFreeBuffer = buf->freeNext;
-		buf->freeNext = FREENEXT_NOT_IN_LIST;
+		while (StrategyControl->firstFreeBuffer >= 0)
+		{
+			buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
+			Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
 
-		/*
-		 * Release the lock so someone else can access the freelist (or run
-		 * the clocksweep) while we check out this buffer.
-		 */
-		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+			/* Unconditionally remove buffer from freelist */
+			StrategyControl->firstFreeBuffer = buf->freeNext;
+			buf->freeNext = FREENEXT_NOT_IN_LIST;
 
-		/*
-		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
-		 * it; discard it and retry.  (This can only happen if VACUUM put a
-		 * valid buffer in the freelist and then someone else used it before
-		 * we got to it.  It's probably impossible altogether as of 8.3, but
-		 * we'd better check anyway.)
-		 */
-		LockBufHdr(buf);
-		if (buf->refcount == 0 && buf->usage_count == 0)
-		{
-			if (strategy != NULL)
-				AddBufferToRing(strategy, buf);
-			return buf;
-		}
-		UnlockBufHdr(buf);
+			/*
+			 * Release the lock so someone else can access the freelist (or run
+			 * the clocksweep) while we check out this buffer.
+			 */
+			SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 
-		/* Reacquire the lock and go around for another pass. */
-		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+			/*
+			 * If the buffer is pinned or has a nonzero usage_count, we cannot use
+			 * it; discard it and retry.  (This can only happen if VACUUM put a
+			 * valid buffer in the freelist and then someone else used it before
+			 * we got to it.  It's probably impossible altogether as of 8.3, but
+			 * we'd better check anyway.)
+			 */
+			LockBufHdr(buf);
+			if (buf->refcount == 0 && buf->usage_count == 0)
+			{
+				if (strategy != NULL)
+					AddBufferToRing(strategy, buf);
+				return buf;
+			}
+			UnlockBufHdr(buf);
+
+			/* Reacquire the lock and go around for another pass. */
+			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+		}
+		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 	}
 
 	/* Nothing on the freelist, so run the "clock sweep" algorithm */
 	trycounter = NBuffers;
 	for (;;)
 	{
-		buf = &BufferDescriptors[StrategyControl->nextVictimBuffer];
+		int victim;
+
+		victim = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
 
-		if (++StrategyControl->nextVictimBuffer >= NBuffers)
+		buf = &BufferDescriptors[victim % NBuffers];
+
+		if (victim % NBuffers == 0)
 		{
-			StrategyControl->nextVictimBuffer = 0;
-			StrategyControl->completePasses++;
+			pg_atomic_add_fetch_u32(&StrategyControl->completePasses, 1);
 		}
 
-		/* Release the lock before manipulating the candidate buffer. */
-		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-
 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
@@ -238,9 +268,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			elog(ERROR, "no unpinned buffers available");
 		}
 		UnlockBufHdr(buf);
-
-		/* Reacquire the lock and get a new candidate buffer. */
-		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 	}
 }
 
@@ -284,13 +311,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 	int			result;
 
 	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-	result = StrategyControl->nextVictimBuffer;
+	result = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer) % NBuffers;
 	if (complete_passes)
-		*complete_passes = StrategyControl->completePasses;
+		*complete_passes = pg_atomic_read_u32(&StrategyControl->completePasses);
 	if (num_buf_alloc)
 	{
-		*num_buf_alloc = StrategyControl->numBufferAllocs;
-		StrategyControl->numBufferAllocs = 0;
+		*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
 	}
 	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 	return result;
@@ -389,11 +415,11 @@ StrategyInitialize(bool init)
 		StrategyControl->lastFreeBuffer = NBuffers - 1;
 
 		/* Initialize the clock sweep pointer */
-		StrategyControl->nextVictimBuffer = 0;
+		pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
 
 		/* Clear statistics */
-		StrategyControl->completePasses = 0;
-		StrategyControl->numBufferAllocs = 0;
+		pg_atomic_init_u32(&StrategyControl->completePasses, 0);
+		pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
 
 		/* No pending notification */
 		StrategyControl->bgwriterLatch = NULL;
-- 
1.8.3.251.g1462b67

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to