On 2017-02-02 11:41:44 -0800, Jim Nasby wrote:
> On 2/1/17 4:28 PM, Andres Freund wrote:
> > On 2016-11-28 11:40:53 -0800, Jim Nasby wrote:
> > > With current limits, the most bgwriter can do (with 8k pages) is 1000 
> > > pages
> > > * 100 times/sec = 780MB/s. It's not hard to exceed that with modern
> > > hardware. Should we increase the limit on bgwriter_lru_maxpages?
> >
> > FWIW, I think working on replacing bgwriter (e.g. by working on the
> > patch I send with a POC replacement) wholesale is a better approach than
> > spending time increasing limits.
>
> Do you have a link to that? I'm not seeing anything in the archives.

Not at hand, but I can just give you the patches.  These are very likely
to conflict, but it shouldn't be too hard to resolve...

What it basically does is move as much of the clock-sweep to bgwriter,
which keeps a certain number of clean pages available.  There's a
lock-free ringbuffer that backends can just pick pages off.

The approach with the ringbuffer has the advantage that with relatively
small changes we can scale to having multiple bgwriters (needs some
changes in the ringbuf implementation, but not that many).

Andres
>From c008aad0445d2d8b1247597940ce096666dbd60e Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Fri, 19 Feb 2016 12:07:51 -0800
Subject: [PATCH 1/2] Basic lockless single producer, multiple consumer
 ringbuffer

---
 src/backend/lib/Makefile  |   2 +-
 src/backend/lib/ringbuf.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++
 src/include/lib/ringbuf.h |  60 +++++++++++++++++
 3 files changed, 223 insertions(+), 1 deletion(-)
 create mode 100644 src/backend/lib/ringbuf.c
 create mode 100644 src/include/lib/ringbuf.h

diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
index 2d2ba84fe9..03a4795706 100644
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = binaryheap.o bipartite_match.o hyperloglog.o ilist.o pairingheap.o \
-       rbtree.o stringinfo.o
+       rbtree.o ringbuf.o stringinfo.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/ringbuf.c b/src/backend/lib/ringbuf.c
new file mode 100644
index 0000000000..62761cc447
--- /dev/null
+++ b/src/backend/lib/ringbuf.c
@@ -0,0 +1,162 @@
+/*-------------------------------------------------------------------------
+ *
+ * ringbuf.c
+ *	  Single writer, multiple reader, lock-free ringbuffer.
+ *
+ * ...
+ *
+ * Copyright (c) 2015, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/ringbuf.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/ringbuf.h"
+#include "storage/proc.h"
+
+static inline uint32
+ringbuf_backendid(ringbuf *rb, uint32 pos)
+{
+	return pos & 0xffff0000;
+}
+
+/*
+ * Compute the new offset, slightly complicated by the fact that we only want
+ * to modify the lower 16 bits.
+ */
+static inline uint32
+ringbuf_advance_pos(uint32 pos)
+{
+	return ((pos + 1) & 0x0000FFFF) | (pos & 0xFFFF0000);
+}
+
+uint32
+ringbuf_elements(ringbuf *rb)
+{
+	uint32 read_off = ringbuf_pos(rb, pg_atomic_read_u32(&rb->read_state));
+	uint32 write_off = ringbuf_pos(rb, rb->write_off);
+
+	/* not wrapped around */
+	if (read_off <= write_off)
+	{
+		return write_off - read_off;
+	}
+
+	/* wrapped around */
+	return (rb->size - read_off) + write_off;
+}
+
+size_t
+ringbuf_size(size_t nelems)
+{
+	Assert(nelems <= 0x0000FFFF);
+	return sizeof(ringbuf) + sizeof(void *) * nelems;
+}
+
+/*
+ * Memory needs to be externally allocated and be at least
+ * ringbuf_size(nelems) large.
+ */
+ringbuf *
+ringbuf_create(void *target, size_t nelems)
+{
+	ringbuf *rb = (ringbuf *) target;
+
+	Assert(nelems <= 0x0000FFFF);
+
+	memset(target, 0, ringbuf_size(nelems));
+
+	rb->size = nelems;
+	pg_atomic_init_u32(&rb->read_state, 0);
+	rb->write_off = 0;
+
+	return rb;
+}
+
+bool
+ringbuf_push(ringbuf *rb, void *data)
+{
+	uint32 read_off = pg_atomic_read_u32(&rb->read_state);
+
+	/*
+	 * Check if full - can be outdated, but that's ok. New readers are just
+	 * going to further consume elements, never cause the buffer to become
+	 * full.
+	 */
+	if (ringbuf_pos(rb, read_off) == ringbuf_pos(rb, rb->write_off + 1))
+	{
+		return false;
+	}
+
+	rb->elements[ringbuf_pos(rb, rb->write_off)] = data;
+
+	/*
+	 * The write adding the data needs to be visible before the corresponding
+	 * increase of write_off is visible.
+	 */
+	pg_write_barrier();
+
+	rb->write_off = ringbuf_advance_pos(rb->write_off);
+
+	return true;
+}
+
+
+bool
+ringbuf_pop(ringbuf *rb, void **data)
+{
+	void *ret;
+	uint32 mybackend = MyProc->backendId;
+
+	Assert((mybackend & 0x0000ffff) == mybackend);
+
+	while (true)
+	{
+		uint32 read_state = pg_atomic_read_u32(&rb->read_state);
+		uint32 read_off = ringbuf_pos(rb, read_state);
+		uint32 old_read_state = read_state;
+
+		/* check if empty - can be outdated, but that's ok */
+		if (read_off == ringbuf_pos(rb, rb->write_off))
+			return false;
+
+		/*
+		 * Add our backend id to the position, to detect wrap around.
+		 * XXX
+		 *
+		 * XXX: Skip if the ID already is ours. That's probably likely enough
+		 * to warrant the additional branch.
+		 */
+		read_state = (read_state & 0x0000ffff) | mybackend << 16;
+
+		/*
+		 * Mix the reader position into the current read_off, otherwise
+		 * unchanged. If the offset changed since, retry from start.
+		 *
+		 * NB: This also serves as the read barrier pairing with the write
+		 * barrier in ringbuf_push().
+		 */
+		if (!pg_atomic_compare_exchange_u32(&rb->read_state, &old_read_state,
+											read_state))
+			continue;
+		old_read_state = read_state; /* with backend id mixed in */
+
+		/* finally read the data */
+		ret = rb->elements[read_off];
+
+		/* compute next offset */
+		read_state = ringbuf_advance_pos(read_state);
+
+		if (pg_atomic_compare_exchange_u32(&rb->read_state, &old_read_state,
+										   read_state))
+			break;
+	}
+
+	*data = ret;
+
+	return true;
+}
diff --git a/src/include/lib/ringbuf.h b/src/include/lib/ringbuf.h
new file mode 100644
index 0000000000..8d686e91f4
--- /dev/null
+++ b/src/include/lib/ringbuf.h
@@ -0,0 +1,60 @@
+/*
+ * ringbuf.h
+ *
+ * Single writer.multiple reader lockless & obstruction free ringbuffer.
+ *
+ * Copyright (c) 2015, PostgreSQL Global Development Group
+ *
+ * src/include/lib/ringbuf.h
+ */
+#ifndef RINGBUF_H
+#define RINGBUF_H
+
+#include "port/atomics.h"
+
+typedef struct ringbuf
+{
+	uint32 size;
+
+	/* 16 bit reader id, 16 bit offset */
+	pg_atomic_uint32 read_state;
+	uint32_t write_off;
+
+	void *elements[FLEXIBLE_ARRAY_MEMBER];
+} ringbuf;
+
+size_t ringbuf_size(size_t nelems);
+
+ringbuf *ringbuf_create(void *target, size_t nelems);
+
+static inline uint32
+ringbuf_pos(ringbuf *rb, uint32 pos)
+{
+	/*
+	 * XXX: replacing rb->size with a bitmask op would avoid expensive
+	 * divisions. Requiring a pow2 size seems ok.
+	 */
+	return (pos & 0x0000ffff) % rb->size;
+}
+
+static inline bool
+ringbuf_empty(ringbuf *rb)
+{
+	uint32 read_state = pg_atomic_read_u32(&rb->read_state);
+
+	return ringbuf_pos(rb, read_state) == ringbuf_pos(rb, rb->write_off);
+}
+
+static inline bool
+ringbuf_full(ringbuf *rb)
+{
+	uint32 read_state = pg_atomic_read_u32(&rb->read_state);
+
+	return ringbuf_pos(rb, read_state) == ringbuf_pos(rb, rb->write_off + 1);
+}
+
+uint32 ringbuf_elements(ringbuf *rb);
+bool ringbuf_push(ringbuf *rb, void *data);
+bool ringbuf_pop(ringbuf *rb, void **data);
+
+#endif
-- 
2.11.0.22.g8d7a455.dirty

>From 39208e12c947183f5ed565b94bc006eed0138136 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Fri, 19 Feb 2016 12:07:51 -0800
Subject: [PATCH 2/2] WIP: bgwriter rewrite

---
 src/backend/postmaster/bgwriter.c     |   9 +-
 src/backend/storage/buffer/buf_init.c |  22 ++-
 src/backend/storage/buffer/bufmgr.c   |  79 +++++++-
 src/backend/storage/buffer/freelist.c | 336 ++++++++++++++++++++--------------
 src/backend/utils/misc/guc.c          |  11 ++
 src/include/postmaster/bgwriter.h     |   1 +
 src/include/storage/buf_internals.h   |  16 +-
 src/include/storage/bufmgr.h          |   4 +-
 8 files changed, 329 insertions(+), 149 deletions(-)

diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 7d0371d807..cdc7144859 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -64,6 +64,7 @@
  * GUC parameters
  */
 int			BgWriterDelay = 200;
+bool		BgWriterLegacy = true;
 
 /*
  * Multiplier to apply to BgWriterDelay when we decide to hibernate.
@@ -275,7 +276,10 @@ BackgroundWriterMain(void)
 		/*
 		 * Do one cycle of dirty-buffer writing.
 		 */
-		can_hibernate = BgBufferSync(&wb_context);
+		if (BgWriterLegacy)
+			can_hibernate = BgBufferSyncLegacy(&wb_context);
+		else
+			can_hibernate = BgBufferSyncNew(&wb_context);
 
 		/*
 		 * Send off activity statistics to the stats collector
@@ -373,7 +377,8 @@ BackgroundWriterMain(void)
 						   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
 						   BgWriterDelay * HIBERNATE_FACTOR);
 			/* Reset the notification request in case we timed out */
-			StrategyNotifyBgWriter(-1);
+			if (BgWriterLegacy)
+				StrategyNotifyBgWriter(-1);
 		}
 
 		/*
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index bfa37f1c66..4b4d560f3c 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "lib/ringbuf.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
 
@@ -25,6 +26,7 @@ LWLockTranche BufferIOLWLockTranche;
 LWLockTranche BufferContentLWLockTranche;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
+ringbuf *FreeBuffers = NULL;
 
 
 /*
@@ -72,7 +74,8 @@ InitBufferPool(void)
 	bool		foundBufs,
 				foundDescs,
 				foundIOLocks,
-				foundBufCkpt;
+				foundBufCkpt,
+				foundFreeBufs;
 
 	/* Align descriptors to a cacheline boundary. */
 	BufferDescriptors = (BufferDescPadded *)
@@ -106,6 +109,10 @@ InitBufferPool(void)
 	LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
 						  &BufferContentLWLockTranche);
 
+	FreeBuffers = ShmemInitStruct("Free Buffers",
+								  ringbuf_size(FREE_BUFFER_SIZE),
+								  &foundFreeBufs);
+
 	/*
 	 * The array used to sort to-be-checkpointed buffer ids is located in
 	 * shared memory, to avoid having to allocate significant amounts of
@@ -117,10 +124,11 @@ InitBufferPool(void)
 		ShmemInitStruct("Checkpoint BufferIds",
 						NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
 
-	if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
+	if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt || foundFreeBufs)
 	{
 		/* should find all of these, or none of them */
-		Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
+		Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt && foundFreeBufs);
+
 		/* note: this path is only taken in EXEC_BACKEND case */
 	}
 	else
@@ -147,6 +155,7 @@ InitBufferPool(void)
 			/*
 			 * Initially link all the buffers together as unused. Subsequent
 			 * management of this list is done by freelist.c.
+			 * FIXME: remove once legacy bgwriter is removed
 			 */
 			buf->freeNext = i + 1;
 
@@ -157,8 +166,10 @@ InitBufferPool(void)
 							 LWTRANCHE_BUFFER_IO_IN_PROGRESS);
 		}
 
-		/* Correct last entry of linked list */
+		/* Correct last entry of linked list: FIXME: remove */
 		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
+		/* FIXME: could fill the first few free buffers? */
+		FreeBuffers = ringbuf_create(FreeBuffers, FREE_BUFFER_SIZE);
 	}
 
 	/* Init other shared buffer-management stuff */
@@ -206,5 +217,8 @@ BufferShmemSize(void)
 	/* size of checkpoint sort array in bufmgr.c */
 	size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
 
+	/* FIXME: better ringbuffer size */
+	size = add_size(size, ringbuf_size(FREE_BUFFER_SIZE));
+
 	return size;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7a1399708e..a6c32a9045 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -38,6 +38,7 @@
 #include "catalog/storage.h"
 #include "executor/instrument.h"
 #include "lib/binaryheap.h"
+#include "lib/ringbuf.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "pgstat.h"
@@ -1328,6 +1329,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
  * trying to write it out.  We have to let them finish before we can
  * reclaim the buffer.
  *
+ * FIXME: ^^^
+ *
  * The buffer could get reclaimed by someone else while we are waiting
  * to acquire the necessary locks; if so, don't mess it up.
  */
@@ -1950,7 +1953,79 @@ BufferSync(int flags)
 }
 
 /*
- * BgBufferSync -- Write out some dirty buffers in the pool.
+ * BgBufferSyncNew -- Write out some dirty buffers in the pool.
+ *
+ * This is called periodically by the background writer process.
+ *
+ * Returns true if it's appropriate for the bgwriter process to go into
+ * low-power hibernation mode.
+ */
+bool
+BgBufferSyncNew(WritebackContext *wb_context)
+{
+	uint32      recent_alloc;
+	uint32      strategy_passes;
+
+	/* Make sure we can handle the pin inside SyncOneBuffer */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* Report buffer alloc counts to pgstat */
+	StrategySyncStart(&strategy_passes, &recent_alloc);
+	BgWriterStats.m_buf_alloc += recent_alloc;
+
+	/* go and populate freelist */
+	while (!ringbuf_full(FreeBuffers))
+	{
+		BufferDesc *buf;
+		bool pushed;
+		bool dirty;
+
+		ReservePrivateRefCountEntry();
+
+		buf = ClockSweep(NULL);
+
+		dirty = buf->flags & BM_DIRTY;
+
+		if (dirty)
+		{
+			SMgrRelation reln;
+			BufferTag tag;
+
+			PinBuffer_Locked(buf);
+
+			reln = smgropen(buf->tag.rnode, InvalidBackendId);
+
+			LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+			FlushBuffer(buf, reln);
+			LWLockRelease(BufferDescriptorGetContentLock(buf));
+
+			BgWriterStats.m_buf_written_clean++;
+
+			tag = buf->tag; /* save tag for writeback scheduling */
+
+			UnpinBuffer(buf, 1);
+
+			pushed = ringbuf_push(FreeBuffers, buf);
+
+			if (wb_context)
+				ScheduleBufferTagForWriteback(wb_context, &tag);
+		}
+		else
+		{
+			UnlockBufHdr(buf);
+			pushed = ringbuf_push(FreeBuffers, buf);
+		}
+
+		/* full, shouldn't normally happen, we're the only writer  */
+		if (!pushed)
+			break;
+	}
+
+	return true;
+}
+
+/*
+ * BgBufferSyncLegacy -- Write out some dirty buffers in the pool.
  *
  * This is called periodically by the background writer process.
  *
@@ -1961,7 +2036,7 @@ BufferSync(int flags)
  * bgwriter_lru_maxpages to 0.)
  */
 bool
-BgBufferSync(WritebackContext *wb_context)
+BgBufferSyncLegacy(WritebackContext *wb_context)
 {
 	/* info obtained from freelist.c */
 	int			strategy_buf_id;
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 551d15205c..6f403dba58 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,7 +15,9 @@
  */
 #include "postgres.h"
 
+#include "lib/ringbuf.h"
 #include "port/atomics.h"
+#include "postmaster/bgwriter.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
@@ -167,131 +169,13 @@ ClockSweepTick(void)
 	return victim;
 }
 
-/*
- * StrategyGetBuffer
- *
- *	Called by the bufmgr to get the next candidate buffer to use in
- *	BufferAlloc(). The only hard requirement BufferAlloc() has is that
- *	the selected buffer must not currently be pinned by anyone.
- *
- *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
- *
- *	To ensure that no one else can pin the buffer before we do, we must
- *	return the buffer with the buffer header spinlock still held.
- */
+
 BufferDesc *
-StrategyGetBuffer(BufferAccessStrategy strategy)
+ClockSweep(BufferAccessStrategy strategy)
 {
 	BufferDesc *buf;
-	int			bgwprocno;
 	int			trycounter;
 
-	/*
-	 * If given a strategy object, see whether it can select a buffer. We
-	 * assume strategy objects don't need buffer_strategy_lock.
-	 */
-	if (strategy != NULL)
-	{
-		buf = GetBufferFromRing(strategy);
-		if (buf != NULL)
-			return buf;
-	}
-
-	/*
-	 * If asked, we need to waken the bgwriter. Since we don't want to rely on
-	 * a spinlock for this we force a read from shared memory once, and then
-	 * set the latch based on that value. We need to go through that length
-	 * because otherwise bgprocno might be reset while/after we check because
-	 * the compiler might just reread from memory.
-	 *
-	 * This can possibly set the latch of the wrong process if the bgwriter
-	 * dies in the wrong moment. But since PGPROC->procLatch is never
-	 * deallocated the worst consequence of that is that we set the latch of
-	 * some arbitrary process.
-	 */
-	bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
-	if (bgwprocno != -1)
-	{
-		/* reset bgwprocno first, before setting the latch */
-		StrategyControl->bgwprocno = -1;
-
-		/*
-		 * Not acquiring ProcArrayLock here which is slightly icky. It's
-		 * actually fine because procLatch isn't ever freed, so we just can
-		 * potentially set the wrong process' (or no process') latch.
-		 */
-		SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
-	}
-
-	/*
-	 * We count buffer allocation requests so that the bgwriter can estimate
-	 * the rate of buffer consumption.  Note that buffers recycled by a
-	 * strategy object are intentionally not counted here.
-	 */
-	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
-
-	/*
-	 * First check, without acquiring the lock, whether there's buffers in the
-	 * freelist. Since we otherwise don't require the spinlock in every
-	 * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
-	 * uselessly in most cases. That obviously leaves a race where a buffer is
-	 * put on the freelist but we don't see the store yet - but that's pretty
-	 * harmless, it'll just get used during the next buffer acquisition.
-	 *
-	 * If there's buffers on the freelist, acquire the spinlock to pop one
-	 * buffer of the freelist. Then check whether that buffer is usable and
-	 * repeat if not.
-	 *
-	 * Note that the freeNext fields are considered to be protected by the
-	 * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
-	 * manipulate them without holding the spinlock.
-	 */
-	if (StrategyControl->firstFreeBuffer >= 0)
-	{
-		while (true)
-		{
-			/* Acquire the spinlock to remove element from the freelist */
-			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
-			if (StrategyControl->firstFreeBuffer < 0)
-			{
-				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-				break;
-			}
-
-			buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
-			Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
-
-			/* Unconditionally remove buffer from freelist */
-			StrategyControl->firstFreeBuffer = buf->freeNext;
-			buf->freeNext = FREENEXT_NOT_IN_LIST;
-
-			/*
-			 * Release the lock so someone else can access the freelist while
-			 * we check out this buffer.
-			 */
-			SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-
-			/*
-			 * If the buffer is pinned or has a nonzero usage_count, we cannot
-			 * use it; discard it and retry.  (This can only happen if VACUUM
-			 * put a valid buffer in the freelist and then someone else used
-			 * it before we got to it.  It's probably impossible altogether as
-			 * of 8.3, but we'd better check anyway.)
-			 */
-			LockBufHdr(buf);
-			if (buf->refcount == 0 && buf->usage_count == 0)
-			{
-				if (strategy != NULL)
-					AddBufferToRing(strategy, buf);
-				return buf;
-			}
-			UnlockBufHdr(buf);
-
-		}
-	}
-
-	/* Nothing on the freelist, so run the "clock sweep" algorithm */
 	trycounter = NBuffers;
 	for (;;)
 	{
@@ -335,33 +219,213 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 }
 
 /*
+ * StrategyGetBuffer
+ *
+ *	Called by the bufmgr to get the next candidate buffer to use in
+ *	BufferAlloc(). The only hard requirement BufferAlloc() has is that
+ *	the selected buffer must not currently be pinned by anyone.
+ *
+ *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
+ *
+ *	To ensure that no one else can pin the buffer before we do, we must
+ *	return the buffer with the buffer header spinlock still held.
+ */
+BufferDesc *
+StrategyGetBuffer(BufferAccessStrategy strategy)
+{
+	BufferDesc *buf;
+	int			bgwprocno;
+
+	/*
+	 * If given a strategy object, see whether it can select a buffer. We
+	 * assume strategy objects don't need buffer_strategy_lock.
+	 */
+	if (strategy != NULL)
+	{
+		buf = GetBufferFromRing(strategy);
+		if (buf != NULL)
+			return buf;
+	}
+
+	/*
+	 * If asked, we need to waken the bgwriter. Since we don't want to rely on
+	 * a spinlock for this we force a read from shared memory once, and then
+	 * set the latch based on that value. We need to go through that length
+	 * because otherwise bgprocno might be reset while/after we check because
+	 * the compiler might just reread from memory.
+	 *
+	 * This can possibly set the latch of the wrong process if the bgwriter
+	 * dies in the wrong moment. But since PGPROC->procLatch is never
+	 * deallocated the worst consequence of that is that we set the latch of
+	 * some arbitrary process.
+	 */
+	bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
+	if (BgWriterLegacy)
+	{
+		if (bgwprocno != -1)
+		{
+			/* reset bgwprocno first, before setting the latch */
+			StrategyControl->bgwprocno = -1;
+
+			/*
+			 * Not acquiring ProcArrayLock here which is slightly icky. It's
+			 * actually fine because procLatch isn't ever freed, so we just can
+			 * potentially set the wrong process' (or no process') latch.
+			 */
+			SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+		}
+	}
+
+	/*
+	 * We count buffer allocation requests so that the bgwriter can estimate
+	 * the rate of buffer consumption.  Note that buffers recycled by a
+	 * strategy object are intentionally not counted here.
+	 */
+	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
+
+	if (BgWriterLegacy)
+	{
+		/*
+		 * First check, without acquiring the lock, whether there's buffers in the
+		 * freelist. Since we otherwise don't require the spinlock in every
+		 * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
+		 * uselessly in most cases. That obviously leaves a race where a buffer is
+		 * put on the freelist but we don't see the store yet - but that's pretty
+		 * harmless, it'll just get used during the next buffer acquisition.
+		 *
+		 * If there's buffers on the freelist, acquire the spinlock to pop one
+		 * buffer of the freelist. Then check whether that buffer is usable and
+		 * repeat if not.
+		 *
+		 * Note that the freeNext fields are considered to be protected by the
+		 * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
+		 * manipulate them without holding the spinlock.
+		 */
+		if (StrategyControl->firstFreeBuffer >= 0)
+		{
+			while (true)
+			{
+				/* Acquire the spinlock to remove element from the freelist */
+				SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+				if (StrategyControl->firstFreeBuffer < 0)
+				{
+					SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+					break;
+				}
+
+				buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
+				Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+				/* Unconditionally remove buffer from freelist */
+				StrategyControl->firstFreeBuffer = buf->freeNext;
+				buf->freeNext = FREENEXT_NOT_IN_LIST;
+
+				/*
+				 * Release the lock so someone else can access the freelist while
+				 * we check out this buffer.
+				 */
+				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+
+				/*
+				 * If the buffer is pinned or has a nonzero usage_count, we cannot
+				 * use it; discard it and retry.  (This can only happen if VACUUM
+				 * put a valid buffer in the freelist and then someone else used
+				 * it before we got to it.  It's probably impossible altogether as
+				 * of 8.3, but we'd better check anyway.)
+				 */
+				LockBufHdr(buf);
+				if (buf->refcount == 0 && buf->usage_count == 0)
+				{
+					if (strategy != NULL)
+						AddBufferToRing(strategy, buf);
+					return buf;
+				}
+				UnlockBufHdr(buf);
+
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * Try to get a buffer from the free list.
+		 */
+		while (!ringbuf_empty(FreeBuffers))
+		{
+			BufferDesc *buf;
+			bool found;
+
+			found = ringbuf_pop(FreeBuffers, (void *)&buf);
+
+			/* If the ringbuffer is sufficiently depleted, wakeup the bgwriter. */
+			if (bgwprocno != -1 && (
+					!found ||
+					ringbuf_elements(FreeBuffers) < FREE_BUFFER_SIZE / 4))
+			{
+				SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+			}
+
+			if (!found)
+				break;
+
+			/* check if the buffer is still unused, done if so */
+			LockBufHdr(buf);
+			if (buf->refcount == 0 && buf->usage_count == 0)
+			{
+				if (strategy != NULL)
+					AddBufferToRing(strategy, buf);
+				return buf;
+			}
+			else
+			{
+				UnlockBufHdr(buf);
+				//ereport(LOG, (errmsg("buffer since reused"),errhidestmt(true)));
+			}
+		}
+	}
+
+	/* Nothing on the freelist, so run the "clock sweep" algorithm */
+	return ClockSweep(strategy);
+}
+
+
+/*
  * StrategyFreeBuffer: put a buffer on the freelist
  */
 void
 StrategyFreeBuffer(BufferDesc *buf)
 {
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
-	/*
-	 * It is possible that we are told to put something in the freelist that
-	 * is already in it; don't screw up the list if so.
-	 */
-	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
+	if (BgWriterLegacy)
 	{
-		buf->freeNext = StrategyControl->firstFreeBuffer;
-		if (buf->freeNext < 0)
-			StrategyControl->lastFreeBuffer = buf->buf_id;
-		StrategyControl->firstFreeBuffer = buf->buf_id;
-	}
+		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 
-	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+		/*
+		 * It is possible that we are told to put something in the freelist that
+		 * is already in it; don't screw up the list if so.
+		 */
+		if (buf->freeNext == FREENEXT_NOT_IN_LIST)
+		{
+			buf->freeNext = StrategyControl->firstFreeBuffer;
+			if (buf->freeNext < 0)
+				StrategyControl->lastFreeBuffer = buf->buf_id;
+			StrategyControl->firstFreeBuffer = buf->buf_id;
+		}
+
+		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+	}
+	else
+	{
+		/* FIXME: It's not correct to do this outside bgwriter */
+		ringbuf_push(FreeBuffers, buf);
+	}
 }
 
 /*
- * StrategySyncStart -- tell BufferSync where to start syncing
+ * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
- * The result is the buffer index of the best buffer to sync first.
- * BufferSync() will proceed circularly around the buffer array from there.
+ * The result is the buffer index below the current clock-hand. BgBufferSync()
+ * will proceed circularly around the buffer array from there.
  *
  * In addition, we return the completed-pass count (which is effectively
  * the higher-order bits of nextVictimBuffer) and the count of recent buffer
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 789efbc8c7..005f231eb0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1224,6 +1224,17 @@ static struct config_bool ConfigureNamesBool[] =
 	},
 
 	{
+		{"bgwriter_legacy", PGC_POSTMASTER, RESOURCES_BGWRITER,
+			gettext_noop("Use legacy bgwriter algorithm."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&BgWriterLegacy,
+		true,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"trace_notify", PGC_USERSET, DEVELOPER_OPTIONS,
 			gettext_noop("Generates debugging output for LISTEN and NOTIFY."),
 			NULL,
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index b162b0dea9..1aeb2dce6c 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -24,6 +24,7 @@ extern int	BgWriterDelay;
 extern int	CheckPointTimeout;
 extern int	CheckPointWarning;
 extern double CheckPointCompletionTarget;
+extern bool	BgWriterLegacy;
 
 extern void BackgroundWriterMain(void) pg_attribute_noreturn();
 extern void CheckpointerMain(void) pg_attribute_noreturn();
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index de84bc4f57..cbb267a7f1 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -116,9 +116,9 @@ typedef struct buftag
  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
  * changes after initialization, so does not need locking.  freeNext is
- * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
- * take care of itself.  The buf_hdr_lock is *not* used to control access to
- * the data in the buffer!
+ * protected by the buffer_strategy_lock not buf_hdr_lock (XXX: remove).  The
+ * LWLock can take care of itself.  The buf_hdr_lock is *not* used to control
+ * access to the data in the buffer!
  *
  * An exception is that if we have the buffer pinned, its tag can't change
  * underneath us, so we can examine the tag without locking the spinlock.
@@ -149,7 +149,8 @@ typedef struct BufferDesc
 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
 	int			buf_id;			/* buffer's index number (from 0) */
-	int			freeNext;		/* link in freelist chain */
+	/* link in freelist chain: only used with legacy bgwriter */
+	int			freeNext;
 
 	LWLock		content_lock;	/* to lock access to buffer contents */
 } BufferDesc;
@@ -197,10 +198,14 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
 /*
  * The freeNext field is either the index of the next freelist entry,
  * or one of these special values:
+ * XXX: Remove when removing legacy bgwriter
  */
 #define FREENEXT_END_OF_LIST	(-1)
 #define FREENEXT_NOT_IN_LIST	(-2)
 
+/* size of buffer free list */
+#define FREE_BUFFER_SIZE 4096
+
 /*
  * Macros for acquiring/releasing a shared buffer header's spinlock.
  * Do not apply these to local buffers!
@@ -235,6 +240,7 @@ typedef struct WritebackContext
 /* in buf_init.c */
 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
+extern PGDLLIMPORT struct ringbuf *FreeBuffers;
 
 /* in localbuf.c */
 extern BufferDesc *LocalBufferDescriptors;
@@ -268,6 +274,8 @@ extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *
 
 /* freelist.c */
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
+extern BufferDesc *ClockSweep(BufferAccessStrategy strategy);
+
 extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 BufferDesc *buf);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index a4b1b370ce..6c45a85000 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -229,7 +229,9 @@ extern bool HoldingBufferPinThatDelaysRecovery(void);
 extern void AbortBufferIO(void);
 
 extern void BufmgrCommit(void);
-extern bool BgBufferSync(struct WritebackContext *wb_context);
+
+extern bool BgBufferSyncNew(struct WritebackContext *wb_context);
+extern bool BgBufferSyncLegacy(struct WritebackContext *wb_context);
 
 extern void AtProcExit_LocalBuffers(void);
 
-- 
2.11.0.22.g8d7a455.dirty

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to