Hi,

I've been annoyed at the amount of memory used by the backend local
PrivateRefCount array for a couple of reasons:

a) The performance impact of AtEOXact_Buffers() on Assert() enabled
   builds is really, really annoying.
b) On larger nodes, the L1/2/3 cache impact of randomly accessing
   several megabyte big array at a high frequency is noticeable. I've
   seen the access to that to be the primary (yes, really) source of
   pipeline stalls.
c) On nodes with significant shared_memory the sum of the per-backend
   arrays is a significant amount of memory, that could very well be
   used more beneficially.

So what I have done in the attached proof of concept is to have a small
(8 currently) array of (buffer, pincount) that's searched linearly when
the refcount of a buffer is needed. When more than 8 buffers are pinned
a hashtable is used to lookup the values.

That seems to work fairly well. On the few tests I could run on my
laptop - I've done this during a flight - it's a small performance win
in all cases I could test. While saving a fair amount of memory.

Alternatively we could just get rid of the idea of tracking this per
backend, relying on tracking via resource managers...

Greetings,

Andres Freund

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
>From 433f248c0f4c3e3d43d1cc955354e5dd5cddfcea Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 20 Mar 2014 21:46:34 +0100
Subject: [PATCH] Make backend local tracking of buffer pins more efficient.

---
 src/backend/storage/buffer/buf_init.c |  25 ---
 src/backend/storage/buffer/bufmgr.c   | 346 ++++++++++++++++++++++++++++------
 src/include/storage/bufmgr.h          |  19 --
 3 files changed, 290 insertions(+), 100 deletions(-)

diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index e187242..3b2432d 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -130,31 +130,6 @@ InitBufferPool(void)
 }
 
 /*
- * Initialize access to shared buffer pool
- *
- * This is called during backend startup (whether standalone or under the
- * postmaster).  It sets up for this backend's access to the already-existing
- * buffer pool.
- *
- * NB: this is called before InitProcess(), so we do not have a PGPROC and
- * cannot do LWLockAcquire; hence we can't actually access stuff in
- * shared memory yet.  We are only initializing local data here.
- * (See also InitBufferPoolBackend, over in bufmgr.c.)
- */
-void
-InitBufferPoolAccess(void)
-{
-	/*
-	 * Allocate and zero local arrays of per-buffer info.
-	 */
-	PrivateRefCount = (int32 *) calloc(NBuffers, sizeof(int32));
-	if (!PrivateRefCount)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-}
-
-/*
  * BufferShmemSize
  *
  * compute the size of shared memory for the buffer pool including
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 19eecab..113b7ed 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -86,6 +86,175 @@ static bool IsForInput;
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
 
+typedef struct PrivateRefCount
+{
+	Buffer buffer;
+	int32 refcount;
+} PrivateRefCount;
+
+/* one full cacheline */
+#define REFCOUNT_ARRAY_ENTRIES 8
+
+/*
+ * Backend-Private refcount management.
+ */
+static struct PrivateRefCount PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
+static HTAB *PrivateRefCountHash = NULL;
+static int32 PrivateRefCountOverflowed = 0;
+
+static PrivateRefCount* GetPrivateRefCountEntry(Buffer buffer, bool create);
+static inline int32 GetPrivateRefCount(Buffer buffer);
+static void ForgetPrivateRefCountEntry(PrivateRefCount *ref);
+
+/*
+ * Return the PrivateRefCount entry for the passed buffer.
+ *
+ * Returns NULL if create = false is passed and the buffer doesn't have a
+ * PrivateRefCount entry; allocates a new PrivateRefCount entry if currently
+ * none exists and create = true is passed.
+ *
+ * When a returned refcount entry isn't used anymore it has to be forgotten,
+ * using ForgetPrivateRefCountEntry().
+ *
+ * Only works for shared buffers.
+ */
+static PrivateRefCount*
+GetPrivateRefCountEntry(Buffer buffer, bool create)
+{
+	PrivateRefCount *res;
+	PrivateRefCount *free = NULL;
+
+	int i;
+	bool found = false;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * First search for references in the array, that'll be sufficient in the
+	 * majority of cases.
+	 */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer == buffer)
+			return res;
+
+		/* Remember where to put a new refcount, should it become necessary. */
+		if (create && free == NULL && res->buffer == InvalidBuffer)
+			free = res;
+	}
+
+	/*
+	 * By here we know that the buffer, if already pinned, isn't residing in
+	 * the array.
+	 */
+
+	res = NULL;
+	found = false;
+
+	/*
+	 * Search in the hashtable if either we haven't found the target buffer
+	 * yet and it already has entries, or if there wasn't a free entry in the
+	 * local array.
+	 */
+	if (PrivateRefCountOverflowed > 0 || (create && free == NULL))
+	{
+		res = hash_search(PrivateRefCountHash,
+						  (void *) &buffer,
+						  (create && free == NULL)  ? HASH_ENTER : HASH_FIND,
+						  &found);
+	}
+
+	if (found)
+	{
+		/* TODO: could relocate to `free' here */
+	}
+	else if (res)
+	{
+		/* a new refcount entry, initialize */
+		PrivateRefCountOverflowed++;
+		res->refcount = 0;
+	}
+	else if (create)
+	{
+		/* buffer wasn't in the hash and we have a free array entry */
+		Assert(free != NULL);
+		free->buffer = buffer;
+		free->refcount = 0;
+		res = free;
+	}
+
+	return res;
+}
+
+/*
+ * Returns how many times the passed buffer is pinned by this backend.
+ *
+ * Only works for shared memory buffers!
+ */
+static inline int32
+GetPrivateRefCount(Buffer buffer)
+{
+	PrivateRefCount *ref;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	ref = GetPrivateRefCountEntry(buffer, false);
+
+	if (ref == NULL)
+		return 0;
+	return ref->refcount;
+}
+
+/*
+ * Stop tracking the refcount of the buffer ref is tracking the refcount
+ * for. Nono, there's no circularity here.
+ */
+static void
+ForgetPrivateRefCountEntry(PrivateRefCount *ref)
+{
+	Assert(ref->refcount == 0);
+
+	if (ref >= &PrivateRefCountArray[0] &&
+		ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
+	{
+		ref->buffer = InvalidBuffer;
+	}
+	else
+	{
+		bool found;
+		Buffer buffer = ref->buffer;
+		hash_search(PrivateRefCountHash,
+					(void *) &buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+		Assert(PrivateRefCountOverflowed > 0);
+		PrivateRefCountOverflowed--;
+	}
+}
+
+/*
+ * BufferIsPinned
+ *		True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ *		NOTE: what we check here is that *this* backend holds a pin on
+ *		the buffer.  We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
+( \
+	!BufferIsValid(bufnum) ? \
+		false \
+	: \
+		BufferIsLocal(bufnum) ? \
+			(LocalRefCount[-(bufnum) - 1] > 0) \
+		: \
+	(GetPrivateRefCount(bufnum) > 0) \
+)
+
 
 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
 				  ForkNumber forkNum, BlockNumber blockNum,
@@ -940,7 +1109,7 @@ retry:
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
-		if (PrivateRefCount[buf->buf_id] != 0)
+		if (GetPrivateRefCount(buf->buf_id) > 0)
 			elog(ERROR, "buffer is pinned in InvalidateBuffer");
 		WaitIO(buf);
 		goto retry;
@@ -999,7 +1168,7 @@ MarkBufferDirty(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -1046,9 +1215,9 @@ ReleaseAndReadBuffer(Buffer buffer,
 
 	if (BufferIsValid(buffer))
 	{
+		Assert(BufferIsPinned(buffer));
 		if (BufferIsLocal(buffer))
 		{
-			Assert(LocalRefCount[-buffer - 1] > 0);
 			bufHdr = &LocalBufferDescriptors[-buffer - 1];
 			if (bufHdr->tag.blockNum == blockNum &&
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
@@ -1059,7 +1228,6 @@ ReleaseAndReadBuffer(Buffer buffer,
 		}
 		else
 		{
-			Assert(PrivateRefCount[buffer - 1] > 0);
 			bufHdr = &BufferDescriptors[buffer - 1];
 			/* we have pin, so it's ok to examine tag without spinlock */
 			if (bufHdr->tag.blockNum == blockNum &&
@@ -1089,15 +1257,18 @@ ReleaseAndReadBuffer(Buffer buffer,
  * Note that ResourceOwnerEnlargeBuffers must have been done already.
  *
  * Returns TRUE if buffer is BM_VALID, else FALSE.	This provision allows
- * some callers to avoid an extra spinlock cycle.
+ * some callradix_tree_inserters to avoid an extra spinlock cycle.
  */
 static bool
 PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 {
 	int			b = buf->buf_id;
 	bool		result;
+	PrivateRefCount *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, true);
 
-	if (PrivateRefCount[b] == 0)
+	if (ref->refcount == 0)
 	{
 		LockBufHdr(buf);
 		buf->refcount++;
@@ -1119,8 +1290,9 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 		/* If we previously pinned the buffer, it must surely be valid */
 		result = true;
 	}
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 	return result;
@@ -1143,12 +1315,15 @@ static void
 PinBuffer_Locked(volatile BufferDesc *buf)
 {
 	int			b = buf->buf_id;
+	PrivateRefCount *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, true);
 
-	if (PrivateRefCount[b] == 0)
+	if (ref->refcount == 0)
 		buf->refcount++;
 	UnlockBufHdr(buf);
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 }
@@ -1165,14 +1340,18 @@ static void
 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 {
 	int			b = buf->buf_id;
+	PrivateRefCount *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, false);
+	Assert(ref != NULL);
 
 	if (fixOwner)
 		ResourceOwnerForgetBuffer(CurrentResourceOwner,
 								  BufferDescriptorGetBuffer(buf));
 
-	Assert(PrivateRefCount[b] > 0);
-	PrivateRefCount[b]--;
-	if (PrivateRefCount[b] == 0)
+	Assert(ref->refcount > 0);
+	ref->refcount--;
+	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(buf->content_lock));
@@ -1197,6 +1376,8 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 		}
 		else
 			UnlockBufHdr(buf);
+
+		ForgetPrivateRefCountEntry(ref);
 	}
 }
 
@@ -1700,36 +1881,95 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	return result | BUF_WRITTEN;
 }
 
-
 /*
- *		AtEOXact_Buffers - clean up at end of transaction.
+ * Helper Routine for AtProcExit_Buffers() and AtEOXact_Buffers().
  *
- *		As of PostgreSQL 8.0, buffer pins should get released by the
- *		ResourceOwner mechanism.  This routine is just a debugging
- *		cross-check that no pins remain.
+ * Check whether any buffer pins are still held by this backend, but only if
+ * assertions are enabled.
  */
-void
-AtEOXact_Buffers(bool isCommit)
+static void
+AssertNoPinnedBuffers(void)
 {
 #ifdef USE_ASSERT_CHECKING
 	if (assert_enabled)
 	{
 		int			RefCountErrors = 0;
-		Buffer		b;
+		PrivateRefCount *res;
+		int			i;
 
-		for (b = 1; b <= NBuffers; b++)
+		/* check the array */
+		for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
 		{
-			if (PrivateRefCount[b - 1] != 0)
+			res = &PrivateRefCountArray[i];
+
+			if (res->buffer != InvalidBuffer)
+			{
+				PrintBufferLeakWarning(res->buffer);
+				RefCountErrors++;
+			}
+		}
+
+		/* if neccessary search the hash */
+		if (PrivateRefCountOverflowed)
+		{
+			HASH_SEQ_STATUS hstat;
+			hash_seq_init(&hstat, PrivateRefCountHash);
+			while ((res = (PrivateRefCount *) hash_seq_search(&hstat)) != NULL)
 			{
-				PrintBufferLeakWarning(b);
+				PrintBufferLeakWarning(res->buffer);
 				RefCountErrors++;
 			}
+
 		}
+
 		Assert(RefCountErrors == 0);
 	}
 #endif
+}
+
+/*
+ *		AtEOXact_Buffers - clean up at end of transaction.
+ *
+ *		As of PostgreSQL 8.0, buffer pins should get released by the
+ *		ResourceOwner mechanism.  This routine is just a debugging
+ *		cross-check that no pins remain.
+ */
+void
+AtEOXact_Buffers(bool isCommit)
+{
+	AssertNoPinnedBuffers();
 
 	AtEOXact_LocalBuffers(isCommit);
+
+	Assert(PrivateRefCountOverflowed == 0);
+}
+
+/*
+ * Initialize access to shared buffer pool
+ *
+ * This is called during backend startup (whether standalone or under the
+ * postmaster).  It sets up for this backend's access to the already-existing
+ * buffer pool.
+ *
+ * NB: this is called before InitProcess(), so we do not have a PGPROC and
+ * cannot do LWLockAcquire; hence we can't actually access stuff in
+ * shared memory yet.  We are only initializing local data here.
+ * (See also InitBufferPoolBackend)
+ */
+void
+InitBufferPoolAccess(void)
+{
+	HASHCTL		hash_ctl;
+
+	memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
+
+	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(int32);
+	hash_ctl.entrysize = sizeof(PrivateRefCountArray);
+	hash_ctl.hash = oid_hash; /* a bit more efficient than tag_hash */
+
+	PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
+									  HASH_ELEM | HASH_FUNCTION);
 }
 
 /*
@@ -1757,26 +1997,12 @@ AtProcExit_Buffers(int code, Datum arg)
 	AbortBufferIO();
 	UnlockBuffers();
 
-#ifdef USE_ASSERT_CHECKING
-	if (assert_enabled)
-	{
-		int			RefCountErrors = 0;
-		Buffer		b;
-
-		for (b = 1; b <= NBuffers; b++)
-		{
-			if (PrivateRefCount[b - 1] != 0)
-			{
-				PrintBufferLeakWarning(b);
-				RefCountErrors++;
-			}
-		}
-		Assert(RefCountErrors == 0);
-	}
-#endif
+	AssertNoPinnedBuffers();
 
 	/* localbuf.c needs a chance too */
 	AtProcExit_LocalBuffers();
+
+	Assert(PrivateRefCountOverflowed == 0);
 }
 
 /*
@@ -1800,7 +2026,7 @@ PrintBufferLeakWarning(Buffer buffer)
 	else
 	{
 		buf = &BufferDescriptors[buffer - 1];
-		loccount = PrivateRefCount[buffer - 1];
+		loccount = GetPrivateRefCount(buffer);
 		backend = InvalidBackendId;
 	}
 
@@ -2340,7 +2566,7 @@ PrintBufferDescs(void)
 			 i, buf->freeNext,
 		  relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
 			 buf->tag.blockNum, buf->flags,
-			 buf->refcount, PrivateRefCount[i]);
+			 buf->refcount, GetPrivateRefCount(i));
 	}
 }
 #endif
@@ -2354,7 +2580,7 @@ PrintPinnedBufs(void)
 
 	for (i = 0; i < NBuffers; ++i, ++buf)
 	{
-		if (PrivateRefCount[i] > 0)
+		if (GetPrivateRefCount(i + 1) > 0)
 		{
 			/* theoretically we should lock the bufhdr here */
 			elog(LOG,
@@ -2363,7 +2589,7 @@ PrintPinnedBufs(void)
 				 i, buf->freeNext,
 				 relpath(buf->tag.rnode, buf->tag.forkNum),
 				 buf->tag.blockNum, buf->flags,
-				 buf->refcount, PrivateRefCount[i]);
+				 buf->refcount, GetPrivateRefCount(i + 1));
 		}
 	}
 }
@@ -2520,6 +2746,7 @@ void
 ReleaseBuffer(Buffer buffer)
 {
 	volatile BufferDesc *bufHdr;
+	PrivateRefCount *ref;
 
 	if (!BufferIsValid(buffer))
 		elog(ERROR, "bad buffer ID: %d", buffer);
@@ -2535,10 +2762,12 @@ ReleaseBuffer(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	ref = GetPrivateRefCountEntry(buffer, false);
+	Assert(ref != NULL);
+	Assert(ref->refcount > 0);
 
-	if (PrivateRefCount[buffer - 1] > 1)
-		PrivateRefCount[buffer - 1]--;
+	if (ref->refcount > 1)
+		ref->refcount--;
 	else
 		UnpinBuffer(bufHdr, false);
 }
@@ -2572,7 +2801,12 @@ IncrBufferRefCount(Buffer buffer)
 	if (BufferIsLocal(buffer))
 		LocalRefCount[-buffer - 1]++;
 	else
-		PrivateRefCount[buffer - 1]++;
+	{
+		PrivateRefCount *ref;
+		ref = GetPrivateRefCountEntry(buffer, false);
+		Assert(ref != NULL);
+		ref->refcount++;
+	}
 }
 
 /*
@@ -2606,7 +2840,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -2823,9 +3057,9 @@ LockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	if (PrivateRefCount[buffer - 1] != 1)
+	if (GetPrivateRefCount(buffer) != 1)
 		elog(ERROR, "incorrect local pin count: %d",
-			 PrivateRefCount[buffer - 1]);
+			 GetPrivateRefCount(buffer));
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
@@ -2890,7 +3124,7 @@ HoldingBufferPinThatDelaysRecovery(void)
 	if (bufid < 0)
 		return false;
 
-	if (PrivateRefCount[bufid] > 0)
+	if (GetPrivateRefCount(bufid + 1) > 0)
 		return true;
 
 	return false;
@@ -2920,8 +3154,8 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	Assert(PrivateRefCount[buffer - 1] > 0);
-	if (PrivateRefCount[buffer - 1] != 1)
+	Assert(GetPrivateRefCount(buffer) > 0);
+	if (GetPrivateRefCount(buffer) != 1)
 		return false;
 
 	/* Try to acquire lock */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..42d9120 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -55,7 +55,6 @@ extern int	target_prefetch_pages;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
-extern PGDLLIMPORT int32 *PrivateRefCount;
 
 /* in localbuf.c */
 extern PGDLLIMPORT int NLocBuffer;
@@ -102,24 +101,6 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 )
 
 /*
- * BufferIsPinned
- *		True iff the buffer is pinned (also checks for valid buffer number).
- *
- *		NOTE: what we check here is that *this* backend holds a pin on
- *		the buffer.  We do not care whether some other backend does.
- */
-#define BufferIsPinned(bufnum) \
-( \
-	!BufferIsValid(bufnum) ? \
-		false \
-	: \
-		BufferIsLocal(bufnum) ? \
-			(LocalRefCount[-(bufnum) - 1] > 0) \
-		: \
-			(PrivateRefCount[(bufnum) - 1] > 0) \
-)
-
-/*
  * BufferGetBlock
  *		Returns a reference to a disk page image associated with a buffer.
  *
-- 
1.8.3.251.g1462b67

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to