On 2015-09-06 15:28:40 +0200, Andres Freund wrote:
> Hm. I found that the buffer content lwlocks can actually also be a
> significant source of contention - I'm not sure reducing padding for
> those is going to be particularly nice. I think we should rather move
> the *content* lock inline into the buffer descriptor. The io lock
> doesn't matter and can be as small as possible.

POC patch along those lines attached. This way the content locks have
full 64byte alignment *without* any additional memory usage because
buffer descriptors are already padded to 64bytes.  I'd to reorder
BufferDesc contents a bit and reduce the width of usagecount to 8bit
(which is fine given that 5 is our highest value) to make enough room.

I've experimented reducing the padding of the IO locks to nothing since
they're not that often contended on the CPU level. But even on my laptop
that lead to a noticeable regression for a readonly pgbench workload
where the dataset fit into the OS page cache but not into s_b.

> Additionally I think we should increase the lwlock padding to 64byte
> (i.e. the by far most command cacheline size). In the past I've seen
> that to be rather beneficial.

You'd already done that...


Benchmarking this on my 4 core/8 threads laptop I see a very slight
performance increase - which is about what we expect since this really
only should affect multi-socket machines.

Greetings,

Andres Freund
>From 55bc6e368206be230a690394ab541105f22a9827 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 7 Sep 2015 19:57:40 +0200
Subject: [PATCH] lwlock-andres-v3

---
 src/backend/storage/buffer/buf_init.c | 52 +++++++++++++++++++++++++++++++----
 src/backend/storage/buffer/bufmgr.c   | 52 +++++++++++++++++------------------
 src/backend/storage/lmgr/lwlock.c     | 12 +++++---
 src/include/storage/buf_internals.h   | 15 ++++++----
 src/include/storage/lwlock.h          | 17 ++++++++----
 5 files changed, 102 insertions(+), 46 deletions(-)

diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..14a2707 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -21,6 +21,16 @@
 BufferDescPadded *BufferDescriptors;
 char	   *BufferBlocks;
 
+/*
+ * This points to the buffer LWLocks in shared memory.  Backends inherit
+ * the pointer by fork from the postmaster (except in the EXEC_BACKEND case,
+ * where we have special measures to pass it down).
+ */
+const int BufferIOLWLockTrancheId = 1;
+LWLockTranche BufferIOLWLockTranche;
+LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
+const int BufferContentLWLockTrancheId = 2;
+LWLockTranche BufferContentLWLockTranche;
 
 /*
  * Data Structures:
@@ -65,7 +75,8 @@ void
 InitBufferPool(void)
 {
 	bool		foundBufs,
-				foundDescs;
+				foundDescs,
+				foundIOLocks;
 
 	/* Align descriptors to a cacheline boundary. */
 	BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
@@ -77,10 +88,25 @@ InitBufferPool(void)
 		ShmemInitStruct("Buffer Blocks",
 						NBuffers * (Size) BLCKSZ, &foundBufs);
 
-	if (foundDescs || foundBufs)
+	BufferIOLWLockArray = (LWLockMinimallyPadded *)
+		ShmemInitStruct("Buffer IO Locks",
+						NBuffers * (Size) sizeof(LWLockMinimallyPadded), &foundIOLocks);
+
+	BufferIOLWLockTranche.name = "buffer-io";
+	BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
+	BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
+	LWLockRegisterTranche(1, &BufferIOLWLockTranche);
+
+	BufferContentLWLockTranche.name = "buffer-content";
+	BufferContentLWLockTranche.array_base =
+		((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
+	BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
+	LWLockRegisterTranche(2, &BufferContentLWLockTranche);
+
+	if (foundDescs || foundBufs || foundIOLocks)
 	{
 		/* both should be present or neither */
-		Assert(foundDescs && foundBufs);
+		Assert(foundDescs && foundBufs && foundIOLocks);
 		/* note: this path is only taken in EXEC_BACKEND case */
 	}
 	else
@@ -110,12 +136,21 @@ InitBufferPool(void)
 			 */
 			buf->freeNext = i + 1;
 
-			buf->io_in_progress_lock = LWLockAssign();
-			buf->content_lock = LWLockAssign();
+			LWLockInitialize(BufferDescriptorGetContentLock(buf),
+							 BufferContentLWLockTrancheId);
 		}
 
 		/* Correct last entry of linked list */
 		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
+
+		/* Initialize all buffer IO LWLocks  */
+		for (i = 0; i < NBuffers; i++)
+		{
+			BufferDesc *buf = GetBufferDescriptor(i);
+
+			LWLockInitialize(BufferDescriptorGetIOLock(buf),
+							 BufferIOLWLockTrancheId);
+		}
 	}
 
 	/* Init other shared buffer-management stuff */
@@ -144,5 +179,12 @@ BufferShmemSize(void)
 	/* size of stuff controlled by freelist.c */
 	size = add_size(size, StrategyShmemSize());
 
+	/*
+	 * Space for the io in progress LWLock array, allocated in a separate
+	 * tranche, with a different stride. This allows us to pad out the main
+	 * array to reduce contention, without wasting space for buffers.
+	 */
+	size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
+
 	return size;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cd3aaad..8f7acea 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -677,7 +677,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			if (!isLocalBuf)
 			{
 				if (mode == RBM_ZERO_AND_LOCK)
-					LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+					LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
 				else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
 					LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
 			}
@@ -818,7 +818,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
 		!isLocalBuf)
 	{
-		LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
 	}
 
 	if (isLocalBuf)
@@ -984,7 +984,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * happens to be trying to split the page the first one got from
 			 * StrategyGetBuffer.)
 			 */
-			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
+			if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED))
 			{
 				/*
 				 * If using a nondefault strategy, and writing the buffer
@@ -1006,7 +1006,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 						StrategyRejectBuffer(strategy, buf))
 					{
 						/* Drop lock/pin and loop around for another buffer */
-						LWLockRelease(buf->content_lock);
+						LWLockRelease(BufferDescriptorGetContentLock(buf));
 						UnpinBuffer(buf, true);
 						continue;
 					}
@@ -1019,7 +1019,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 											  smgr->smgr_rnode.node.relNode);
 
 				FlushBuffer(buf, NULL);
-				LWLockRelease(buf->content_lock);
+				LWLockRelease(BufferDescriptorGetContentLock(buf));
 
 				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
 											   smgr->smgr_rnode.node.spcNode,
@@ -1334,7 +1334,7 @@ MarkBufferDirty(Buffer buffer)
 
 	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
-	Assert(LWLockHeldByMe(bufHdr->content_lock));
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
 	LockBufHdr(bufHdr);
 
@@ -1534,8 +1534,8 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
-		Assert(!LWLockHeldByMe(buf->content_lock));
-		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
+		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
 
 		LockBufHdr(buf);
 
@@ -2055,11 +2055,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	 * buffer is clean by the time we've locked it.)
 	 */
 	PinBuffer_Locked(bufHdr);
-	LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 
 	FlushBuffer(bufHdr, NULL);
 
-	LWLockRelease(bufHdr->content_lock);
+	LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 	UnpinBuffer(bufHdr, true);
 
 	return result | BUF_WRITTEN;
@@ -2865,9 +2865,9 @@ FlushRelationBuffers(Relation rel)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 			FlushBuffer(bufHdr, rel->rd_smgr);
-			LWLockRelease(bufHdr->content_lock);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -2917,9 +2917,9 @@ FlushDatabaseBuffers(Oid dbid)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 			FlushBuffer(bufHdr, NULL);
-			LWLockRelease(bufHdr->content_lock);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -3019,7 +3019,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
-	Assert(LWLockHeldByMe(bufHdr->content_lock));
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
 	/*
 	 * This routine might get called many times on the same page, if we are
@@ -3172,11 +3172,11 @@ LockBuffer(Buffer buffer, int mode)
 	buf = GetBufferDescriptor(buffer - 1);
 
 	if (mode == BUFFER_LOCK_UNLOCK)
-		LWLockRelease(buf->content_lock);
+		LWLockRelease(BufferDescriptorGetContentLock(buf));
 	else if (mode == BUFFER_LOCK_SHARE)
-		LWLockAcquire(buf->content_lock, LW_SHARED);
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-		LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -3197,7 +3197,7 @@ ConditionalLockBuffer(Buffer buffer)
 
 	buf = GetBufferDescriptor(buffer - 1);
 
-	return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+	return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
 }
 
 /*
@@ -3407,8 +3407,8 @@ WaitIO(volatile BufferDesc *buf)
 		UnlockBufHdr(buf);
 		if (!(sv_flags & BM_IO_IN_PROGRESS))
 			break;
-		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 	}
 }
 
@@ -3441,7 +3441,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
 		 * Grab the io_in_progress lock so that other processes can wait for
 		 * me to finish the I/O.
 		 */
-		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
 		LockBufHdr(buf);
 
@@ -3455,7 +3455,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
 		 * him to get unwedged.
 		 */
 		UnlockBufHdr(buf);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		WaitIO(buf);
 	}
 
@@ -3465,7 +3465,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
 	{
 		/* someone else already did the I/O */
 		UnlockBufHdr(buf);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		return false;
 	}
 
@@ -3514,7 +3514,7 @@ TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
 
 	InProgressBuf = NULL;
 
-	LWLockRelease(buf->io_in_progress_lock);
+	LWLockRelease(BufferDescriptorGetIOLock(buf));
 }
 
 /*
@@ -3539,7 +3539,7 @@ AbortBufferIO(void)
 		 * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
 		 * buffer will be in a busy spin until we succeed in doing this.
 		 */
-		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
 		LockBufHdr(buf);
 		Assert(buf->flags & BM_IO_IN_PROGRESS);
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 687ed63..8b14b63 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -335,9 +335,6 @@ NumLWLocks(void)
 	/* Predefined LWLocks */
 	numLocks = NUM_FIXED_LWLOCKS;
 
-	/* bufmgr.c needs two for each shared buffer */
-	numLocks += 2 * NBuffers;
-
 	/* proc.c needs one for each backend or auxiliary process */
 	numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
 
@@ -443,6 +440,8 @@ CreateLWLocks(void)
 
 		MainLWLockArray = (LWLockPadded *) ptr;
 
+		ptr += numLocks * sizeof(LWLockPadded);
+
 		/* Initialize all LWLocks in main array */
 		for (id = 0, lock = MainLWLockArray; id < numLocks; id++, lock++)
 			LWLockInitialize(&lock->lock, 0);
@@ -457,7 +456,12 @@ CreateLWLocks(void)
 		LWLockCounter = (int *) ((char *) MainLWLockArray - 3 * sizeof(int));
 		LWLockCounter[0] = NUM_FIXED_LWLOCKS;
 		LWLockCounter[1] = numLocks;
-		LWLockCounter[2] = 1;	/* 0 is the main array */
+		/*
+		 * 0 is the main array
+		 * 1 is for buffers IO locks
+		 * 2 is for buffers content locks
+		 */
+		LWLockCounter[2] = 3;
 	}
 
 	if (LWLockTrancheArray == NULL)
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..429b734 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -138,17 +138,15 @@ typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
 	BufFlags	flags;			/* see bit definitions above */
-	uint16		usage_count;	/* usage counter for clock sweep code */
+	uint8		usage_count;	/* usage counter for clock sweep code */
+	slock_t		buf_hdr_lock;	/* protects the above fields */
 	unsigned	refcount;		/* # of backends holding pins on buffer */
 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
-	slock_t		buf_hdr_lock;	/* protects the above fields */
-
 	int			buf_id;			/* buffer's index number (from 0) */
 	int			freeNext;		/* link in freelist chain */
 
-	LWLock	   *io_in_progress_lock;	/* to wait for I/O to complete */
-	LWLock	   *content_lock;	/* to lock access to buffer contents */
+	LWLock		content_lock;	/* to lock access to buffer contents */
 } BufferDesc;
 
 /*
@@ -184,6 +182,13 @@ typedef union BufferDescPadded
 
 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
 
+#define BufferDescriptorGetIOLock(bdesc) \
+	(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
+#define BufferDescriptorGetContentLock(bdesc) \
+	((LWLock*) (&(bdesc)->content_lock))
+
+extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
+
 /*
  * The freeNext field is either the index of the next freelist entry,
  * or one of these special values:
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 84a6fc7..35f5a35 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -76,13 +76,12 @@ typedef struct LWLock
  * (Of course, we have to also ensure that the array start address is suitably
  * aligned.)
  *
- * On a 32-bit platforms a LWLock will these days fit into 16 bytes, but since
- * that didn't use to be the case and cramming more lwlocks into a cacheline
- * might be detrimental performancewise we still use 32 byte alignment
- * there. So, both on 32 and 64 bit platforms, it should fit into 32 bytes
- * unless slock_t is really big.  We allow for that just in case.
+ * We pad out the main LWLocks so we have one per cache line to minimize
+ * contention. Other tranches, with differing usage patterns, are allocated
+ * differently.
  */
-#define LWLOCK_PADDED_SIZE	(sizeof(LWLock) <= 32 ? 32 : 64)
+#define LWLOCK_PADDED_SIZE	PG_CACHE_LINE_SIZE
+#define LWLOCK_MINIMAL_SIZE	(sizeof(LWLock) <= 32 ? 32 : 64)
 
 typedef union LWLockPadded
 {
@@ -91,6 +90,12 @@ typedef union LWLockPadded
 } LWLockPadded;
 extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
 
+typedef union LWLockMinimallyPadded
+{
+	LWLock		lock;
+	char		pad[LWLOCK_MINIMAL_SIZE];
+} LWLockMinimallyPadded;
+
 /*
  * Some commonly-used locks have predefined positions within MainLWLockArray;
  * defining macros here makes it much easier to keep track of these.  If you
-- 
2.5.0.400.gff86faf

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to