On 2015-11-15 16:24:24 -0500, Robert Haas wrote:
> I think what we should do about the buffer locks is polish up this
> patch and get it applied:
> 
> http://www.postgresql.org/message-id/20150907175909.gd5...@alap3.anarazel.de
> 
> I think it needs to be adapted to use predefined constants for the
> tranche IDs instead of hardcoded values, maybe based on the attached
> tranche-constants.patch.

Here's two patches doing that. The first is an adaption of your
constants patch, using an enum and also converting xlog.c's locks. The
second is the separation into distinct tranches.

One thing to call out is that an "oversized" s_lock can now make
BufferDesc exceed 64 bytes, right now that's just the case when it's
larger than 4 bytes.  I'm not sure if that's cause for real concern,
given that it's not very concurrent or ancient platforms where that's
the case.
http://archives.postgresql.org/message-id/20150915020625.GI9666%40alap3.anarazel.de
would alleviate that concern again, as it collapses flags, usage_count,
buf_hdr_lock and refcount into one 32 bit int...

Greetings,

Andres Freund
>From 5f10d977f285109df16bfef6b79456564251aa54 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sat, 12 Dec 2015 18:41:59 +0100
Subject: [PATCH 1/2] Define enum of builtin LWLock tranches.

It's a bit cumbersome having to allocate tranche IDs with
LWLockNewTrancheId() because the returned value needs to be shared
between backends, which all need to call LWLockRegisterTranche(),
somehow.  For builtin tranches we can easily do better, and simple
pre-define tranche IDs for those.

This is motivated by an upcoming patch adding further builtin tranches.

Author: Robert Haas and Andres Freund
Discussion:
    CA+TgmoYciHS4FuU2rYNt8bX0+7ZcNRBGeO-L20apcXTeo7=4...@mail.gmail.com
---
 src/backend/access/transam/xlog.c | 10 +++-------
 src/backend/storage/lmgr/lwlock.c |  7 ++++---
 src/include/storage/lwlock.h      | 11 +++++++++++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 71fc8ff..147fd53 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -512,7 +512,6 @@ typedef struct XLogCtlInsert
 	 */
 	WALInsertLockPadded *WALInsertLocks;
 	LWLockTranche WALInsertLockTranche;
-	int			WALInsertLockTrancheId;
 } XLogCtlInsert;
 
 /*
@@ -4653,7 +4652,7 @@ XLOGShmemInit(void)
 
 		/* Initialize local copy of WALInsertLocks and register the tranche */
 		WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
-		LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId,
+		LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
 							  &XLogCtl->Insert.WALInsertLockTranche);
 		return;
 	}
@@ -4677,17 +4676,14 @@ XLOGShmemInit(void)
 		(WALInsertLockPadded *) allocptr;
 	allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
 
-	XLogCtl->Insert.WALInsertLockTrancheId = LWLockNewTrancheId();
-
 	XLogCtl->Insert.WALInsertLockTranche.name = "WALInsertLocks";
 	XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
 	XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);
 
-	LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId, &XLogCtl->Insert.WALInsertLockTranche);
+	LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, &XLogCtl->Insert.WALInsertLockTranche);
 	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
 	{
-		LWLockInitialize(&WALInsertLocks[i].l.lock,
-						 XLogCtl->Insert.WALInsertLockTrancheId);
+		LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
 		WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
 	}
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index b13ebc6..84691df 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -445,7 +445,7 @@ CreateLWLocks(void)
 
 		/* Initialize all LWLocks in main array */
 		for (id = 0, lock = MainLWLockArray; id < numLocks; id++, lock++)
-			LWLockInitialize(&lock->lock, 0);
+			LWLockInitialize(&lock->lock, LWTRANCHE_MAIN);
 
 		/*
 		 * Initialize the dynamic-allocation counters, which are stored just
@@ -457,7 +457,7 @@ CreateLWLocks(void)
 		LWLockCounter = (int *) ((char *) MainLWLockArray - 3 * sizeof(int));
 		LWLockCounter[0] = NUM_FIXED_LWLOCKS;
 		LWLockCounter[1] = numLocks;
-		LWLockCounter[2] = 1;	/* 0 is the main array */
+		LWLockCounter[2] = LWTRANCHE_FIRST_USER_DEFINED;
 	}
 
 	if (LWLockTrancheArray == NULL)
@@ -466,12 +466,13 @@ CreateLWLocks(void)
 		LWLockTrancheArray = (LWLockTranche **)
 			MemoryContextAlloc(TopMemoryContext,
 						  LWLockTranchesAllocated * sizeof(LWLockTranche *));
+		Assert(LWLockTranchesAllocated >= LWTRANCHE_FIRST_USER_DEFINED);
 	}
 
 	MainLWLockTranche.name = "main";
 	MainLWLockTranche.array_base = MainLWLockArray;
 	MainLWLockTranche.array_stride = sizeof(LWLockPadded);
-	LWLockRegisterTranche(0, &MainLWLockTranche);
+	LWLockRegisterTranche(LWTRANCHE_MAIN, &MainLWLockTranche);
 }
 
 /*
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 4653e09..9d22273 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -177,6 +177,17 @@ extern void LWLockRegisterTranche(int tranche_id, LWLockTranche *tranche);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
 /*
+ * We reserve a few predefined tranche IDs.  These values will never be
+ * returned by LWLockNewTrancheId.
+ */
+typedef enum BuiltinTrancheIds
+{
+	LWTRANCHE_MAIN,
+	LWTRANCHE_WAL_INSERT,
+	LWTRANCHE_FIRST_USER_DEFINED
+}	BuiltinTrancheIds;
+
+/*
  * Prior to PostgreSQL 9.4, we used an enum type called LWLockId to refer
  * to LWLocks.  New code should instead use LWLock *.  However, for the
  * convenience of third-party code, we include the following typedef.
-- 
2.6.0.rc3

>From 592750b3160e765d85eefa289f66e3108b114c3f Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sat, 12 Dec 2015 18:51:19 +0100
Subject: [PATCH 2/2] Move buffer LWLocks out of the main array and increase
 padding of locks.

So far LWLocks allocated in the main tranche were padded to 32 bytes. On
the most common platforms that means two locks share one cache line.  So
far we refrained from increasing the padding due to the large number of
LWLocks for buffers.  The tranche infrastructure allows to move out
buffer lwlocks - leaving only relatively few locks in the main array.

Move the content lock directly into struct BufferDesc - in many cases
pinning and locking go hand in hand, making it desirable to have them in
one line. With some care it is possible to do so without increasing the
size of BufferDesc above one cacheline (on platforms with a 1/2 byte
spinlock).

As there's not enough space to also store the IO locks directly in the
struct, move those into a separately allocated array.

Having moved the Buffer LWLocks out, the padding for the LWLocks in the
main array can be increased to a full cacheline.

Author: Andres Freund, Simon Riggs, Robert Haas
Discussion:
     anp8+jldj+rh6pw_3dh1jzrkcy-5jbffmvh1h6pc96_1hbd...@mail.gmail.com
     CA+TgmoYciHS4FuU2rYNt8bX0+7ZcNRBGeO-L20apcXTeo7=4...@mail.gmail.com
---
 src/backend/storage/buffer/buf_init.c | 58 +++++++++++++++++++++++++++++------
 src/backend/storage/buffer/bufmgr.c   | 57 ++++++++++++++++++----------------
 src/backend/storage/lmgr/lwlock.c     |  7 +++--
 src/include/storage/buf_internals.h   | 23 +++++++++-----
 src/include/storage/lwlock.h          | 29 ++++++++++++++----
 src/tools/pgindent/typedefs.list      |  1 +
 6 files changed, 123 insertions(+), 52 deletions(-)

diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..8a4a383 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -20,6 +20,9 @@
 
 BufferDescPadded *BufferDescriptors;
 char	   *BufferBlocks;
+LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
+LWLockTranche BufferIOLWLockTranche;
+LWLockTranche BufferContentLWLockTranche;
 
 
 /*
@@ -65,22 +68,46 @@ void
 InitBufferPool(void)
 {
 	bool		foundBufs,
-				foundDescs;
+				foundDescs,
+				foundIOLocks;
 
 	/* Align descriptors to a cacheline boundary. */
-	BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
-										ShmemInitStruct("Buffer Descriptors",
-					NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE,
-														&foundDescs));
+	BufferDescriptors = (BufferDescPadded *)
+		CACHELINEALIGN(
+					   ShmemInitStruct("Buffer Descriptors",
+									   NBuffers * sizeof(BufferDescPadded)
+									   + PG_CACHE_LINE_SIZE,
+									   &foundDescs));
 
 	BufferBlocks = (char *)
 		ShmemInitStruct("Buffer Blocks",
 						NBuffers * (Size) BLCKSZ, &foundBufs);
 
-	if (foundDescs || foundBufs)
+	/* Align lwlocks to cacheline boundary */
+	BufferIOLWLockArray = (LWLockMinimallyPadded *)
+		CACHELINEALIGN(
+					   ShmemInitStruct("Buffer IO Locks",
+							  NBuffers * (Size) sizeof(LWLockMinimallyPadded)
+									   + PG_CACHE_LINE_SIZE,
+									   &foundIOLocks));
+
+	BufferIOLWLockTranche.name = "Buffer IO Locks";
+	BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
+	BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
+	LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
+						  &BufferIOLWLockTranche);
+
+	BufferContentLWLockTranche.name = "Buffer Content Locks";
+	BufferContentLWLockTranche.array_base =
+		((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
+	BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
+	LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
+						  &BufferContentLWLockTranche);
+
+	if (foundDescs || foundBufs || foundIOLocks)
 	{
 		/* both should be present or neither */
-		Assert(foundDescs && foundBufs);
+		Assert(foundDescs && foundBufs && foundIOLocks);
 		/* note: this path is only taken in EXEC_BACKEND case */
 	}
 	else
@@ -110,8 +137,11 @@ InitBufferPool(void)
 			 */
 			buf->freeNext = i + 1;
 
-			buf->io_in_progress_lock = LWLockAssign();
-			buf->content_lock = LWLockAssign();
+			LWLockInitialize(BufferDescriptorGetContentLock(buf),
+							 LWTRANCHE_BUFFER_CONTENT);
+
+			LWLockInitialize(BufferDescriptorGetIOLock(buf),
+							 LWTRANCHE_BUFFER_IO_IN_PROGRESS);
 		}
 
 		/* Correct last entry of linked list */
@@ -144,5 +174,15 @@ BufferShmemSize(void)
 	/* size of stuff controlled by freelist.c */
 	size = add_size(size, StrategyShmemSize());
 
+	/*
+	 * Space for the IO LWLock array, allocated in a separate tranche, with a
+	 * different stride. This allows us to pad out the main array to reduce
+	 * contention, without wasting space for buffers (of which there are many
+	 * more than locks in the main lwlock array).
+	 */
+	size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
+	/* to allow aligning the above */
+	size = add_size(size, PG_CACHE_LINE_SIZE);
+
 	return size;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b32543b..c50c6b0 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -738,7 +738,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			if (!isLocalBuf)
 			{
 				if (mode == RBM_ZERO_AND_LOCK)
-					LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+					LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+								  LW_EXCLUSIVE);
 				else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
 					LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
 			}
@@ -879,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
 		!isLocalBuf)
 	{
-		LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
 	}
 
 	if (isLocalBuf)
@@ -1045,7 +1046,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * happens to be trying to split the page the first one got from
 			 * StrategyGetBuffer.)
 			 */
-			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
+			if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+										 LW_SHARED))
 			{
 				/*
 				 * If using a nondefault strategy, and writing the buffer
@@ -1067,7 +1069,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 						StrategyRejectBuffer(strategy, buf))
 					{
 						/* Drop lock/pin and loop around for another buffer */
-						LWLockRelease(buf->content_lock);
+						LWLockRelease(BufferDescriptorGetContentLock(buf));
 						UnpinBuffer(buf, true);
 						continue;
 					}
@@ -1080,7 +1082,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 											  smgr->smgr_rnode.node.relNode);
 
 				FlushBuffer(buf, NULL);
-				LWLockRelease(buf->content_lock);
+				LWLockRelease(BufferDescriptorGetContentLock(buf));
 
 				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
 											   smgr->smgr_rnode.node.spcNode,
@@ -1395,7 +1397,7 @@ MarkBufferDirty(Buffer buffer)
 
 	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
-	Assert(LWLockHeldByMe(bufHdr->content_lock));
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
 	LockBufHdr(bufHdr);
 
@@ -1595,8 +1597,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
-		Assert(!LWLockHeldByMe(buf->content_lock));
-		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
+		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
 
 		LockBufHdr(buf);
 
@@ -2116,11 +2118,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	 * buffer is clean by the time we've locked it.)
 	 */
 	PinBuffer_Locked(bufHdr);
-	LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 
 	FlushBuffer(bufHdr, NULL);
 
-	LWLockRelease(bufHdr->content_lock);
+	LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 	UnpinBuffer(bufHdr, true);
 
 	return result | BUF_WRITTEN;
@@ -2926,9 +2928,9 @@ FlushRelationBuffers(Relation rel)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 			FlushBuffer(bufHdr, rel->rd_smgr);
-			LWLockRelease(bufHdr->content_lock);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -2978,9 +2980,9 @@ FlushDatabaseBuffers(Oid dbid)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 			FlushBuffer(bufHdr, NULL);
-			LWLockRelease(bufHdr->content_lock);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -3004,7 +3006,7 @@ FlushOneBuffer(Buffer buffer)
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
 
-	LWLockHeldByMe(bufHdr->content_lock);
+	LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr));
 
 	FlushBuffer(bufHdr, NULL);
 }
@@ -3101,7 +3103,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
-	Assert(LWLockHeldByMe(bufHdr->content_lock));
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
 	/*
 	 * This routine might get called many times on the same page, if we are
@@ -3254,11 +3256,11 @@ LockBuffer(Buffer buffer, int mode)
 	buf = GetBufferDescriptor(buffer - 1);
 
 	if (mode == BUFFER_LOCK_UNLOCK)
-		LWLockRelease(buf->content_lock);
+		LWLockRelease(BufferDescriptorGetContentLock(buf));
 	else if (mode == BUFFER_LOCK_SHARE)
-		LWLockAcquire(buf->content_lock, LW_SHARED);
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-		LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -3279,7 +3281,8 @@ ConditionalLockBuffer(Buffer buffer)
 
 	buf = GetBufferDescriptor(buffer - 1);
 
-	return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+	return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+									LW_EXCLUSIVE);
 }
 
 /*
@@ -3489,8 +3492,8 @@ WaitIO(BufferDesc *buf)
 		UnlockBufHdr(buf);
 		if (!(sv_flags & BM_IO_IN_PROGRESS))
 			break;
-		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 	}
 }
 
@@ -3523,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		 * Grab the io_in_progress lock so that other processes can wait for
 		 * me to finish the I/O.
 		 */
-		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
 		LockBufHdr(buf);
 
@@ -3537,7 +3540,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		 * him to get unwedged.
 		 */
 		UnlockBufHdr(buf);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		WaitIO(buf);
 	}
 
@@ -3547,7 +3550,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 	{
 		/* someone else already did the I/O */
 		UnlockBufHdr(buf);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		return false;
 	}
 
@@ -3595,7 +3598,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
 
 	InProgressBuf = NULL;
 
-	LWLockRelease(buf->io_in_progress_lock);
+	LWLockRelease(BufferDescriptorGetIOLock(buf));
 }
 
 /*
@@ -3620,7 +3623,7 @@ AbortBufferIO(void)
 		 * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
 		 * buffer will be in a busy spin until we succeed in doing this.
 		 */
-		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
 		LockBufHdr(buf);
 		Assert(buf->flags & BM_IO_IN_PROGRESS);
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 84691df..fa811d0 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -353,9 +353,6 @@ NumLWLocks(void)
 	/* Predefined LWLocks */
 	numLocks = NUM_FIXED_LWLOCKS;
 
-	/* bufmgr.c needs two for each shared buffer */
-	numLocks += 2 * NBuffers;
-
 	/* proc.c needs one for each backend or auxiliary process */
 	numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
 
@@ -423,6 +420,10 @@ CreateLWLocks(void)
 	StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
 					 "MAX_BACKENDS too big for lwlock.c");
 
+	StaticAssertExpr(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
+					 sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
+					 "Miscalculated LWLock padding");
+
 	if (!IsUnderPostmaster)
 	{
 		int			numLocks = NumLWLocks();
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 19247c4..374c141 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -115,8 +115,8 @@ typedef struct buftag
  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
  * changes after initialization, so does not need locking.  freeNext is
- * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLocks can take
- * care of themselves.  The buf_hdr_lock is *not* used to control access to
+ * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
+ * take care of itself.  The buf_hdr_lock is *not* used to control access to
  * the data in the buffer!
  *
  * An exception is that if we have the buffer pinned, its tag can't change
@@ -133,22 +133,24 @@ typedef struct buftag
  *
  * We use this same struct for local buffer headers, but the lock fields
  * are not used and not all of the flag bits are useful either.
+ *
+ * Be careful to avoid increasing the size of the struct when
+ * adding/reorganizing members, keeping it below 64 bit (most common cache
+ * line size) is fairly important for performance.
  */
 typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
 	BufFlags	flags;			/* see bit definitions above */
-	uint16		usage_count;	/* usage counter for clock sweep code */
+	uint8		usage_count;	/* usage counter for clock sweep code */
+	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
 	unsigned	refcount;		/* # of backends holding pins on buffer */
 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
-	slock_t		buf_hdr_lock;	/* protects the above fields */
-
 	int			buf_id;			/* buffer's index number (from 0) */
 	int			freeNext;		/* link in freelist chain */
 
-	LWLock	   *io_in_progress_lock;	/* to wait for I/O to complete */
-	LWLock	   *content_lock;	/* to lock access to buffer contents */
+	LWLock		content_lock;	/* to lock access to buffer contents */
 } BufferDesc;
 
 /*
@@ -184,6 +186,13 @@ typedef union BufferDescPadded
 
 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
 
+#define BufferDescriptorGetIOLock(bdesc) \
+	(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
+#define BufferDescriptorGetContentLock(bdesc) \
+	((LWLock*) (&(bdesc)->content_lock))
+
+extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
+
 /*
  * The freeNext field is either the index of the next freelist entry,
  * or one of these special values:
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 9d22273..0993f2d 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -76,19 +76,34 @@ typedef struct LWLock
  * (Of course, we have to also ensure that the array start address is suitably
  * aligned.)
  *
+ * We pad LWLocks in the main array so we have one per cache line (struct
+ * LWLockPadded), to minimize contention. Other tranches, with differing usage
+ * patterns, are allocated differently (e.g. using LWLockMinimallyPadded).
+ *
  * On a 32-bit platforms a LWLock will these days fit into 16 bytes, but since
- * that didn't use to be the case and cramming more lwlocks into a cacheline
- * might be detrimental performancewise we still use 32 byte alignment
- * there. So, both on 32 and 64 bit platforms, it should fit into 32 bytes
- * unless slock_t is really big.  We allow for that just in case.
+ * that didn't use to be the case and cramming more lwlocks into a cacheline -
+ * might be detrimental performancewise we still use 32 byte alignment for
+ * "minimally" padded lwlocks. So, both on 32 and 64 bit platforms, it should
+ * fit into 32 bytes - unless slock_t is really big.  We allow for that just
+ * in case.
  */
-#define LWLOCK_PADDED_SIZE	(sizeof(LWLock) <= 32 ? 32 : 64)
+#define LWLOCK_PADDED_SIZE	PG_CACHE_LINE_SIZE
+#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
 
+/* LWLock, padded to a full cache line size*/
 typedef union LWLockPadded
 {
 	LWLock		lock;
-	char		pad[LWLOCK_PADDED_SIZE];
+	char		pad[PG_CACHE_LINE_SIZE];
 } LWLockPadded;
+
+/* LWLock, minimally padded */
+typedef union LWLockMinimallyPadded
+{
+	LWLock		lock;
+	char		pad[LWLOCK_MINIMAL_SIZE];
+} LWLockMinimallyPadded;
+
 extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
 extern char *MainLWLockNames[];
 
@@ -184,6 +199,8 @@ typedef enum BuiltinTrancheIds
 {
 	LWTRANCHE_MAIN,
 	LWTRANCHE_WAL_INSERT,
+	LWTRANCHE_BUFFER_CONTENT,
+	LWTRANCHE_BUFFER_IO_IN_PROGRESS,
 	LWTRANCHE_FIRST_USER_DEFINED
 }	BuiltinTrancheIds;
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 03e1d2c..963d865 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1008,6 +1008,7 @@ LSEG
 LVRelStats
 LWLock
 LWLockHandle
+LWLockMinimallyPadded
 LWLockMode
 LWLockPadded
 LWLockTranche
-- 
2.6.0.rc3

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to