Re: [HACKERS] [PATCH] Refactoring of LWLock tranches

Ildus Kurbangaliev Thu, 19 Nov 2015 06:09:30 -0800

Thank you for the review. I've made changes according to your comments.
I don't stick on current names in the patch.


I've removed all unnecerrary indirections, and added cache aligning
to LWLock arrays.

On Tue, 17 Nov 2015 19:36:13 +0100
"and...@anarazel.de" <and...@anarazel.de> wrote:

> On 2015-11-17 14:14:50 +0300, Ildus Kurbangaliev wrote:
> > 1) We can avoid constants, and use a standard steps for tranches
> > creation.  
> 
> The constants are actually a bit useful, to easily determine
> builtin/non-builtin stuff.

Maybe I'm missing something here, but I don't see much difference with
other tranches, created in Postgres startup. In my opinion they are also
pretty much builtin.

> 
> > 2) We have only one global variable (BufferCtl)  
> 
> Don't see the advantage. This adds another, albeit predictable,
> indirection in frequent callsites. There's no architectural advantage
> in avoiding these.
> 
> > 3) Tranches moved to shared memory, so we won't need to do
> > an additional work here.  
> 
> I can see some benefit, but it also doesn't seem like a huge benefit.

The moving base tranches to shared memory has been discussed many times.
The point is using them later in pg_stat_activity and other monitoring
views.

> 
> 
> > 4) Also we can kept buffer locks from MainLWLockArray there (that
> > was done in attached patch).  
> 
> That's the 'blockLocks' thing? That's a pretty bad name. These are
> buffer mapping locks. And this seems like a separate patch, separately
> debated.

Changed to BufMappingPartitionLWLocks. If this patch is all about
separating LWLocks of the buffer manager, why not use one patch for this
task?

> 
> > +   if (!foundCtl)
> >     {
> > -           int                     i;
> > +           /* Align descriptors to a cacheline boundary. */
> > +           ctl->base.bufferDescriptors = (void *)
> > CACHELINEALIGN(ShmemAlloc(
> > +                   NBuffers * sizeof(BufferDescPadded) +
> > PG_CACHE_LINE_SIZE)); +
> > +           ctl->base.bufferBlocks = (char *)
> > ShmemAlloc(NBuffers * (Size) BLCKSZ);  
> 
> I'm going to entirely veto this.
> 
> > +           strncpy(ctl->IOLWLockTrancheName, "buffer_io",
> > +                   BUFMGR_MAX_NAME_LENGTH);
> > +           strncpy(ctl->contentLWLockTrancheName,
> > "buffer_content",
> > +                   BUFMGR_MAX_NAME_LENGTH);
> > +           strncpy(ctl->blockLWLockTrancheName,
> > "buffer_blocks",
> > +                   BUFMGR_MAX_NAME_LENGTH);
> > +
> > +           ctl->IOLocks = (LWLockMinimallyPadded *)
> > ShmemAlloc(
> > +                   sizeof(LWLockMinimallyPadded) *
> > NBuffers);  
> 
> This should be cacheline aligned.

Fixed

> 
> > -                   buf->io_in_progress_lock = LWLockAssign();
> > -                   buf->content_lock = LWLockAssign();
> > +
> > LWLockInitialize(BufferDescriptorGetContentLock(buf),
> > +                           ctl->contentLWLockTrancheId);
> > +                   LWLockInitialize(&ctl->IOLocks[i].lock,
> > +                           ctl->IOLWLockTrancheId);  
> 
> Seems weird to use the macro accessing content locks, but not IO
> locks.

Fixed

> 
> >  /* Note: these two macros only work on shared buffers, not local
> > ones! */ -#define BufHdrGetBlock(bufHdr)    ((Block)
> > (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) +#define
> > BufHdrGetBlock(bufHdr)      ((Block) (BufferCtl->bufferBlocks +
> > ((Size) (bufHdr)->buf_id) * BLCKSZ))  
> 
> That's the additional indirection I'm talking about.
> 
> > @@ -353,9 +353,6 @@ NumLWLocks(void)
> >     /* Predefined LWLocks */
> >     numLocks = NUM_FIXED_LWLOCKS;
> >  
> > -   /* bufmgr.c needs two for each shared buffer */
> > -   numLocks += 2 * NBuffers;
> > -
> >     /* proc.c needs one for each backend or auxiliary process
> > */ numLocks += MaxBackends + NUM_AUXILIARY_PROCS;  
> 
> Didn't you also move the buffer mapping locks... ?
> 
> > diff --git a/src/include/storage/buf_internals.h
> > b/src/include/storage/buf_internals.h index 521ee1c..e1795dc 100644
> > --- a/src/include/storage/buf_internals.h
> > +++ b/src/include/storage/buf_internals.h
> > @@ -95,6 +95,7 @@ typedef struct buftag
> >     (a).forkNum == (b).forkNum \
> >  )
> >  
> > +
> >  /*  
> 
> unrelated change.
> 
> >   * The shared buffer mapping table is partitioned to reduce
> > contention.
> >   * To determine which partition lock a given tag requires, compute
> > the tag's @@ -104,10 +105,9 @@ typedef struct buftag
> >  #define BufTableHashPartition(hashcode) \
> >     ((hashcode) % NUM_BUFFER_PARTITIONS)
> >  #define BufMappingPartitionLock(hashcode) \
> > -   (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
> > -           BufTableHashPartition(hashcode)].lock)
> > +   (&((BufferCtlData
> > *)BufferCtl)->blockLocks[BufTableHashPartition(hashcode)].lock)
> > #define BufMappingPartitionLockByIndex(i) \
> > -   (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
> > +   (&((BufferCtlData *)BufferCtl)->blockLocks[i].lock)  
> 
> Again, unnecessary indirections.

Fixed

> 
> > +/* Maximum length of a bufmgr lock name */
> > +#define BUFMGR_MAX_NAME_LENGTH     32  
> 
> If we were to do this I'd just use NAMEDATALEN.

Fixed

> 
> > +/*
> > + * Base control struct for the buffer manager data
> > + * Located in shared memory.
> > + *
> > + * BufferCtl variable points to BufferCtlBase because of only
> > + * the backend code knows about BufferDescPadded, LWLock and
> > + * others structures and the same time it must be usable in
> > + * the frontend code.
> > + */
> > +typedef struct BufferCtlData
> > +{
> > +   BufferCtlBase base;  
> 
> Eeek. What's the point here?

The point was using BufferCtlBase in the backend and frontend code.
The frontend code doesn't know about LWLock and other structures.
Anyway I've removed this code.

-- 
Ildus Kurbangaliev
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company

diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..7ae1491 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -18,8 +18,10 @@
 #include "storage/buf_internals.h"
 
 
-BufferDescPadded *BufferDescriptors;
-char	   *BufferBlocks;
+BufferDescPadded		*BufferDescriptors;
+char					*BufferBlocks;
+LWLockMinimallyPadded	*BufferIOLWLocks;
+LWLockMinimallyPadded	*BufMappingPartitionLWLocks;
 
 
 /*
@@ -54,6 +56,80 @@ char	   *BufferBlocks;
  *		multiple times. Check the PrivateRefCount infrastructure in bufmgr.c.
  */
 
+static void
+InitBufferLWLockTranches(void)
+{
+	int		i;
+	bool	foundCtl;
+
+	BufferCtlData *ctl = (BufferCtlData *) ShmemInitStruct("BufferCtl",
+		sizeof(BufferCtlData), &foundCtl);
+
+	if (!foundCtl)
+	{
+		BufferIOLWLocks = (LWLockMinimallyPadded *) CACHELINEALIGN(ShmemAlloc(
+			sizeof(LWLockMinimallyPadded) * NBuffers + PG_CACHE_LINE_SIZE));
+		BufMappingPartitionLWLocks = (LWLockMinimallyPadded *) CACHELINEALIGN(
+			ShmemAlloc(sizeof(LWLockMinimallyPadded) * NUM_BUFFER_PARTITIONS +
+				PG_CACHE_LINE_SIZE));
+
+		strncpy(ctl->IOLWLockTrancheName, "buffer_io",
+			NAMEDATALEN);
+		strncpy(ctl->contentLWLockTrancheName, "buffer_content",
+			NAMEDATALEN);
+		strncpy(ctl->partitionLWLockTrancheName, "buffer_partition",
+			NAMEDATALEN);
+
+		/* Initialize tranche fields */
+		ctl->IOLWLockTranche.array_base = BufferIOLWLocks;
+		ctl->IOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
+		ctl->IOLWLockTranche.name = ctl->IOLWLockTrancheName;
+
+		ctl->contentLWLockTranche.array_base =
+			((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
+		ctl->contentLWLockTranche.array_stride = sizeof(BufferDescPadded);
+		ctl->contentLWLockTranche.name = ctl->contentLWLockTrancheName;
+
+		ctl->partitionLWLockTranche.array_base = BufMappingPartitionLWLocks;
+		ctl->partitionLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
+		ctl->partitionLWLockTranche.name = ctl->partitionLWLockTrancheName;
+
+		ctl->partitionLWLockTrancheId = LWLockNewTrancheId();
+		ctl->contentLWLockTrancheId = LWLockNewTrancheId();
+		ctl->IOLWLockTrancheId = LWLockNewTrancheId();
+
+		/* Initialize LWLocks */
+		for (i = 0; i < NBuffers; i++)
+		{
+			BufferDesc *buf = GetBufferDescriptor(i);
+
+			LWLockInitialize(BufferDescriptorGetContentLock(buf),
+				ctl->contentLWLockTrancheId);
+			LWLockInitialize(BufferDescriptorGetIOLock(buf),
+				ctl->IOLWLockTrancheId);
+		}
+
+		for (i = 0; i < NUM_BUFFER_PARTITIONS; i++)
+			LWLockInitialize(&BufMappingPartitionLWLocks[i].lock,
+				ctl->partitionLWLockTrancheId);
+	}
+	else
+	{
+		BufferIOLWLocks = (LWLockMinimallyPadded *)
+			(ctl->IOLWLockTranche.array_base);
+		BufMappingPartitionLWLocks = (LWLockMinimallyPadded *)
+			(ctl->partitionLWLockTranche.array_base);
+	}
+
+	/* Register tranches in main tranches array */
+	LWLockRegisterTranche(ctl->IOLWLockTrancheId,
+		&ctl->IOLWLockTranche);
+	LWLockRegisterTranche(ctl->contentLWLockTrancheId,
+		&ctl->contentLWLockTranche);
+	LWLockRegisterTranche(ctl->partitionLWLockTrancheId,
+		&ctl->partitionLWLockTranche);
+}
+
 
 /*
  * Initialize shared buffer pool
@@ -109,15 +185,14 @@ InitBufferPool(void)
 			 * management of this list is done by freelist.c.
 			 */
 			buf->freeNext = i + 1;
-
-			buf->io_in_progress_lock = LWLockAssign();
-			buf->content_lock = LWLockAssign();
 		}
 
 		/* Correct last entry of linked list */
 		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
 	}
 
+	InitBufferLWLockTranches();
+
 	/* Init other shared buffer-management stuff */
 	StrategyInitialize(!foundDescs);
 }
@@ -144,5 +219,19 @@ BufferShmemSize(void)
 	/* size of stuff controlled by freelist.c */
 	size = add_size(size, StrategyShmemSize());
 
+	/* size of a control struct */
+	size = add_size(size, sizeof(BufferCtlData));
+
+	/* size of IO LWLocks */
+	size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
+	/* to allow aligning IO LWLocks */
+	size = add_size(size, PG_CACHE_LINE_SIZE);
+
+	/* size of buffer partition lwlocks */
+	size = add_size(size, mul_size(NUM_BUFFER_PARTITIONS,
+		sizeof(LWLockMinimallyPadded)));
+	/* to allow aligning buffer partition LWLocks */
+	size = add_size(size, PG_CACHE_LINE_SIZE);
+
 	return size;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 63188a3..a3f52ff 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -738,7 +738,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			if (!isLocalBuf)
 			{
 				if (mode == RBM_ZERO_AND_LOCK)
-					LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+					LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
 				else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
 					LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
 			}
@@ -879,7 +879,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
 		!isLocalBuf)
 	{
-		LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
 	}
 
 	if (isLocalBuf)
@@ -1045,7 +1045,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * happens to be trying to split the page the first one got from
 			 * StrategyGetBuffer.)
 			 */
-			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
+			if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED))
 			{
 				/*
 				 * If using a nondefault strategy, and writing the buffer
@@ -1067,7 +1067,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 						StrategyRejectBuffer(strategy, buf))
 					{
 						/* Drop lock/pin and loop around for another buffer */
-						LWLockRelease(buf->content_lock);
+						LWLockRelease(BufferDescriptorGetContentLock(buf));
 						UnpinBuffer(buf, true);
 						continue;
 					}
@@ -1080,7 +1080,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 											  smgr->smgr_rnode.node.relNode);
 
 				FlushBuffer(buf, NULL);
-				LWLockRelease(buf->content_lock);
+				LWLockRelease(BufferDescriptorGetContentLock(buf));
 
 				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
 											   smgr->smgr_rnode.node.spcNode,
@@ -1395,7 +1395,7 @@ MarkBufferDirty(Buffer buffer)
 
 	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
-	Assert(LWLockHeldByMe(bufHdr->content_lock));
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
 	LockBufHdr(bufHdr);
 
@@ -1595,8 +1595,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
-		Assert(!LWLockHeldByMe(buf->content_lock));
-		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
+		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
 
 		LockBufHdr(buf);
 
@@ -2116,11 +2116,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	 * buffer is clean by the time we've locked it.)
 	 */
 	PinBuffer_Locked(bufHdr);
-	LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 
 	FlushBuffer(bufHdr, NULL);
 
-	LWLockRelease(bufHdr->content_lock);
+	LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 	UnpinBuffer(bufHdr, true);
 
 	return result | BUF_WRITTEN;
@@ -2926,9 +2926,9 @@ FlushRelationBuffers(Relation rel)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 			FlushBuffer(bufHdr, rel->rd_smgr);
-			LWLockRelease(bufHdr->content_lock);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -2978,9 +2978,9 @@ FlushDatabaseBuffers(Oid dbid)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 			FlushBuffer(bufHdr, NULL);
-			LWLockRelease(bufHdr->content_lock);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -3080,7 +3080,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
-	Assert(LWLockHeldByMe(bufHdr->content_lock));
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
 	/*
 	 * This routine might get called many times on the same page, if we are
@@ -3233,11 +3233,11 @@ LockBuffer(Buffer buffer, int mode)
 	buf = GetBufferDescriptor(buffer - 1);
 
 	if (mode == BUFFER_LOCK_UNLOCK)
-		LWLockRelease(buf->content_lock);
+		LWLockRelease(BufferDescriptorGetContentLock(buf));
 	else if (mode == BUFFER_LOCK_SHARE)
-		LWLockAcquire(buf->content_lock, LW_SHARED);
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-		LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -3258,7 +3258,7 @@ ConditionalLockBuffer(Buffer buffer)
 
 	buf = GetBufferDescriptor(buffer - 1);
 
-	return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+	return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
 }
 
 /*
@@ -3468,8 +3468,8 @@ WaitIO(BufferDesc *buf)
 		UnlockBufHdr(buf);
 		if (!(sv_flags & BM_IO_IN_PROGRESS))
 			break;
-		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 	}
 }
 
@@ -3502,7 +3502,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		 * Grab the io_in_progress lock so that other processes can wait for
 		 * me to finish the I/O.
 		 */
-		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
 		LockBufHdr(buf);
 
@@ -3516,7 +3516,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		 * him to get unwedged.
 		 */
 		UnlockBufHdr(buf);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		WaitIO(buf);
 	}
 
@@ -3526,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 	{
 		/* someone else already did the I/O */
 		UnlockBufHdr(buf);
-		LWLockRelease(buf->io_in_progress_lock);
+		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		return false;
 	}
 
@@ -3574,7 +3574,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
 
 	InProgressBuf = NULL;
 
-	LWLockRelease(buf->io_in_progress_lock);
+	LWLockRelease(BufferDescriptorGetIOLock(buf));
 }
 
 /*
@@ -3599,7 +3599,7 @@ AbortBufferIO(void)
 		 * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
 		 * buffer will be in a busy spin until we succeed in doing this.
 		 */
-		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
 		LockBufHdr(buf);
 		Assert(buf->flags & BM_IO_IN_PROGRESS);
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index b13ebc6..aa24d2c 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -353,9 +353,6 @@ NumLWLocks(void)
 	/* Predefined LWLocks */
 	numLocks = NUM_FIXED_LWLOCKS;
 
-	/* bufmgr.c needs two for each shared buffer */
-	numLocks += 2 * NBuffers;
-
 	/* proc.c needs one for each backend or auxiliary process */
 	numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 19247c4..cdfed92 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -95,6 +95,9 @@ typedef struct buftag
 	(a).forkNum == (b).forkNum \
 )
 
+/* in buf_init.c */
+extern PGDLLIMPORT LWLockMinimallyPadded *BufMappingPartitionLWLocks;
+
 /*
  * The shared buffer mapping table is partitioned to reduce contention.
  * To determine which partition lock a given tag requires, compute the tag's
@@ -104,10 +107,9 @@ typedef struct buftag
 #define BufTableHashPartition(hashcode) \
 	((hashcode) % NUM_BUFFER_PARTITIONS)
 #define BufMappingPartitionLock(hashcode) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
-		BufTableHashPartition(hashcode)].lock)
+	(&BufMappingPartitionLWLocks[BufTableHashPartition(hashcode)].lock)
 #define BufMappingPartitionLockByIndex(i) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
+	(&BufMappingPartitionLWLocks[i].lock)
 
 /*
  *	BufferDesc -- shared descriptor/state data for a single shared buffer.
@@ -138,17 +140,15 @@ typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
 	BufFlags	flags;			/* see bit definitions above */
-	uint16		usage_count;	/* usage counter for clock sweep code */
+	uint8		usage_count;	/* usage counter for clock sweep code */
+	slock_t		buf_hdr_lock;	/* protects the above fields */
 	unsigned	refcount;		/* # of backends holding pins on buffer */
 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
-	slock_t		buf_hdr_lock;	/* protects the above fields */
-
 	int			buf_id;			/* buffer's index number (from 0) */
 	int			freeNext;		/* link in freelist chain */
 
-	LWLock	   *io_in_progress_lock;	/* to wait for I/O to complete */
-	LWLock	   *content_lock;	/* to lock access to buffer contents */
+	LWLock	   content_lock;	/* to lock access to buffer contents */
 } BufferDesc;
 
 /*
@@ -179,10 +179,37 @@ typedef union BufferDescPadded
 	char		pad[BUFFERDESC_PAD_TO_SIZE];
 } BufferDescPadded;
 
+/* Number of partitions of the shared buffer mapping hashtable */
+#define NUM_BUFFER_PARTITIONS  128
+
+/* Control struct for the buffer manager data */
+typedef struct BufferCtlData
+{
+	int					IOLWLockTrancheId;
+	LWLockTranche		IOLWLockTranche;
+
+	int					partitionLWLockTrancheId;
+	LWLockTranche		partitionLWLockTranche;
+
+	int					contentLWLockTrancheId;
+	LWLockTranche		contentLWLockTranche;
+
+	char				IOLWLockTrancheName[NAMEDATALEN];
+	char				partitionLWLockTrancheName[NAMEDATALEN];
+	char				contentLWLockTrancheName[NAMEDATALEN];
+} BufferCtlData;
+
+/* in buf_init.c */
+extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLocks;
+
 #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
 #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
 
 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+#define BufferDescriptorGetIOLock(bdesc) \
+	(&(BufferIOLWLocks[(bdesc)->buf_id]).lock)
+#define BufferDescriptorGetContentLock(bdesc) \
+	((LWLock*) (&(bdesc)->content_lock))
 
 /*
  * The freeNext field is either the index of the next freelist entry,
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 4653e09..494e14e 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -76,22 +76,28 @@ typedef struct LWLock
  * (Of course, we have to also ensure that the array start address is suitably
  * aligned.)
  *
- * On a 32-bit platforms a LWLock will these days fit into 16 bytes, but since
- * that didn't use to be the case and cramming more lwlocks into a cacheline
- * might be detrimental performancewise we still use 32 byte alignment
- * there. So, both on 32 and 64 bit platforms, it should fit into 32 bytes
- * unless slock_t is really big.  We allow for that just in case.
+ * We pad out the main LWLocks so we have one per cache line to minimize
+ * contention. Other tranches, with differing usage patterns, are allocated
+ * differently.
  */
-#define LWLOCK_PADDED_SIZE	(sizeof(LWLock) <= 32 ? 32 : 64)
+#define LWLOCK_PADDED_SIZE	PG_CACHE_LINE_SIZE
+#define LWLOCK_MINIMAL_SIZE	(sizeof(LWLock) <= 32 ? 32 : 64)
 
 typedef union LWLockPadded
 {
 	LWLock		lock;
 	char		pad[LWLOCK_PADDED_SIZE];
 } LWLockPadded;
+
 extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
 extern char *MainLWLockNames[];
 
+typedef union LWLockMinimallyPadded
+{
+	LWLock		lock;
+	char		pad[LWLOCK_MINIMAL_SIZE];
+} LWLockMinimallyPadded;
+
 /* Names for fixed lwlocks */
 #include "storage/lwlocknames.h"
 
@@ -101,9 +107,6 @@ extern char *MainLWLockNames[];
  * having this file include lock.h or bufmgr.h would be backwards.
  */
 
-/* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS  128
-
 /* Number of partitions the shared lock tables are divided into */
 #define LOG2_NUM_LOCK_PARTITIONS  4
 #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
@@ -113,9 +116,7 @@ extern char *MainLWLockNames[];
 #define NUM_PREDICATELOCK_PARTITIONS  (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
 
 /* Offsets for various chunks of preallocated lwlocks. */
-#define BUFFER_MAPPING_LWLOCK_OFFSET	NUM_INDIVIDUAL_LWLOCKS
-#define LOCK_MANAGER_LWLOCK_OFFSET		\
-	(BUFFER_MAPPING_LWLOCK_OFFSET + NUM_BUFFER_PARTITIONS)
+#define LOCK_MANAGER_LWLOCK_OFFSET	NUM_INDIVIDUAL_LWLOCKS
 #define PREDICATELOCK_MANAGER_LWLOCK_OFFSET \
 	(LOCK_MANAGER_LWLOCK_OFFSET + NUM_LOCK_PARTITIONS)
 #define NUM_FIXED_LWLOCKS \

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] [PATCH] Refactoring of LWLock tranches

Reply via email to