Good day.

I found some opportunity in Buffer Manager code in BufferAlloc
function:
- When valid buffer is evicted, BufferAlloc acquires two partition
lwlocks: for partition for evicted block is in and partition for new
block placement.

It doesn't matter if there is small number of concurrent replacements.
But if there are a lot of concurrent backends replacing buffers,
complex dependency net quickly arose.

It could be easily seen with select-only pgbench with scale 100 and
shared buffers 128MB: scale 100 produces 1.5GB tables, and it certainly
doesn't fit shared buffers. This way performance starts to degrade at
~100 connections. Even with shared buffers 1GB it slowly degrades after
150 connections. 

But strictly speaking, there is no need to hold both lock
simultaneously. Buffer is pinned so other processes could not select it
for eviction. If tag is cleared and buffer removed from old partition
then other processes will not find it. Therefore it is safe to release
old partition lock before acquiring new partition lock.

If other process concurrently inserts same new buffer, then old buffer
is placed to bufmanager's freelist.

Additional optimisation: in case of old buffer is reused, there is no
need to put its BufferLookupEnt into dynahash's freelist. That reduces
lock contention a bit more. To acomplish this FreeListData.nentries is
changed to pg_atomic_u32/pg_atomic_u64 and atomic increment/decrement
is used.

Remark: there were bug in the `hash_update_hash_key`: nentries were not
kept in sync if freelist partitions differ. This bug were never
triggered because single use of `hash_update_hash_key` doesn't move
entry between partitions.

There is some tests results.

- pgbench with scale 100 were tested with --select-only (since we want
to test buffer manager alone). It produces 1.5GB table.
- two shared_buffers values were tested: 128MB and 1GB.
- second best result were taken among five runs

Test were made in three system configurations:
- notebook with i7-1165G7 (limited to 2.8GHz to not overheat)
- Xeon X5675 6 core 2 socket NUMA system (12 cores/24 threads).
- same Xeon X5675 but restricted to single socket
  (with numactl -m 0 -N 0)

Results for i7-1165G7:

  conns |     master |    patched |  master 1G | patched 1G 
--------+------------+------------+------------+------------
      1 |      29667 |      29079 |      29425 |      29411 
      2 |      55577 |      55553 |      57974 |      57223 
      3 |      87393 |      87924 |      87246 |      89210 
      5 |     136222 |     136879 |     133775 |     133949 
      7 |     179865 |     176734 |     178297 |     175559 
     17 |     215953 |     214708 |     222908 |     223651 
     27 |     211162 |     213014 |     220506 |     219752 
     53 |     211620 |     218702 |     220906 |     225218 
     83 |     213488 |     221799 |     219075 |     228096 
    107 |     212018 |     222110 |     222502 |     227825 
    139 |     207068 |     220812 |     218191 |     226712 
    163 |     203716 |     220793 |     213498 |     226493 
    191 |     199248 |     217486 |     210994 |     221026 
    211 |     195887 |     217356 |     209601 |     219397 
    239 |     193133 |     215695 |     209023 |     218773 
    271 |     190686 |     213668 |     207181 |     219137 
    307 |     188066 |     214120 |     205392 |     218782 
    353 |     185449 |     213570 |     202120 |     217786 
    397 |     182173 |     212168 |     201285 |     216489 

Results for 1 socket X5675

  conns |     master |    patched |  master 1G | patched 1G 
--------+------------+------------+------------+------------
      1 |      16864 |      16584 |      17419 |      17630 
      2 |      32764 |      32735 |      34593 |      34000 
      3 |      47258 |      46022 |      49570 |      47432 
      5 |      64487 |      64929 |      68369 |      68885 
      7 |      81932 |      82034 |      87543 |      87538 
     17 |     114502 |     114218 |     127347 |     127448 
     27 |     116030 |     115758 |     130003 |     128890 
     53 |     116814 |     117197 |     131142 |     131080 
     83 |     114438 |     116704 |     130198 |     130985 
    107 |     113255 |     116910 |     129932 |     131468 
    139 |     111577 |     116929 |     129012 |     131782 
    163 |     110477 |     116818 |     128628 |     131697 
    191 |     109237 |     116672 |     127833 |     131586 
    211 |     108248 |     116396 |     127474 |     131650 
    239 |     107443 |     116237 |     126731 |     131760 
    271 |     106434 |     115813 |     126009 |     131526 
    307 |     105077 |     115542 |     125279 |     131421 
    353 |     104516 |     115277 |     124491 |     131276 
    397 |     103016 |     114842 |     123624 |     131019 

Results for 2 socket x5675

  conns |     master |    patched |  master 1G | patched 1G 
--------+------------+------------+------------+------------
      1 |      16323 |      16280 |      16959 |      17598 
      2 |      30510 |      31431 |      33763 |      31690 
      3 |      45051 |      45834 |      48896 |      47991 
      5 |      71800 |      73208 |      78077 |      74714 
      7 |      89792 |      89980 |      95986 |      96662 
     17 |     178319 |     177979 |     195566 |     196143 
     27 |     210475 |     205209 |     226966 |     235249 
     53 |     222857 |     220256 |     252673 |     251041 
     83 |     219652 |     219938 |     250309 |     250464 
    107 |     218468 |     219849 |     251312 |     251425 
    139 |     210486 |     217003 |     250029 |     250695 
    163 |     204068 |     218424 |     248234 |     252940 
    191 |     200014 |     218224 |     246622 |     253331 
    211 |     197608 |     218033 |     245331 |     253055 
    239 |     195036 |     218398 |     243306 |     253394 
    271 |     192780 |     217747 |     241406 |     253148 
    307 |     189490 |     217607 |     239246 |     253373 
    353 |     186104 |     216697 |     236952 |     253034 
    397 |     183507 |     216324 |     234764 |     252872 

As can be seen, patched version degrades much slower than master.
(Or even doesn't degrade with 1G shared buffer on older processor).

PS.

There is a room for further improvements:
- buffer manager's freelist could be partitioned
- dynahash's freelist could be sized/aligned to CPU cache line
- in fact, there is no need in dynahash at all. It is better to make
  custom hash-table using BufferDesc as entries. BufferDesc has spare
  space for link to next and hashvalue.

regards,
Yura Sokolov
y.soko...@postgrespro.ru
funny.fal...@gmail.com
From a1606eaa124fc497763ed5e28e22cbc8f6443b33 Mon Sep 17 00:00:00 2001
From: Yura Sokolov <y.soko...@postgrespro.ru>
Date: Wed, 22 Sep 2021 13:10:37 +0300
Subject: [PATCH v0] bufmgr: do not acquire two partition locks.

Acquiring two partition locks leads to complex dependency chain that hurts
at high concurrency level.

There is no need to hold both lock simultaneously. Buffer is pinned so
other processes could not select it for eviction. If tag is cleared and
buffer removed from old partition other processes will not find it.
Therefore it is safe to release old partition lock before acquiring
new partition lock.

This change requires to manually return BufferDesc to free list.

Also insertion and deletion to dynahash is optimized by avoiding
unnecessary free list manipulations in common case (when buffer is
reused)

Also small and never triggered bug in hash_update_hash_key is fixed.
---
 src/backend/storage/buffer/buf_table.c |  54 +++--
 src/backend/storage/buffer/bufmgr.c    | 183 ++++++++--------
 src/backend/utils/hash/dynahash.c      | 289 +++++++++++++++++++++++--
 src/include/storage/buf_internals.h    |   6 +-
 src/include/utils/hsearch.h            |  17 ++
 5 files changed, 404 insertions(+), 145 deletions(-)

diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index caa03ae1233..05e1dc9dd29 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -107,36 +107,29 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
 
 /*
  * BufTableInsert
- *		Insert a hashtable entry for given tag and buffer ID,
- *		unless an entry already exists for that tag
- *
- * Returns -1 on successful insertion.  If a conflicting entry exists
- * already, returns the buffer ID in that entry.
+ *		Insert a hashtable entry for given tag and buffer ID.
+ *		Caller should be sure there is no conflicting entry.
  *
  * Caller must hold exclusive lock on BufMappingLock for tag's partition
+ * and call BufTableLookup to check for conflicting entry.
+ *
+ * If oldelem is passed it is reused.
  */
-int
-BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
+void
+BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id, void *oldelem)
 {
 	BufferLookupEnt *result;
-	bool		found;
 
 	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */
 	Assert(tagPtr->blockNum != P_NEW);	/* invalid tag */
 
 	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_ENTER,
-									&found);
-
-	if (found)					/* found something already in the table */
-		return result->id;
+		hash_insert_with_hash_nocheck(SharedBufHash,
+									  (void *) tagPtr,
+									  hashcode,
+									  oldelem);
 
 	result->id = buf_id;
-
-	return -1;
 }
 
 /*
@@ -144,19 +137,32 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
  *		Delete the hashtable entry for given tag (which must exist)
  *
  * Caller must hold exclusive lock on BufMappingLock for tag's partition
+ *
+ * Returns pointer to internal hashtable entry that should be passed either
+ * to BufTableInsert or BufTableFreeDeleted.
  */
-void
+void *
 BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
 {
 	BufferLookupEnt *result;
 
 	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_REMOVE,
-									NULL);
+		hash_delete_skip_freelist(SharedBufHash,
+								  (void *) tagPtr,
+								  hashcode);
 
 	if (!result)				/* shouldn't happen */
 		elog(ERROR, "shared buffer hash table corrupted");
+
+	return result;
+}
+
+/*
+ * BufTableFreeDeleted
+ *		Returns deleted hashtable entry to freelist.
+ */
+void
+BufTableFreeDeleted(void *oldelem, uint32 hashcode)
+{
+	hash_return_to_freelist(SharedBufHash, oldelem, hashcode);
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e88e4e918b0..9c23e54f7c5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1114,6 +1114,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	BufferDesc *buf;
 	bool		valid;
 	uint32		buf_state;
+	void	   *oldElem = NULL;
 
 	/* create a tag so we can lookup the buffer */
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
@@ -1288,93 +1289,16 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			oldHash = BufTableHashCode(&oldTag);
 			oldPartitionLock = BufMappingPartitionLock(oldHash);
 
-			/*
-			 * Must lock the lower-numbered partition first to avoid
-			 * deadlocks.
-			 */
-			if (oldPartitionLock < newPartitionLock)
-			{
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-			else if (oldPartitionLock > newPartitionLock)
-			{
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-			}
-			else
-			{
-				/* only one partition, only one lock */
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
+			LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 		}
 		else
 		{
-			/* if it wasn't valid, we need only the new partition */
-			LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 			/* remember we have no old-partition lock or tag */
 			oldPartitionLock = NULL;
 			/* keep the compiler quiet about uninitialized variables */
 			oldHash = 0;
 		}
 
-		/*
-		 * Try to make a hashtable entry for the buffer under its new tag.
-		 * This could fail because while we were writing someone else
-		 * allocated another buffer for the same block we want to read in.
-		 * Note that we have not yet removed the hashtable entry for the old
-		 * tag.
-		 */
-		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
-
-		if (buf_id >= 0)
-		{
-			/*
-			 * Got a collision. Someone has already done what we were about to
-			 * do. We'll just handle this as if it were found in the buffer
-			 * pool in the first place.  First, give up the buffer we were
-			 * planning to use.
-			 */
-			UnpinBuffer(buf, true);
-
-			/* Can give up that buffer's mapping partition lock now */
-			if (oldPartitionLock != NULL &&
-				oldPartitionLock != newPartitionLock)
-				LWLockRelease(oldPartitionLock);
-
-			/* remaining code should match code at top of routine */
-
-			buf = GetBufferDescriptor(buf_id);
-
-			valid = PinBuffer(buf, strategy);
-
-			/* Can release the mapping lock as soon as we've pinned it */
-			LWLockRelease(newPartitionLock);
-
-			*foundPtr = true;
-
-			if (!valid)
-			{
-				/*
-				 * We can only get here if (a) someone else is still reading
-				 * in the page, or (b) a previous read attempt failed.  We
-				 * have to wait for any active read attempt to finish, and
-				 * then set up our own read attempt if the page is still not
-				 * BM_VALID.  StartBufferIO does it all.
-				 */
-				if (StartBufferIO(buf, true))
-				{
-					/*
-					 * If we get here, previous attempts to read the buffer
-					 * must have failed ... but we shall bravely try again.
-					 */
-					*foundPtr = false;
-				}
-			}
-
-			return buf;
-		}
-
 		/*
 		 * Need to lock the buffer header too in order to change its tag.
 		 */
@@ -1391,31 +1315,102 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			break;
 
 		UnlockBufHdr(buf, buf_state);
-		BufTableDelete(&newTag, newHash);
-		if (oldPartitionLock != NULL &&
-			oldPartitionLock != newPartitionLock)
+		if (oldPartitionLock != NULL)
 			LWLockRelease(oldPartitionLock);
-		LWLockRelease(newPartitionLock);
 		UnpinBuffer(buf, true);
 	}
 
 	/*
-	 * Okay, it's finally safe to rename the buffer.
+	 * Clear out the buffer's tag and flags.  We must do this to ensure that
+	 * linear scans of the buffer array don't think the buffer is valid. We
+	 * also reset the usage_count since any recency of use of the old content
+	 * is no longer relevant.
 	 *
-	 * Clearing BM_VALID here is necessary, clearing the dirtybits is just
-	 * paranoia.  We also reset the usage_count since any recency of use of
-	 * the old content is no longer relevant.  (The usage_count starts out at
-	 * 1 so that the buffer can survive one clock-sweep pass.)
+	 * Since we are single pinner, there should no be PIN_COUNT_WAITER or
+	 * IO_IN_PROGRESS (flags that were not cleared in previous code).
+	 */
+	Assert((oldFlags & (BM_PIN_COUNT_WAITER | BM_IO_IN_PROGRESS)) == 0);
+	CLEAR_BUFFERTAG(buf->tag);
+	buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+	UnlockBufHdr(buf, buf_state);
+
+	if (oldFlags & BM_TAG_VALID)
+		oldElem = BufTableDelete(&oldTag, oldHash);
+
+	if (oldPartitionLock != newPartitionLock)
+	{
+		if (oldPartitionLock != NULL)
+			LWLockRelease(oldPartitionLock);
+		LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+	}
+
+	/*
+	 * Try to make a hashtable entry for the buffer under its new tag. This
+	 * could fail because while we were writing someone else allocated another
+	 * buffer for the same block we want to read in. Note that we have not yet
+	 * removed the hashtable entry for the old tag.
+	 */
+	buf_id = BufTableLookup(&newTag, newHash);
+
+	if (buf_id >= 0)
+	{
+		/*
+		 * Got a collision. Someone has already done what we were about to do.
+		 * We'll just handle this as if it were found in the buffer pool in
+		 * the first place.  First, give up the buffer we were planning to use
+		 * and put it to free lists.
+		 */
+		UnpinBuffer(buf, true);
+		StrategyFreeBuffer(buf);
+		if (oldElem != NULL)
+			BufTableFreeDeleted(oldElem, oldHash);
+
+		/* remaining code should match code at top of routine */
+
+		buf = GetBufferDescriptor(buf_id);
+
+		valid = PinBuffer(buf, strategy);
+
+		/* Can release the mapping lock as soon as we've pinned it */
+		LWLockRelease(newPartitionLock);
+
+		*foundPtr = true;
+
+		if (!valid)
+		{
+			/*
+			 * We can only get here if (a) someone else is still reading in
+			 * the page, or (b) a previous read attempt failed.  We have to
+			 * wait for any active read attempt to finish, and then set up our
+			 * own read attempt if the page is still not BM_VALID.
+			 * StartBufferIO does it all.
+			 */
+			if (StartBufferIO(buf, true))
+			{
+				/*
+				 * If we get here, previous attempts to read the buffer must
+				 * have failed ... but we shall bravely try again.
+				 */
+				*foundPtr = false;
+			}
+		}
+
+		return buf;
+	}
+
+	/*
+	 * Okay, it's finally safe to rename the buffer.
 	 *
 	 * Make sure BM_PERMANENT is set for buffers that must be written at every
 	 * checkpoint.  Unlogged buffers only need to be written at shutdown
 	 * checkpoints, except for their "init" forks, which need to be treated
 	 * just like permanent relations.
+	 *
+	 * The usage_count starts out at 1 so that the buffer can survive one
+	 * clock-sweep pass.
 	 */
+	buf_state = LockBufHdr(buf);
 	buf->tag = newTag;
-	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
-				   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
-				   BUF_USAGECOUNT_MASK);
 	if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
 		buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
 	else
@@ -1423,13 +1418,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	UnlockBufHdr(buf, buf_state);
 
-	if (oldPartitionLock != NULL)
-	{
-		BufTableDelete(&oldTag, oldHash);
-		if (oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-	}
-
+	BufTableInsert(&newTag, newHash, buf->buf_id, oldElem);
 	LWLockRelease(newPartitionLock);
 
 	/*
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 6546e3c7c79..ce5bba8e975 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -99,6 +99,7 @@
 #include "access/xact.h"
 #include "common/hashfn.h"
 #include "port/pg_bitutils.h"
+#include "port/atomics.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/dynahash.h"
@@ -133,6 +134,18 @@ typedef HASHELEMENT *HASHBUCKET;
 /* A hash segment is an array of bucket headers */
 typedef HASHBUCKET *HASHSEGMENT;
 
+#if SIZEOF_LONG == 8
+typedef pg_atomic_uint64 Count;
+#define count_atomic_inc(x)	pg_atomic_fetch_add_u64((x), 1)
+#define count_atomic_dec(x)	pg_atomic_fetch_sub_u64((x), 1)
+#define MAX_NENTRIES	((uint64)PG_INT64_MAX)
+#else
+typedef pg_atomic_uint32 Count;
+#define count_atomic_inc(x)	pg_atomic_fetch_add_u32((x), 1)
+#define count_atomic_dec(x)	pg_atomic_fetch_sub_u32((x), 1)
+#define MAX_NENTRIES	((uint32)PG_INT32_MAX)
+#endif
+
 /*
  * Per-freelist data.
  *
@@ -153,7 +166,7 @@ typedef HASHBUCKET *HASHSEGMENT;
 typedef struct
 {
 	slock_t		mutex;			/* spinlock for this freelist */
-	long		nentries;		/* number of entries in associated buckets */
+	Count		nentries;		/* number of entries in associated buckets */
 	HASHELEMENT *freeList;		/* chain of free elements */
 } FreeListData;
 
@@ -306,6 +319,54 @@ string_compare(const char *key1, const char *key2, Size keysize)
 	return strncmp(key1, key2, keysize - 1);
 }
 
+/*
+ * Free list routines
+ */
+static inline void
+free_list_link_entry(HASHHDR *hctl, HASHBUCKET currBucket, int freelist_idx)
+{
+	FreeListData *list = &hctl->freeList[freelist_idx];
+
+	if (IS_PARTITIONED(hctl))
+	{
+		SpinLockAcquire(&list->mutex);
+		currBucket->link = list->freeList;
+		list->freeList = currBucket;
+		SpinLockRelease(&list->mutex);
+	}
+	else
+	{
+		currBucket->link = list->freeList;
+		list->freeList = currBucket;
+	}
+}
+
+static inline void
+free_list_increment_nentries(HASHHDR *hctl, int freelist_idx)
+{
+	FreeListData *list = &hctl->freeList[freelist_idx];
+
+	/* Check for overflow */
+	Assert(hctl->freeList[freelist_idx].nentries.value < MAX_NENTRIES);
+
+	if (IS_PARTITIONED(hctl))
+		count_atomic_inc(&list->nentries);
+	else
+		list->nentries.value++;
+}
+
+static inline void
+free_list_decrement_nentries(HASHHDR *hctl, int freelist_idx)
+{
+	FreeListData *list = &hctl->freeList[freelist_idx];
+
+	if (IS_PARTITIONED(hctl))
+		count_atomic_dec(&list->nentries);
+	else
+		list->nentries.value--;
+	/* Check for overflow */
+	Assert(hctl->freeList[freelist_idx].nentries.value < MAX_NENTRIES);
+}
 
 /************************** CREATE ROUTINES **********************/
 
@@ -1000,7 +1061,7 @@ hash_search_with_hash_value(HTAB *hashp,
 		 * Can't split if running in partitioned mode, nor if frozen, nor if
 		 * table is the subject of any active hash_seq_search scans.
 		 */
-		if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
+		if (hctl->freeList[0].nentries.value > (long) hctl->max_bucket &&
 			!IS_PARTITIONED(hctl) && !hashp->frozen &&
 			!has_seq_scans(hashp))
 			(void) expand_table(hashp);
@@ -1057,23 +1118,14 @@ hash_search_with_hash_value(HTAB *hashp,
 		case HASH_REMOVE:
 			if (currBucket != NULL)
 			{
-				/* if partitioned, must lock to touch nentries and freeList */
-				if (IS_PARTITIONED(hctl))
-					SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
-
 				/* delete the record from the appropriate nentries counter. */
-				Assert(hctl->freeList[freelist_idx].nentries > 0);
-				hctl->freeList[freelist_idx].nentries--;
+				free_list_decrement_nentries(hctl, freelist_idx);
 
 				/* remove record from hash bucket's chain. */
 				*prevBucketPtr = currBucket->link;
 
 				/* add the record to the appropriate freelist. */
-				currBucket->link = hctl->freeList[freelist_idx].freeList;
-				hctl->freeList[freelist_idx].freeList = currBucket;
-
-				if (IS_PARTITIONED(hctl))
-					SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
+				free_list_link_entry(hctl, currBucket, freelist_idx);
 
 				/*
 				 * better hope the caller is synchronizing access to this
@@ -1115,6 +1167,7 @@ hash_search_with_hash_value(HTAB *hashp,
 							(errcode(ERRCODE_OUT_OF_MEMORY),
 							 errmsg("out of memory")));
 			}
+			free_list_increment_nentries(hctl, freelist_idx);
 
 			/* link into hashbucket chain */
 			*prevBucketPtr = currBucket;
@@ -1165,6 +1218,7 @@ hash_update_hash_key(HTAB *hashp,
 {
 	HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
 	HASHHDR    *hctl = hashp->hctl;
+	uint32		oldhashvalue;
 	uint32		newhashvalue;
 	Size		keysize;
 	uint32		bucket;
@@ -1218,6 +1272,7 @@ hash_update_hash_key(HTAB *hashp,
 			 hashp->tabname);
 
 	oldPrevPtr = prevBucketPtr;
+	oldhashvalue = existingElement->hashvalue;
 
 	/*
 	 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
@@ -1271,12 +1326,21 @@ hash_update_hash_key(HTAB *hashp,
 	 */
 	if (bucket != newbucket)
 	{
+		int			old_freelist_idx = FREELIST_IDX(hctl, oldhashvalue);
+		int			new_freelist_idx = FREELIST_IDX(hctl, newhashvalue);
+
 		/* OK to remove record from old hash bucket's chain. */
 		*oldPrevPtr = currBucket->link;
 
 		/* link into new hashbucket chain */
 		*prevBucketPtr = currBucket;
 		currBucket->link = NULL;
+
+		if (old_freelist_idx != new_freelist_idx)
+		{
+			free_list_decrement_nentries(hctl, old_freelist_idx);
+			free_list_increment_nentries(hctl, new_freelist_idx);
+		}
 	}
 
 	/* copy new key into record */
@@ -1288,6 +1352,193 @@ hash_update_hash_key(HTAB *hashp,
 	return true;
 }
 
+/*
+ * hash_insert_with_hash_nocheck - inserts new entry into bucket without
+ * checking for duplicates.
+ *
+ * Caller should be sure there is no conflicting entry.
+ *
+ * Caller may pass pointer to old entry acquired with hash_delete_skip_freelist.
+ * In this case entry will be reused and returned as a new.
+ */
+void *
+hash_insert_with_hash_nocheck(HTAB *hashp,
+							  const void *keyPtr,
+							  uint32 hashvalue,
+							  void *oldentry)
+{
+	HASHHDR    *hctl = hashp->hctl;
+	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
+	uint32		bucket;
+	long		segment_num;
+	long		segment_ndx;
+	HASHSEGMENT segp;
+	HASHBUCKET	currBucket;
+	HASHBUCKET *prevBucketPtr;
+
+#if HASH_STATISTICS
+	hash_accesses++;
+	hctl->accesses++;
+#endif
+
+	/* disallow updates if frozen */
+	if (hashp->frozen)
+		elog(ERROR, "cannot update in frozen hashtable \"%s\"",
+			 hashp->tabname);
+
+	if (!IS_PARTITIONED(hctl) &&
+		hctl->freeList[0].nentries.value > (long) hctl->max_bucket &&
+		!has_seq_scans(hashp))
+		(void) expand_table(hashp);
+
+	/*
+	 * Lookup the existing element using its saved hash value.  We need to do
+	 * this to be able to unlink it from its hash chain, but as a side benefit
+	 * we can verify the validity of the passed existingEntry pointer.
+	 */
+	bucket = calc_bucket(hctl, hashvalue);
+
+	segment_num = bucket >> hashp->sshift;
+	segment_ndx = MOD(bucket, hashp->ssize);
+
+	segp = hashp->dir[segment_num];
+
+	if (segp == NULL)
+		hash_corrupted(hashp);
+
+	prevBucketPtr = &segp[segment_ndx];
+
+	if (oldentry != NULL)
+		currBucket = ELEMENT_FROM_KEY(oldentry);
+	else
+		currBucket = get_hash_entry(hashp, freelist_idx);
+	free_list_increment_nentries(hctl, freelist_idx);
+
+	if (currBucket == NULL)
+	{
+		/* report a generic message */
+		if (hashp->isshared)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	/* copy key into record */
+	currBucket->hashvalue = hashvalue;
+	currBucket->link = *prevBucketPtr;
+	hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, hashp->keysize);
+
+	*prevBucketPtr = currBucket;
+
+	return (void *) ELEMENTKEY(currBucket);
+}
+
+/*
+ * hash_delete_skip_freelist - find and delete entry, but don't put it
+ * to free list.
+ *
+ * Used in Buffer Manager to reuse entry for evicted buffer.
+ *
+ * Returned entry should be either reused with hash_insert_with_hash_nocheck
+ * or returned to free list with hash_return_to_freelist.
+ */
+void *
+hash_delete_skip_freelist(HTAB *hashp,
+						  const void *keyPtr,
+						  uint32 hashvalue)
+{
+	HASHHDR    *hctl = hashp->hctl;
+	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
+	Size		keysize;
+	uint32		bucket;
+	long		segment_num;
+	long		segment_ndx;
+	HASHSEGMENT segp;
+	HASHBUCKET	currBucket;
+	HASHBUCKET *prevBucketPtr;
+	HashCompareFunc match;
+
+#if HASH_STATISTICS
+	hash_accesses++;
+	hctl->accesses++;
+#endif
+
+	/*
+	 * Do the initial lookup
+	 */
+	bucket = calc_bucket(hctl, hashvalue);
+
+	segment_num = bucket >> hashp->sshift;
+	segment_ndx = MOD(bucket, hashp->ssize);
+
+	segp = hashp->dir[segment_num];
+
+	if (segp == NULL)
+		hash_corrupted(hashp);
+
+	prevBucketPtr = &segp[segment_ndx];
+	currBucket = *prevBucketPtr;
+
+	/*
+	 * Follow collision chain looking for matching key
+	 */
+	match = hashp->match;		/* save one fetch in inner loop */
+	keysize = hashp->keysize;	/* ditto */
+
+	while (currBucket != NULL)
+	{
+		if (currBucket->hashvalue == hashvalue &&
+			match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
+			break;
+		prevBucketPtr = &(currBucket->link);
+		currBucket = *prevBucketPtr;
+#if HASH_STATISTICS
+		hash_collisions++;
+		hctl->collisions++;
+#endif
+	}
+
+	if (currBucket == NULL)
+		return NULL;
+
+	/* delete the record from the appropriate nentries counter. */
+	free_list_decrement_nentries(hctl, freelist_idx);
+
+	/* remove record from hash bucket's chain. */
+	*prevBucketPtr = currBucket->link;
+
+	return (void *) ELEMENTKEY(currBucket);
+}
+
+/*
+ * hash_return_to_freelist - return entry deleted with
+ * hash_delete_skip_freelist to free list.
+ *
+ * Used in Buffer Manager in case new conflicting entry were inserted by
+ * concurrent process.
+ */
+void
+hash_return_to_freelist(HTAB *hashp,
+						const void *entry,
+						uint32 hashvalue)
+{
+	HASHHDR    *hctl = hashp->hctl;
+	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
+	HASHBUCKET	currBucket;
+
+	if (entry == NULL)
+		return;
+
+	currBucket = ELEMENT_FROM_KEY(entry);
+
+	/* add the record to the appropriate freelist. */
+	free_list_link_entry(hctl, currBucket, freelist_idx);
+}
+
 /*
  * Allocate a new hashtable entry if possible; return NULL if out of memory.
  * (Or, if the underlying space allocator throws error for out-of-memory,
@@ -1349,11 +1600,6 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
 					hctl->freeList[borrow_from_idx].freeList = newElement->link;
 					SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
 
-					/* careful: count the new element in its proper freelist */
-					SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
-					hctl->freeList[freelist_idx].nentries++;
-					SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
-
 					return newElement;
 				}
 
@@ -1365,9 +1611,8 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
 		}
 	}
 
-	/* remove entry from freelist, bump nentries */
+	/* remove entry from freelist */
 	hctl->freeList[freelist_idx].freeList = newElement->link;
-	hctl->freeList[freelist_idx].nentries++;
 
 	if (IS_PARTITIONED(hctl))
 		SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
@@ -1382,7 +1627,7 @@ long
 hash_get_num_entries(HTAB *hashp)
 {
 	int			i;
-	long		sum = hashp->hctl->freeList[0].nentries;
+	long		sum = hashp->hctl->freeList[0].nentries.value;
 
 	/*
 	 * We currently don't bother with acquiring the mutexes; it's only
@@ -1392,7 +1637,7 @@ hash_get_num_entries(HTAB *hashp)
 	if (IS_PARTITIONED(hashp->hctl))
 	{
 		for (i = 1; i < NUM_FREELISTS; i++)
-			sum += hashp->hctl->freeList[i].nentries;
+			sum += hashp->hctl->freeList[i].nentries.value;
 	}
 
 	return sum;
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 33fcaf5c9a8..4a1d6b37821 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -327,8 +327,10 @@ extern Size BufTableShmemSize(int size);
 extern void InitBufTable(int size);
 extern uint32 BufTableHashCode(BufferTag *tagPtr);
 extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
-extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern void BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id,
+						   void *oldelem);
+extern void *BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern void BufTableFreeDeleted(void *oldelem, uint32 hashcode);
 
 /* localbuf.c */
 extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index d7af0239c8c..1d586ef1169 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -150,4 +150,21 @@ extern Size hash_get_shared_size(HASHCTL *info, int flags);
 extern void AtEOXact_HashTables(bool isCommit);
 extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
 
+/*
+ * Buffer Manager optimization utilities.
+ * They made to avoid taking two partition locks simultaneously and
+ * skip interraction with dynahash's freelist.
+ * Use them carefully and only if they gives meaningful improvement.
+ */
+extern void *hash_insert_with_hash_nocheck(HTAB *hashp,
+										   const void *keyPtr,
+										   uint32 hashvalue,
+										   void *oldentry);
+extern void *hash_delete_skip_freelist(HTAB *hashp,
+									   const void *keyPtr,
+									   uint32 hashvalue);
+extern void hash_return_to_freelist(HTAB *hashp,
+									const void *entry,
+									uint32 hashvalue);
+
 #endif							/* HSEARCH_H */
-- 
2.33.0

<<attachment: res.zip>>

Reply via email to