> This comment certainly requires some changes.

Fixed.

> BTW, could you explain why init_table_size was two times less than 
> max_table_size?

I have no clue. My best guess is that it was a reasonable thing to do in
the past. Then somebody changed a code and now there is little reason
to use init_table_size for partitioned tables.

> Why did you delete these two lines? I wonder if you should rewrite
> them instead?

```
    MemSet(hctl, 0, sizeof(HASHHDR));
 
-   hctl->nentries = 0;
-   hctl->freeList = NULL;
```

These fields were initialized with zero values twice. It makes little
sense to me. 

> As far as I understood, this number was obtained experimentally?
> Maybe you should note that in the comment.

These numbers are very platform specific and will be outdated very
soon. I recall that my code was criticized for including exact numbers
not a long time ago. So I suggest to keep this part as is.

> For example, if you have nelem=25 and partitions_number=6.
> 25 / 6 = 4. And then you allocate 24 nelems, while 1 is lost.

Agree. Fixed.

> Except mentioned notes, I suppose the patch is good enough

I guess I will mark this patch as "Ready for Committer" then.
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 9673fe0..0c8e4fb 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -495,7 +495,7 @@ pgss_shmem_startup(void)
 	info.hash = pgss_hash_fn;
 	info.match = pgss_match_fn;
 	pgss_hash = ShmemInitHash("pg_stat_statements hash",
-							  pgss_max, pgss_max,
+							  pgss_max,
 							  &info,
 							  HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
 
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 39e8baf..dd5acb7 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -62,7 +62,7 @@ InitBufTable(int size)
 	info.num_partitions = NUM_BUFFER_PARTITIONS;
 
 	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
-								  size, size,
+								  size,
 								  &info,
 								  HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
 }
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 81506ea..4c18701 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -237,7 +237,7 @@ InitShmemIndex(void)
 	hash_flags = HASH_ELEM;
 
 	ShmemIndex = ShmemInitHash("ShmemIndex",
-							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
+							   SHMEM_INDEX_SIZE,
 							   &info, hash_flags);
 }
 
@@ -255,17 +255,12 @@ InitShmemIndex(void)
  * exceeded substantially (since it's used to compute directory size and
  * the hash table buckets will get overfull).
  *
- * init_size is the number of hashtable entries to preallocate.  For a table
- * whose maximum size is certain, this should be equal to max_size; that
- * ensures that no run-time out-of-shared-memory failures can occur.
- *
  * Note: before Postgres 9.0, this function returned NULL for some failure
  * cases.  Now, it always throws error instead, so callers need not check
  * for NULL.
  */
 HTAB *
 ShmemInitHash(const char *name, /* table string name for shmem index */
-			  long init_size,	/* initial table size */
 			  long max_size,	/* max size of the table */
 			  HASHCTL *infoP,	/* info about key and bucket size */
 			  int hash_flags)	/* info about infoP */
@@ -299,7 +294,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
 	/* Pass location of hashtable header to hash_create */
 	infoP->hctl = (HASHHDR *) location;
 
-	return hash_create(name, init_size, infoP, hash_flags);
+	return hash_create(name, max_size, infoP, hash_flags);
 }
 
 /*
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 9c2e49c..8585a76 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -373,18 +373,10 @@ void
 InitLocks(void)
 {
 	HASHCTL		info;
-	long		init_table_size,
-				max_table_size;
+	long		max_table_size;
 	bool		found;
 
 	/*
-	 * Compute init/max size to request for lock hashtables.  Note these
-	 * calculations must agree with LockShmemSize!
-	 */
-	max_table_size = NLOCKENTS();
-	init_table_size = max_table_size / 2;
-
-	/*
 	 * Allocate hash table for LOCK structs.  This stores per-locked-object
 	 * information.
 	 */
@@ -392,16 +384,15 @@ InitLocks(void)
 	info.keysize = sizeof(LOCKTAG);
 	info.entrysize = sizeof(LOCK);
 	info.num_partitions = NUM_LOCK_PARTITIONS;
+	max_table_size = NLOCKENTS();
 
 	LockMethodLockHash = ShmemInitHash("LOCK hash",
-									   init_table_size,
 									   max_table_size,
 									   &info,
 									HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
 
 	/* Assume an average of 2 holders per lock */
 	max_table_size *= 2;
-	init_table_size *= 2;
 
 	/*
 	 * Allocate hash table for PROCLOCK structs.  This stores
@@ -413,7 +404,6 @@ InitLocks(void)
 	info.num_partitions = NUM_LOCK_PARTITIONS;
 
 	LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
-										   init_table_size,
 										   max_table_size,
 										   &info,
 								 HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d9d4e22..fc72d2d 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -1116,7 +1116,6 @@ InitPredicateLocks(void)
 
 	PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
 											max_table_size,
-											max_table_size,
 											&info,
 											HASH_ELEM | HASH_BLOBS |
 											HASH_PARTITION | HASH_FIXED_SIZE);
@@ -1144,7 +1143,6 @@ InitPredicateLocks(void)
 
 	PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
 									  max_table_size,
-									  max_table_size,
 									  &info,
 									  HASH_ELEM | HASH_FUNCTION |
 									  HASH_PARTITION | HASH_FIXED_SIZE);
@@ -1225,7 +1223,6 @@ InitPredicateLocks(void)
 
 	SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
 										max_table_size,
-										max_table_size,
 										&info,
 										HASH_ELEM | HASH_BLOBS |
 										HASH_FIXED_SIZE);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 24a53da..6a2665b 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -15,7 +15,7 @@
  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
  * Therefore, each hash bucket chain operates independently, and no fields
  * of the hash header change after init except nentries and freeList.
- * A partitioned table uses a spinlock to guard changes of those two fields.
+ * A partitioned table uses spinlocks to guard changes of those fields.
  * This lets any subset of the hash buckets be treated as a separately
  * lockable partition.  We expect callers to use the low-order bits of a
  * lookup key's hash value as a partition number --- this will work because
@@ -87,6 +87,7 @@
 #include "access/xact.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
+#include "storage/lock.h"
 #include "utils/dynahash.h"
 #include "utils/memutils.h"
 
@@ -128,12 +129,26 @@ typedef HASHBUCKET *HASHSEGMENT;
  */
 struct HASHHDR
 {
-	/* In a partitioned table, take this lock to touch nentries or freeList */
-	slock_t		mutex;			/* unused if not partitioned table */
-
-	/* These fields change during entry addition/deletion */
-	long		nentries;		/* number of entries in hash table */
-	HASHELEMENT *freeList;		/* linked list of free elements */
+	/*
+	 * There are two fields declared below: nentries and freeList. nentries
+	 * stores current number of entries in a hash table. freeList is a linked
+	 * list of free elements.
+	 *
+	 * To keep these fields consistent in a partitioned table we need to
+	 * synchronize access to them using a spinlock. But it turned out that a
+	 * single spinlock can create a bottleneck. To prevent lock contention an
+	 * array of NUM_LOCK_PARTITIONS spinlocks is used. Each spinlock
+	 * corresponds to a single table partition (see PARTITION_IDX definition)
+	 * and protects one element of nentries and freeList arrays. Since
+	 * partitions are locked on a calling side depending on lower bits of a
+	 * hash value this particular number of spinlocks prevents deadlocks.
+	 *
+	 * If hash table is not partitioned only nentries[0] and freeList[0] are
+	 * used and spinlocks are not used at all.
+	 */
+	slock_t		mutex[NUM_LOCK_PARTITIONS];		/* array of spinlocks */
+	long		nentries[NUM_LOCK_PARTITIONS];	/* number of entries */
+	HASHELEMENT *freeList[NUM_LOCK_PARTITIONS]; /* lists of free elements */
 
 	/* These fields can change, but not in a partitioned table */
 	/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +181,8 @@ struct HASHHDR
 
 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
 
+#define PARTITION_IDX(hctl, hashcode) (IS_PARTITIONED(hctl) ? LockHashPartition(hashcode) : 0)
+
 /*
  * Top control structure for a hashtable --- in a shared table, each backend
  * has its own copy (OK since no fields change at runtime)
@@ -219,10 +236,10 @@ static long hash_accesses,
  */
 static void *DynaHashAlloc(Size size);
 static HASHSEGMENT seg_alloc(HTAB *hashp);
-static bool element_alloc(HTAB *hashp, int nelem);
+static bool element_alloc(HTAB *hashp, int nelem, int partition_idx);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
-static HASHBUCKET get_hash_entry(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp, int partition_idx);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
@@ -282,6 +299,10 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 {
 	HTAB	   *hashp;
 	HASHHDR    *hctl;
+	int			i,
+				partitions_number,
+				nelem_alloc,
+				nelem_alloc_first;
 
 	/*
 	 * For shared hash tables, we have a local hash header (HTAB struct) that
@@ -482,10 +503,34 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	if ((flags & HASH_SHARED_MEM) ||
 		nelem < hctl->nelem_alloc)
 	{
-		if (!element_alloc(hashp, (int) nelem))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+		/*
+		 * If hash table is partitioned all freeLists have equal number of
+		 * elements. Otherwise only freeList[0] is used.
+		 */
+		if (IS_PARTITIONED(hashp->hctl))
+			partitions_number = NUM_LOCK_PARTITIONS;
+		else
+			partitions_number = 1;
+
+		nelem_alloc = nelem / partitions_number;
+		if (nelem_alloc == 0)
+			nelem_alloc = 1;
+
+		if (nelem_alloc * partitions_number < nelem)
+			/* Make sure all memory will be used */
+			nelem_alloc_first = nelem - nelem_alloc * (partitions_number - 1);
+		else
+			nelem_alloc_first = nelem_alloc;
+
+		for (i = 0; i < partitions_number; i++)
+		{
+			int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
+
+			if (!element_alloc(hashp, temp, i))
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+		}
 	}
 
 	if (flags & HASH_FIXED_SIZE)
@@ -503,9 +548,6 @@ hdefault(HTAB *hashp)
 
 	MemSet(hctl, 0, sizeof(HASHHDR));
 
-	hctl->nentries = 0;
-	hctl->freeList = NULL;
-
 	hctl->dsize = DEF_DIRSIZE;
 	hctl->nsegs = 0;
 
@@ -572,12 +614,14 @@ init_htab(HTAB *hashp, long nelem)
 	HASHSEGMENT *segp;
 	int			nbuckets;
 	int			nsegs;
+	int			i;
 
 	/*
 	 * initialize mutex if it's a partitioned table
 	 */
 	if (IS_PARTITIONED(hctl))
-		SpinLockInit(&hctl->mutex);
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			SpinLockInit(&(hctl->mutex[i]));
 
 	/*
 	 * Divide number of elements by the fill factor to determine a desired
@@ -648,7 +692,7 @@ init_htab(HTAB *hashp, long nelem)
 			"HIGH MASK       ", hctl->high_mask,
 			"LOW  MASK       ", hctl->low_mask,
 			"NSEGS           ", hctl->nsegs,
-			"NENTRIES        ", hctl->nentries);
+			"NENTRIES        ", hash_get_num_entries(hctl));
 #endif
 	return true;
 }
@@ -769,7 +813,7 @@ hash_stats(const char *where, HTAB *hashp)
 			where, hashp->hctl->accesses, hashp->hctl->collisions);
 
 	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
-			hashp->hctl->nentries, (long) hashp->hctl->keysize,
+			hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
 			hashp->hctl->max_bucket, hashp->hctl->nsegs);
 	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 			where, hash_accesses, hash_collisions);
@@ -863,6 +907,7 @@ hash_search_with_hash_value(HTAB *hashp,
 	HASHBUCKET	currBucket;
 	HASHBUCKET *prevBucketPtr;
 	HashCompareFunc match;
+	int			partition_idx = PARTITION_IDX(hctl, hashvalue);
 
 #if HASH_STATISTICS
 	hash_accesses++;
@@ -885,7 +930,7 @@ hash_search_with_hash_value(HTAB *hashp,
 		 * order of these tests is to try to check cheaper conditions first.
 		 */
 		if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
-			hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+		hctl->nentries[0] / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
 			!has_seq_scans(hashp))
 			(void) expand_table(hashp);
 	}
@@ -943,20 +988,20 @@ hash_search_with_hash_value(HTAB *hashp,
 			{
 				/* if partitioned, must lock to touch nentries and freeList */
 				if (IS_PARTITIONED(hctl))
-					SpinLockAcquire(&hctl->mutex);
+					SpinLockAcquire(&(hctl->mutex[partition_idx]));
 
-				Assert(hctl->nentries > 0);
-				hctl->nentries--;
+				Assert(hctl->nentries[partition_idx] > 0);
+				hctl->nentries[partition_idx]--;
 
 				/* remove record from hash bucket's chain. */
 				*prevBucketPtr = currBucket->link;
 
 				/* add the record to the freelist for this table.  */
-				currBucket->link = hctl->freeList;
-				hctl->freeList = currBucket;
+				currBucket->link = hctl->freeList[partition_idx];
+				hctl->freeList[partition_idx] = currBucket;
 
 				if (IS_PARTITIONED(hctl))
-					SpinLockRelease(&hctl->mutex);
+					SpinLockRelease(&hctl->mutex[partition_idx]);
 
 				/*
 				 * better hope the caller is synchronizing access to this
@@ -982,7 +1027,7 @@ hash_search_with_hash_value(HTAB *hashp,
 				elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
 					 hashp->tabname);
 
-			currBucket = get_hash_entry(hashp);
+			currBucket = get_hash_entry(hashp, partition_idx);
 			if (currBucket == NULL)
 			{
 				/* out of memory */
@@ -1175,41 +1220,70 @@ hash_update_hash_key(HTAB *hashp,
  * create a new entry if possible
  */
 static HASHBUCKET
-get_hash_entry(HTAB *hashp)
+get_hash_entry(HTAB *hashp, int partition_idx)
 {
-	HASHHDR *hctl = hashp->hctl;
+	HASHHDR    *hctl = hashp->hctl;
 	HASHBUCKET	newElement;
+	int			borrow_from_idx;
 
 	for (;;)
 	{
 		/* if partitioned, must lock to touch nentries and freeList */
 		if (IS_PARTITIONED(hctl))
-			SpinLockAcquire(&hctl->mutex);
+			SpinLockAcquire(&hctl->mutex[partition_idx]);
 
 		/* try to get an entry from the freelist */
-		newElement = hctl->freeList;
+		newElement = hctl->freeList[partition_idx];
+
 		if (newElement != NULL)
-			break;
+		{
+			/* remove entry from freelist, bump nentries */
+			hctl->freeList[partition_idx] = newElement->link;
+			hctl->nentries[partition_idx]++;
+			if (IS_PARTITIONED(hctl))
+				SpinLockRelease(&hctl->mutex[partition_idx]);
+
+			return newElement;
+		}
 
-		/* no free elements.  allocate another chunk of buckets */
 		if (IS_PARTITIONED(hctl))
-			SpinLockRelease(&hctl->mutex);
+			SpinLockRelease(&hctl->mutex[partition_idx]);
 
-		if (!element_alloc(hashp, hctl->nelem_alloc))
+		/* no free elements.  allocate another chunk of buckets */
+		if (!element_alloc(hashp, hctl->nelem_alloc, partition_idx))
 		{
-			/* out of memory */
-			return NULL;
-		}
-	}
+			if (!IS_PARTITIONED(hctl))
+				return NULL;	/* out of memory */
+
+			/* try to borrow element from another partition */
+			borrow_from_idx = partition_idx;
+			for (;;)
+			{
+				borrow_from_idx = (borrow_from_idx + 1) % NUM_LOCK_PARTITIONS;
+				if (borrow_from_idx == partition_idx)
+					break;
 
-	/* remove entry from freelist, bump nentries */
-	hctl->freeList = newElement->link;
-	hctl->nentries++;
+				SpinLockAcquire(&(hctl->mutex[borrow_from_idx]));
+				newElement = hctl->freeList[borrow_from_idx];
 
-	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+				if (newElement != NULL)
+				{
+					hctl->freeList[borrow_from_idx] = newElement->link;
+					SpinLockRelease(&(hctl->mutex[borrow_from_idx]));
+
+					SpinLockAcquire(&hctl->mutex[partition_idx]);
+					hctl->nentries[partition_idx]++;
+					SpinLockRelease(&hctl->mutex[partition_idx]);
 
-	return newElement;
+					break;
+				}
+
+				SpinLockRelease(&(hctl->mutex[borrow_from_idx]));
+			}
+
+			return newElement;
+		}
+	}
 }
 
 /*
@@ -1218,11 +1292,21 @@ get_hash_entry(HTAB *hashp)
 long
 hash_get_num_entries(HTAB *hashp)
 {
+	int			i;
+	long		sum = hashp->hctl->nentries[0];
+
 	/*
 	 * We currently don't bother with the mutex; it's only sensible to call
 	 * this function if you've got lock on all partitions of the table.
 	 */
-	return hashp->hctl->nentries;
+
+	if (!IS_PARTITIONED(hashp->hctl))
+		return sum;
+
+	for (i = 1; i < NUM_LOCK_PARTITIONS; i++)
+		sum += hashp->hctl->nentries[i];
+
+	return sum;
 }
 
 /*
@@ -1530,9 +1614,9 @@ seg_alloc(HTAB *hashp)
  * allocate some new elements and link them into the free list
  */
 static bool
-element_alloc(HTAB *hashp, int nelem)
+element_alloc(HTAB *hashp, int nelem, int partition_idx)
 {
-	HASHHDR *hctl = hashp->hctl;
+	HASHHDR    *hctl = hashp->hctl;
 	Size		elementSize;
 	HASHELEMENT *firstElement;
 	HASHELEMENT *tmpElement;
@@ -1563,14 +1647,14 @@ element_alloc(HTAB *hashp, int nelem)
 
 	/* if partitioned, must lock to touch freeList */
 	if (IS_PARTITIONED(hctl))
-		SpinLockAcquire(&hctl->mutex);
+		SpinLockAcquire(&hctl->mutex[partition_idx]);
 
 	/* freelist could be nonempty if two backends did this concurrently */
-	firstElement->link = hctl->freeList;
-	hctl->freeList = prevElement;
+	firstElement->link = hctl->freeList[partition_idx];
+	hctl->freeList[partition_idx] = prevElement;
 
 	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+		SpinLockRelease(&hctl->mutex[partition_idx]);
 
 	return true;
 }
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 5e8825e..177371b 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -128,13 +128,19 @@ extern char *MainLWLockNames[];
  * having this file include lock.h or bufmgr.h would be backwards.
  */
 
-/* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS  128
-
-/* Number of partitions the shared lock tables are divided into */
-#define LOG2_NUM_LOCK_PARTITIONS  4
+/*
+ * Number of partitions the shared lock tables are divided into.
+ *
+ * This particular number of partitions significantly reduces lock contention
+ * in partitioned hash tables, almost if partitioned tables didn't use any
+ * locking at all.
+ */
+#define LOG2_NUM_LOCK_PARTITIONS  7
 #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
 
+/* Number of partitions of the shared buffer mapping hashtable */
+#define NUM_BUFFER_PARTITIONS NUM_LOCK_PARTITIONS
+
 /* Number of partitions the shared predicate lock tables are divided into */
 #define LOG2_NUM_PREDICATELOCK_PARTITIONS  4
 #define NUM_PREDICATELOCK_PARTITIONS  (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index 6468e66..50cf928 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -37,7 +37,7 @@ extern void InitShmemAllocation(void);
 extern void *ShmemAlloc(Size size);
 extern bool ShmemAddrIsValid(const void *addr);
 extern void InitShmemIndex(void);
-extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
+extern HTAB *ShmemInitHash(const char *name, long max_size,
 			  HASHCTL *infoP, int hash_flags);
 extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
 extern Size add_size(Size s1, Size s2);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to