From 270ddf3e8e1fe99e92c45716f5d57396ee482d11 Mon Sep 17 00:00:00 2001
From: Rahila Syed <rahilasyed.90@gmail.com>
Date: Fri, 4 Apr 2025 14:22:05 +0530
Subject: [PATCH] Improve accounting for memory used by shared hash tables

pg_shmem_allocations tracks the memory allocated by ShmemInitStruct(),
but for shared hash tables that covered only the header and hash
directory.  The remaining parts (segments and buckets) were allocated
later using ShmemAlloc(), which does not update the shmem accounting.
Thus, these allocations were not shown in pg_shmem_allocations.

This commit improves the situation by allocating all the hash table
parts at once, using a single ShmemInitStruct() call. This way the
ShmemIndex entries (and thus pg_shmem_allocations) better reflect the
proper size of the hash table.u

This does not change anything for non-shared hash tables.

This changes the alignment a bit. ShmemAlloc() aligns the chunks using
CACHELINEALIGN(), which means some parts (header, directory, segments)
were aligned this way. Allocating all parts as a single chunk removes
this (implicit) alignment. We've considered adding explicit alignment,
but we've decided not to - it seems to be merely a coincidence due to
using the ShmemAlloc() API, not due to necessity.

Author: Rahila Syed <rahilasyed90@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CAH2L28vHzRankszhqz7deXURxKncxfirnuW68zD7+hVAqaS5GQ@mail.gmail.com
---
 src/backend/storage/ipc/shmem.c   |   4 +-
 src/backend/utils/hash/dynahash.c | 242 ++++++++++++++++++++++++------
 src/include/utils/hsearch.h       |   3 +-
 3 files changed, 198 insertions(+), 51 deletions(-)

diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39e..a1b9180a64b 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -73,6 +73,7 @@
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/dynahash.h"
 
 static void *ShmemAllocRaw(Size size, Size *allocated_size);
 
@@ -346,7 +347,8 @@ ShmemInitHash(const char *name,		/* table string name for shmem index */
 
 	/* look it up in the shmem index */
 	location = ShmemInitStruct(name,
-							   hash_get_shared_size(infoP, hash_flags),
+							   hash_get_shared_size(infoP, hash_flags,
+													init_size),
 							   &found);
 
 	/*
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 3f25929f2d8..53b84db0683 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -260,12 +260,39 @@ static long hash_accesses,
 			hash_expansions;
 #endif
 
+/* access to parts of the hash table, allocated as a single chunk */
+#define	HASH_DIRECTORY_PTR(hashp) \
+	(((char *) (hashp)->hctl) + sizeof(HASHHDR))
+
+#define HASH_SEGMENT_OFFSET(hctl, idx) \
+	(sizeof(HASHHDR) + \
+	 ((hctl)->dsize * sizeof(HASHSEGMENT)) + \
+	 ((hctl)->ssize * (idx) * sizeof(HASHBUCKET)))
+
+#define HASH_SEGMENT_PTR(hashp, idx) \
+	((char *) (hashp)->hctl + HASH_SEGMENT_OFFSET((hashp)->hctl, (idx)))
+
+#define HASH_SEGMENT_SIZE(hashp) \
+	((hashp)->ssize * sizeof(HASHBUCKET))
+
+#define HASH_ELEMENTS_PTR(hashp, nsegs) \
+	((char *) (hashp)->hctl + HASH_SEGMENT_OFFSET((hashp)->hctl, nsegs))
+
+/* Each element has a HASHELEMENT header plus user data. */
+#define HASH_ELEMENT_SIZE(hctl) \
+	(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN((hctl)->entrysize))
+
+#define HASH_ELEMENT_NEXT(hctl, num, ptr) \
+	((char *) (ptr) + ((num) * HASH_ELEMENT_SIZE(hctl)))
+
 /*
  * Private function prototypes
  */
 static void *DynaHashAlloc(Size size);
 static HASHSEGMENT seg_alloc(HTAB *hashp);
-static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
+static HASHELEMENT *element_alloc(HTAB *hashp, int nelem);
+static void element_add(HTAB *hashp, HASHELEMENT *firstElement,
+						int nelem, int freelist_idx);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
@@ -280,6 +307,9 @@ static int	next_pow2_int(long num);
 static void register_seq_scan(HTAB *hashp);
 static void deregister_seq_scan(HTAB *hashp);
 static bool has_seq_scans(HTAB *hashp);
+static void compute_buckets_and_segs(long nelem, long num_partitions,
+									 long ssize,
+									 int *nbuckets, int *nsegments);
 
 
 /*
@@ -568,12 +598,12 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 		elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
 
 	/*
+	 * For a private hash table, preallocate the requested number of elements
+	 * if it's less than our chosen nelem_alloc.  This avoids wasting space if
+	 * the caller correctly estimates a small table size.
+	 *
 	 * For a shared hash table, preallocate the requested number of elements.
 	 * This reduces problems with run-time out-of-shared-memory conditions.
-	 *
-	 * For a non-shared hash table, preallocate the requested number of
-	 * elements if it's less than our chosen nelem_alloc.  This avoids wasting
-	 * space if the caller correctly estimates a small table size.
 	 */
 	if ((flags & HASH_SHARED_MEM) ||
 		nelem < hctl->nelem_alloc)
@@ -582,6 +612,7 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 					freelist_partitions,
 					nelem_alloc,
 					nelem_alloc_first;
+		void	   *ptr = NULL;
 
 		/*
 		 * If hash table is partitioned, give each freelist an equal share of
@@ -606,14 +637,42 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 		else
 			nelem_alloc_first = nelem_alloc;
 
+		/*
+		 * For a shared hash table, calculate the offset at which to find the
+		 * first partition of elements. We have to skip space for the header,
+		 * segments and buckets.
+		 */
+		if (hashp->isshared)
+			ptr = HASH_ELEMENTS_PTR(hashp, hctl->nsegs);
+
 		for (i = 0; i < freelist_partitions; i++)
 		{
 			int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
 
-			if (!element_alloc(hashp, temp, i))
-				ereport(ERROR,
-						(errcode(ERRCODE_OUT_OF_MEMORY),
-						 errmsg("out of memory")));
+			/*
+			 * Assign the correct location of each parition within a
+			 * pre-allocated buffer.
+			 *
+			 * Actual memory allocation happens in ShmemInitHash for shared
+			 * hash tables.
+			 *
+			 * We just need to split that allocation into per-batch freelists.
+			 */
+			if (hashp->isshared)
+			{
+				element_add(hashp, (HASHELEMENT *) ptr, temp, i);
+				ptr = HASH_ELEMENT_NEXT(hctl, temp, ptr);
+			}
+			else
+			{
+				HASHELEMENT *firstElement = element_alloc(hashp, temp);
+
+				if (!firstElement)
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
+				element_add(hashp, firstElement, temp, i);
+			}
 		}
 	}
 
@@ -702,29 +761,16 @@ init_htab(HTAB *hashp, long nelem)
 			SpinLockInit(&(hctl->freeList[i].mutex));
 
 	/*
-	 * Allocate space for the next greater power of two number of buckets,
-	 * assuming a desired maximum load factor of 1.
+	 * We've already calculated these parameters when we calculated how much
+	 * space to allocate in hash_get_shared_size(). Be careful to keep these
+	 * two places in sync, so that we get the same parameters.
 	 */
-	nbuckets = next_pow2_int(nelem);
-
-	/*
-	 * In a partitioned table, nbuckets must be at least equal to
-	 * num_partitions; were it less, keys with apparently different partition
-	 * numbers would map to the same bucket, breaking partition independence.
-	 * (Normally nbuckets will be much bigger; this is just a safety check.)
-	 */
-	while (nbuckets < hctl->num_partitions)
-		nbuckets <<= 1;
+	compute_buckets_and_segs(nelem, hctl->num_partitions, hctl->ssize,
+							 &nbuckets, &nsegs);
 
 	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
 	hctl->high_mask = (nbuckets << 1) - 1;
 
-	/*
-	 * Figure number of directory segments needed, round up to a power of 2
-	 */
-	nsegs = (nbuckets - 1) / hctl->ssize + 1;
-	nsegs = next_pow2_int(nsegs);
-
 	/*
 	 * Make sure directory is big enough. If pre-allocated directory is too
 	 * small, choke (caller screwed up).
@@ -748,12 +794,22 @@ init_htab(HTAB *hashp, long nelem)
 	}
 
 	/* Allocate initial segments */
+	i = 0;
 	for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
 	{
-		*segp = seg_alloc(hashp);
-		if (*segp == NULL)
-			return false;
+		/* Assign initial segments, which are also pre-allocated */
+		if (hashp->isshared)
+		{
+			*segp = (HASHSEGMENT) HASH_SEGMENT_PTR(hashp, i++);
+			MemSet(*segp, 0, HASH_SEGMENT_SIZE(hashp));
+		}
+		else
+		{
+			*segp = seg_alloc(hashp);
+			i++;
+		}
 	}
+	Assert(i == nsegs);
 
 	/* Choose number of entries to allocate at a time */
 	hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
@@ -846,16 +902,60 @@ hash_select_dirsize(long num_entries)
 }
 
 /*
- * Compute the required initial memory allocation for a shared-memory
- * hashtable with the given parameters.  We need space for the HASHHDR
- * and for the (non expansible) directory.
+ * hash_get_shared_size -- determine memory needed for a new shared dynamic hash table
+ *
+ *	info: hash table parameters
+ *	flags: bitmask indicating which parameters to take from *info
+ *	nelem: maximum number of elements expected
+ *
+ * Compute the required initial memory allocation for a hashtable with the given
+ * parameters. We need space for the HASHHDR, for the directory, segments and
+ * preallocated elements.
+ *
+ * For shared hash tables the directory size is non-expansive, and we preallocate
+ * all elements (nelem).
  */
 Size
-hash_get_shared_size(HASHCTL *info, int flags)
+hash_get_shared_size(const HASHCTL *info, int flags, long nelem)
 {
+	int			nbuckets;
+	int			nsegs;
+	int			num_partitions;
+	long		ssize;
+	long		dsize;
+	Size		elementSize = HASH_ELEMENT_SIZE(info);
+
+#ifdef USE_ASSERT_CHECKING
+	/* shared hash tables have non-expansive directory */
+	Assert(flags & HASH_SHARED_MEM);
 	Assert(flags & HASH_DIRSIZE);
 	Assert(info->dsize == info->max_dsize);
-	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
+#endif
+
+	dsize = info->dsize;
+
+	if (flags & HASH_SEGMENT)
+		ssize = info->ssize;
+	else
+		ssize = DEF_SEGSIZE;
+
+	if (flags & HASH_PARTITION)
+	{
+		num_partitions = info->num_partitions;
+
+		/* Number of entries should be atleast equal to the freelists */
+		if (nelem < NUM_FREELISTS)
+			nelem = NUM_FREELISTS;
+	}
+	else
+		num_partitions = 0;
+
+	compute_buckets_and_segs(nelem, num_partitions, ssize,
+							 &nbuckets, &nsegs);
+
+	return sizeof(HASHHDR) + dsize * sizeof(HASHSEGMENT)
+		+ sizeof(HASHBUCKET) * ssize * nsegs
+		+ nelem * elementSize;
 }
 
 
@@ -1285,7 +1385,7 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
 		 * Failing because the needed element is in a different freelist is
 		 * not acceptable.
 		 */
-		if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
+		if ((newElement = element_alloc(hashp, hctl->nelem_alloc)) == NULL)
 		{
 			int			borrow_from_idx;
 
@@ -1322,6 +1422,7 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
 			/* no elements available to borrow either, so out of memory */
 			return NULL;
 		}
+		element_add(hashp, newElement, hctl->nelem_alloc, freelist_idx);
 	}
 
 	/* remove entry from freelist, bump nentries */
@@ -1700,29 +1801,43 @@ seg_alloc(HTAB *hashp)
 }
 
 /*
- * allocate some new elements and link them into the indicated free list
+ * allocate some new elements
  */
-static bool
-element_alloc(HTAB *hashp, int nelem, int freelist_idx)
+static HASHELEMENT *
+element_alloc(HTAB *hashp, int nelem)
 {
 	HASHHDR    *hctl = hashp->hctl;
 	Size		elementSize;
-	HASHELEMENT *firstElement;
-	HASHELEMENT *tmpElement;
-	HASHELEMENT *prevElement;
-	int			i;
+	HASHELEMENT *firstElement = NULL;
 
 	if (hashp->isfixed)
-		return false;
+		return NULL;
 
 	/* Each element has a HASHELEMENT header plus user data. */
-	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
-
+	elementSize = HASH_ELEMENT_SIZE(hctl);
 	CurrentDynaHashCxt = hashp->hcxt;
 	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
 
 	if (!firstElement)
-		return false;
+		return NULL;
+
+	return firstElement;
+}
+
+/*
+ * link the elements allocated by element_alloc into the indicated free list
+ */
+static void
+element_add(HTAB *hashp, HASHELEMENT *firstElement, int nelem, int freelist_idx)
+{
+	HASHHDR    *hctl = hashp->hctl;
+	Size		elementSize;
+	HASHELEMENT *tmpElement;
+	HASHELEMENT *prevElement;
+	int			i;
+
+	/* Each element has a HASHELEMENT header plus user data. */
+	elementSize = HASH_ELEMENT_SIZE(hctl);
 
 	/* prepare to link all the new entries into the freelist */
 	prevElement = NULL;
@@ -1744,8 +1859,6 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx)
 
 	if (IS_PARTITIONED(hctl))
 		SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
-
-	return true;
 }
 
 /*
@@ -1957,3 +2070,34 @@ AtEOSubXact_HashTables(bool isCommit, int nestDepth)
 		}
 	}
 }
+
+/*
+ * Calculate the number of buckets and segments to store the given
+ * number of elements in a hash table. Segments contain buckets which
+ * in turn contain elements.
+ */
+static void
+compute_buckets_and_segs(long nelem, long num_partitions, long ssize,
+						 int *nbuckets, int *nsegments)
+{
+	/*
+	 * Allocate space for the next greater power of two number of buckets,
+	 * assuming a desired maximum load factor of 1.
+	 */
+	*nbuckets = next_pow2_int(nelem);
+
+	/*
+	 * In a partitioned table, nbuckets must be at least equal to
+	 * num_partitions; were it less, keys with apparently different partition
+	 * numbers would map to the same bucket, breaking partition independence.
+	 * (Normally nbuckets will be much bigger; this is just a safety check.)
+	 */
+	while ((*nbuckets) < num_partitions)
+		(*nbuckets) <<= 1;
+
+	/*
+	 * Figure number of directory segments needed, round up to a power of 2
+	 */
+	*nsegments = ((*nbuckets) - 1) / ssize + 1;
+	*nsegments = next_pow2_int(*nsegments);
+}
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 932cc4f34d9..5cefda1c22c 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -151,7 +151,8 @@ extern void hash_seq_term(HASH_SEQ_STATUS *status);
 extern void hash_freeze(HTAB *hashp);
 extern Size hash_estimate_size(long num_entries, Size entrysize);
 extern long hash_select_dirsize(long num_entries);
-extern Size hash_get_shared_size(HASHCTL *info, int flags);
+extern Size hash_get_shared_size(const HASHCTL *info, int flags,
+								 long nelem);
 extern void AtEOXact_HashTables(bool isCommit);
 extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
 
-- 
2.34.1

