From c13a7133ed455842b685426217a7b5079e6fc869 Mon Sep 17 00:00:00 2001
From: Rahila Syed <rahilasyed.90@gmail.com>
Date: Fri, 21 Feb 2025 15:08:12 +0530
Subject: [PATCH 1/2] Account for initial shared memory allocated during
 hash_create.

pg_shmem_allocations tracks the memory allocated by ShmemInitStruct,
which, in case of shared hash tables, only covers memory allocated
to the hash directory and control structure. The hash segments and
buckets are allocated using ShmemAllocNoError which does not attribute
the allocations to the hash table name. Thus, these allocations are
not tracked in pg_shmem_allocations.

Improve this include the alocation of segments and buckets or elements
in the initial allocation of shared hash directory. Since this adds numbers
to existing hash table entries, the resulting tuples in pg_shmem_allocations
represent the total size of the initial hash table including all the
buckets and the elements they contain, instead of just the directory size.
---
 src/backend/storage/ipc/shmem.c   |   3 +-
 src/backend/utils/hash/dynahash.c | 146 ++++++++++++++++++++++--------
 src/include/utils/dynahash.h      |   2 +
 src/include/utils/hsearch.h       |   2 +-
 4 files changed, 111 insertions(+), 42 deletions(-)

diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39..c56e9b6c77 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -73,6 +73,7 @@
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/dynahash.h"
 
 static void *ShmemAllocRaw(Size size, Size *allocated_size);
 
@@ -346,7 +347,7 @@ ShmemInitHash(const char *name,		/* table string name for shmem index */
 
 	/* look it up in the shmem index */
 	location = ShmemInitStruct(name,
-							   hash_get_shared_size(infoP, hash_flags),
+							   hash_get_shared_size(infoP, hash_flags, init_size),
 							   &found);
 
 	/*
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index cd5a00132f..5203f5b30b 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -120,7 +120,6 @@
  * a good idea of the maximum number of entries!).  For non-shared hash
  * tables, the initial directory size can be left at the default.
  */
-#define DEF_SEGSIZE			   256
 #define DEF_SEGSIZE_SHIFT	   8	/* must be log2(DEF_SEGSIZE) */
 #define DEF_DIRSIZE			   256
 
@@ -265,7 +264,7 @@ static long hash_accesses,
  */
 static void *DynaHashAlloc(Size size);
 static HASHSEGMENT seg_alloc(HTAB *hashp);
-static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
+static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx, HASHELEMENT *firstElement);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
@@ -276,11 +275,10 @@ static void hash_corrupted(HTAB *hashp) pg_attribute_noreturn();
 static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
 								  HASHBUCKET **bucketptr);
 static long next_pow2_long(long num);
-static int	next_pow2_int(long num);
 static void register_seq_scan(HTAB *hashp);
 static void deregister_seq_scan(HTAB *hashp);
 static bool has_seq_scans(HTAB *hashp);
-
+static int find_num_of_segs(long nelem, int *nbuckets, long num_partitions, long ssize);
 
 /*
  * memory allocation support
@@ -468,7 +466,11 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 	else
 		hashp->keycopy = memcpy;
 
-	/* And select the entry allocation function, too. */
+	/*
+	 * And select the entry allocation function, too. XXX should this also
+	 * Assert that flags & HASH_SHARED_MEM is true, since HASH_ALLOC is
+	 * currently only set with HASH_SHARED_MEM *
+	 */
 	if (flags & HASH_ALLOC)
 		hashp->alloc = info->alloc;
 	else
@@ -518,6 +520,7 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 
 	hashp->frozen = false;
 
+	/* Initializing the HASHHDR variables with default values */
 	hdefault(hashp);
 
 	hctl = hashp->hctl;
@@ -582,7 +585,8 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 					freelist_partitions,
 					nelem_alloc,
 					nelem_alloc_first;
-
+		void *curr_offset;
+	
 		/*
 		 * If hash table is partitioned, give each freelist an equal share of
 		 * the initial allocation.  Otherwise only freeList[0] is used.
@@ -592,6 +596,20 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 		else
 			freelist_partitions = 1;
 
+		/*
+		 * If table is shared, calculate the offset at which to find the
+		 * the first partition of elements
+		 */
+		if (hashp->isshared)
+		{
+			int			nsegs;
+			int			nbuckets;
+			nsegs = find_num_of_segs(nelem, &nbuckets, hctl->num_partitions, hctl->ssize);
+			
+			curr_offset =  (((char *) hashp->hctl) + sizeof(HASHHDR) + (info->dsize * sizeof(HASHSEGMENT)) +
+                        + (sizeof(HASHBUCKET) * hctl->ssize * nsegs));
+		}
+
 		nelem_alloc = nelem / freelist_partitions;
 		if (nelem_alloc <= 0)
 			nelem_alloc = 1;
@@ -609,11 +627,20 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 		for (i = 0; i < freelist_partitions; i++)
 		{
 			int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
-
-			if (!element_alloc(hashp, temp, i))
+			HASHELEMENT *firstElement;
+ 			Size elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
+
+			/* Memory is allocated as part of initial allocation in ShmemInitHash */
+			if (hashp->isshared)
+				firstElement = (HASHELEMENT *) curr_offset;
+			else
+				firstElement = NULL;		
+				
+			if (!element_alloc(hashp, temp, i, firstElement))
 				ereport(ERROR,
 						(errcode(ERRCODE_OUT_OF_MEMORY),
 						 errmsg("out of memory")));
+			curr_offset = (((char *)curr_offset) + (temp * elementSize));
 		}
 	}
 
@@ -693,7 +720,7 @@ init_htab(HTAB *hashp, long nelem)
 	int			nbuckets;
 	int			nsegs;
 	int			i;
-
+	
 	/*
 	 * initialize mutexes if it's a partitioned table
 	 */
@@ -701,30 +728,11 @@ init_htab(HTAB *hashp, long nelem)
 		for (i = 0; i < NUM_FREELISTS; i++)
 			SpinLockInit(&(hctl->freeList[i].mutex));
 
-	/*
-	 * Allocate space for the next greater power of two number of buckets,
-	 * assuming a desired maximum load factor of 1.
-	 */
-	nbuckets = next_pow2_int(nelem);
-
-	/*
-	 * In a partitioned table, nbuckets must be at least equal to
-	 * num_partitions; were it less, keys with apparently different partition
-	 * numbers would map to the same bucket, breaking partition independence.
-	 * (Normally nbuckets will be much bigger; this is just a safety check.)
-	 */
-	while (nbuckets < hctl->num_partitions)
-		nbuckets <<= 1;
+	nsegs = find_num_of_segs(nelem, &nbuckets, hctl->num_partitions, hctl->ssize);
 
 	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
 	hctl->high_mask = (nbuckets << 1) - 1;
 
-	/*
-	 * Figure number of directory segments needed, round up to a power of 2
-	 */
-	nsegs = (nbuckets - 1) / hctl->ssize + 1;
-	nsegs = next_pow2_int(nsegs);
-
 	/*
 	 * Make sure directory is big enough. If pre-allocated directory is too
 	 * small, choke (caller screwed up).
@@ -748,11 +756,19 @@ init_htab(HTAB *hashp, long nelem)
 	}
 
 	/* Allocate initial segments */
+	i = 0;
 	for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
 	{
-		*segp = seg_alloc(hashp);
+		if (hashp->isshared)
+			*segp = (HASHBUCKET *) (((char *) hashp->hctl)
+				+ sizeof(HASHHDR)
+				+ (hashp->hctl->dsize * sizeof(HASHSEGMENT))
+				+ (i * sizeof(HASHBUCKET) * hashp->ssize)); 
+		else
+			*segp = seg_alloc(hashp);
 		if (*segp == NULL)
 			return false;
+		i = i + 1;
 	}
 
 	/* Choose number of entries to allocate at a time */
@@ -851,11 +867,32 @@ hash_select_dirsize(long num_entries)
  * and for the (non expansible) directory.
  */
 Size
-hash_get_shared_size(HASHCTL *info, int flags)
+hash_get_shared_size(HASHCTL *info, int flags, long init_size)
 {
+	int nbuckets;
+	int nsegs;
+	int num_partitions;
+	int ssize;
+	Size elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(info->entrysize);
+
 	Assert(flags & HASH_DIRSIZE);
 	Assert(info->dsize == info->max_dsize);
-	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
+
+	if (flags & HASH_PARTITION)
+		num_partitions = info->num_partitions;
+	else
+		num_partitions = 0;
+
+	if (flags & HASH_SEGMENT)
+		ssize = info->ssize;
+	else
+		ssize = DEF_SEGSIZE;
+
+	nsegs = find_num_of_segs(init_size, &nbuckets, num_partitions, ssize);
+
+	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT) +
+			+ sizeof(HASHBUCKET) * ssize * nsegs
+			+ init_size * elementSize;
 }
 
 
@@ -1285,7 +1322,7 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
 		 * Failing because the needed element is in a different freelist is
 		 * not acceptable.
 		 */
-		if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
+		if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx, NULL))
 		{
 			int			borrow_from_idx;
 
@@ -1689,6 +1726,7 @@ seg_alloc(HTAB *hashp)
 	HASHSEGMENT segp;
 
 	CurrentDynaHashCxt = hashp->hcxt;
+	
 	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
 
 	if (!segp)
@@ -1703,11 +1741,10 @@ seg_alloc(HTAB *hashp)
  * allocate some new elements and link them into the indicated free list
  */
 static bool
-element_alloc(HTAB *hashp, int nelem, int freelist_idx)
+element_alloc(HTAB *hashp, int nelem, int freelist_idx, HASHELEMENT *firstElement)
 {
 	HASHHDR    *hctl = hashp->hctl;
 	Size		elementSize;
-	HASHELEMENT *firstElement;
 	HASHELEMENT *tmpElement;
 	HASHELEMENT *prevElement;
 	int			i;
@@ -1717,10 +1754,11 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx)
 
 	/* Each element has a HASHELEMENT header plus user data. */
 	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
-
-	CurrentDynaHashCxt = hashp->hcxt;
-	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
-
+	if (!firstElement)
+	{
+		CurrentDynaHashCxt = hashp->hcxt;
+		firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
+	}
 	if (!firstElement)
 		return false;
 
@@ -1816,7 +1854,7 @@ next_pow2_long(long num)
 }
 
 /* calculate first power of 2 >= num, bounded to what will fit in an int */
-static int
+int
 next_pow2_int(long num)
 {
 	if (num > INT_MAX / 2)
@@ -1957,3 +1995,31 @@ AtEOSubXact_HashTables(bool isCommit, int nestDepth)
 		}
 	}
 }
+
+static int
+find_num_of_segs(long nelem, int *nbuckets, long num_partitions, long ssize)
+{
+	int nsegs;
+	/*
+	 * Allocate space for the next greater power of two number of buckets,
+	 * assuming a desired maximum load factor of 1.
+	 */
+	*nbuckets = next_pow2_int(nelem);
+
+	/*
+	 * In a partitioned table, nbuckets must be at least equal to
+	 * num_partitions; were it less, keys with apparently different partition
+	 * numbers would map to the same bucket, breaking partition independence.
+	 * (Normally nbuckets will be much bigger; this is just a safety check.)
+	 */
+	while ((*nbuckets) < num_partitions)
+		(*nbuckets) <<= 1;
+
+
+	/*
+	 * Figure number of directory segments needed, round up to a power of 2
+	 */
+	nsegs = ((*nbuckets) - 1) / ssize + 1;
+	nsegs = next_pow2_int(nsegs);
+	return nsegs;
+}
diff --git a/src/include/utils/dynahash.h b/src/include/utils/dynahash.h
index 8a31d9524e..9393018ef6 100644
--- a/src/include/utils/dynahash.h
+++ b/src/include/utils/dynahash.h
@@ -15,6 +15,8 @@
 #ifndef DYNAHASH_H
 #define DYNAHASH_H
 
+#define DEF_SEGSIZE			   256
 extern int	my_log2(long num);
+extern int	next_pow2_int(long num);
 
 #endif							/* DYNAHASH_H */
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 932cc4f34d..5e16bd4183 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -151,7 +151,7 @@ extern void hash_seq_term(HASH_SEQ_STATUS *status);
 extern void hash_freeze(HTAB *hashp);
 extern Size hash_estimate_size(long num_entries, Size entrysize);
 extern long hash_select_dirsize(long num_entries);
-extern Size hash_get_shared_size(HASHCTL *info, int flags);
+extern Size hash_get_shared_size(HASHCTL *info, int flags, long init_size);
 extern void AtEOXact_HashTables(bool isCommit);
 extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
 
-- 
2.34.1

