From 6d00b05e586467ec1b52a548f9f7dcbcd1b9c14f Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Thu, 21 Mar 2024 23:02:13 +0900
Subject: [PATCH v80 1/2] Rethink create and attach APIs of shared TidStore.

Previously, the behavior of TidStoreCreate() was inconsistent between
local and shared TidStore instances in terms of memory limitation. For
local TidStore, a memory context was created with initial and maximum
memory block sizes, as well as a minimum memory context size, based on
the specified memory limitation. However, for shared TidStore, the
provided DSA area was used for TID storage. Although commit XXX
allowed specifying the initial and maximum DSA segment sizes, callers
would have needed to clamp their own limits, which was not consistent
and user-friendly.

With this commit, when creating a shared TidStore, a dedicated DSA
area is created for TID storage instead of using a provided DSA
area. The initial and maximum DSA segment sizes are chosen based on
the specified max_bytes memory limitation. Other processes can attach
to the shared TidStore using the handle of the created DSA returned by
the new TidStoreGetDSA() function and the DSA pointer returned by
TidStoreGetHandle(). The created DSA has the same lifetime as the
shared TidStore and is deleted when all processes detach from it.

To improve clarity, the TidStoreCreate() function has been divided
into two separate functions: TidStoreCreateLocal() and
TidStoreCreateShared().

Reviewed-by: John Naylor
Discussion: https://postgr.es/m/CAD21AoAyc1j%3DBCdUqZfk6qbdjZ68UgRx1Gkpk0oah4K7S0Ri9g%40mail.gmail.com
---
 src/backend/access/common/tidstore.c          | 107 ++++++++++++++----
 src/include/access/tidstore.h                 |   7 +-
 .../modules/test_tidstore/test_tidstore.c     |  14 +--
 3 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c
index f79141590e..fcb8e839d2 100644
--- a/src/backend/access/common/tidstore.c
+++ b/src/backend/access/common/tidstore.c
@@ -7,9 +7,9 @@
  * Internally it uses a radix tree as the storage for TIDs. The key is the
  * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
  *
- * TidStore can be shared among parallel worker processes by passing DSA area
- * to TidStoreCreate(). Other backends can attach to the shared TidStore by
- * TidStoreAttach().
+ * TidStore can be shared among parallel worker processes by using
+ * TidStoreCreateShared(). Other backends can attach to the shared TidStore
+ * by TidStoreAttach().
  *
  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -111,15 +111,16 @@ static void tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
 /*
  * Create a TidStore. The TidStore will live in the memory context that is
  * CurrentMemoryContext at the time of this call. The TID storage, backed
- * by a radix tree, will live in its child memory context, rt_context. The
- * TidStore will be limited to (approximately) max_bytes total memory
- * consumption. If the 'area' is non-NULL, the radix tree is created in the
- * DSA area.
+ * by a radix tree, will live in its child memory context, rt_context.
  *
- * The returned object is allocated in backend-local memory.
+ * "max_bytes" is not an internally-enforced limit; it is used only as a
+ * hint to cap the memory block size of the memory context for TID storage.
+ * This reduces space wastage due to over-allocation. If the caller wants to
+ * monitor memory usage, it must compare its limit with the value reported
+ * by TidStoreMemoryUsage().
  */
 TidStore *
-TidStoreCreate(size_t max_bytes, dsa_area *area, int tranche_id)
+TidStoreCreateLocal(size_t max_bytes)
 {
 	TidStore   *ts;
 	size_t		initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
@@ -143,33 +144,74 @@ TidStoreCreate(size_t max_bytes, dsa_area *area, int tranche_id)
 										   initBlockSize,
 										   maxBlockSize);
 
-	if (area != NULL)
-	{
-		ts->tree.shared = shared_ts_create(ts->rt_context, area,
-										   tranche_id);
-		ts->area = area;
-	}
-	else
-		ts->tree.local = local_ts_create(ts->rt_context);
+	ts->tree.local = local_ts_create(ts->rt_context);
 
 	return ts;
 }
 
 /*
- * Attach to the shared TidStore using the given  handle. The returned object
- * is allocated in backend-local memory using the CurrentMemoryContext.
+ * Similar to TidStoreCreateLocal() but create a shared TidStore on a
+ * DSA area. The TID storage will live in the DSA area, and the memory
+ * context rt_context will have only meta data of the radix tree.
+ *
+ * The returned object is allocated in backend-local memory.
  */
 TidStore *
-TidStoreAttach(dsa_area *area, dsa_pointer handle)
+TidStoreCreateShared(size_t max_bytes, int tranche_id)
 {
 	TidStore   *ts;
+	dsa_area   *area;
+	size_t		dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE;
+	size_t		dsa_max_size = DSA_MAX_SEGMENT_SIZE;
 
-	Assert(area != NULL);
+	ts = palloc0(sizeof(TidStore));
+	ts->context = CurrentMemoryContext;
+
+	ts->rt_context = AllocSetContextCreate(CurrentMemoryContext,
+										   "TID storage meta data",
+										   ALLOCSET_SMALL_SIZES);
+
+	/*
+	 * Choose the initial and maximum DSA segment sizes to be no longer
+	 * than 1/8 of max_bytes.
+	 */
+	while (8 * dsa_max_size > max_bytes)
+		dsa_max_size >>= 1;
+
+	if (dsa_max_size < DSA_MIN_SEGMENT_SIZE)
+		dsa_max_size = DSA_MIN_SEGMENT_SIZE;
+
+	if (dsa_init_size > dsa_max_size)
+		dsa_init_size = dsa_max_size;
+
+	area = dsa_create_ext(tranche_id, dsa_init_size, dsa_max_size);
+	ts->tree.shared = shared_ts_create(ts->rt_context, area,
+									   tranche_id);
+	ts->area = area;
+
+	return ts;
+}
+
+/*
+ * Attach to the shared TidStore. 'area_handle' is the DSA handle where
+ * the TidStore is created. 'handle' is the dsa_pointer returned by
+ * TidStoreGetHandle(). The returned object is allocated in backend-local
+ * memory using the CurrentMemoryContext.
+ */
+TidStore *
+TidStoreAttach(dsa_handle area_handle, dsa_pointer handle)
+{
+	TidStore   *ts;
+	dsa_area   *area;
+
+	Assert(area_handle != DSA_HANDLE_INVALID);
 	Assert(DsaPointerIsValid(handle));
 
 	/* create per-backend state */
 	ts = palloc0(sizeof(TidStore));
 
+	area = dsa_attach(area_handle);
+
 	/* Find the shared the shared radix tree */
 	ts->tree.shared = shared_ts_attach(area, handle);
 	ts->area = area;
@@ -178,10 +220,8 @@ TidStoreAttach(dsa_area *area, dsa_pointer handle)
 }
 
 /*
- * Detach from a TidStore. This detaches from radix tree and frees the
- * backend-local resources. The radix tree will continue to exist until
- * it is either explicitly destroyed, or the area that backs it is returned
- * to the operating system.
+ * Detach from a TidStore. This also detaches from radix tree and frees
+ * the backend-local resources.
  */
 void
 TidStoreDetach(TidStore *ts)
@@ -189,6 +229,8 @@ TidStoreDetach(TidStore *ts)
 	Assert(TidStoreIsShared(ts));
 
 	shared_ts_detach(ts->tree.shared);
+	dsa_detach(ts->area);
+
 	pfree(ts);
 }
 
@@ -234,7 +276,11 @@ TidStoreDestroy(TidStore *ts)
 {
 	/* Destroy underlying radix tree */
 	if (TidStoreIsShared(ts))
+	{
 		shared_ts_free(ts->tree.shared);
+
+		dsa_detach(ts->area);
+	}
 	else
 		local_ts_free(ts->tree.local);
 
@@ -420,6 +466,17 @@ TidStoreMemoryUsage(TidStore *ts)
 		return local_ts_memory_usage(ts->tree.local);
 }
 
+/*
+ * Return the DSA area where the TidStore lives.
+ */
+dsa_area *
+TidStoreGetDSA(TidStore *ts)
+{
+	Assert(TidStoreIsShared(ts));
+
+	return ts->area;
+}
+
 dsa_pointer
 TidStoreGetHandle(TidStore *ts)
 {
diff --git a/src/include/access/tidstore.h b/src/include/access/tidstore.h
index 09f7a9a474..1de8aa8035 100644
--- a/src/include/access/tidstore.h
+++ b/src/include/access/tidstore.h
@@ -29,9 +29,9 @@ typedef struct TidStoreIterResult
 	OffsetNumber *offsets;
 } TidStoreIterResult;
 
-extern TidStore *TidStoreCreate(size_t max_bytes, dsa_area *area,
-								int tranche_id);
-extern TidStore *TidStoreAttach(dsa_area *area, dsa_pointer handle);
+extern TidStore *TidStoreCreateLocal(size_t max_bytes);
+extern TidStore *TidStoreCreateShared(size_t max_bytes, int tranche_id);
+extern TidStore *TidStoreAttach(dsa_handle dsa_handle, dsa_pointer handle);
 extern void TidStoreDetach(TidStore *ts);
 extern void TidStoreLockExclusive(TidStore *ts);
 extern void TidStoreLockShare(TidStore *ts);
@@ -45,5 +45,6 @@ extern TidStoreIterResult *TidStoreIterateNext(TidStoreIter *iter);
 extern void TidStoreEndIterate(TidStoreIter *iter);
 extern size_t TidStoreMemoryUsage(TidStore *ts);
 extern dsa_pointer TidStoreGetHandle(TidStore *ts);
+extern dsa_area *TidStoreGetDSA(TidStore *ts);
 
 #endif							/* TIDSTORE_H */
diff --git a/src/test/modules/test_tidstore/test_tidstore.c b/src/test/modules/test_tidstore/test_tidstore.c
index c74ad2cf8b..3d4af77dda 100644
--- a/src/test/modules/test_tidstore/test_tidstore.c
+++ b/src/test/modules/test_tidstore/test_tidstore.c
@@ -34,7 +34,6 @@ PG_FUNCTION_INFO_V1(test_is_full);
 PG_FUNCTION_INFO_V1(test_destroy);
 
 static TidStore *tidstore = NULL;
-static dsa_area *dsa = NULL;
 static size_t tidstore_empty_size;
 
 /* array for verification of some tests */
@@ -94,7 +93,6 @@ test_create(PG_FUNCTION_ARGS)
 	size_t		array_init_size = 1024;
 
 	Assert(tidstore == NULL);
-	Assert(dsa == NULL);
 
 	/*
 	 * Create the TidStore on TopMemoryContext so that the same process use it
@@ -109,18 +107,16 @@ test_create(PG_FUNCTION_ARGS)
 		tranche_id = LWLockNewTrancheId();
 		LWLockRegisterTranche(tranche_id, "test_tidstore");
 
-		dsa = dsa_create(tranche_id);
+		tidstore = TidStoreCreateShared(tidstore_max_size, tranche_id);
 
 		/*
 		 * Remain attached until end of backend or explicitly detached so that
 		 * the same process use the tidstore for subsequent tests.
 		 */
-		dsa_pin_mapping(dsa);
-
-		tidstore = TidStoreCreate(tidstore_max_size, dsa, tranche_id);
+		dsa_pin_mapping(TidStoreGetDSA(tidstore));
 	}
 	else
-		tidstore = TidStoreCreate(tidstore_max_size, NULL, 0);
+		tidstore = TidStoreCreateLocal(tidstore_max_size);
 
 	tidstore_empty_size = TidStoreMemoryUsage(tidstore);
 
@@ -309,9 +305,5 @@ test_destroy(PG_FUNCTION_ARGS)
 	pfree(items.lookup_tids);
 	pfree(items.iter_tids);
 
-	if (dsa)
-		dsa_detach(dsa);
-	dsa = NULL;
-
 	PG_RETURN_VOID();
 }
-- 
2.39.3

