On Mon, Apr 1, 2024 at 9:54 AM Masahiko Sawada <sawada.m...@gmail.com> wrote:
>
> Thank you! I've attached the patch that I'm going to push tomorrow.

Excellent!

I've attached a mostly-polished update on runtime embeddable values,
storing up to 3 offsets in the child pointer (1 on 32-bit platforms).
As discussed, this includes a macro to cap max possible offset that
can be stored in the bitmap, which I believe only reduces the valid
offset range for 32kB pages on 32-bit platforms. Even there, it allows
for more line pointers than can possibly be useful. It also splits
into two parts for readability. It would be committed in two pieces as
well, since they are independently useful.
From 24bd672deb4a6fa14abfc8583b500d1cbc332032 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 5 Apr 2024 17:04:12 +0700
Subject: [PATCH v83 1/3] store offsets in the header

---
 src/backend/access/common/tidstore.c          | 52 +++++++++++++++++++
 .../test_tidstore/expected/test_tidstore.out  | 14 +++++
 .../test_tidstore/sql/test_tidstore.sql       |  5 ++
 3 files changed, 71 insertions(+)

diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c
index e1a7e82469..4eb5d46951 100644
--- a/src/backend/access/common/tidstore.c
+++ b/src/backend/access/common/tidstore.c
@@ -34,6 +34,9 @@
 /* number of active words for a page: */
 #define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
 
+/* number of offsets we can store in the header of a BlocktableEntry */
+#define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint16)) / sizeof(OffsetNumber))
+
 /*
  * This is named similarly to PagetableEntry in tidbitmap.c
  * because the two have a similar function.
@@ -41,6 +44,10 @@
 typedef struct BlocktableEntry
 {
 	uint16		nwords;
+
+	/* We can store a small number of offsets here to avoid wasting space with a sparse bitmap. */
+	OffsetNumber full_offsets[NUM_FULL_OFFSETS];
+
 	bitmapword	words[FLEXIBLE_ARRAY_MEMBER];
 } BlocktableEntry;
 #define MaxBlocktableEntrySize \
@@ -316,6 +323,25 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
 	for (int i = 1; i < num_offsets; i++)
 		Assert(offsets[i] > offsets[i - 1]);
 
+	memset(page, 0, offsetof(BlocktableEntry, words));
+
+	if (num_offsets <= NUM_FULL_OFFSETS)
+	{
+		for (int i = 0; i < num_offsets; i++)
+		{
+			OffsetNumber off = offsets[i];
+
+			/* safety check to ensure we don't overrun bit array bounds */
+			if (!OffsetNumberIsValid(off))
+				elog(ERROR, "tuple offset out of range: %u", off);
+
+			page->full_offsets[i] = off;
+		}
+
+		page->nwords = 0;
+	}
+	else
+	{
 	for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
 		 wordnum <= WORDNUM(offsets[num_offsets - 1]);
 		 wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
@@ -343,6 +369,7 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
 
 	page->nwords = wordnum;
 	Assert(page->nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
+}
 
 	if (TidStoreIsShared(ts))
 		shared_ts_set(ts->tree.shared, blkno, page);
@@ -369,6 +396,18 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid)
 	if (page == NULL)
 		return false;
 
+	if (page->nwords == 0)
+	{
+		/* we have offsets in the header */
+		for (int i = 0; i < NUM_FULL_OFFSETS; i++)
+		{
+			if (page->full_offsets[i] == off)
+				return true;
+		}
+		return false;
+	}
+	else
+	{
 	wordnum = WORDNUM(off);
 	bitnum = BITNUM(off);
 
@@ -378,6 +417,7 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid)
 
 	return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
 }
+}
 
 /*
  * Prepare to iterate through a TidStore.
@@ -496,6 +536,17 @@ tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
 	result->num_offsets = 0;
 	result->blkno = blkno;
 
+	if (page->nwords == 0)
+	{
+		/* we have offsets in the header */
+		for (int i = 0; i < NUM_FULL_OFFSETS; i++)
+		{
+			if (page->full_offsets[i] != InvalidOffsetNumber)
+				result->offsets[result->num_offsets++] = page->full_offsets[i];
+		}
+	}
+	else
+	{
 	for (wordnum = 0; wordnum < page->nwords; wordnum++)
 	{
 		bitmapword	w = page->words[wordnum];
@@ -518,3 +569,4 @@ tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
 		}
 	}
 }
+}
diff --git a/src/test/modules/test_tidstore/expected/test_tidstore.out b/src/test/modules/test_tidstore/expected/test_tidstore.out
index 0ae2f970da..c40779859b 100644
--- a/src/test/modules/test_tidstore/expected/test_tidstore.out
+++ b/src/test/modules/test_tidstore/expected/test_tidstore.out
@@ -36,6 +36,20 @@ SELECT do_set_block_offsets(blk, array_agg(off)::int2[])
     (VALUES (0), (1), (:maxblkno / 2), (:maxblkno - 1), (:maxblkno)) AS blocks(blk),
     (VALUES (1), (2), (:maxoffset / 2), (:maxoffset - 1), (:maxoffset)) AS offsets(off)
   GROUP BY blk;
+-- Test offsets embedded in the bitmap header.
+SELECT do_set_block_offsets(501, array[greatest((random() * :maxoffset)::int, 1)]::int2[]);
+ do_set_block_offsets 
+----------------------
+                  501
+(1 row)
+
+SELECT do_set_block_offsets(502, array_agg(DISTINCT greatest((random() * :maxoffset)::int, 1))::int2[])
+  FROM generate_series(1, 3);
+ do_set_block_offsets 
+----------------------
+                  502
+(1 row)
+
 -- Add enough TIDs to cause the store to appear "full", compared
 -- to the allocated memory it started out with. This is easier
 -- with memory contexts in local memory.
diff --git a/src/test/modules/test_tidstore/sql/test_tidstore.sql b/src/test/modules/test_tidstore/sql/test_tidstore.sql
index e5edfbb264..1f4e4a807e 100644
--- a/src/test/modules/test_tidstore/sql/test_tidstore.sql
+++ b/src/test/modules/test_tidstore/sql/test_tidstore.sql
@@ -28,6 +28,11 @@ SELECT do_set_block_offsets(blk, array_agg(off)::int2[])
     (VALUES (1), (2), (:maxoffset / 2), (:maxoffset - 1), (:maxoffset)) AS offsets(off)
   GROUP BY blk;
 
+-- Test offsets embedded in the bitmap header.
+SELECT do_set_block_offsets(501, array[greatest((random() * :maxoffset)::int, 1)]::int2[]);
+SELECT do_set_block_offsets(502, array_agg(DISTINCT greatest((random() * :maxoffset)::int, 1))::int2[])
+  FROM generate_series(1, 3);
+
 -- Add enough TIDs to cause the store to appear "full", compared
 -- to the allocated memory it started out with. This is easier
 -- with memory contexts in local memory.
-- 
2.44.0

From 89d5abe4f0b945d07645ac7ead252c8aafe09331 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 5 Apr 2024 17:06:55 +0700
Subject: [PATCH v83 2/3] pgindent

---
 src/backend/access/common/tidstore.c | 99 ++++++++++++++--------------
 1 file changed, 51 insertions(+), 48 deletions(-)

diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c
index 4eb5d46951..26eb52948b 100644
--- a/src/backend/access/common/tidstore.c
+++ b/src/backend/access/common/tidstore.c
@@ -45,7 +45,10 @@ typedef struct BlocktableEntry
 {
 	uint16		nwords;
 
-	/* We can store a small number of offsets here to avoid wasting space with a sparse bitmap. */
+	/*
+	 * We can store a small number of offsets here to avoid wasting space with
+	 * a sparse bitmap.
+	 */
 	OffsetNumber full_offsets[NUM_FULL_OFFSETS];
 
 	bitmapword	words[FLEXIBLE_ARRAY_MEMBER];
@@ -342,35 +345,35 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
 	}
 	else
 	{
-	for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
-		 wordnum <= WORDNUM(offsets[num_offsets - 1]);
-		 wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
-	{
-		word = 0;
-
-		while (idx < num_offsets)
+		for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
+			 wordnum <= WORDNUM(offsets[num_offsets - 1]);
+			 wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
 		{
-			OffsetNumber off = offsets[idx];
+			word = 0;
 
-			/* safety check to ensure we don't overrun bit array bounds */
-			if (!OffsetNumberIsValid(off))
-				elog(ERROR, "tuple offset out of range: %u", off);
+			while (idx < num_offsets)
+			{
+				OffsetNumber off = offsets[idx];
+
+				/* safety check to ensure we don't overrun bit array bounds */
+				if (!OffsetNumberIsValid(off))
+					elog(ERROR, "tuple offset out of range: %u", off);
 
-			if (off >= next_word_threshold)
-				break;
+				if (off >= next_word_threshold)
+					break;
 
-			word |= ((bitmapword) 1 << BITNUM(off));
-			idx++;
+				word |= ((bitmapword) 1 << BITNUM(off));
+				idx++;
+			}
+
+			/* write out offset bitmap for this wordnum */
+			page->words[wordnum] = word;
 		}
 
-		/* write out offset bitmap for this wordnum */
-		page->words[wordnum] = word;
+		page->nwords = wordnum;
+		Assert(page->nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
 	}
 
-	page->nwords = wordnum;
-	Assert(page->nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
-}
-
 	if (TidStoreIsShared(ts))
 		shared_ts_set(ts->tree.shared, blkno, page);
 	else
@@ -408,15 +411,15 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid)
 	}
 	else
 	{
-	wordnum = WORDNUM(off);
-	bitnum = BITNUM(off);
+		wordnum = WORDNUM(off);
+		bitnum = BITNUM(off);
 
-	/* no bitmap for the off */
-	if (wordnum >= page->nwords)
-		return false;
+		/* no bitmap for the off */
+		if (wordnum >= page->nwords)
+			return false;
 
-	return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
-}
+		return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
+	}
 }
 
 /*
@@ -547,26 +550,26 @@ tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
 	}
 	else
 	{
-	for (wordnum = 0; wordnum < page->nwords; wordnum++)
-	{
-		bitmapword	w = page->words[wordnum];
-		int			off = wordnum * BITS_PER_BITMAPWORD;
-
-		/* Make sure there is enough space to add offsets */
-		if ((result->num_offsets + BITS_PER_BITMAPWORD) > result->max_offset)
+		for (wordnum = 0; wordnum < page->nwords; wordnum++)
 		{
-			result->max_offset *= 2;
-			result->offsets = repalloc(result->offsets,
-									   sizeof(OffsetNumber) * result->max_offset);
-		}
-
-		while (w != 0)
-		{
-			if (w & 1)
-				result->offsets[result->num_offsets++] = (OffsetNumber) off;
-			off++;
-			w >>= 1;
+			bitmapword	w = page->words[wordnum];
+			int			off = wordnum * BITS_PER_BITMAPWORD;
+
+			/* Make sure there is enough space to add offsets */
+			if ((result->num_offsets + BITS_PER_BITMAPWORD) > result->max_offset)
+			{
+				result->max_offset *= 2;
+				result->offsets = repalloc(result->offsets,
+										   sizeof(OffsetNumber) * result->max_offset);
+			}
+
+			while (w != 0)
+			{
+				if (w & 1)
+					result->offsets[result->num_offsets++] = (OffsetNumber) off;
+				off++;
+				w >>= 1;
+			}
 		}
 	}
 }
-}
-- 
2.44.0

From 8547efd90cb438a2c9624a5dfbeef945e1ca97ac Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 5 Apr 2024 17:45:59 +0700
Subject: [PATCH v83 3/3] Teach radix tree to embed values at runtime

Previously, the decision to store values in leaves or within the child
pointer was made at compile time, with variable length values using
leaves by necessity. This commit allows introspecting the length of
variable length values at runtime for that decision. This requires
the ability to tell whether the last-level child pointer is actually
a value, so we use a pointer tag in the lowest level bit.

Use this in TID store. This entails adding a byte to the header to
reserve space for the tag. Commit XXXXXXXXX stores up to three offsets
within the header with no bitmap, and now the header can be embedded
as above. This reduces worst-case memory usage when TIDs are sparse.

Discussion: https://postgr.es/m/
---
 src/backend/access/common/tidstore.c | 79 ++++++++++++++++++++--------
 src/include/lib/radixtree.h          | 32 +++++++++++
 2 files changed, 88 insertions(+), 23 deletions(-)

diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c
index 26eb52948b..5784223b76 100644
--- a/src/backend/access/common/tidstore.c
+++ b/src/backend/access/common/tidstore.c
@@ -43,19 +43,50 @@
  */
 typedef struct BlocktableEntry
 {
-	uint16		nwords;
-
-	/*
-	 * We can store a small number of offsets here to avoid wasting space with
-	 * a sparse bitmap.
-	 */
-	OffsetNumber full_offsets[NUM_FULL_OFFSETS];
+	union
+	{
+		struct
+		{
+#ifndef WORDS_BIGENDIAN
+			/*
+			 * We need to position this member so that the backing radix tree
+			 * can use the lowest bit for a pointer tag. In particular, it
+			 * must be placed within 'header' so that it corresponds to the
+			 * lowest byte in 'ptr'. We position 'nwords' along with it to
+			 * avoid struct padding.
+			 */
+			uint8		flags;
+
+			int8		nwords;
+#endif
+
+			/*
+			 * We can store a small number of offsets here to avoid wasting
+			 * space with a sparse bitmap.
+			 */
+			OffsetNumber full_offsets[NUM_FULL_OFFSETS];
+
+#ifdef WORDS_BIGENDIAN
+			int8		nwords;
+			uint8		flags;
+#endif
+		};
+		uintptr_t	ptr;
+	}			header;
 
 	bitmapword	words[FLEXIBLE_ARRAY_MEMBER];
 } BlocktableEntry;
+
+/*
+ * The type of 'nwords' limits the max number of words in the 'words' array.
+ * This computes the max offset we can actually store in the bitmap. In
+ * practice, it's almost always the same as MaxOffsetNumber.
+ */
+#define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
+
 #define MaxBlocktableEntrySize \
 	offsetof(BlocktableEntry, words) + \
-		(sizeof(bitmapword) * WORDS_PER_PAGE(MaxOffsetNumber))
+		(sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
 
 #define RT_PREFIX local_ts
 #define RT_SCOPE static
@@ -64,7 +95,8 @@ typedef struct BlocktableEntry
 #define RT_VALUE_TYPE BlocktableEntry
 #define RT_VARLEN_VALUE_SIZE(page) \
 	(offsetof(BlocktableEntry, words) + \
-	sizeof(bitmapword) * (page)->nwords)
+	sizeof(bitmapword) * (page)->header.nwords)
+#define RT_RUNTIME_EMBEDDABLE_VALUE
 #include "lib/radixtree.h"
 
 #define RT_PREFIX shared_ts
@@ -75,7 +107,8 @@ typedef struct BlocktableEntry
 #define RT_VALUE_TYPE BlocktableEntry
 #define RT_VARLEN_VALUE_SIZE(page) \
 	(offsetof(BlocktableEntry, words) + \
-	sizeof(bitmapword) * (page)->nwords)
+	sizeof(bitmapword) * (page)->header.nwords)
+#define RT_RUNTIME_EMBEDDABLE_VALUE
 #include "lib/radixtree.h"
 
 /* Per-backend state for a TidStore */
@@ -335,13 +368,13 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
 			OffsetNumber off = offsets[i];
 
 			/* safety check to ensure we don't overrun bit array bounds */
-			if (!OffsetNumberIsValid(off))
+			if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
 				elog(ERROR, "tuple offset out of range: %u", off);
 
-			page->full_offsets[i] = off;
+			page->header.full_offsets[i] = off;
 		}
 
-		page->nwords = 0;
+		page->header.nwords = 0;
 	}
 	else
 	{
@@ -356,7 +389,7 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
 				OffsetNumber off = offsets[idx];
 
 				/* safety check to ensure we don't overrun bit array bounds */
-				if (!OffsetNumberIsValid(off))
+				if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
 					elog(ERROR, "tuple offset out of range: %u", off);
 
 				if (off >= next_word_threshold)
@@ -370,8 +403,8 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
 			page->words[wordnum] = word;
 		}
 
-		page->nwords = wordnum;
-		Assert(page->nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
+		page->header.nwords = wordnum;
+		Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
 	}
 
 	if (TidStoreIsShared(ts))
@@ -399,12 +432,12 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid)
 	if (page == NULL)
 		return false;
 
-	if (page->nwords == 0)
+	if (page->header.nwords == 0)
 	{
 		/* we have offsets in the header */
 		for (int i = 0; i < NUM_FULL_OFFSETS; i++)
 		{
-			if (page->full_offsets[i] == off)
+			if (page->header.full_offsets[i] == off)
 				return true;
 		}
 		return false;
@@ -415,7 +448,7 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid)
 		bitnum = BITNUM(off);
 
 		/* no bitmap for the off */
-		if (wordnum >= page->nwords)
+		if (wordnum >= page->header.nwords)
 			return false;
 
 		return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
@@ -539,18 +572,18 @@ tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
 	result->num_offsets = 0;
 	result->blkno = blkno;
 
-	if (page->nwords == 0)
+	if (page->header.nwords == 0)
 	{
 		/* we have offsets in the header */
 		for (int i = 0; i < NUM_FULL_OFFSETS; i++)
 		{
-			if (page->full_offsets[i] != InvalidOffsetNumber)
-				result->offsets[result->num_offsets++] = page->full_offsets[i];
+			if (page->header.full_offsets[i] != InvalidOffsetNumber)
+				result->offsets[result->num_offsets++] = page->header.full_offsets[i];
 		}
 	}
 	else
 	{
-		for (wordnum = 0; wordnum < page->nwords; wordnum++)
+		for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
 		{
 			bitmapword	w = page->words[wordnum];
 			int			off = wordnum * BITS_PER_BITMAPWORD;
diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h
index 6f36e8bfde..a9a2c90db2 100644
--- a/src/include/lib/radixtree.h
+++ b/src/include/lib/radixtree.h
@@ -105,6 +105,10 @@
  *   involving a pointer to the value type, to calculate size.
  *     NOTE: implies that the value is in fact variable-length,
  *     so do not set for fixed-length values.
+ * - RT_RUNTIME_EMBEDDABLE_VALUE - for variable length values, allows
+ *   storing the value in a child pointer slot, rather than as a single-
+ *   value leaf, if small enough. This requires that the value, when
+ *   read as a child pointer, can be tagged in the lowest bit.
  *
  * Optional parameters:
  * - RT_SHMEM - if defined, the radix tree is created in the DSA area
@@ -437,7 +441,13 @@ static inline bool
 RT_VALUE_IS_EMBEDDABLE(RT_VALUE_TYPE * value_p)
 {
 #ifdef RT_VARLEN_VALUE_SIZE
+
+#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
+	return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
+#else
 	return false;
+#endif
+
 #else
 	return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
 #endif
@@ -451,7 +461,19 @@ static inline bool
 RT_CHILDPTR_IS_VALUE(RT_PTR_ALLOC child)
 {
 #ifdef RT_VARLEN_VALUE_SIZE
+
+#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
+	/* check for pointer tag */
+#ifdef RT_SHMEM
+	return child & 1;
+#else
+	return ((uintptr_t) child) & 1;
+#endif
+
+#else
 	return false;
+#endif
+
 #else
 	return sizeof(RT_VALUE_TYPE) <= sizeof(RT_PTR_ALLOC);
 #endif
@@ -1728,6 +1750,15 @@ have_slot:
 	{
 		/* store value directly in child pointer slot */
 		memcpy(slot, value_p, value_sz);
+
+#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
+		/* tag child pointer */
+#ifdef RT_SHMEM
+		*slot |= 1;
+#else
+		*((uintptr_t *) slot) |= 1;
+#endif
+#endif
 	}
 	else
 	{
@@ -2879,6 +2910,7 @@ RT_DUMP_NODE(RT_NODE * node)
 #undef RT_DEFINE
 #undef RT_VALUE_TYPE
 #undef RT_VARLEN_VALUE_SIZE
+#undef RT_RUNTIME_EMBEDDABLE_VALUE
 #undef RT_SHMEM
 #undef RT_USE_DELETE
 #undef RT_DEBUG
-- 
2.44.0

Reply via email to