There are a few things up in the air, so I'm coming back to this list to
summarize and add a recent update:

On Mon, Nov 14, 2022 at 7:59 PM John Naylor <john.nay...@enterprisedb.com>
wrote:
>
> - See how much performance we actually gain from tagging the node kind.

Needs a benchmark that has enough branch mispredicts and L2/3 misses to
show a benefit. Otherwise either neutral or worse in its current form,
depending on compiler(?). Put off for later.

> - Try additional size classes while keeping the node kinds to only four.

This is relatively simple and effective. If only one additional size class
(total 5) is coded as a placeholder, I imagine it will be easier to rebase
shared memory logic than using this technique everywhere possible.

> - Optimize node128 insert.

I've attached a rough start at this. The basic idea is borrowed from our
bitmapset nodes, so we can iterate over and operate on word-sized (32- or
64-bit) types at a time, rather than bytes. To make this easier, I've moved
some of the lower-level macros and types from bitmapset.h/.c to
pg_bitutils.h. That's probably going to need a separate email thread to
resolve the coding style clash this causes, so that can be put off for
later. This is not meant to be included in the next patchset.  For
demonstration purposes, I get these results with a function that repeatedly
deletes the last value from a mostly-full node128 leaf and re-inserts it:

select * from bench_node128_load(120);

v11

NOTICE:  num_keys = 14400, height = 1, n1 = 0, n4 = 0, n15 = 0, n32 = 0,
n61 = 0, n128 = 121, n256 = 0
 fanout | nkeys | rt_mem_allocated | rt_sparseload_ms
--------+-------+------------------+------------------
    120 | 14400 |           208304 |               56

v11 + 0006 addendum

NOTICE:  num_keys = 14400, height = 1, n1 = 0, n4 = 0, n15 = 0, n32 = 0,
n61 = 0, n128 = 121, n256 = 0
 fanout | nkeys | rt_mem_allocated | rt_sparseload_ms
--------+-------+------------------+------------------
    120 | 14400 |           208816 |               34

I didn't test inner nodes, but I imagine the difference is bigger. This
bitmap style should also be used for the node256-leaf isset array simply to
be consistent and avoid needing single-use macros, but that has not been
done yet. It won't make a difference for performance because there is no
iteration there.

> - Try templating out the differences between local and shared memory.

I hope to start this sometime after the crashes on 32-bit are resolved.

--
John Naylor
EDB: http://www.enterprisedb.com
diff --git a/contrib/bench_radix_tree/bench_radix_tree--1.0.sql 
b/contrib/bench_radix_tree/bench_radix_tree--1.0.sql
index 67ba568531..2fd689aa91 100644
--- a/contrib/bench_radix_tree/bench_radix_tree--1.0.sql
+++ b/contrib/bench_radix_tree/bench_radix_tree--1.0.sql
@@ -63,3 +63,14 @@ OUT rt_search_ms int8
 returns record
 as 'MODULE_PATHNAME'
 LANGUAGE C STRICT VOLATILE PARALLEL UNSAFE;
+
+create function bench_node128_load(
+fanout int4,
+OUT fanout int4,
+OUT nkeys int8,
+OUT rt_mem_allocated int8,
+OUT rt_sparseload_ms int8
+)
+returns record
+as 'MODULE_PATHNAME'
+LANGUAGE C STRICT VOLATILE PARALLEL UNSAFE;
diff --git a/contrib/bench_radix_tree/bench_radix_tree.c 
b/contrib/bench_radix_tree/bench_radix_tree.c
index e69be48448..b035b3a747 100644
--- a/contrib/bench_radix_tree/bench_radix_tree.c
+++ b/contrib/bench_radix_tree/bench_radix_tree.c
@@ -31,6 +31,7 @@ PG_FUNCTION_INFO_V1(bench_shuffle_search);
 PG_FUNCTION_INFO_V1(bench_load_random_int);
 PG_FUNCTION_INFO_V1(bench_fixed_height_search);
 PG_FUNCTION_INFO_V1(bench_search_random_nodes);
+PG_FUNCTION_INFO_V1(bench_node128_load);
 
 static uint64
 tid_to_key_off(ItemPointer tid, uint32 *off)
@@ -552,3 +553,85 @@ finish_search:
        rt_free(rt);
        PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, 
nulls)));
 }
+
+Datum
+bench_node128_load(PG_FUNCTION_ARGS)
+{
+       int                     fanout = PG_GETARG_INT32(0);
+       radix_tree *rt;
+       TupleDesc       tupdesc;
+       TimestampTz start_time,
+                               end_time;
+       long            secs;
+       int                     usecs;
+       int64           rt_sparseload_ms;
+       Datum           values[5];
+       bool            nulls[5];
+
+       uint64          r,
+                               h;
+       int                     key_id;
+
+       /* Build a tuple descriptor for our result type */
+       if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+               elog(ERROR, "return type must be a row type");
+
+       rt = rt_create(CurrentMemoryContext);
+
+       key_id = 0;
+
+       for (r = 1; r <= fanout; r++)
+       {
+               for (h = 1; h <= fanout; h++)
+               {
+                       uint64          key;
+
+                       key = (r << 8) | (h);
+
+                       key_id++;
+                       rt_set(rt, key, key_id);
+               }
+       }
+
+       rt_stats(rt);
+
+       /* measure sparse deletion and re-loading */
+       start_time = GetCurrentTimestamp();
+
+       for (int t = 0; t<10000; t++)
+       {
+               /* delete one key in each leaf */
+               for (r = 1; r <= fanout; r++)
+               {
+                       uint64          key;
+
+                       key = (r << 8) | (fanout);
+
+                       rt_delete(rt, key);
+               }
+
+               /* add them all back */
+               key_id = 0;
+               for (r = 1; r <= fanout; r++)
+               {
+                       uint64          key;
+
+                       key = (r << 8) | (fanout);
+
+                       key_id++;
+                       rt_set(rt, key, key_id);
+               }
+       }
+       end_time = GetCurrentTimestamp();
+       TimestampDifference(start_time, end_time, &secs, &usecs);
+       rt_sparseload_ms = secs * 1000 + usecs / 1000;
+
+       MemSet(nulls, false, sizeof(nulls));
+       values[0] = Int32GetDatum(fanout);
+       values[1] = Int64GetDatum(rt_num_entries(rt));
+       values[2] = Int64GetDatum(rt_memory_usage(rt));
+       values[3] = Int64GetDatum(rt_sparseload_ms);
+
+       rt_free(rt);
+       PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, 
nulls)));
+}
diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c
index f10abd8add..9cfed1624f 100644
--- a/src/backend/lib/radixtree.c
+++ b/src/backend/lib/radixtree.c
@@ -262,6 +262,9 @@ typedef struct rt_node_inner_128
 {
        rt_node_base_128 base;
 
+       /* isset is a bitmap to track which slot is in use */
+       bitmapword              isset[WORDNUM(128)];
+
        /* number of children depends on size class */
        rt_node    *children[FLEXIBLE_ARRAY_MEMBER];
 } rt_node_inner_128;
@@ -271,7 +274,7 @@ typedef struct rt_node_leaf_128
        rt_node_base_128 base;
 
        /* isset is a bitmap to track which slot is in use */
-       uint8           isset[RT_NODE_NSLOTS_BITS(128)];
+       bitmapword              isset[WORDNUM(128)];
 
        /* number of values depends on size class */
        uint64          values[FLEXIBLE_ARRAY_MEMBER];
@@ -679,14 +682,14 @@ static inline bool
 node_inner_128_is_slot_used(rt_node_inner_128 *node, uint8 slot)
 {
        Assert(!NODE_IS_LEAF(node));
-       return (node->children[slot] != NULL);
+       return (node->isset[WORDNUM(slot)] & ((bitmapword) 1 << BITNUM(slot))) 
!= 0;
 }
 
 static inline bool
 node_leaf_128_is_slot_used(rt_node_leaf_128 *node, uint8 slot)
 {
        Assert(NODE_IS_LEAF(node));
-       return ((node->isset[RT_NODE_BITMAP_BYTE(slot)] & 
RT_NODE_BITMAP_BIT(slot)) != 0);
+       return (node->isset[WORDNUM(slot)] & ((bitmapword) 1 << BITNUM(slot))) 
!= 0;
 }
 
 static inline rt_node *
@@ -707,7 +710,10 @@ node_leaf_128_get_value(rt_node_leaf_128 *node, uint8 
chunk)
 static void
 node_inner_128_delete(rt_node_inner_128 *node, uint8 chunk)
 {
+       int                     slotpos = node->base.slot_idxs[chunk];
+
        Assert(!NODE_IS_LEAF(node));
+       node->isset[WORDNUM(slotpos)] &= ~((bitmapword) 1 << BITNUM(slotpos));
        node->base.slot_idxs[chunk] = RT_NODE_128_INVALID_IDX;
 }
 
@@ -717,41 +723,32 @@ node_leaf_128_delete(rt_node_leaf_128 *node, uint8 chunk)
        int                     slotpos = node->base.slot_idxs[chunk];
 
        Assert(NODE_IS_LEAF(node));
-       node->isset[RT_NODE_BITMAP_BYTE(slotpos)] &= 
~(RT_NODE_BITMAP_BIT(slotpos));
+       node->isset[WORDNUM(slotpos)] &= ~((bitmapword) 1 << BITNUM(slotpos));
        node->base.slot_idxs[chunk] = RT_NODE_128_INVALID_IDX;
 }
 
 /* Return an unused slot in node-128 */
 static int
-node_inner_128_find_unused_slot(rt_node_inner_128 *node, uint8 chunk)
-{
-       int                     slotpos = 0;
-
-       Assert(!NODE_IS_LEAF(node));
-       while (node_inner_128_is_slot_used(node, slotpos))
-               slotpos++;
-
-       return slotpos;
-}
-
-static int
-node_leaf_128_find_unused_slot(rt_node_leaf_128 *node, uint8 chunk)
+node128_find_unused_slot(bitmapword *isset)
 {
        int                     slotpos;
+       int                     idx;
+       bitmapword      inverse;
 
-       Assert(NODE_IS_LEAF(node));
-
-       /* We iterate over the isset bitmap per byte then check each bit */
-       for (slotpos = 0; slotpos < RT_NODE_NSLOTS_BITS(128); slotpos++)
+       /* get the first word with at least one bit not set */
+       for (idx = 0; idx < WORDNUM(128); idx++)
        {
-               if (node->isset[slotpos] < 0xFF)
+               if (isset[idx] < ~((bitmapword) 0))
                        break;
        }
-       Assert(slotpos < RT_NODE_NSLOTS_BITS(128));
 
-       slotpos *= BITS_PER_BYTE;
-       while (node_leaf_128_is_slot_used(node, slotpos))
-               slotpos++;
+       /* To get the first unset bit in X, get the first set bit in ~X */
+       inverse = ~(isset[idx]);
+       slotpos = idx * BITS_PER_BITMAPWORD;
+       slotpos += bmw_rightmost_one_pos(inverse);
+
+       /* mark the slot used */
+       isset[idx] |= RIGHTMOST_ONE(inverse);
 
        return slotpos;
 }
@@ -763,8 +760,7 @@ node_inner_128_insert(rt_node_inner_128 *node, uint8 chunk, 
rt_node *child)
 
        Assert(!NODE_IS_LEAF(node));
 
-       /* find unused slot */
-       slotpos = node_inner_128_find_unused_slot(node, chunk);
+       slotpos = node128_find_unused_slot(node->isset);
 
        node->base.slot_idxs[chunk] = slotpos;
        node->children[slotpos] = child;
@@ -778,11 +774,9 @@ node_leaf_128_insert(rt_node_leaf_128 *node, uint8 chunk, 
uint64 value)
 
        Assert(NODE_IS_LEAF(node));
 
-       /* find unused slot */
-       slotpos = node_leaf_128_find_unused_slot(node, chunk);
+       slotpos = node128_find_unused_slot(node->isset);
 
        node->base.slot_idxs[chunk] = slotpos;
-       node->isset[RT_NODE_BITMAP_BYTE(slotpos)] |= 
RT_NODE_BITMAP_BIT(slotpos);
        node->values[slotpos] = value;
 }
 
@@ -2508,9 +2502,9 @@ rt_dump_node(rt_node *node, int level, bool recurse)
                                        rt_node_leaf_128 *n = (rt_node_leaf_128 
*) node;
 
                                        fprintf(stderr, ", isset-bitmap:");
-                                       for (int i = 0; i < 16; i++)
+                                       for (int i = 0; i < WORDNUM(128); i++)
                                        {
-                                               fprintf(stderr, "%X ", (uint8) 
n->isset[i]);
+                                               fprintf(stderr, "%lX ", 
n->isset[i]);
                                        }
                                        fprintf(stderr, "\n");
                                }
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index b7b274aeff..3fe0fd88ce 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -23,49 +23,11 @@
 #include "common/hashfn.h"
 #include "nodes/bitmapset.h"
 #include "nodes/pg_list.h"
-#include "port/pg_bitutils.h"
 
 
-#define WORDNUM(x)     ((x) / BITS_PER_BITMAPWORD)
-#define BITNUM(x)      ((x) % BITS_PER_BITMAPWORD)
-
 #define BITMAPSET_SIZE(nwords) \
        (offsetof(Bitmapset, words) + (nwords) * sizeof(bitmapword))
 
-/*----------
- * This is a well-known cute trick for isolating the rightmost one-bit
- * in a word.  It assumes two's complement arithmetic.  Consider any
- * nonzero value, and focus attention on the rightmost one.  The value is
- * then something like
- *                             xxxxxx10000
- * where x's are unspecified bits.  The two's complement negative is formed
- * by inverting all the bits and adding one.  Inversion gives
- *                             yyyyyy01111
- * where each y is the inverse of the corresponding x.  Incrementing gives
- *                             yyyyyy10000
- * and then ANDing with the original value gives
- *                             00000010000
- * This works for all cases except original value = zero, where of course
- * we get zero.
- *----------
- */
-#define RIGHTMOST_ONE(x) ((signedbitmapword) (x) & -((signedbitmapword) (x)))
-
-#define HAS_MULTIPLE_ONES(x)   ((bitmapword) RIGHTMOST_ONE(x) != (x))
-
-/* Select appropriate bit-twiddling functions for bitmap word size */
-#if BITS_PER_BITMAPWORD == 32
-#define bmw_leftmost_one_pos(w)                pg_leftmost_one_pos32(w)
-#define bmw_rightmost_one_pos(w)       pg_rightmost_one_pos32(w)
-#define bmw_popcount(w)                                pg_popcount32(w)
-#elif BITS_PER_BITMAPWORD == 64
-#define bmw_leftmost_one_pos(w)                pg_leftmost_one_pos64(w)
-#define bmw_rightmost_one_pos(w)       pg_rightmost_one_pos64(w)
-#define bmw_popcount(w)                                pg_popcount64(w)
-#else
-#error "invalid BITS_PER_BITMAPWORD"
-#endif
-
 
 /*
  * bms_copy - make a palloc'd copy of a bitmapset
diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h
index 2792281658..06fa21ccaa 100644
--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -21,33 +21,13 @@
 #define BITMAPSET_H
 
 #include "nodes/nodes.h"
+#include "port/pg_bitutils.h"
 
 /*
  * Forward decl to save including pg_list.h
  */
 struct List;
 
-/*
- * Data representation
- *
- * Larger bitmap word sizes generally give better performance, so long as
- * they're not wider than the processor can handle efficiently.  We use
- * 64-bit words if pointers are that large, else 32-bit words.
- */
-#if SIZEOF_VOID_P >= 8
-
-#define BITS_PER_BITMAPWORD 64
-typedef uint64 bitmapword;             /* must be an unsigned type */
-typedef int64 signedbitmapword; /* must be the matching signed type */
-
-#else
-
-#define BITS_PER_BITMAPWORD 32
-typedef uint32 bitmapword;             /* must be an unsigned type */
-typedef int32 signedbitmapword; /* must be the matching signed type */
-
-#endif
-
 typedef struct Bitmapset
 {
        pg_node_attr(custom_copy_equal, special_read_write)
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 814e0b2dba..ad5aa2c5cf 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -17,6 +17,51 @@ extern PGDLLIMPORT const uint8 pg_leftmost_one_pos[256];
 extern PGDLLIMPORT const uint8 pg_rightmost_one_pos[256];
 extern PGDLLIMPORT const uint8 pg_number_of_ones[256];
 
+/*
+ * Platform-specific types
+ *
+ * Larger bitmap word sizes generally give better performance, so long as
+ * they're not wider than the processor can handle efficiently.  We use
+ * 64-bit words if pointers are that large, else 32-bit words.
+ */
+#if SIZEOF_VOID_P >= 8
+
+#define BITS_PER_BITMAPWORD 64
+typedef uint64 bitmapword;             /* must be an unsigned type */
+typedef int64 signedbitmapword; /* must be the matching signed type */
+
+#else
+
+#define BITS_PER_BITMAPWORD 32
+typedef uint32 bitmapword;             /* must be an unsigned type */
+typedef int32 signedbitmapword; /* must be the matching signed type */
+
+#endif
+
+#define WORDNUM(x)     ((x) / BITS_PER_BITMAPWORD)
+#define BITNUM(x)      ((x) % BITS_PER_BITMAPWORD)
+
+/*----------
+ * This is a well-known cute trick for isolating the rightmost one-bit
+ * in a word.  It assumes two's complement arithmetic.  Consider any
+ * nonzero value, and focus attention on the rightmost one.  The value is
+ * then something like
+ *                             xxxxxx10000
+ * where x's are unspecified bits.  The two's complement negative is formed
+ * by inverting all the bits and adding one.  Inversion gives
+ *                             yyyyyy01111
+ * where each y is the inverse of the corresponding x.  Incrementing gives
+ *                             yyyyyy10000
+ * and then ANDing with the original value gives
+ *                             00000010000
+ * This works for all cases except original value = zero, where of course
+ * we get zero.
+ *----------
+ */
+#define RIGHTMOST_ONE(x) ((signedbitmapword) (x) & -((signedbitmapword) (x)))
+
+#define HAS_MULTIPLE_ONES(x)   ((bitmapword) RIGHTMOST_ONE(x) != (x))
+
 /*
  * pg_leftmost_one_pos32
  *             Returns the position of the most significant set bit in "word",
@@ -291,4 +336,17 @@ pg_rotate_left32(uint32 word, int n)
 #define pg_prevpower2_size_t pg_prevpower2_64
 #endif
 
+/* variants of some functions for bitmap word size */
+#if BITS_PER_BITMAPWORD == 32
+#define bmw_leftmost_one_pos pg_leftmost_one_pos32
+#define bmw_rightmost_one_pos pg_rightmost_one_pos32
+#define bmw_popcount pg_popcount32
+#elif BITS_PER_BITMAPWORD == 64
+#define bmw_leftmost_one_pos pg_leftmost_one_pos64
+#define bmw_rightmost_one_pos pg_rightmost_one_pos64
+#define bmw_popcount pg_popcount64
+#else
+#error "invalid BITS_PER_BITMAPWORD"
+#endif
+
 #endif                                                 /* PG_BITUTILS_H */

Reply via email to