From c83fd58b09544b2debce1a7960f55cb252c26973 Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Tue, 15 Jun 2021 16:21:48 +0900
Subject: [PATCH v10] Optimze get_partition_for_tuple by caching bound offset

For bulk loads into list and range partitioned tables, it can be very
likely that long runs of consecutive tuples route to the same
partition.  In such cases, we can perform a binary search only once
to find the partition's bound and cache thus found offset.  And then
for the subsequent tuples, only check if they satisfy the bound at the
cached offset, something that's done with up to 2 comparisons, compared
to O(log num_parts) comparisons needed for the binary search.

To avoid impacting the cases where such caching can be unproductive,
it is disabled on the first tuple that no longer satisfies the cached
bound and only re-enabled if the individual bound offsets are found
to re-occur in succession over the span of a threshold number of
tuples, that is, after that many tuples have been processed.

Author: Hou Zhijie
Author: Amit Langote
---
 src/backend/executor/execPartition.c | 208 ++++++++++++++++++++++++---
 1 file changed, 186 insertions(+), 22 deletions(-)

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 90ed1485d1..0d9e524026 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -133,6 +133,13 @@ struct PartitionTupleRouting
  *		routing it through this table). A NULL value is stored if no tuple
  *		conversion is required.
  *
+ * cached_bound_offset
+ * last_seen_offset
+ * n_tups_inserted
+ * n_offset_changed
+ *		Fields to manage the state for bound offset caching; see
+ *		maybe_cache_partition_bound_offset()
+ *
  * indexes
  *		Array of partdesc->nparts elements.  For leaf partitions the index
  *		corresponds to the partition's ResultRelInfo in the encapsulating
@@ -150,6 +157,12 @@ typedef struct PartitionDispatchData
 	PartitionDesc partdesc;
 	TupleTableSlot *tupslot;
 	AttrMap    *tupmap;
+
+	int			cached_bound_offset;
+	int			last_seen_offset;
+	int			n_tups_inserted;
+	int			n_offset_changed;
+
 	int			indexes[FLEXIBLE_ARRAY_MEMBER];
 }			PartitionDispatchData;
 
@@ -1026,6 +1039,10 @@ ExecInitPartitionDispatchInfo(EState *estate,
 	pd->key = RelationGetPartitionKey(rel);
 	pd->keystate = NIL;
 	pd->partdesc = partdesc;
+
+	pd->cached_bound_offset = pd->last_seen_offset = -1;
+	pd->n_tups_inserted = pd->n_offset_changed = 0;
+
 	if (parent_pd != NULL)
 	{
 		TupleDesc	tupdesc = RelationGetDescr(rel);
@@ -1231,6 +1248,129 @@ FormPartitionKeyDatum(PartitionDispatch pd,
 		elog(ERROR, "wrong number of partition key expressions");
 }
 
+/*
+ * Threshold of the number of tuples to need to have been processed before
+ * maybe_cache_partition_bound_offset() (re-)assesses whether caching must be
+ * enabled for subsequent tuples.
+ */
+#define	CACHE_BOUND_OFFSET_THRESHOLD_TUPS	10
+
+/*
+ * maybe_cache_partition_bound_offset
+ *		Conditionally sets pd->cached_bound_offset so that
+ *		get_cached_{list|range}_partition can be used for subsequent
+ *		tuples
+ *
+ * It is set if it appears that some offsets observed over the last
+ * pd->n_tups_inserted tuples would have been reused, which can be inferred
+ * from seeing that the number of tuples inserted is greater than the number
+ * of times the bound offsets to which they were routed changed.
+ */
+static inline void
+maybe_cache_partition_bound_offset(PartitionDispatch pd, int offset)
+{
+	/* If the offset has changed, reset the cached value. */
+	if (offset != pd->last_seen_offset)
+	{
+		pd->last_seen_offset = offset;
+		pd->n_offset_changed += 1;
+		pd->cached_bound_offset = -1;
+	}
+
+	/*
+	 * Only consider (re-) enabling caching if we've seen at least a threshold
+	 * number of tuples.
+	 */
+	if (pd->n_tups_inserted < CACHE_BOUND_OFFSET_THRESHOLD_TUPS)
+		return;
+
+	/* Wouldn't get called if the cached bound offset worked. */
+	Assert(offset != pd->cached_bound_offset);
+
+	if (pd->n_tups_inserted > pd->n_offset_changed)
+		pd->cached_bound_offset = offset;
+
+	/* Reset the counters for the next run of tuples. */
+	pd->n_tups_inserted = pd->n_offset_changed = 0;
+}
+
+/*
+ * get_cached_{list|range}_partition
+ *		Computes if the cached bound offset value, if any, is satisfied by
+ *		the tuple specified in 'values' and if it is, returns the index of
+ *		the partition corresponding to that bound
+ *
+ * Callers must ensure that none of the elements of 'values' is NULL.
+ */
+static inline int
+get_cached_list_partition(PartitionDispatch pd,
+						  PartitionBoundInfo boundinfo,
+						  PartitionKey key,
+						  Datum *values)
+{
+	int		part_index = -1;
+	int		cached_off = pd->cached_bound_offset;
+
+	if (cached_off >= 0)
+	{
+		Datum	bound_datum = boundinfo->datums[cached_off][0];
+		int32	cmpval;
+
+		cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
+												 key->partcollation[0],
+												 bound_datum,
+												 values[0]));
+		if (cmpval == 0)
+			part_index = boundinfo->indexes[cached_off];
+	}
+
+	return part_index;
+}
+
+static inline int
+get_cached_range_partition(PartitionDispatch pd,
+						   PartitionBoundInfo boundinfo,
+						   PartitionKey key,
+						   Datum *values)
+{
+	int		part_index = -1;
+	int		cached_off = pd->cached_bound_offset;
+
+	if (cached_off >= 0)
+	{
+		Datum   *bound_datums = boundinfo->datums[cached_off];
+		PartitionRangeDatumKind *bound_kind = boundinfo->kind[cached_off];
+		int32	cmpval;
+
+		/* Check if the value is above the lower bound */
+		cmpval = partition_rbound_datum_cmp(key->partsupfunc,
+											key->partcollation,
+											bound_datums,
+											bound_kind,
+											values,
+											key->partnatts);
+		if (cmpval == 0)
+			part_index = boundinfo->indexes[cached_off + 1];
+		else if (cmpval < 0 && cached_off + 1 < boundinfo->ndatums)
+		{
+			/* Check if the value is below the upper bound */
+			bound_datums = boundinfo->datums[cached_off + 1];
+			bound_kind = boundinfo->kind[cached_off + 1];
+			cmpval = partition_rbound_datum_cmp(key->partsupfunc,
+												key->partcollation,
+												bound_datums,
+												bound_kind,
+												values,
+												key->partnatts);
+
+			if (cmpval > 0)
+				part_index = boundinfo->indexes[cached_off + 1];
+		}
+	}
+
+	return part_index;
+}
+
 /*
  * get_partition_for_tuple
  *		Finds partition of relation which accepts the partition key specified
@@ -1248,6 +1388,8 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
 	PartitionDesc partdesc = pd->partdesc;
 	PartitionBoundInfo boundinfo = partdesc->boundinfo;
 
+	pd->n_tups_inserted += 1;
+
 	/* Route as appropriate based on partitioning strategy. */
 	switch (key->strategy)
 	{
@@ -1272,14 +1414,26 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
 			}
 			else
 			{
-				bool		equal = false;
-
-				bound_offset = partition_list_bsearch(key->partsupfunc,
-													  key->partcollation,
-													  boundinfo,
-													  values[0], &equal);
-				if (bound_offset >= 0 && equal)
-					part_index = boundinfo->indexes[bound_offset];
+				part_index = get_cached_list_partition(pd,
+													   boundinfo,
+													   key,
+													   values);
+				if (part_index < 0)
+				{
+					bool		equal = false;
+
+					bound_offset = partition_list_bsearch(key->partsupfunc,
+														  key->partcollation,
+														  boundinfo,
+														  values[0], &equal);
+					if (bound_offset >= 0 && equal)
+					{
+						part_index = boundinfo->indexes[bound_offset];
+						if (part_index >= 0)
+							maybe_cache_partition_bound_offset(pd,
+															   bound_offset);
+					}
+				}
 			}
 			break;
 
@@ -1304,20 +1458,30 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
 
 				if (!range_partkey_has_null)
 				{
-					bound_offset = partition_range_datum_bsearch(key->partsupfunc,
-																 key->partcollation,
-																 boundinfo,
-																 key->partnatts,
-																 values,
-																 &equal);
-
-					/*
-					 * The bound at bound_offset is less than or equal to the
-					 * tuple value, so the bound at offset+1 is the upper
-					 * bound of the partition we're looking for, if there
-					 * actually exists one.
-					 */
-					part_index = boundinfo->indexes[bound_offset + 1];
+					part_index = get_cached_range_partition(pd,
+															boundinfo,
+															key,
+															values);
+					if (part_index < 0)
+					{
+						bound_offset = partition_range_datum_bsearch(key->partsupfunc,
+																	 key->partcollation,
+																	 boundinfo,
+																	 key->partnatts,
+																	 values,
+																	 &equal);
+
+						/*
+						 * The bound at bound_offset is less than or equal to the
+						 * tuple value, so the bound at offset+1 is the upper
+						 * bound of the partition we're looking for, if there
+						 * actually exists one.
+						 */
+						part_index = boundinfo->indexes[bound_offset + 1];
+						if (part_index >= 0)
+							maybe_cache_partition_bound_offset(pd,
+															   bound_offset);
+					}
 				}
 			}
 			break;
-- 
2.24.1

