From 93964b65609ed406333aa073e8b4eac72981b45a Mon Sep 17 00:00:00 2001
From: "dgrowley@gmail.com" <dgrowley@gmail.com>
Date: Thu, 26 Jul 2018 19:54:55 +1200
Subject: [PATCH v2] Speed up INSERT and UPDATE on partitioned tables

This is more or less a complete redesign of PartitionTupleRouting. The
aim here is to get rid of all the possibly large arrays that were being
allocated during ExecSetupPartitionTupleRouting().  We now allocate
small arrays to store the partition's ResultRelInfo and only enlarge
these when we run out of space.  The partitions array is now ordered
by the order in which the partition's ResultRelInfos are inititialized
rather than in same order as partdesc.

The slowest part of ExecSetupPartitionTupleRouting still remains.  The
find_all_inheritors call still remains by far the slowest part of the
function. This patch just removes the other slow parts.

Initialization of the parent/child translation maps array is now only
performed when we need to store the first translation map.  If the column
order between the parent and its child are the same, then no map ever
needs to be stored, this (possibly large) array did nothing.

In simple INSERTs hitting a single partition to a partitioned table with
many partitions the shutdown of the executor was also slow in comparison to
the actual execution, this was down to the loop which cleans up each
ResultRelInfo having to loop over an array which often contained mostly
NULLs, which had to be skipped.  Performance of this has now improved as
the array we loop over now no longer has to skip possibly many NULL
values.

David Rowley and Amit Langote
---
 src/backend/commands/copy.c            |  19 +-
 src/backend/executor/execPartition.c   | 744 +++++++++++++++++++--------------
 src/backend/executor/nodeModifyTable.c | 102 +----
 src/backend/utils/cache/partcache.c    |  11 +-
 src/include/catalog/partition.h        |   6 +-
 src/include/executor/execPartition.h   | 159 ++++---
 6 files changed, 555 insertions(+), 486 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 3a66cb5025..44cf3bba12 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2621,10 +2621,8 @@ CopyFrom(CopyState cstate)
 			 * will get us the ResultRelInfo and TupleConversionMap for the
 			 * partition, respectively.
 			 */
-			leaf_part_index = ExecFindPartition(resultRelInfo,
-												proute->partition_dispatch_info,
-												slot,
-												estate);
+			leaf_part_index = ExecFindPartition(mtstate, resultRelInfo,
+												proute, slot, estate);
 			Assert(leaf_part_index >= 0 &&
 				   leaf_part_index < proute->num_partitions);
 
@@ -2644,15 +2642,8 @@ CopyFrom(CopyState cstate)
 			 * to the selected partition.
 			 */
 			saved_resultRelInfo = resultRelInfo;
+			Assert(proute->partitions[leaf_part_index] != NULL);
 			resultRelInfo = proute->partitions[leaf_part_index];
-			if (resultRelInfo == NULL)
-			{
-				resultRelInfo = ExecInitPartitionInfo(mtstate,
-													  saved_resultRelInfo,
-													  proute, estate,
-													  leaf_part_index);
-				Assert(resultRelInfo != NULL);
-			}
 
 			/*
 			 * For ExecInsertIndexTuples() to work on the partition's indexes
@@ -2693,7 +2684,9 @@ CopyFrom(CopyState cstate)
 			 * We might need to convert from the parent rowtype to the
 			 * partition rowtype.
 			 */
-			tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index],
+			tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps ?
+												proute->parent_child_tupconv_maps[leaf_part_index] :
+												NULL,
 											  tuple,
 											  proute->partition_tuple_slot,
 											  &slot);
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 7a4665cc4e..d7b18f52ed 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -31,11 +31,19 @@
 #include "utils/rls.h"
 #include "utils/ruleutils.h"
 
+#define PARTITION_ROUTING_INITSIZE	8
 
-static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
-								 int *num_parted, List **leaf_part_oids);
-static void get_partition_dispatch_recurse(Relation rel, Relation parent,
-							   List **pds, List **leaf_part_oids);
+static void
+ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate,
+							   PartitionTupleRouting *proute);
+static void ExecExpandRoutingArrays(PartitionTupleRouting *proute);
+static int ExecInitPartitionInfo(ModifyTableState *mtstate,
+					  ResultRelInfo *rootResultRelInfo,
+					  PartitionTupleRouting *proute,
+					  EState *estate,
+					  PartitionDispatch parent, int partidx);
+static PartitionDispatch ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute,
+						Oid partoid, PartitionDispatch parent_pd, int partidx);
 static void FormPartitionKeyDatum(PartitionDispatch pd,
 					  TupleTableSlot *slot,
 					  EState *estate,
@@ -62,134 +70,107 @@ static void find_matching_subplans_recurse(PartitionPruneState *prunestate,
  * Note that all the relations in the partition tree are locked using the
  * RowExclusiveLock mode upon return from this function.
  *
- * While we allocate the arrays of pointers of ResultRelInfo and
- * TupleConversionMap for all partitions here, actual objects themselves are
- * lazily allocated for a given partition if a tuple is actually routed to it;
- * see ExecInitPartitionInfo.  However, if the function is invoked for update
- * tuple routing, caller would already have initialized ResultRelInfo's for
- * some of the partitions, which are reused and assigned to their respective
- * slot in the aforementioned array.  For such partitions, we delay setting
- * up objects such as TupleConversionMap until those are actually chosen as
- * the partitions to route tuples to.  See ExecPrepareTupleRouting.
+ * Callers must use the returned PartitionTupleRouting during calls to
+ * ExecFindPartition.  The actual ResultRelInfos are allocated lazily by that
+ * function.
  */
 PartitionTupleRouting *
 ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 {
-	List	   *leaf_parts;
-	ListCell   *cell;
-	int			i;
-	ResultRelInfo *update_rri = NULL;
-	int			num_update_rri = 0,
-				update_rri_index = 0;
 	PartitionTupleRouting *proute;
-	int			nparts;
 	ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
 
-	/*
-	 * Get the information about the partition tree after locking all the
-	 * partitions.
-	 */
+	/* Lock all the partitions. */
 	(void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
-	proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
-	proute->partition_dispatch_info =
-		RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch,
-										 &leaf_parts);
-	proute->num_partitions = nparts = list_length(leaf_parts);
-	proute->partitions =
-		(ResultRelInfo **) palloc(nparts * sizeof(ResultRelInfo *));
-	proute->parent_child_tupconv_maps =
-		(TupleConversionMap **) palloc0(nparts * sizeof(TupleConversionMap *));
-	proute->partition_oids = (Oid *) palloc(nparts * sizeof(Oid));
-
-	/* Set up details specific to the type of tuple routing we are doing. */
-	if (node && node->operation == CMD_UPDATE)
-	{
-		update_rri = mtstate->resultRelInfo;
-		num_update_rri = list_length(node->plans);
-		proute->subplan_partition_offsets =
-			palloc(num_update_rri * sizeof(int));
-		proute->num_subplan_partition_offsets = num_update_rri;
 
-		/*
-		 * We need an additional tuple slot for storing transient tuples that
-		 * are converted to the root table descriptor.
-		 */
-		proute->root_tuple_slot = MakeTupleTableSlot(NULL);
-	}
+	/*
+	 * Here we attempt to expend as little effort as possible in setting up
+	 * the PartitionTupleRouting.  Each partition's ResultRelInfo is built
+	 * lazily, only when we actually need to route a tuple to that partition.
+	 * The reason for this is that a common case is for INSERT to insert a
+	 * single tuple into a single partition.
+	 *
+	 * We initially allocate enough memory to hold PARTITION_ROUTING_INITSIZE
+	 * PartitionDispatch and ResultRelInfo pointers in their respective arrays.
+	 * More space can be allocated later, if required via
+	 * ExecExpandRoutingArrays.
+	 *
+	 * We're certain to only need just 1 PartitionDispatch; the one for the
+	 * partitioned table which is the target of the command.  We'll only setup
+	 * PartitionDispatchs for any subpartitions if tuples actually get routed
+	 * to (through) them.
+	 */
+	proute = (PartitionTupleRouting *) palloc(sizeof(PartitionTupleRouting));
+	proute->partition_root = rel;
+	proute->partition_dispatch_info = (PartitionDispatchData **)
+			palloc(sizeof(PartitionDispatchData) * PARTITION_ROUTING_INITSIZE);
+	proute->num_dispatch = 0;
+	proute->dispatch_allocsize = PARTITION_ROUTING_INITSIZE;
+
+	proute->partitions = (ResultRelInfo **)
+			palloc(sizeof(ResultRelInfo *) * PARTITION_ROUTING_INITSIZE);
+	proute->num_partitions = 0;
+	proute->partitions_allocsize = PARTITION_ROUTING_INITSIZE;
+
+	/* We only allocate these arrays when we need to store the first map */
+	proute->parent_child_tupconv_maps = NULL;
+	proute->child_parent_tupconv_maps = NULL;
+	proute->child_parent_map_not_required = NULL;
 
 	/*
-	 * Initialize an empty slot that will be used to manipulate tuples of any
-	 * given partition's rowtype.  It is attached to the caller-specified node
-	 * (such as ModifyTableState) and released when the node finishes
-	 * processing.
+	 * Initialize this table's PartitionDispatch object.  Here we pass in
+	 * the parent is NULL as we don't need to care about any parent of the
+	 * target partitioned table.
 	 */
-	proute->partition_tuple_slot = MakeTupleTableSlot(NULL);
+	(void) ExecInitPartitionDispatchInfo(proute, RelationGetRelid(rel), NULL,
+										 0);
 
-	i = 0;
-	foreach(cell, leaf_parts)
+	/*
+	 * If UPDATE needs to do tuple routing, we'll need a slot that will
+	 * transiently store the tuple being routed using the root parent's
+	 * rowtype.  We must set up at least this slot, because it's needed even
+	 * before tuple routing begins.  Other necessary information is
+	 * initialized when  tuple routing code calls
+	 * ExecUseUpdateResultRelForRouting.
+	 */
+	if (node && node->operation == CMD_UPDATE)
 	{
-		ResultRelInfo *leaf_part_rri = NULL;
-		Oid			leaf_oid = lfirst_oid(cell);
-
-		proute->partition_oids[i] = leaf_oid;
-
-		/*
-		 * If the leaf partition is already present in the per-subplan result
-		 * rels, we re-use that rather than initialize a new result rel. The
-		 * per-subplan resultrels and the resultrels of the leaf partitions
-		 * are both in the same canonical order. So while going through the
-		 * leaf partition oids, we need to keep track of the next per-subplan
-		 * result rel to be looked for in the leaf partition resultrels.
-		 */
-		if (update_rri_index < num_update_rri &&
-			RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
-		{
-			leaf_part_rri = &update_rri[update_rri_index];
-
-			/*
-			 * This is required in order to convert the partition's tuple to
-			 * be compatible with the root partitioned table's tuple
-			 * descriptor.  When generating the per-subplan result rels, this
-			 * was not set.
-			 */
-			leaf_part_rri->ri_PartitionRoot = rel;
-
-			/* Remember the subplan offset for this ResultRelInfo */
-			proute->subplan_partition_offsets[update_rri_index] = i;
-
-			update_rri_index++;
-		}
-
-		proute->partitions[i] = leaf_part_rri;
-		i++;
+		ExecHashSubPlanResultRelsByOid(mtstate, proute);
+		proute->root_tuple_slot = MakeTupleTableSlot(NULL);
+	}
+	else
+	{
+		proute->subplan_partition_table = NULL;
+		proute->root_tuple_slot = NULL;
 	}
 
 	/*
-	 * For UPDATE, we should have found all the per-subplan resultrels in the
-	 * leaf partitions.  (If this is an INSERT, both values will be zero.)
+	 * Initialize an empty slot that will be used to manipulate tuples of any
+	 * given partition's rowtype.
 	 */
-	Assert(update_rri_index == num_update_rri);
+	proute->partition_tuple_slot = MakeTupleTableSlot(NULL);
 
 	return proute;
 }
 
 /*
- * ExecFindPartition -- Find a leaf partition in the partition tree rooted
- * at parent, for the heap tuple contained in *slot
+ * ExecFindPartition -- Find a leaf partition for the tuple contained in *slot
  *
  * estate must be non-NULL; we'll need it to compute any expressions in the
  * partition key(s)
  *
  * If no leaf partition is found, this routine errors out with the appropriate
- * error message, else it returns the leaf partition sequence number
- * as an index into the array of (ResultRelInfos of) all leaf partitions in
- * the partition tree.
+ * error message, else it returns the index of the leaf partition's
+ * ResultRelInfo in the proute->partitions array.
  */
 int
-ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
+ExecFindPartition(ModifyTableState *mtstate,
+				  ResultRelInfo *resultRelInfo,
+				  PartitionTupleRouting *proute,
 				  TupleTableSlot *slot, EState *estate)
 {
-	int			result;
+	PartitionDispatch *pd = proute->partition_dispatch_info;
+	int			result = -1;
 	Datum		values[PARTITION_MAX_KEYS];
 	bool		isnull[PARTITION_MAX_KEYS];
 	Relation	rel;
@@ -211,7 +192,7 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 		PartitionDesc partdesc;
 		TupleTableSlot *myslot = parent->tupslot;
 		TupleConversionMap *map = parent->tupmap;
-		int			cur_index = -1;
+		int			partidx = -1;
 
 		rel = parent->reldesc;
 		partdesc = RelationGetPartitionDesc(rel);
@@ -242,81 +223,226 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 		FormPartitionKeyDatum(parent, slot, estate, values, isnull);
 
 		/*
-		 * Nothing for get_partition_for_tuple() to do if there are no
-		 * partitions to begin with.
+		 * If this partitioned table has no partitions or no partition for
+		 * these values, then error out.
 		 */
-		if (partdesc->nparts == 0)
+		if (partdesc->nparts == 0 ||
+			(partidx = get_partition_for_tuple(rel, values, isnull)) < 0)
 		{
-			result = -1;
-			break;
+			char	   *val_desc;
+
+			val_desc = ExecBuildSlotPartitionKeyDescription(rel,
+															values, isnull, 64);
+			Assert(OidIsValid(RelationGetRelid(rel)));
+			ereport(ERROR,
+					(errcode(ERRCODE_CHECK_VIOLATION),
+					 errmsg("no partition of relation \"%s\" found for row",
+							RelationGetRelationName(rel)),
+					 val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0));
 		}
 
-		cur_index = get_partition_for_tuple(rel, values, isnull);
-
-		/*
-		 * cur_index < 0 means we failed to find a partition of this parent.
-		 * cur_index >= 0 means we either found the leaf partition, or the
-		 * next parent to find a partition of.
-		 */
-		if (cur_index < 0)
+		if (partdesc->is_leaf[partidx])
 		{
-			result = -1;
-			break;
+			/*
+			 * Get the index for PartitionTupleRouting->partitions array index
+			 * for this leaf partition.  This may require building a new
+			 * ResultRelInfo.
+			 */
+			if (likely(parent->indexes[partidx] >= 0))
+			{
+				/* ResultRelInfo already built */
+				Assert(parent->indexes[partidx] < proute->num_partitions);
+				result = parent->indexes[partidx];
+			}
+			else
+			{
+				if (proute->subplan_partition_table)
+				{
+					ResultRelInfo *rri;
+					Oid			partoid = partdesc->oids[partidx];
+
+					rri = hash_search(proute->subplan_partition_table,
+									  &partoid, HASH_FIND, NULL);
+
+					if (rri)
+					{
+						result = proute->num_partitions++;
+						parent->indexes[partidx] = result;
+
+						/* Allocate more space in the arrays, if required */
+						if (result >= proute->partitions_allocsize)
+							ExecExpandRoutingArrays(proute);
+
+						/* Save here for later use. */
+						proute->partitions[result] = rri;
+					}
+				}
+
+				/* We need to create one afresh. */
+				if (result < 0)
+				{
+					result = ExecInitPartitionInfo(mtstate, resultRelInfo,
+												   proute, estate,
+												   parent, partidx);
+					Assert(result >= 0 && result < proute->num_partitions);
+				}
+			}
+
+			ecxt->ecxt_scantuple = ecxt_scantuple_old;
+			return result;
 		}
-		else if (parent->indexes[cur_index] >= 0)
+		else
 		{
-			result = parent->indexes[cur_index];
-			break;
+			/*
+			 * Partition is a sub-partitioned table; get the PartitionDispatch
+			 */
+			if (likely(parent->indexes[partidx] >= 0))
+			{
+				/* Already built. */
+				Assert(parent->indexes[partidx] < proute->num_dispatch);
+				parent = pd[parent->indexes[partidx]];
+			}
+			else
+			{
+				/* Not yet built. Do that now. */
+				PartitionDispatch subparent;
+
+				subparent = ExecInitPartitionDispatchInfo(proute,
+													partdesc->oids[partidx],
+													parent, partidx);
+				Assert(parent->indexes[partidx] >= 0 &&
+					   parent->indexes[partidx] < proute->num_dispatch);
+				parent = subparent;
+			}
 		}
-		else
-			parent = pd[-parent->indexes[cur_index]];
 	}
+}
 
-	/* A partition was not found. */
-	if (result < 0)
+/*
+ * ExecHashSubPlanResultRelsByOid
+ *		Build a hash table to allow fast lookups of subplan ResultRelInfos by
+ *		partition Oid.  We also populate the subplan ResultRelInfo with an
+ *		ri_PartitionRoot.
+ */
+static void
+ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate,
+							   PartitionTupleRouting *proute)
+{
+	ModifyTable	   *node = (ModifyTable *) mtstate->ps.plan;
+	ResultRelInfo  *subplan_result_rels;
+	HASHCTL			ctl;
+	HTAB		   *htab;
+	int				nsubplans;
+	int				i;
+
+	subplan_result_rels = mtstate->resultRelInfo;
+	nsubplans = list_length(node->plans);
+
+	memset(&ctl, 0, sizeof(ctl));
+	ctl.keysize = sizeof(Oid);
+	ctl.entrysize = sizeof(ResultRelInfo **);
+	ctl.hcxt = CurrentMemoryContext;
+
+	htab = hash_create("PartitionTupleRouting table", nsubplans, &ctl,
+					   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+	proute->subplan_partition_table = htab;
+
+	/* Hash all subplan by Oid */
+	for (i = 0; i < nsubplans; i++)
 	{
-		char	   *val_desc;
-
-		val_desc = ExecBuildSlotPartitionKeyDescription(rel,
-														values, isnull, 64);
-		Assert(OidIsValid(RelationGetRelid(rel)));
-		ereport(ERROR,
-				(errcode(ERRCODE_CHECK_VIOLATION),
-				 errmsg("no partition of relation \"%s\" found for row",
-						RelationGetRelationName(rel)),
-				 val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0));
+		ResultRelInfo *rri = &subplan_result_rels[i];
+		bool		found;
+		Oid			partoid = RelationGetRelid(rri->ri_RelationDesc);
+		ResultRelInfo **subplanrri;
+
+		subplanrri = (ResultRelInfo **) hash_search(htab, &partoid, HASH_ENTER,
+												   &found);
+
+		if (!found)
+			*subplanrri = rri;
+
+		/*
+		 * This is required in order to convert the partition's tuple
+		 * to be compatible with the root partitioned table's tuple
+		 * descriptor.  When generating the per-subplan result rels,
+		 * this was not set.
+		 */
+		rri->ri_PartitionRoot = proute->partition_root;
 	}
+}
 
-	ecxt->ecxt_scantuple = ecxt_scantuple_old;
-	return result;
+/*
+ * ExecExpandRoutingArrays
+ *		Double the size of the allocated arrays in 'proute'
+ */
+static void
+ExecExpandRoutingArrays(PartitionTupleRouting *proute)
+{
+	int		new_size = proute->partitions_allocsize * 2;
+	int		old_size = proute->partitions_allocsize;
+
+	proute->partitions_allocsize = new_size;
+
+	proute->partitions = (ResultRelInfo **)
+		repalloc(proute->partitions, sizeof(ResultRelInfo *) * new_size);
+
+	if (proute->parent_child_tupconv_maps != NULL)
+	{
+		proute->parent_child_tupconv_maps = (TupleConversionMap **)
+			repalloc( proute->parent_child_tupconv_maps,
+						sizeof(TupleConversionMap *) * new_size);
+		memset(&proute->parent_child_tupconv_maps[old_size], 0,
+			   sizeof(TupleConversionMap *) * (new_size - old_size));
+	}
+
+	if (proute->child_parent_map_not_required != NULL)
+	{
+		proute->child_parent_tupconv_maps = (TupleConversionMap **)
+			repalloc(proute->child_parent_tupconv_maps,
+					 sizeof(TupleConversionMap *) * new_size);
+		memset(&proute->child_parent_tupconv_maps[old_size], 0,
+			   sizeof(TupleConversionMap *) * (new_size - old_size));
+	}
+
+	if (proute->child_parent_map_not_required != NULL)
+	{
+		proute->child_parent_map_not_required = (bool *)
+			repalloc(proute->child_parent_map_not_required,
+					 sizeof(bool) * new_size);
+		memset(&proute->child_parent_map_not_required[old_size], 0,
+			   sizeof(bool) * (new_size - old_size));
+	}
 }
 
 /*
  * ExecInitPartitionInfo
  *		Initialize ResultRelInfo and other information for a partition
- *
- * Returns the ResultRelInfo
+ *		and store it in the next empty slot in 'proute's partitions array and
+ *		return the index of that element.
  */
-ResultRelInfo *
+static int
 ExecInitPartitionInfo(ModifyTableState *mtstate,
-					  ResultRelInfo *resultRelInfo,
+					  ResultRelInfo *rootResultRelInfo,
 					  PartitionTupleRouting *proute,
-					  EState *estate, int partidx)
+					  EState *estate,
+					  PartitionDispatch parent, int partidx)
 {
+	Oid			partoid = parent->partdesc->oids[partidx];
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
-	Relation	rootrel = resultRelInfo->ri_RelationDesc,
+	Relation	rootrel = rootResultRelInfo->ri_RelationDesc,
 				partrel;
 	Relation	firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
 	ResultRelInfo *leaf_part_rri;
 	MemoryContext oldContext;
 	AttrNumber *part_attnos = NULL;
 	bool		found_whole_row;
+	int			part_result_rel_index;
 
 	/*
 	 * We locked all the partitions in ExecSetupPartitionTupleRouting
 	 * including the leaf partitions.
 	 */
-	partrel = heap_open(proute->partition_oids[partidx], NoLock);
+	partrel = heap_open(partoid, NoLock);
 
 	/*
 	 * Keep ResultRelInfo and other information for this partition in the
@@ -492,15 +618,25 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 									&mtstate->ps, RelationGetDescr(partrel));
 	}
 
+	part_result_rel_index = proute->num_partitions++;
+	parent->indexes[partidx] = part_result_rel_index;
+
+	/* Allocate more space in the arrays, if required */
+	if (part_result_rel_index >= proute->partitions_allocsize)
+		ExecExpandRoutingArrays(proute);
+
+	/* Save here for later use. */
+	proute->partitions[part_result_rel_index] = leaf_part_rri;
+
 	/* Set up information needed for routing tuples to the partition. */
-	ExecInitRoutingInfo(mtstate, estate, proute, leaf_part_rri, partidx);
+	ExecInitRoutingInfo(mtstate, estate, proute, leaf_part_rri,
+						part_result_rel_index);
 
 	/*
 	 * If there is an ON CONFLICT clause, initialize state for it.
 	 */
 	if (node && node->onConflictAction != ONCONFLICT_NONE)
 	{
-		TupleConversionMap *map = proute->parent_child_tupconv_maps[partidx];
 		int			firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
 		TupleDesc	partrelDesc = RelationGetDescr(partrel);
 		ExprContext *econtext = mtstate->ps.ps_ExprContext;
@@ -513,7 +649,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 		 * list and searching for ancestry relationships to each index in the
 		 * ancestor table.
 		 */
-		if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) > 0)
+		if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0)
 		{
 			List	   *childIdxs;
 
@@ -526,7 +662,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 				ListCell   *lc2;
 
 				ancestors = get_partition_ancestors(childIdx);
-				foreach(lc2, resultRelInfo->ri_onConflictArbiterIndexes)
+				foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes)
 				{
 					if (list_member_oid(ancestors, lfirst_oid(lc2)))
 						arbiterIndexes = lappend_oid(arbiterIndexes, childIdx);
@@ -540,7 +676,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 		 * (This shouldn't happen, since arbiter index selection should not
 		 * pick up an invalid index.)
 		 */
-		if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) !=
+		if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
 			list_length(arbiterIndexes))
 			elog(ERROR, "invalid arbiter index list");
 		leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
@@ -550,8 +686,14 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 		 */
 		if (node->onConflictAction == ONCONFLICT_UPDATE)
 		{
+			TupleConversionMap *map;
+
+			map = proute->parent_child_tupconv_maps ?
+				proute->parent_child_tupconv_maps[part_result_rel_index] :
+				NULL;
+
 			Assert(node->onConflictSet != NIL);
-			Assert(resultRelInfo->ri_onConflict != NULL);
+			Assert(rootResultRelInfo->ri_onConflict != NULL);
 
 			/*
 			 * If the partition's tuple descriptor matches exactly the root
@@ -560,7 +702,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 			 * need to create state specific to this partition.
 			 */
 			if (map == NULL)
-				leaf_part_rri->ri_onConflict = resultRelInfo->ri_onConflict;
+				leaf_part_rri->ri_onConflict = rootResultRelInfo->ri_onConflict;
 			else
 			{
 				List	   *onconflset;
@@ -651,12 +793,9 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 		}
 	}
 
-	Assert(proute->partitions[partidx] == NULL);
-	proute->partitions[partidx] = leaf_part_rri;
-
 	MemoryContextSwitchTo(oldContext);
 
-	return leaf_part_rri;
+	return part_result_rel_index;
 }
 
 /*
@@ -671,6 +810,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 					int partidx)
 {
 	MemoryContext oldContext;
+	TupleConversionMap *map;
 
 	/*
 	 * Switch into per-query memory context.
@@ -681,10 +821,24 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 	 * Set up a tuple conversion map to convert a tuple routed to the
 	 * partition from the parent's type to the partition's.
 	 */
-	proute->parent_child_tupconv_maps[partidx] =
-		convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot),
-							   RelationGetDescr(partRelInfo->ri_RelationDesc),
-							   gettext_noop("could not convert row type"));
+	map = convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot),
+								 RelationGetDescr(partRelInfo->ri_RelationDesc),
+								 gettext_noop("could not convert row type"));
+
+	if (map)
+	{
+		int		new_size;
+
+		/* Allocate parent child map array only if we need to store a map */
+		if (proute->parent_child_tupconv_maps == NULL)
+		{
+			new_size = proute->partitions_allocsize;
+			proute->parent_child_tupconv_maps = (TupleConversionMap **)
+				palloc0(sizeof(TupleConversionMap *) * new_size);
+		}
+
+		proute->parent_child_tupconv_maps[partidx] = map;
+	}
 
 	/*
 	 * If the partition is a foreign table, let the FDW init itself for
@@ -699,6 +853,88 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 	partRelInfo->ri_PartitionReadyForRouting = true;
 }
 
+/*
+ * ExecInitPartitionDispatchInfo
+ *		Initialize PartitionDispatch for a partitioned table
+ *
+ * This also stores it in the proute->partition_dispatch_info array at the
+ * specified index ('dispatchidx'), possibly expanding the array if there
+ * isn't space left in it.
+ */
+static PartitionDispatch
+ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
+							  PartitionDispatch parent_pd, int partidx)
+{
+	Relation	rel;
+	TupleDesc	tupdesc;
+	PartitionDesc partdesc;
+	PartitionKey partkey;
+	PartitionDispatch pd;
+	int			dispatchidx;
+
+	if (partoid != RelationGetRelid(proute->partition_root))
+		rel = heap_open(partoid, NoLock);
+	else
+		rel = proute->partition_root;
+	tupdesc = RelationGetDescr(rel);
+	partdesc = RelationGetPartitionDesc(rel);
+	partkey = RelationGetPartitionKey(rel);
+
+	pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
+	pd->reldesc = rel;
+	pd->key = partkey;
+	pd->keystate = NIL;
+	pd->partdesc = partdesc;
+	if (parent_pd != NULL)
+	{
+		/*
+		 * For every partitioned table other than the root, we must store a
+		 * tuple table slot initialized with its tuple descriptor and a tuple
+		 * conversion map to convert a tuple from its parent's rowtype to its
+		 * own. That is to make sure that we are looking at the correct row
+		 * using the correct tuple descriptor when computing its partition key
+		 * for tuple routing.
+		 */
+		pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
+		pd->tupmap =
+				convert_tuples_by_name(RelationGetDescr(parent_pd->reldesc),
+									   tupdesc,
+									   gettext_noop("could not convert row type"));
+	}
+	else
+	{
+		/* Not required for the root partitioned table */
+		pd->tupslot = NULL;
+		pd->tupmap = NULL;
+	}
+
+	pd->indexes = (int *) palloc(sizeof(int) * partdesc->nparts);
+
+	/*
+	 * Initialize with -1 to signify that the corresponding partition's
+	 * ResultRelInfo or PartitionDispatch has not been created yet.
+	 */
+	memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
+
+	dispatchidx = proute->num_dispatch++;
+	if (parent_pd)
+		parent_pd->indexes[partidx] = dispatchidx;
+	if (dispatchidx >= proute->dispatch_allocsize)
+	{
+		/* Expand allocated space. */
+		proute->dispatch_allocsize *= 2;
+		proute->partition_dispatch_info = (PartitionDispatchData **)
+			repalloc(proute->partition_dispatch_info,
+					 sizeof(PartitionDispatchData *) *
+					 proute->dispatch_allocsize);
+	}
+
+	/* Save here for later use. */
+	proute->partition_dispatch_info[dispatchidx] = pd;
+
+	return pd;
+}
+
 /*
  * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition
  * child-to-root tuple conversion map array.
@@ -711,19 +947,22 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 void
 ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute)
 {
+	int			size;
+
 	Assert(proute != NULL);
 
+	size = proute->partitions_allocsize;
+
 	/*
 	 * These array elements get filled up with maps on an on-demand basis.
 	 * Initially just set all of them to NULL.
 	 */
 	proute->child_parent_tupconv_maps =
-		(TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) *
-										proute->num_partitions);
+		(TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * size);
 
 	/* Same is the case for this array. All the values are set to false */
-	proute->child_parent_map_not_required =
-		(bool *) palloc0(sizeof(bool) * proute->num_partitions);
+	proute->child_parent_map_not_required = (bool *) palloc0(sizeof(bool) *
+															 size);
 }
 
 /*
@@ -734,15 +973,15 @@ TupleConversionMap *
 TupConvMapForLeaf(PartitionTupleRouting *proute,
 				  ResultRelInfo *rootRelInfo, int leaf_index)
 {
-	ResultRelInfo **resultRelInfos = proute->partitions;
 	TupleConversionMap **map;
 	TupleDesc	tupdesc;
 
-	/* Don't call this if we're not supposed to be using this type of map. */
-	Assert(proute->child_parent_tupconv_maps != NULL);
+	/* If nobody else set up the per-leaf maps array, do so ourselves. */
+	if (proute->child_parent_tupconv_maps == NULL)
+		ExecSetupChildParentMapForLeaf(proute);
 
 	/* If it's already known that we don't need a map, return NULL. */
-	if (proute->child_parent_map_not_required[leaf_index])
+	else if (proute->child_parent_map_not_required[leaf_index])
 		return NULL;
 
 	/* If we've already got a map, return it. */
@@ -751,13 +990,16 @@ TupConvMapForLeaf(PartitionTupleRouting *proute,
 		return *map;
 
 	/* No map yet; try to create one. */
-	tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc);
+	tupdesc = RelationGetDescr(proute->partitions[leaf_index]->ri_RelationDesc);
 	*map =
 		convert_tuples_by_name(tupdesc,
 							   RelationGetDescr(rootRelInfo->ri_RelationDesc),
 							   gettext_noop("could not convert row type"));
 
-	/* If it turns out no map is needed, remember for next time. */
+	/*
+	 * If it turns out no map is needed, remember that so we don't try making
+	 * one again next time.
+	 */
 	proute->child_parent_map_not_required[leaf_index] = (*map == NULL);
 
 	return *map;
@@ -805,7 +1047,6 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 						PartitionTupleRouting *proute)
 {
 	int			i;
-	int			subplan_index = 0;
 
 	/*
 	 * Remember, proute->partition_dispatch_info[0] corresponds to the root
@@ -826,10 +1067,6 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 	{
 		ResultRelInfo *resultRelInfo = proute->partitions[i];
 
-		/* skip further processsing for uninitialized partitions */
-		if (resultRelInfo == NULL)
-			continue;
-
 		/* Allow any FDWs to shut down if they've been exercised */
 		if (resultRelInfo->ri_PartitionReadyForRouting &&
 			resultRelInfo->ri_FdwRoutine != NULL &&
@@ -838,21 +1075,20 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 														   resultRelInfo);
 
 		/*
-		 * If this result rel is one of the UPDATE subplan result rels, let
-		 * ExecEndPlan() close it. For INSERT or COPY,
-		 * proute->subplan_partition_offsets will always be NULL. Note that
-		 * the subplan_partition_offsets array and the partitions array have
-		 * the partitions in the same order. So, while we iterate over
-		 * partitions array, we also iterate over the
-		 * subplan_partition_offsets array in order to figure out which of the
-		 * result rels are present in the UPDATE subplans.
+		 * Check if this result rel is one belonging to the node's subplans,
+		 * if so, let ExecEndPlan() clean it up.
 		 */
-		if (proute->subplan_partition_offsets &&
-			subplan_index < proute->num_subplan_partition_offsets &&
-			proute->subplan_partition_offsets[subplan_index] == i)
+		if (proute->subplan_partition_table)
 		{
-			subplan_index++;
-			continue;
+			Oid			partoid;
+			bool		found;
+
+			partoid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+			(void) hash_search(proute->subplan_partition_table, &partoid,
+							   HASH_FIND, &found);
+			if (found)
+				continue;
 		}
 
 		ExecCloseIndices(resultRelInfo);
@@ -866,144 +1102,6 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 		ExecDropSingleTupleTableSlot(proute->partition_tuple_slot);
 }
 
-/*
- * RelationGetPartitionDispatchInfo
- *		Returns information necessary to route tuples down a partition tree
- *
- * The number of elements in the returned array (that is, the number of
- * PartitionDispatch objects for the partitioned tables in the partition tree)
- * is returned in *num_parted and a list of the OIDs of all the leaf
- * partitions of rel is returned in *leaf_part_oids.
- *
- * All the relations in the partition tree (including 'rel') must have been
- * locked (using at least the AccessShareLock) by the caller.
- */
-static PartitionDispatch *
-RelationGetPartitionDispatchInfo(Relation rel,
-								 int *num_parted, List **leaf_part_oids)
-{
-	List	   *pdlist = NIL;
-	PartitionDispatchData **pd;
-	ListCell   *lc;
-	int			i;
-
-	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
-
-	*num_parted = 0;
-	*leaf_part_oids = NIL;
-
-	get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
-	*num_parted = list_length(pdlist);
-	pd = (PartitionDispatchData **) palloc(*num_parted *
-										   sizeof(PartitionDispatchData *));
-	i = 0;
-	foreach(lc, pdlist)
-	{
-		pd[i++] = lfirst(lc);
-	}
-
-	return pd;
-}
-
-/*
- * get_partition_dispatch_recurse
- *		Recursively expand partition tree rooted at rel
- *
- * As the partition tree is expanded in a depth-first manner, we maintain two
- * global lists: of PartitionDispatch objects corresponding to partitioned
- * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
- *
- * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
- * the order in which the planner's expand_partitioned_rtentry() processes
- * them.  It's not necessarily the case that the offsets match up exactly,
- * because constraint exclusion might prune away some partitions on the
- * planner side, whereas we'll always have the complete list; but unpruned
- * partitions will appear in the same order in the plan as they are returned
- * here.
- */
-static void
-get_partition_dispatch_recurse(Relation rel, Relation parent,
-							   List **pds, List **leaf_part_oids)
-{
-	TupleDesc	tupdesc = RelationGetDescr(rel);
-	PartitionDesc partdesc = RelationGetPartitionDesc(rel);
-	PartitionKey partkey = RelationGetPartitionKey(rel);
-	PartitionDispatch pd;
-	int			i;
-
-	check_stack_depth();
-
-	/* Build a PartitionDispatch for this table and add it to *pds. */
-	pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
-	*pds = lappend(*pds, pd);
-	pd->reldesc = rel;
-	pd->key = partkey;
-	pd->keystate = NIL;
-	pd->partdesc = partdesc;
-	if (parent != NULL)
-	{
-		/*
-		 * For every partitioned table other than the root, we must store a
-		 * tuple table slot initialized with its tuple descriptor and a tuple
-		 * conversion map to convert a tuple from its parent's rowtype to its
-		 * own. That is to make sure that we are looking at the correct row
-		 * using the correct tuple descriptor when computing its partition key
-		 * for tuple routing.
-		 */
-		pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
-		pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
-											tupdesc,
-											gettext_noop("could not convert row type"));
-	}
-	else
-	{
-		/* Not required for the root partitioned table */
-		pd->tupslot = NULL;
-		pd->tupmap = NULL;
-	}
-
-	/*
-	 * Go look at each partition of this table.  If it's a leaf partition,
-	 * simply add its OID to *leaf_part_oids.  If it's a partitioned table,
-	 * recursively call get_partition_dispatch_recurse(), so that its
-	 * partitions are processed as well and a corresponding PartitionDispatch
-	 * object gets added to *pds.
-	 *
-	 * The 'indexes' array is used when searching for a partition matching a
-	 * given tuple.  The actual value we store here depends on whether the
-	 * array element belongs to a leaf partition or a subpartitioned table.
-	 * For leaf partitions we store the index into *leaf_part_oids, and for
-	 * sub-partitioned tables we store a negative version of the index into
-	 * the *pds list.  Both indexes are 0-based, but the first element of the
-	 * *pds list is the root partition, so 0 always means the first leaf. When
-	 * searching, if we see a negative value, the search must continue in the
-	 * corresponding sub-partition; otherwise, we've identified the correct
-	 * partition.
-	 */
-	pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
-	for (i = 0; i < partdesc->nparts; i++)
-	{
-		Oid			partrelid = partdesc->oids[i];
-
-		if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
-		{
-			*leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
-			pd->indexes[i] = list_length(*leaf_part_oids) - 1;
-		}
-		else
-		{
-			/*
-			 * We assume all tables in the partition tree were already locked
-			 * by the caller.
-			 */
-			Relation	partrel = heap_open(partrelid, NoLock);
-
-			pd->indexes[i] = -list_length(*pds);
-			get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
-		}
-	}
-}
-
 /* ----------------
  *		FormPartitionKeyDatum
  *			Construct values[] and isnull[] arrays for the partition key
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index f535762e2d..6e0c7862dc 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -68,7 +68,6 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
 						ResultRelInfo *targetRelInfo,
 						TupleTableSlot *slot);
 static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
-static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate);
 static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
 static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
 						int whichplan);
@@ -1666,7 +1665,7 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 	if (mtstate->mt_transition_capture != NULL ||
 		mtstate->mt_oc_transition_capture != NULL)
 	{
-		ExecSetupChildParentMapForTcs(mtstate);
+		ExecSetupChildParentMapForSubplan(mtstate);
 
 		/*
 		 * Install the conversion map for the first plan for UPDATE and DELETE
@@ -1709,21 +1708,12 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
 	 * value is to be used as an index into the arrays for the ResultRelInfo
 	 * and TupleConversionMap for the partition.
 	 */
-	partidx = ExecFindPartition(targetRelInfo,
-								proute->partition_dispatch_info,
-								slot,
-								estate);
+	partidx = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate);
 	Assert(partidx >= 0 && partidx < proute->num_partitions);
 
-	/*
-	 * Get the ResultRelInfo corresponding to the selected partition; if not
-	 * yet there, initialize it.
-	 */
+	/* Get the ResultRelInfo corresponding to the selected partition. */
+	Assert(proute->partitions[partidx] != NULL);
 	partrel = proute->partitions[partidx];
-	if (partrel == NULL)
-		partrel = ExecInitPartitionInfo(mtstate, targetRelInfo,
-										proute, estate,
-										partidx);
 
 	/*
 	 * Check whether the partition is routable if we didn't yet
@@ -1789,7 +1779,9 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
 	/*
 	 * Convert the tuple, if necessary.
 	 */
-	ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[partidx],
+	ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps ?
+								proute->parent_child_tupconv_maps[partidx] :
+								NULL,
 							  tuple,
 							  proute->partition_tuple_slot,
 							  &slot);
@@ -1828,17 +1820,6 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate)
 	int			numResultRelInfos = mtstate->mt_nplans;
 	int			i;
 
-	/*
-	 * First check if there is already a per-subplan array allocated. Even if
-	 * there is already a per-leaf map array, we won't require a per-subplan
-	 * one, since we will use the subplan offset array to convert the subplan
-	 * index to per-leaf index.
-	 */
-	if (mtstate->mt_per_subplan_tupconv_maps ||
-		(mtstate->mt_partition_tuple_routing &&
-		 mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps))
-		return;
-
 	/*
 	 * Build array of conversion maps from each child's TupleDesc to the one
 	 * used in the target relation.  The map pointers may be NULL when no
@@ -1860,79 +1841,18 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate)
 	}
 }
 
-/*
- * Initialize the child-to-root tuple conversion map array required for
- * capturing transition tuples.
- *
- * The map array can be indexed either by subplan index or by leaf-partition
- * index.  For transition tables, we need a subplan-indexed access to the map,
- * and where tuple-routing is present, we also require a leaf-indexed access.
- */
-static void
-ExecSetupChildParentMapForTcs(ModifyTableState *mtstate)
-{
-	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-
-	/*
-	 * If partition tuple routing is set up, we will require partition-indexed
-	 * access. In that case, create the map array indexed by partition; we
-	 * will still be able to access the maps using a subplan index by
-	 * converting the subplan index to a partition index using
-	 * subplan_partition_offsets. If tuple routing is not set up, it means we
-	 * don't require partition-indexed access. In that case, create just a
-	 * subplan-indexed map.
-	 */
-	if (proute)
-	{
-		/*
-		 * If a partition-indexed map array is to be created, the subplan map
-		 * array has to be NULL.  If the subplan map array is already created,
-		 * we won't be able to access the map using a partition index.
-		 */
-		Assert(mtstate->mt_per_subplan_tupconv_maps == NULL);
-
-		ExecSetupChildParentMapForLeaf(proute);
-	}
-	else
-		ExecSetupChildParentMapForSubplan(mtstate);
-}
-
 /*
  * For a given subplan index, get the tuple conversion map.
  */
 static TupleConversionMap *
 tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan)
 {
-	/*
-	 * If a partition-index tuple conversion map array is allocated, we need
-	 * to first get the index into the partition array. Exactly *one* of the
-	 * two arrays is allocated. This is because if there is a partition array
-	 * required, we don't require subplan-indexed array since we can translate
-	 * subplan index into partition index. And, we create a subplan-indexed
-	 * array *only* if partition-indexed array is not required.
-	 */
+	/* If nobody else set the per-subplan array of maps, do so ouselves. */
 	if (mtstate->mt_per_subplan_tupconv_maps == NULL)
-	{
-		int			leaf_index;
-		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-
-		/*
-		 * If subplan-indexed array is NULL, things should have been arranged
-		 * to convert the subplan index to partition index.
-		 */
-		Assert(proute && proute->subplan_partition_offsets != NULL &&
-			   whichplan < proute->num_subplan_partition_offsets);
-
-		leaf_index = proute->subplan_partition_offsets[whichplan];
+		ExecSetupChildParentMapForSubplan(mtstate);
 
-		return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate),
-								 leaf_index);
-	}
-	else
-	{
-		Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
-		return mtstate->mt_per_subplan_tupconv_maps[whichplan];
-	}
+	Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
+	return mtstate->mt_per_subplan_tupconv_maps[whichplan];
 }
 
 /* ----------------------------------------------------------------
diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c
index 115a9fe78f..aa82aa52eb 100644
--- a/src/backend/utils/cache/partcache.c
+++ b/src/backend/utils/cache/partcache.c
@@ -594,6 +594,7 @@ RelationBuildPartitionDesc(Relation rel)
 		int			next_index = 0;
 
 		result->oids = (Oid *) palloc0(nparts * sizeof(Oid));
+		result->is_leaf = (bool *) palloc(nparts * sizeof(bool));
 
 		boundinfo = (PartitionBoundInfoData *)
 			palloc0(sizeof(PartitionBoundInfoData));
@@ -782,7 +783,15 @@ RelationBuildPartitionDesc(Relation rel)
 		 * defined by canonicalized representation of the partition bounds.
 		 */
 		for (i = 0; i < nparts; i++)
-			result->oids[mapping[i]] = oids[i];
+		{
+			int			index = mapping[i];
+
+			result->oids[index] = oids[i];
+			/* Record if the partition is a subpartitioned table */
+			result->is_leaf[index] =
+				(get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
+		}
+
 		pfree(mapping);
 	}
 
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 1f49e5d3a9..4cc7508067 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -26,7 +26,11 @@
 typedef struct PartitionDescData
 {
 	int			nparts;			/* Number of partitions */
-	Oid		   *oids;			/* OIDs of partitions */
+	Oid		   *oids;			/* Array of length 'nparts' containing
+								 * partition OIDs in order of the their
+								 * bounds */
+	bool	   *is_leaf;		/* Array of 'nparts' elements storing whether
+								 * a partition is a leaf partition or not */
 	PartitionBoundInfo boundinfo;	/* collection of partition bounds */
 } PartitionDescData;
 
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 862bf65060..1b421f2ec5 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -31,9 +31,13 @@
  *	tupmap		TupleConversionMap to convert from the parent's rowtype to
  *				this table's rowtype (when extracting the partition key of a
  *				tuple just before routing it through this table)
- *	indexes		Array with partdesc->nparts members (for details on what
- *				individual members represent, see how they are set in
- *				get_partition_dispatch_recurse())
+ *	indexes		Array with partdesc->nparts elements.  For leaf partitions the
+ *				index into the PartitionTupleRouting->partitions array is
+ *				stored.  When the partition is itself a partitioned table then
+ *				we store the index into
+ *				PartitionTupleRouting->partition_dispatch_info.  -1 means
+ *				we've not yet allocated anything in PartitionTupleRouting for
+ *				the partition.
  *-----------------------
  */
 typedef struct PartitionDispatchData
@@ -50,66 +54,106 @@ typedef struct PartitionDispatchData
 typedef struct PartitionDispatchData *PartitionDispatch;
 
 /*-----------------------
- * PartitionTupleRouting - Encapsulates all information required to execute
- * tuple-routing between partitions.
- *
- * partition_dispatch_info		Array of PartitionDispatch objects with one
- *								entry for every partitioned table in the
- *								partition tree.
- * num_dispatch					number of partitioned tables in the partition
- *								tree (= length of partition_dispatch_info[])
- * partition_oids				Array of leaf partitions OIDs with one entry
- *								for every leaf partition in the partition tree,
- *								initialized in full by
- *								ExecSetupPartitionTupleRouting.
- * partitions					Array of ResultRelInfo* objects with one entry
- *								for every leaf partition in the partition tree,
- *								initialized lazily by ExecInitPartitionInfo.
- * num_partitions				Number of leaf partitions in the partition tree
- *								(= 'partitions_oid'/'partitions' array length)
- * parent_child_tupconv_maps	Array of TupleConversionMap objects with one
- *								entry for every leaf partition (required to
- *								convert tuple from the root table's rowtype to
- *								a leaf partition's rowtype after tuple routing
- *								is done)
- * child_parent_tupconv_maps	Array of TupleConversionMap objects with one
- *								entry for every leaf partition (required to
- *								convert an updated tuple from the leaf
- *								partition's rowtype to the root table's rowtype
- *								so that tuple routing can be done)
- * child_parent_map_not_required  Array of bool. True value means that a map is
- *								determined to be not required for the given
- *								partition. False means either we haven't yet
- *								checked if a map is required, or it was
- *								determined to be required.
- * subplan_partition_offsets	Integer array ordered by UPDATE subplans. Each
- *								element of this array has the index into the
- *								corresponding partition in partitions array.
- * num_subplan_partition_offsets  Length of 'subplan_partition_offsets' array
- * partition_tuple_slot			TupleTableSlot to be used to manipulate any
- *								given leaf partition's rowtype after that
- *								partition is chosen for insertion by
- *								tuple-routing.
- * root_tuple_slot				TupleTableSlot to be used to transiently hold
- *								copy of a tuple that's being moved across
- *								partitions in the root partitioned table's
- *								rowtype
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions
+ *
+ *	partition_root			Root table, that is, the table mentioned in the
+ *							command.
+ *
+ *	partition_dispatch_info	Contains PartitionDispatch objects for every
+ *							partitioned table touched by tuple routing.  The
+ *							entry for the root partitioned table is *always*
+ *							present as the first entry of this array.
+ *
+ *	num_dispatch			The number of existing entries and also serves as
+ *							the index of the next entry to be allocated and
+ *							placed in 'partition_dispatch_info'.
+ *
+ *	dispatch_allocsize		(>= 'num_dispatch') is the number of entries that
+ *							can be stored in 'partition_dispatch_info' before
+ *							needing to reallocate more space.
+ *
+ *	partitions				Contains pointers to a ResultRelInfos of all leaf
+ *							partitions touched by tuple routing.  Some of
+ *							these are pointers to "reused" ResultRelInfos,
+ *							that is, those that are created and destroyed
+ *							outside execPartition.c, for example, when tuple
+ *							routing is used for UPDATE queries that modify
+ *							the partition key.  Rest of them are pointers to
+ *							ResultRelInfos managed by execPartition.c itself
+ *
+ *	num_partitions			The number of existing entries and also serves as
+ *							the index of the next entry to be allocated and
+ *							placed in 'partitions'
+ *
+ *	partitions_allocsize	(>= 'num_partitions') is the number of entries
+ *							that can be stored in 'partitions',
+ *							'parent_child_tupconv_maps',
+ *							'child_parent_tupconv_maps' and
+ *							'child_parent_map_not_required' arrays before
+ *							needing to reallocate more space
+ *
+ *	parent_child_tupconv_maps	Contains information to convert tuples of the
+ *							root parent's rowtype to those of the leaf
+ *							partitions' rowtype, but only for those partitions
+ *							whose TupleDescs are physically different from the
+ *							root parent's.  If none of the partitions has such
+ *							a differing TupleDesc, then it's NULL.  If
+ *							non-NULL, is of the same size as 'partitions', to
+ *							be able to use the same array index.  Also, there
+ *							need not be more of these maps than there are
+ *							partitions that were touched.
+ *
+ *	partition_tuple_slot	This is a tuple slot used to store a tuple using
+ *							rowtype of the partition chosen by tuple
+ *							routing.  Maintained separately because partitions
+ *							may have different rowtype.
+ *
+ * Note: The following fields are used only when UPDATE ends up needing to
+ * do tuple routing.
+ *
+ *	child_parent_tupconv_maps	Information to convert tuples of the leaf
+ *							partitions' rowtype to the root parent's rowtype.
+ *							These are needed by transition table machinery
+ *							when storing tuples of partition's rowtype into
+ *							the transition table that can only store tuples of
+ *							the root parent's rowtype.  Like
+ *							'parent_child_tupconv_maps' it remains NULL if
+ *							none of the partitions selected by tuple routing
+ *							needed a conversion map.  Also, if non-NULL, is of
+ *							the same size as 'partitions'.
+ *
+ *	child_parent_map_not_required	Stores if we don't need a conversion
+ *							map for a partition so that TupConvMapForLeaf
+ *							can return without having to re-check if it needs
+ *							to build a map.
+ *
+ *	subplan_partition_table	Hash table to store subplan index by Oid.
+ *
+ *	root_tuple_slot			During UPDATE tuple routing, this tuple slot is
+ *							used to transiently store a tuple using the root
+ *							table's rowtype after converting it from the
+ *							tuple's source leaf partition's rowtype.  That is,
+ *							if leaf partition's rowtype is different.
  *-----------------------
  */
 typedef struct PartitionTupleRouting
 {
+	Relation	partition_root;
+
 	PartitionDispatch *partition_dispatch_info;
 	int			num_dispatch;
-	Oid		   *partition_oids;
+	int			dispatch_allocsize;
 	ResultRelInfo **partitions;
 	int			num_partitions;
+	int			partitions_allocsize;
 	TupleConversionMap **parent_child_tupconv_maps;
 	TupleConversionMap **child_parent_tupconv_maps;
 	bool	   *child_parent_map_not_required;
-	int		   *subplan_partition_offsets;
-	int			num_subplan_partition_offsets;
-	TupleTableSlot *partition_tuple_slot;
+	HTAB	   *subplan_partition_table;
 	TupleTableSlot *root_tuple_slot;
+	TupleTableSlot *partition_tuple_slot;
 } PartitionTupleRouting;
 
 /*-----------------------
@@ -186,14 +230,15 @@ typedef struct PartitionPruneState
 
 extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 							   Relation rel);
-extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
-				  PartitionDispatch *pd,
+extern int ExecFindPartition(ModifyTableState *mtstate,
+				  ResultRelInfo *resultRelInfo,
+				  PartitionTupleRouting *proute,
 				  TupleTableSlot *slot,
 				  EState *estate);
-extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
-					  ResultRelInfo *resultRelInfo,
-					  PartitionTupleRouting *proute,
-					  EState *estate, int partidx);
+extern ResultRelInfo *ExecGetPartitionInfo(ModifyTableState *mtstate,
+					 ResultRelInfo *resultRelInfo,
+					 PartitionTupleRouting *proute,
+					 EState *estate, int partidx);
 extern void ExecInitRoutingInfo(ModifyTableState *mtstate,
 					EState *estate,
 					PartitionTupleRouting *proute,
-- 
2.16.2.windows.1

