From cb81abc25bcfcc146c5f0c46e5d1345790bd3d8e Mon Sep 17 00:00:00 2001
From: "dgrowley@gmail.com" <dgrowley@gmail.com>
Date: Fri, 22 Jun 2018 15:05:42 +1200
Subject: [PATCH v1 1/2] Speed up INSERT and UPDATE on partitioned tables

Various changes have been made here to reduce the overhead of executor init
of INSERT and UPDATE plans which perform the operation on a partitioned
table.  Tests done against partitioned tables with many partitions show
that there are a number of bottlenecks in the
ExecSetupPartitionTupleRouting code.  Namely, locking all the partitions
when we may require inserting into just one partition is quite a costly
overhead. This commit does not change anything relating to the locks, it
does however remove all the other bottlenecks.  Lock reduction will need to
be left for another day.

This commit also moves some of the work being done in
ExecSetupPartitionTupleRouting and the functions which it calls so that the
setup work is pre-calculated by the relcache code.  Particular care has
been taken in get_partition_dispatch_recurse to speed up the code.
Dereferencing the input parameters once per call, rather than once per loop
made a noticeable increase in performance.   Also, changing the
leaf_part_oids List into an array speeds things up considerably both
because that's the final form we need that data in, and also because it
saves constant palloc() calls which are made in lappend.

Initialization of the parent/child translation maps array is now only
performed when we need to store the first translation map.  If the column
order between the parent and its child are the same, then no map ever needs
to be stored, this (possibly large) array did nothing.

In simple INSERTs hitting a single partition to a partitioned table with
many partitions the shutdown of the executor was also slow in comparison to
the actual execution, this was down to the loop which cleans up each
ResultRelInfo having to loop over an array which often contained mostly
NULLs, which had to be skipped.  To speed this up we now keep track of
exactly which ResultRelInfos have been initialized.  These are stored in a
new array which we expand on demand.  Technically we could initialize the
full array size on the first allocation, but profiles indicated a higher
overhead when that memory context was destroyed, presumably due to some
extra malloc/free calls which had resulted due to the large array
allocation.
---
 src/backend/commands/copy.c            |  17 +-
 src/backend/executor/execPartition.c   | 370 ++++++++++++++++++++++-----------
 src/backend/executor/nodeModifyTable.c |  16 +-
 src/backend/utils/cache/partcache.c    |  32 ++-
 src/include/catalog/partition.h        |  13 +-
 src/include/executor/execPartition.h   |  17 +-
 6 files changed, 319 insertions(+), 146 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 3a66cb5025..25bec76c1d 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2644,15 +2644,10 @@ CopyFrom(CopyState cstate)
 			 * to the selected partition.
 			 */
 			saved_resultRelInfo = resultRelInfo;
-			resultRelInfo = proute->partitions[leaf_part_index];
-			if (resultRelInfo == NULL)
-			{
-				resultRelInfo = ExecInitPartitionInfo(mtstate,
-													  saved_resultRelInfo,
-													  proute, estate,
-													  leaf_part_index);
-				Assert(resultRelInfo != NULL);
-			}
+			resultRelInfo = ExecGetPartitionInfo(mtstate,
+												 saved_resultRelInfo,
+												 proute, estate,
+												 leaf_part_index);
 
 			/*
 			 * For ExecInsertIndexTuples() to work on the partition's indexes
@@ -2693,7 +2688,9 @@ CopyFrom(CopyState cstate)
 			 * We might need to convert from the parent rowtype to the
 			 * partition rowtype.
 			 */
-			tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index],
+			tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps ?
+												proute->parent_child_tupconv_maps[leaf_part_index] :
+												NULL,
 											  tuple,
 											  proute->partition_tuple_slot,
 											  &slot);
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 7a4665cc4e..1a3a67dd0d 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -31,11 +31,17 @@
 #include "utils/rls.h"
 #include "utils/ruleutils.h"
 
-
+static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
+					  ResultRelInfo *resultRelInfo,
+					  PartitionTupleRouting *proute,
+					  EState *estate, int partidx);
 static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
-								 int *num_parted, List **leaf_part_oids);
+								 int *num_parted, Oid **leaf_part_oids,
+								 int *n_leaf_part_oids);
 static void get_partition_dispatch_recurse(Relation rel, Relation parent,
-							   List **pds, List **leaf_part_oids);
+							   List **pds, Oid **leaf_part_oids,
+							   int *n_leaf_part_oids,
+							   int *leaf_part_oid_size);
 static void FormPartitionKeyDatum(PartitionDispatch pd,
 					  TupleTableSlot *slot,
 					  EState *estate,
@@ -65,22 +71,18 @@ static void find_matching_subplans_recurse(PartitionPruneState *prunestate,
  * While we allocate the arrays of pointers of ResultRelInfo and
  * TupleConversionMap for all partitions here, actual objects themselves are
  * lazily allocated for a given partition if a tuple is actually routed to it;
- * see ExecInitPartitionInfo.  However, if the function is invoked for update
- * tuple routing, caller would already have initialized ResultRelInfo's for
- * some of the partitions, which are reused and assigned to their respective
- * slot in the aforementioned array.  For such partitions, we delay setting
- * up objects such as TupleConversionMap until those are actually chosen as
- * the partitions to route tuples to.  See ExecPrepareTupleRouting.
+ * see ExecInitPartitionInfo.  However, if the function is invoked for UPDATE
+ * tuple routing, the caller will have already initialized ResultRelInfo's for
+ * each partition present in the ModifyTable's subplans. These are reused and
+ * assigned to their respective slot in the aforementioned array.  For such
+ * partitions, we delay setting up objects such as TupleConversionMap until
+ * those are actually chosen as the partitions to route tuples to.  See
+ * ExecPrepareTupleRouting.
  */
 PartitionTupleRouting *
 ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 {
-	List	   *leaf_parts;
-	ListCell   *cell;
 	int			i;
-	ResultRelInfo *update_rri = NULL;
-	int			num_update_rri = 0,
-				update_rri_index = 0;
 	PartitionTupleRouting *proute;
 	int			nparts;
 	ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
@@ -90,32 +92,36 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 	 * partitions.
 	 */
 	(void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
-	proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
+	proute = (PartitionTupleRouting *) palloc(sizeof(PartitionTupleRouting));
 	proute->partition_dispatch_info =
 		RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch,
-										 &leaf_parts);
-	proute->num_partitions = nparts = list_length(leaf_parts);
+										 &proute->partition_oids, &nparts);
+
+	proute->num_partitions = nparts;
 	proute->partitions =
-		(ResultRelInfo **) palloc(nparts * sizeof(ResultRelInfo *));
-	proute->parent_child_tupconv_maps =
-		(TupleConversionMap **) palloc0(nparts * sizeof(TupleConversionMap *));
-	proute->partition_oids = (Oid *) palloc(nparts * sizeof(Oid));
+		(ResultRelInfo **) palloc0(nparts * sizeof(ResultRelInfo *));
 
-	/* Set up details specific to the type of tuple routing we are doing. */
-	if (node && node->operation == CMD_UPDATE)
-	{
-		update_rri = mtstate->resultRelInfo;
-		num_update_rri = list_length(node->plans);
-		proute->subplan_partition_offsets =
-			palloc(num_update_rri * sizeof(int));
-		proute->num_subplan_partition_offsets = num_update_rri;
+	/*
+	 * Allocate an array to store ResultRelInfos that we'll later allocate.
+	 * It is common that not all partitions will have tuples routed to them,
+	 * so we'll refrain from allocating enough space for all partitions here.
+	 * Let's just start with something small and make it bigger only when
+	 * needed.  Storing these separately rather than relying on the
+	 *'partitions' array allows us to quickly identify which ResultRelInfos we
+	 * must teardown at the end.
+	 */
+	proute->partitions_init_size = Min(nparts, 8);
+
+	proute->partitions_init = (ResultRelInfo **)
+		palloc(proute->partitions_init_size * sizeof(ResultRelInfo *));
+
+	proute->num_partitions_init = 0;
+
+	/* We only allocate this when we need to store the first non-NULL map */
+	proute->parent_child_tupconv_maps = NULL;
+
+	proute->child_parent_tupconv_maps = NULL;
 
-		/*
-		 * We need an additional tuple slot for storing transient tuples that
-		 * are converted to the root table descriptor.
-		 */
-		proute->root_tuple_slot = MakeTupleTableSlot(NULL);
-	}
 
 	/*
 	 * Initialize an empty slot that will be used to manipulate tuples of any
@@ -125,50 +131,70 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 	 */
 	proute->partition_tuple_slot = MakeTupleTableSlot(NULL);
 
-	i = 0;
-	foreach(cell, leaf_parts)
+	/* Set up details specific to the type of tuple routing we are doing. */
+	if (node && node->operation == CMD_UPDATE)
 	{
-		ResultRelInfo *leaf_part_rri = NULL;
-		Oid			leaf_oid = lfirst_oid(cell);
+		ResultRelInfo *update_rri = NULL;
+		int			num_update_rri = 0,
+					update_rri_index = 0;
 
-		proute->partition_oids[i] = leaf_oid;
+		update_rri = mtstate->resultRelInfo;
+		num_update_rri = list_length(node->plans);
+		proute->subplan_partition_offsets =
+			palloc(num_update_rri * sizeof(int));
+		proute->num_subplan_partition_offsets = num_update_rri;
 
-		/*
-		 * If the leaf partition is already present in the per-subplan result
-		 * rels, we re-use that rather than initialize a new result rel. The
-		 * per-subplan resultrels and the resultrels of the leaf partitions
-		 * are both in the same canonical order. So while going through the
-		 * leaf partition oids, we need to keep track of the next per-subplan
-		 * result rel to be looked for in the leaf partition resultrels.
-		 */
-		if (update_rri_index < num_update_rri &&
-			RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
+		proute->root_tuple_slot = MakeTupleTableSlot(NULL);
+
+		for (i = 0; i < nparts; i++)
 		{
-			leaf_part_rri = &update_rri[update_rri_index];
+			Oid			leaf_oid = proute->partition_oids[i];
 
 			/*
-			 * This is required in order to convert the partition's tuple to
-			 * be compatible with the root partitioned table's tuple
-			 * descriptor.  When generating the per-subplan result rels, this
-			 * was not set.
+			 * If the leaf partition is already present in the per-subplan
+			 * result rels, we re-use that rather than initialize a new result
+			 * rel. The per-subplan resultrels and the resultrels of the leaf
+			 * partitions are both in the same canonical order. So while going
+			 * through the leaf partition oids, we need to keep track of the
+			 * next per-subplan result rel to be looked for in the leaf
+			 * partition resultrels.
 			 */
-			leaf_part_rri->ri_PartitionRoot = rel;
+			if (update_rri_index < num_update_rri &&
+				RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
+			{
+				ResultRelInfo *leaf_part_rri;
+
+				leaf_part_rri = &update_rri[update_rri_index];
+
+				/*
+				 * This is required in order to convert the partition's tuple
+				 * to be compatible with the root partitioned table's tuple
+				 * descriptor.  When generating the per-subplan result rels,
+				 * this was not set.
+				 */
+				leaf_part_rri->ri_PartitionRoot = rel;
+
+				/* Remember the subplan offset for this ResultRelInfo */
+				proute->subplan_partition_offsets[update_rri_index] = i;
 
-			/* Remember the subplan offset for this ResultRelInfo */
-			proute->subplan_partition_offsets[update_rri_index] = i;
+				update_rri_index++;
 
-			update_rri_index++;
+				proute->partitions[i] = leaf_part_rri;
+			}
 		}
 
-		proute->partitions[i] = leaf_part_rri;
-		i++;
+		/*
+		 * We should have found all the per-subplan resultrels in the leaf
+		 * partitions.
+		 */
+		Assert(update_rri_index == num_update_rri);
+	}
+	else
+	{
+		proute->root_tuple_slot = NULL;
+		proute->subplan_partition_offsets = NULL;
+		proute->num_subplan_partition_offsets = 0;
 	}
-
-	/*
-	 * For UPDATE, we should have found all the per-subplan resultrels in the
-	 * leaf partitions.  (If this is an INSERT, both values will be zero.)
-	 */
-	Assert(update_rri_index == num_update_rri);
 
 	return proute;
 }
@@ -291,13 +317,61 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 	return result;
 }
 
+/*
+ * ExecGetPartitionInfo
+ *		Fetch ResultRelInfo for partidx
+ *
+ * Sets up ResultRelInfo, if not done already.
+ */
+ResultRelInfo *
+ExecGetPartitionInfo(ModifyTableState *mtstate,
+					 ResultRelInfo *resultRelInfo,
+					 PartitionTupleRouting *proute,
+					 EState *estate, int partidx)
+{
+	ResultRelInfo *result = proute->partitions[partidx];
+
+	if (result)
+		return result;
+
+	result = ExecInitPartitionInfo(mtstate,
+								   resultRelInfo,
+								   proute,
+								   estate,
+								   partidx);
+	Assert(result);
+
+	proute->partitions[partidx] = result;
+
+	/*
+	 * Record the ones setup so far in setup order.  This makes the cleanup
+	 * operation more efficient when very few have been setup.
+	 */
+	if (proute->num_partitions_init == proute->partitions_init_size)
+	{
+		/* First allocate more space if the array is not large enough */
+		proute->partitions_init_size =
+			Min(proute->partitions_init_size * 2, proute->num_partitions);
+
+		proute->partitions_init = (ResultRelInfo **)
+				repalloc(proute->partitions_init,
+				proute->partitions_init_size * sizeof(ResultRelInfo *));
+	}
+
+	proute->partitions_init[proute->num_partitions_init++] = result;
+
+	Assert(proute->num_partitions_init <= proute->num_partitions);
+
+	return result;
+}
+
 /*
  * ExecInitPartitionInfo
  *		Initialize ResultRelInfo and other information for a partition
  *
  * Returns the ResultRelInfo
  */
-ResultRelInfo *
+static ResultRelInfo *
 ExecInitPartitionInfo(ModifyTableState *mtstate,
 					  ResultRelInfo *resultRelInfo,
 					  PartitionTupleRouting *proute,
@@ -500,7 +574,6 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 	 */
 	if (node && node->onConflictAction != ONCONFLICT_NONE)
 	{
-		TupleConversionMap *map = proute->parent_child_tupconv_maps[partidx];
 		int			firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
 		TupleDesc	partrelDesc = RelationGetDescr(partrel);
 		ExprContext *econtext = mtstate->ps.ps_ExprContext;
@@ -550,6 +623,11 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 		 */
 		if (node->onConflictAction == ONCONFLICT_UPDATE)
 		{
+			TupleConversionMap *map;
+
+			map = proute->parent_child_tupconv_maps ?
+				proute->parent_child_tupconv_maps[partidx] : NULL;
+
 			Assert(node->onConflictSet != NIL);
 			Assert(resultRelInfo->ri_onConflict != NULL);
 
@@ -671,6 +749,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 					int partidx)
 {
 	MemoryContext oldContext;
+	TupleConversionMap *map;
 
 	/*
 	 * Switch into per-query memory context.
@@ -681,10 +760,19 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 	 * Set up a tuple conversion map to convert a tuple routed to the
 	 * partition from the parent's type to the partition's.
 	 */
-	proute->parent_child_tupconv_maps[partidx] =
-		convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot),
-							   RelationGetDescr(partRelInfo->ri_RelationDesc),
-							   gettext_noop("could not convert row type"));
+	map = convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot),
+								 RelationGetDescr(partRelInfo->ri_RelationDesc),
+								 gettext_noop("could not convert row type"));
+
+	if (map)
+	{
+		/* Allocate parent child map array only if we need to store a map */
+		if (!proute->parent_child_tupconv_maps)
+			proute->parent_child_tupconv_maps = (TupleConversionMap **)
+				palloc0(proute->num_partitions * sizeof(TupleConversionMap *));
+
+		proute->parent_child_tupconv_maps[partidx] = map;
+	}
 
 	/*
 	 * If the partition is a foreign table, let the FDW init itself for
@@ -805,7 +893,6 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 						PartitionTupleRouting *proute)
 {
 	int			i;
-	int			subplan_index = 0;
 
 	/*
 	 * Remember, proute->partition_dispatch_info[0] corresponds to the root
@@ -822,13 +909,9 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 		ExecDropSingleTupleTableSlot(pd->tupslot);
 	}
 
-	for (i = 0; i < proute->num_partitions; i++)
+	for (i = 0; i < proute->num_partitions_init; i++)
 	{
-		ResultRelInfo *resultRelInfo = proute->partitions[i];
-
-		/* skip further processsing for uninitialized partitions */
-		if (resultRelInfo == NULL)
-			continue;
+		ResultRelInfo *resultRelInfo = proute->partitions_init[i];
 
 		/* Allow any FDWs to shut down if they've been exercised */
 		if (resultRelInfo->ri_PartitionReadyForRouting &&
@@ -837,24 +920,6 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 			resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
 														   resultRelInfo);
 
-		/*
-		 * If this result rel is one of the UPDATE subplan result rels, let
-		 * ExecEndPlan() close it. For INSERT or COPY,
-		 * proute->subplan_partition_offsets will always be NULL. Note that
-		 * the subplan_partition_offsets array and the partitions array have
-		 * the partitions in the same order. So, while we iterate over
-		 * partitions array, we also iterate over the
-		 * subplan_partition_offsets array in order to figure out which of the
-		 * result rels are present in the UPDATE subplans.
-		 */
-		if (proute->subplan_partition_offsets &&
-			subplan_index < proute->num_subplan_partition_offsets &&
-			proute->subplan_partition_offsets[subplan_index] == i)
-		{
-			subplan_index++;
-			continue;
-		}
-
 		ExecCloseIndices(resultRelInfo);
 		heap_close(resultRelInfo->ri_RelationDesc, NoLock);
 	}
@@ -868,31 +933,36 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
 
 /*
  * RelationGetPartitionDispatchInfo
- *		Returns information necessary to route tuples down a partition tree
- *
- * The number of elements in the returned array (that is, the number of
- * PartitionDispatch objects for the partitioned tables in the partition tree)
- * is returned in *num_parted and a list of the OIDs of all the leaf
- * partitions of rel is returned in *leaf_part_oids.
+ *		Returns an array of PartitionDispatch as is required for routing
+ *		tuples to the correct partition.
  *
+ * 'num_parted' is set to the size of the returned array and the
+ *'leaf_part_oids' array is allocated and populated with each leaf partition
+ * Oid in the hierarchy. 'n_leaf_part_oids' is set to the size of that array.
  * All the relations in the partition tree (including 'rel') must have been
  * locked (using at least the AccessShareLock) by the caller.
  */
 static PartitionDispatch *
 RelationGetPartitionDispatchInfo(Relation rel,
-								 int *num_parted, List **leaf_part_oids)
+								 int *num_parted, Oid **leaf_part_oids,
+								 int *n_leaf_part_oids)
 {
 	List	   *pdlist = NIL;
 	PartitionDispatchData **pd;
 	ListCell   *lc;
 	int			i;
+	int			leaf_part_oid_size;
 
 	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
 
 	*num_parted = 0;
-	*leaf_part_oids = NIL;
+	*n_leaf_part_oids = 0;
+
+	leaf_part_oid_size = 0;
+	*leaf_part_oids = NULL;
 
-	get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
+	get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids,
+								   n_leaf_part_oids, &leaf_part_oid_size);
 	*num_parted = list_length(pdlist);
 	pd = (PartitionDispatchData **) palloc(*num_parted *
 										   sizeof(PartitionDispatchData *));
@@ -909,9 +979,9 @@ RelationGetPartitionDispatchInfo(Relation rel,
  * get_partition_dispatch_recurse
  *		Recursively expand partition tree rooted at rel
  *
- * As the partition tree is expanded in a depth-first manner, we maintain two
- * global lists: of PartitionDispatch objects corresponding to partitioned
- * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
+ * As the partition tree is expanded in a depth-first manner, we populate
+ * '*pds' with PartitionDispatch objects of each partitioned table we find,
+ * and populate leaf_part_oids with each leaf partition OID found.
  *
  * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
  * the order in which the planner's expand_partitioned_rtentry() processes
@@ -920,16 +990,27 @@ RelationGetPartitionDispatchInfo(Relation rel,
  * planner side, whereas we'll always have the complete list; but unpruned
  * partitions will appear in the same order in the plan as they are returned
  * here.
+ *
+ * Note: Callers must not attempt to pfree the 'leaf_part_oids' array.
  */
 static void
 get_partition_dispatch_recurse(Relation rel, Relation parent,
-							   List **pds, List **leaf_part_oids)
+							   List **pds, Oid **leaf_part_oids,
+							   int *n_leaf_part_oids,
+							   int *leaf_part_oid_size)
 {
 	TupleDesc	tupdesc = RelationGetDescr(rel);
 	PartitionDesc partdesc = RelationGetPartitionDesc(rel);
 	PartitionKey partkey = RelationGetPartitionKey(rel);
 	PartitionDispatch pd;
 	int			i;
+	int			nparts;
+	int			oid_array_used;
+	int			oid_array_size;
+	Oid		   *oid_array;
+	Oid		   *partdesc_oids;
+	bool	   *partdesc_subpartitions;
+	int		   *indexes;
 
 	check_stack_depth();
 
@@ -960,6 +1041,21 @@ get_partition_dispatch_recurse(Relation rel, Relation parent,
 		/* Not required for the root partitioned table */
 		pd->tupslot = NULL;
 		pd->tupmap = NULL;
+
+		/*
+		 * If the parent has no sub partitions then we can skip calculating
+		 * all the leaf partitions and just return all the oids at this level.
+		 * In this case, the indexes were also pre-calculated for us by the
+		 * syscache code.
+		 */
+		if (!partdesc->hassubpart)
+		{
+			*leaf_part_oids = partdesc->oids;
+			/* XXX or should we memcpy this out of syscache? */
+			pd->indexes = partdesc->indexes;
+			*n_leaf_part_oids = partdesc->nparts;
+			return;
+		}
 	}
 
 	/*
@@ -980,15 +1076,38 @@ get_partition_dispatch_recurse(Relation rel, Relation parent,
 	 * corresponding sub-partition; otherwise, we've identified the correct
 	 * partition.
 	 */
-	pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
-	for (i = 0; i < partdesc->nparts; i++)
+	oid_array_used = *n_leaf_part_oids;
+	oid_array_size = *leaf_part_oid_size;
+	oid_array = *leaf_part_oids;
+	nparts = partdesc->nparts;
+
+	if (!oid_array)
+	{
+		oid_array_size = *leaf_part_oid_size = nparts;
+		*leaf_part_oids = (Oid *) palloc(sizeof(Oid) * nparts);
+		oid_array = *leaf_part_oids;
+	}
+
+	partdesc_oids = partdesc->oids;
+	partdesc_subpartitions = partdesc->subpartitions;
+
+	pd->indexes = indexes = (int *) palloc(nparts * sizeof(int));
+
+	for (i = 0; i < nparts; i++)
 	{
-		Oid			partrelid = partdesc->oids[i];
+		Oid			partrelid = partdesc_oids[i];
 
-		if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
+		if (!partdesc_subpartitions[i])
 		{
-			*leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
-			pd->indexes[i] = list_length(*leaf_part_oids) - 1;
+			if (oid_array_size <= oid_array_used)
+			{
+				oid_array_size *= 2;
+				oid_array = (Oid *) repalloc(oid_array,
+											 sizeof(Oid) * oid_array_size);
+			}
+
+			oid_array[oid_array_used] = partrelid;
+			indexes[i] = oid_array_used++;
 		}
 		else
 		{
@@ -998,10 +1117,23 @@ get_partition_dispatch_recurse(Relation rel, Relation parent,
 			 */
 			Relation	partrel = heap_open(partrelid, NoLock);
 
-			pd->indexes[i] = -list_length(*pds);
-			get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
+			*n_leaf_part_oids = oid_array_used;
+			*leaf_part_oid_size = oid_array_size;
+			*leaf_part_oids = oid_array;
+
+			indexes[i] = -list_length(*pds);
+			get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids,
+										   n_leaf_part_oids, leaf_part_oid_size);
+
+			oid_array_used = *n_leaf_part_oids;
+			oid_array_size = *leaf_part_oid_size;
+			oid_array = *leaf_part_oids;
 		}
 	}
+
+	*n_leaf_part_oids = oid_array_used;
+	*leaf_part_oid_size = oid_array_size;
+	*leaf_part_oids = oid_array;
 }
 
 /* ----------------
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 7e0b867971..8f62f35cd2 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -1682,15 +1682,9 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
 								estate);
 	Assert(partidx >= 0 && partidx < proute->num_partitions);
 
-	/*
-	 * Get the ResultRelInfo corresponding to the selected partition; if not
-	 * yet there, initialize it.
-	 */
-	partrel = proute->partitions[partidx];
-	if (partrel == NULL)
-		partrel = ExecInitPartitionInfo(mtstate, targetRelInfo,
-										proute, estate,
-										partidx);
+	/* Get the ResultRelInfo corresponding to the selected partition. */
+	partrel = ExecGetPartitionInfo(mtstate, targetRelInfo, proute, estate,
+								   partidx);
 
 	/*
 	 * Check whether the partition is routable if we didn't yet
@@ -1756,7 +1750,9 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
 	/*
 	 * Convert the tuple, if necessary.
 	 */
-	ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[partidx],
+	ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps ?
+								proute->parent_child_tupconv_maps[partidx] :
+								NULL,
 							  tuple,
 							  proute->partition_tuple_slot,
 							  &slot);
diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c
index 115a9fe78f..b36b7366e5 100644
--- a/src/backend/utils/cache/partcache.c
+++ b/src/backend/utils/cache/partcache.c
@@ -594,6 +594,7 @@ RelationBuildPartitionDesc(Relation rel)
 		int			next_index = 0;
 
 		result->oids = (Oid *) palloc0(nparts * sizeof(Oid));
+		result->subpartitions = (bool *) palloc(nparts * sizeof(bool));
 
 		boundinfo = (PartitionBoundInfoData *)
 			palloc0(sizeof(PartitionBoundInfoData));
@@ -774,6 +775,7 @@ RelationBuildPartitionDesc(Relation rel)
 		}
 
 		result->boundinfo = boundinfo;
+		result->hassubpart = false; /* unless we discover otherwise below */
 
 		/*
 		 * Now assign OIDs from the original array into mapped indexes of the
@@ -782,7 +784,35 @@ RelationBuildPartitionDesc(Relation rel)
 		 * defined by canonicalized representation of the partition bounds.
 		 */
 		for (i = 0; i < nparts; i++)
-			result->oids[mapping[i]] = oids[i];
+		{
+			int			index = mapping[i];
+			bool		subpart;
+
+			result->oids[index] = oids[i];
+
+			subpart = (get_rel_relkind(oids[i]) == RELKIND_PARTITIONED_TABLE);
+			/* Record if the partition is a subpartitioned table */
+			result->subpartitions[index] = subpart;
+			result->hassubpart |= subpart;
+		}
+
+		/*
+		 * If there are no subpartitions then we can pre-calculate the
+		 * PartitionDispatch->indexes array.  Doing this here saves quite a
+		 * bit of overhead on simple queries which perform INSERTs or UPDATEs
+		 * on partitioned tables with many partitions.  The pre-calculation is
+		 * very simple.  All we need to store is a sequence of numbers from 0
+		 * to nparts - 1.
+		 */
+		if (!result->hassubpart)
+		{
+			result->indexes = (int *) palloc(nparts * sizeof(int));
+			for (i = 0; i < nparts; i++)
+				result->indexes[i] = i;
+		}
+		else
+			result->indexes = NULL;
+
 		pfree(mapping);
 	}
 
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 1f49e5d3a9..a8c69ff224 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -26,7 +26,18 @@
 typedef struct PartitionDescData
 {
 	int			nparts;			/* Number of partitions */
-	Oid		   *oids;			/* OIDs of partitions */
+	Oid		   *oids;			/* OIDs array of 'nparts' of partitions in
+								 * partbound order */
+	int		   *indexes;		/* Stores index for corresponding 'oids'
+								 * element for use in tuple routing, or NULL
+								 * if hassubpart is true.
+								 */
+	bool	   *subpartitions;	/* Array of 'nparts' set to true if the
+								 * corresponding 'oids' element belongs to a
+								 * sub-partitioned table.
+								 */
+	bool		hassubpart;		/* true if any oid belongs to a
+								 * sub-partitioned table */
 	PartitionBoundInfo boundinfo;	/* collection of partition bounds */
 } PartitionDescData;
 
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 862bf65060..822f66f5e2 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -65,13 +65,17 @@ typedef struct PartitionDispatchData *PartitionDispatch;
  * partitions					Array of ResultRelInfo* objects with one entry
  *								for every leaf partition in the partition tree,
  *								initialized lazily by ExecInitPartitionInfo.
+ * partitions_init				Array of ResultRelInfo* objects in the order
+ *								that they were lazily initialized.
  * num_partitions				Number of leaf partitions in the partition tree
  *								(= 'partitions_oid'/'partitions' array length)
+ * num_partitions_init			Number of leaf partition lazily setup so far.
+ * partitions_init_size			Size of partitions_init array.
  * parent_child_tupconv_maps	Array of TupleConversionMap objects with one
  *								entry for every leaf partition (required to
  *								convert tuple from the root table's rowtype to
  *								a leaf partition's rowtype after tuple routing
- *								is done)
+ *								is done). Remains NULL if no maps to store.
  * child_parent_tupconv_maps	Array of TupleConversionMap objects with one
  *								entry for every leaf partition (required to
  *								convert an updated tuple from the leaf
@@ -102,7 +106,10 @@ typedef struct PartitionTupleRouting
 	int			num_dispatch;
 	Oid		   *partition_oids;
 	ResultRelInfo **partitions;
+	ResultRelInfo **partitions_init;
 	int			num_partitions;
+	int			num_partitions_init;
+	int			partitions_init_size;
 	TupleConversionMap **parent_child_tupconv_maps;
 	TupleConversionMap **child_parent_tupconv_maps;
 	bool	   *child_parent_map_not_required;
@@ -190,10 +197,10 @@ extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
 				  PartitionDispatch *pd,
 				  TupleTableSlot *slot,
 				  EState *estate);
-extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
-					  ResultRelInfo *resultRelInfo,
-					  PartitionTupleRouting *proute,
-					  EState *estate, int partidx);
+extern ResultRelInfo *ExecGetPartitionInfo(ModifyTableState *mtstate,
+					 ResultRelInfo *resultRelInfo,
+					 PartitionTupleRouting *proute,
+					 EState *estate, int partidx);
 extern void ExecInitRoutingInfo(ModifyTableState *mtstate,
 					EState *estate,
 					PartitionTupleRouting *proute,
-- 
2.16.2.windows.1

