From 7765ed870f34b0b2b6f0911d4f3ca6bfa129a0fe Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 28 Nov 2018 10:15:55 -0500
Subject: [PATCH v3 2/4] Ensure that repeated PartitionDesc lookups return the
 same answer.

The query planner will get confused if lookup up the PartitionDesc
for a particular relation does not return a consistent answer for
the entire duration of query planning.  Likewise, query execution
will get confused if the same relation seems to have a different
PartitionDesc at different times.  Invent a new PartitionDirectory
concept and use it to ensure consistency.

Note that this only ensures consistency within a single query
planning cycle or a single query execution.  It doesn't guarantee
that the answer can't change between planning and execution, nor
does it change the way a PartitionDesc is constructed in the first
place.

Since this allows pointers to old PartitionDesc entries to survive
even after a relcache rebuild, also postpone removing the old
PartitionDesc entry until we're certain no one is using it.
---
 src/backend/commands/copy.c            |  2 +-
 src/backend/executor/execPartition.c   | 28 ++++++--
 src/backend/executor/execUtils.c       |  8 +++
 src/backend/executor/nodeModifyTable.c |  2 +-
 src/backend/optimizer/plan/planner.c   |  4 ++
 src/backend/optimizer/util/inherit.c   |  9 ++-
 src/backend/optimizer/util/plancat.c   |  3 +-
 src/backend/partitioning/partdesc.c    | 91 +++++++++++++++++++++++++-
 src/backend/utils/cache/relcache.c     | 20 ++++++
 src/include/executor/execPartition.h   |  3 +-
 src/include/nodes/execnodes.h          |  2 +
 src/include/nodes/pathnodes.h          |  2 +
 src/include/partitioning/partdefs.h    |  2 +
 src/include/partitioning/partdesc.h    |  4 ++
 14 files changed, 167 insertions(+), 13 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index dbb06397e6..382c966a6e 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2559,7 +2559,7 @@ CopyFrom(CopyState cstate)
 	 * CopyFrom tuple routing.
 	 */
 	if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-		proute = ExecSetupPartitionTupleRouting(NULL, cstate->rel);
+		proute = ExecSetupPartitionTupleRouting(estate, NULL, cstate->rel);
 
 	if (cstate->whereClause)
 		cstate->qualexpr = ExecInitQual(castNode(List, cstate->whereClause),
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index e121c6c8ff..db133b37a5 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -167,7 +167,8 @@ static void ExecInitRoutingInfo(ModifyTableState *mtstate,
 					PartitionDispatch dispatch,
 					ResultRelInfo *partRelInfo,
 					int partidx);
-static PartitionDispatch ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute,
+static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
+							  PartitionTupleRouting *proute,
 							  Oid partoid, PartitionDispatch parent_pd, int partidx);
 static void FormPartitionKeyDatum(PartitionDispatch pd,
 					  TupleTableSlot *slot,
@@ -201,7 +202,8 @@ static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
  * it should be estate->es_query_cxt.
  */
 PartitionTupleRouting *
-ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
+ExecSetupPartitionTupleRouting(EState *estate, ModifyTableState *mtstate,
+							   Relation rel)
 {
 	PartitionTupleRouting *proute;
 	ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
@@ -223,7 +225,8 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 	 * parent as NULL as we don't need to care about any parent of the target
 	 * partitioned table.
 	 */
-	ExecInitPartitionDispatchInfo(proute, RelationGetRelid(rel), NULL, 0);
+	ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
+								  NULL, 0);
 
 	/*
 	 * If performing an UPDATE with tuple routing, we can reuse partition
@@ -424,7 +427,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 				 * Create the new PartitionDispatch.  We pass the current one
 				 * in as the parent PartitionDispatch
 				 */
-				subdispatch = ExecInitPartitionDispatchInfo(proute,
+				subdispatch = ExecInitPartitionDispatchInfo(mtstate->ps.state,
+															proute,
 															partdesc->oids[partidx],
 															dispatch, partidx);
 				Assert(dispatch->indexes[partidx] >= 0 &&
@@ -964,7 +968,8 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
  *		PartitionDispatch later.
  */
 static PartitionDispatch
-ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
+ExecInitPartitionDispatchInfo(EState *estate,
+							  PartitionTupleRouting *proute, Oid partoid,
 							  PartitionDispatch parent_pd, int partidx)
 {
 	Relation	rel;
@@ -973,6 +978,10 @@ ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
 	int			dispatchidx;
 	MemoryContext oldcxt;
 
+	if (estate->es_partition_directory == NULL)
+		estate->es_partition_directory =
+			CreatePartitionDirectory(estate->es_query_cxt);
+
 	oldcxt = MemoryContextSwitchTo(proute->memcxt);
 
 	/*
@@ -984,7 +993,7 @@ ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
 		rel = table_open(partoid, RowExclusiveLock);
 	else
 		rel = proute->partition_root;
-	partdesc = RelationGetPartitionDesc(rel);
+	partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
 
 	pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
 									partdesc->nparts * sizeof(int));
@@ -1530,6 +1539,10 @@ ExecCreatePartitionPruneState(PlanState *planstate,
 	ListCell   *lc;
 	int			i;
 
+	if (estate->es_partition_directory == NULL)
+		estate->es_partition_directory =
+			CreatePartitionDirectory(estate->es_query_cxt);
+
 	n_part_hierarchies = list_length(partitionpruneinfo->prune_infos);
 	Assert(n_part_hierarchies > 0);
 
@@ -1609,7 +1622,8 @@ ExecCreatePartitionPruneState(PlanState *planstate,
 			 */
 			partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex);
 			partkey = RelationGetPartitionKey(partrel);
-			partdesc = RelationGetPartitionDesc(partrel);
+			partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
+												partrel);
 
 			n_steps = list_length(pinfo->pruning_steps);
 
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 312a0dc805..d19a747788 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -54,6 +54,7 @@
 #include "mb/pg_wchar.h"
 #include "nodes/nodeFuncs.h"
 #include "parser/parsetree.h"
+#include "partitioning/partdesc.h"
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
@@ -218,6 +219,13 @@ FreeExecutorState(EState *estate)
 		estate->es_jit = NULL;
 	}
 
+	/* release partition directory, if allocated */
+	if (estate->es_partition_directory)
+	{
+		DestroyPartitionDirectory(estate->es_partition_directory);
+		estate->es_partition_directory = NULL;
+	}
+
 	/*
 	 * Free the per-query memory context, thereby releasing all working
 	 * memory, including the EState node itself.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 566858c19b..b9ecd8d24e 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2229,7 +2229,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
 		(operation == CMD_INSERT || update_tuple_routing_needed))
 		mtstate->mt_partition_tuple_routing =
-			ExecSetupPartitionTupleRouting(mtstate, rel);
+			ExecSetupPartitionTupleRouting(estate, mtstate, rel);
 
 	/*
 	 * Build state for collecting transition tuples.  This requires having a
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index bc81535905..98dd5281ad 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -56,6 +56,7 @@
 #include "parser/analyze.h"
 #include "parser/parsetree.h"
 #include "parser/parse_agg.h"
+#include "partitioning/partdesc.h"
 #include "rewrite/rewriteManip.h"
 #include "storage/dsm_impl.h"
 #include "utils/rel.h"
@@ -567,6 +568,9 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
 			result->jitFlags |= PGJIT_DEFORM;
 	}
 
+	if (glob->partition_directory != NULL)
+		DestroyPartitionDirectory(glob->partition_directory);
+
 	return result;
 }
 
diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c
index a014a12060..1fa154e0cb 100644
--- a/src/backend/optimizer/util/inherit.c
+++ b/src/backend/optimizer/util/inherit.c
@@ -147,6 +147,10 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
 	{
 		Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
 
+		if (root->glob->partition_directory == NULL)
+			root->glob->partition_directory =
+				CreatePartitionDirectory(CurrentMemoryContext);
+
 		/*
 		 * If this table has partitions, recursively expand and lock them.
 		 * While at it, also extract the partition key columns of all the
@@ -246,7 +250,10 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 	int			i;
 	RangeTblEntry *childrte;
 	Index		childRTindex;
-	PartitionDesc partdesc = RelationGetPartitionDesc(parentrel);
+	PartitionDesc partdesc;
+
+	partdesc = PartitionDirectoryLookup(root->glob->partition_directory,
+										parentrel);
 
 	check_stack_depth();
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 78a96b4ee2..30f4dc151b 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -2086,7 +2086,8 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
 
 	Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
 
-	partdesc = RelationGetPartitionDesc(relation);
+	partdesc = PartitionDirectoryLookup(root->glob->partition_directory,
+										relation);
 	partkey = RelationGetPartitionKey(relation);
 	rel->part_scheme = find_partition_scheme(root, relation);
 	Assert(partdesc != NULL && rel->part_scheme != NULL);
diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c
index 66b1e38527..dbbca84dcb 100644
--- a/src/backend/partitioning/partdesc.c
+++ b/src/backend/partitioning/partdesc.c
@@ -21,12 +21,26 @@
 #include "storage/sinval.h"
 #include "utils/builtins.h"
 #include "utils/inval.h"
+#include "utils/hsearch.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/partcache.h"
 #include "utils/syscache.h"
 
+typedef struct PartitionDirectoryData
+{
+	MemoryContext pdir_mcxt;
+	HTAB	   *pdir_hash;
+} PartitionDirectoryData;
+
+typedef struct PartitionDirectoryEntry
+{
+	Oid			reloid;
+	Relation	rel;
+	PartitionDesc pd;
+} PartitionDirectoryEntry;
+
 /*
  * RelationBuildPartitionDesc
  *		Form rel's partition descriptor
@@ -208,13 +222,88 @@ RelationBuildPartitionDesc(Relation rel)
 		partdesc->oids[index] = oids[i];
 		/* Record if the partition is a leaf partition */
 		partdesc->is_leaf[index] =
-				(get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
+			(get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
 	}
 	MemoryContextSwitchTo(oldcxt);
 
 	rel->rd_partdesc = partdesc;
 }
 
+/*
+ * CreatePartitionDirectory
+ *		Create a new partition directory object.
+ */
+PartitionDirectory
+CreatePartitionDirectory(MemoryContext mcxt)
+{
+	MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
+	PartitionDirectory pdir;
+	HASHCTL		ctl;
+
+	MemSet(&ctl, 0, sizeof(HASHCTL));
+	ctl.keysize = sizeof(Oid);
+	ctl.entrysize = sizeof(PartitionDirectoryEntry);
+	ctl.hcxt = mcxt;
+
+	pdir = palloc(sizeof(PartitionDirectoryData));
+	pdir->pdir_mcxt = mcxt;
+	pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
+								  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+	MemoryContextSwitchTo(oldcontext);
+	return pdir;
+}
+
+/*
+ * PartitionDirectoryLookup
+ *		Look up the partition descriptor for a relation in the directory.
+ *
+ * The purpose of this function is to ensure that we get the same
+ * PartitionDesc for each relation every time we look it up.  In the
+ * face of current DDL, different PartitionDescs may be constructed with
+ * different views of the catalog state, but any single particular OID
+ * will always get the same PartitionDesc for as long as the same
+ * PartitionDirectory is used.
+ */
+PartitionDesc
+PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
+{
+	PartitionDirectoryEntry *pde;
+	Oid			relid = RelationGetRelid(rel);
+	bool		found;
+
+	pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
+	if (!found)
+	{
+		/*
+		 * We must keep a reference count on the relation so that the
+		 * PartitionDesc to which we are pointing can't get destroyed.
+		 */
+		RelationIncrementReferenceCount(rel);
+		pde->rel = rel;
+		pde->pd = RelationGetPartitionDesc(rel);
+		Assert(pde->pd != NULL);
+	}
+	return pde->pd;
+}
+
+/*
+ * DestroyPartitionDirectory
+ *		Destroy a partition directory.
+ *
+ * Release the reference counts we're holding.
+ */
+void
+DestroyPartitionDirectory(PartitionDirectory pdir)
+{
+	HASH_SEQ_STATUS	status;
+	PartitionDirectoryEntry *pde;
+
+	hash_seq_init(&status, pdir->pdir_hash);
+	while ((pde = hash_seq_search(&status)) != NULL)
+		RelationDecrementReferenceCount(pde->rel);
+}
+
 /*
  * equalPartitionDescs
  *		Compare two partition descriptors for logical equality
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 54a40ef00b..1495b60d11 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -2480,6 +2480,26 @@ RelationClearRelation(Relation relation, bool rebuild)
 			SWAPFIELD(PartitionDesc, rd_partdesc);
 			SWAPFIELD(MemoryContext, rd_pdcxt);
 		}
+		else if (rebuild && newrel->rd_pdcxt != NULL)
+		{
+			/*
+			 * We are rebuilding a partitioned relation with a non-zero
+			 * reference count, so keep the old partition descriptor around,
+			 * in case there's a PartitionDirectory with a pointer to it.
+			 * Attach it to the new rd_pdcxt so that it gets cleaned up
+			 * eventually.  In the case where the reference count is 0, this
+			 * code is not reached, which should be OK because in that case
+			 * there should be no PartitionDirectory with a pointer to the old
+			 * entry.
+			 *
+			 * Note that newrel and relation have already been swapped, so
+			 * the "old" partition descriptor is actually the one hanging off
+			 * of newrel.
+			 */
+			MemoryContextSetParent(newrel->rd_pdcxt, relation->rd_pdcxt);
+			newrel->rd_partdesc = NULL;
+			newrel->rd_pdcxt = NULL;
+		}
 
 #undef SWAPFIELD
 
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 2048c43c37..b363aba2a5 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -135,7 +135,8 @@ typedef struct PartitionPruneState
 	PartitionPruningData *partprunedata[FLEXIBLE_ARRAY_MEMBER];
 } PartitionPruneState;
 
-extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
+extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(EState *estate,
+							   ModifyTableState *mtstate,
 							   Relation rel);
 extern ResultRelInfo *ExecFindPartition(ModifyTableState *mtstate,
 				  ResultRelInfo *rootResultRelInfo,
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 3b789ee7cf..84de8efeda 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -19,6 +19,7 @@
 #include "lib/pairingheap.h"
 #include "nodes/params.h"
 #include "nodes/plannodes.h"
+#include "partitioning/partdefs.h"
 #include "utils/hsearch.h"
 #include "utils/queryenvironment.h"
 #include "utils/reltrigger.h"
@@ -515,6 +516,7 @@ typedef struct EState
 	 */
 	ResultRelInfo *es_root_result_relations;	/* array of ResultRelInfos */
 	int			es_num_root_result_relations;	/* length of the array */
+	PartitionDirectory es_partition_directory;	/* for PartitionDesc lookup */
 
 	/*
 	 * The following list contains ResultRelInfos created by the tuple routing
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index a008ae07da..7b2cbdbefc 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -144,6 +144,8 @@ typedef struct PlannerGlobal
 	bool		parallelModeNeeded; /* parallel mode actually required? */
 
 	char		maxParallelHazard;	/* worst PROPARALLEL hazard level */
+
+	PartitionDirectory partition_directory; /* partition descriptors */
 } PlannerGlobal;
 
 /* macro for fetching the Plan associated with a SubPlan node */
diff --git a/src/include/partitioning/partdefs.h b/src/include/partitioning/partdefs.h
index 6e9c128b2c..aec3b3fe63 100644
--- a/src/include/partitioning/partdefs.h
+++ b/src/include/partitioning/partdefs.h
@@ -21,4 +21,6 @@ typedef struct PartitionBoundSpec PartitionBoundSpec;
 
 typedef struct PartitionDescData *PartitionDesc;
 
+typedef struct PartitionDirectoryData *PartitionDirectory;
+
 #endif							/* PARTDEFS_H */
diff --git a/src/include/partitioning/partdesc.h b/src/include/partitioning/partdesc.h
index f72b70dded..da19369e25 100644
--- a/src/include/partitioning/partdesc.h
+++ b/src/include/partitioning/partdesc.h
@@ -31,6 +31,10 @@ typedef struct PartitionDescData
 
 extern void RelationBuildPartitionDesc(Relation rel);
 
+extern PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt);
+extern PartitionDesc PartitionDirectoryLookup(PartitionDirectory, Relation);
+extern void DestroyPartitionDirectory(PartitionDirectory pdir);
+
 extern Oid	get_default_oid_from_partdesc(PartitionDesc partdesc);
 
 extern bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
-- 
2.17.2 (Apple Git-113)

