From e8ccff62337f109a0df98df6f2b03e97951f5fe2 Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Thu, 3 Sep 2020 15:53:04 +0900
Subject: [PATCH v2] Check default partition's constraint even after tuple
 routing

A partition's constraint is not checked if a row has been inserted
into it via tuple routing, because the routing process ensures that
only the tuples satisfying the constraint make it to a given
partition.  For a default partition however, whose constraint can
change on-the-fly due to concurrent addition of partitions, this
assumption does not always hold.  So, check the constraint even
when inserting into a default partition via tuple routing.

This also adds an isolation test based on the reproduction steps
described by Hao Wu.

Reported by: Hao Wu <hawu@vmware.com>
---
 src/backend/executor/execPartition.c               | 125 +++++++++++++++++----
 .../expected/partition-concurrent-attach.out       |  49 ++++++++
 src/test/isolation/isolation_schedule              |   1 +
 .../specs/partition-concurrent-attach.spec         |  43 +++++++
 4 files changed, 195 insertions(+), 23 deletions(-)
 create mode 100644 src/test/isolation/expected/partition-concurrent-attach.out
 create mode 100644 src/test/isolation/specs/partition-concurrent-attach.spec

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 79fcbd6..5910d0e 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -51,6 +51,11 @@
  *		PartitionDispatchData->indexes for details on how this array is
  *		indexed.
  *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
  * num_dispatch
  *		The current number of items stored in the 'partition_dispatch_info'
  *		array.  Also serves as the index of the next free array element for
@@ -89,6 +94,7 @@ struct PartitionTupleRouting
 {
 	Relation	partition_root;
 	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
 	int			num_dispatch;
 	int			max_dispatch;
 	ResultRelInfo **partitions;
@@ -281,8 +287,10 @@ ExecFindPartition(ModifyTableState *mtstate,
 	PartitionDesc partdesc;
 	ExprContext *ecxt = GetPerTupleExprContext(estate);
 	TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
+	TupleTableSlot *rootslot = slot;
 	TupleTableSlot *myslot = NULL;
 	MemoryContext oldcxt;
+	ResultRelInfo *rri = NULL;
 
 	/* use per-tuple context here to avoid leaking memory */
 	oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
@@ -296,9 +304,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 
 	/* start with the root partitioned table */
 	dispatch = pd[0];
-	while (true)
+	do
 	{
-		AttrMap    *map = dispatch->tupmap;
 		int			partidx = -1;
 
 		CHECK_FOR_INTERRUPTS();
@@ -307,17 +314,6 @@ ExecFindPartition(ModifyTableState *mtstate,
 		partdesc = dispatch->partdesc;
 
 		/*
-		 * Convert the tuple to this parent's layout, if different from the
-		 * current relation.
-		 */
-		myslot = dispatch->tupslot;
-		if (myslot != NULL)
-		{
-			Assert(map != NULL);
-			slot = execute_attr_map_slot(map, slot, myslot);
-		}
-
-		/*
 		 * Extract partition key from tuple. Expression evaluation machinery
 		 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
 		 * point to the correct tuple slot.  The slot might have changed from
@@ -352,8 +348,6 @@ ExecFindPartition(ModifyTableState *mtstate,
 
 		if (partdesc->is_leaf[partidx])
 		{
-			ResultRelInfo *rri;
-
 			/*
 			 * Look to see if we've already got a ResultRelInfo for this
 			 * partition.
@@ -401,13 +395,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 												rootResultRelInfo, partidx);
 			}
 
-			/* Release the tuple in the lowest parent's dedicated slot. */
-			if (slot == myslot)
-				ExecClearTuple(myslot);
-
-			MemoryContextSwitchTo(oldcxt);
-			ecxt->ecxt_scantuple = ecxt_scantuple_old;
-			return rri;
+			/* We've reached the leaf. */
+			dispatch = NULL;
 		}
 		else
 		{
@@ -419,6 +408,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 				/* Already built. */
 				Assert(dispatch->indexes[partidx] < proute->num_dispatch);
 
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
+
 				/*
 				 * Move down to the next partition level and search again
 				 * until we find a leaf partition that matches this tuple
@@ -440,10 +431,80 @@ ExecFindPartition(ModifyTableState *mtstate,
 															dispatch, partidx);
 				Assert(dispatch->indexes[partidx] >= 0 &&
 					   dispatch->indexes[partidx] < proute->num_dispatch);
+
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
 				dispatch = subdispatch;
 			}
+
+			/*
+			 * Convert the tuple to the new parent's layout, if different
+			 * from the previous parent.
+			 */
+			if (dispatch->tupslot)
+			{
+				AttrMap    *map = dispatch->tupmap;
+				TupleTableSlot *prev_myslot = myslot;
+
+				myslot = dispatch->tupslot;
+				Assert(map != NULL);
+				slot = execute_attr_map_slot(map, slot, myslot);
+
+				/* Clear previous parent's tuple if any. */
+				if (prev_myslot != NULL)
+					ExecClearTuple(prev_myslot);
+			}
 		}
-	}
+
+		/*
+		 * If this partition is the default one, we must check its partition
+		 * constraint, which, unlike a non-default partition's constraint, can
+		 * change due to partitions being added to the parent concurrently.
+		 *
+		 * The tuple must match the partition's layout for the constraint
+		 * expression to be evaluated successfully.  If the partition is
+		 * sub-partitioned, that would already be the case due to the code
+		 * above, but for a leaf partition the tuple still matches the
+		 * parent's layout.
+		 */
+		if (partidx == partdesc->boundinfo->default_index)
+		{
+			PartitionRoutingInfo *partrouteinfo = rri->ri_PartitionInfo;
+
+			if (partrouteinfo)
+			{
+				TupleConversionMap *map = partrouteinfo->pi_RootToPartitionMap;
+				TupleTableSlot *pslot = partrouteinfo->pi_PartitionTupleSlot;
+
+				/*
+				 * As the name says, we can only convert from root table's
+				 * layout, so use the slot received from the caller, rootslot.
+				 */
+				if (map)
+					slot = execute_attr_map_slot(map->attrMap, rootslot,
+												 pslot);
+				else
+					/*
+					 * Or use root layout slot as-is.  Note we don't use
+					 * the parent's slot, because we don't know if it's same
+					 * layout or different from the leaf partition, but we
+					 * know for sure that root and leaf are the same.
+					 */
+					slot = rootslot;
+			}
+
+			ExecPartitionCheck(rri, slot, estate, true);
+		}
+	} while (dispatch != NULL);
+
+	/* Release the tuple in the lowest parent's dedicated slot. */
+	if (myslot != NULL)
+		ExecClearTuple(myslot);
+	MemoryContextSwitchTo(oldcxt);
+	ecxt->ecxt_scantuple = ecxt_scantuple_old;
+
+	Assert(rri != NULL);
+
+	return rri;
 }
 
 /*
@@ -1060,6 +1121,8 @@ ExecInitPartitionDispatchInfo(EState *estate,
 			proute->max_dispatch = 4;
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 		else
 		{
@@ -1067,11 +1130,27 @@ ExecInitPartitionDispatchInfo(EState *estate,
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				repalloc(proute->partition_dispatch_info,
 						 sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				repalloc(proute->nonleaf_partitions,
+						 sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 	}
 	proute->partition_dispatch_info[dispatchidx] = pd;
 
 	/*
+	 * If setting up a PartitionDispatch for a sub-partitioned table, we may
+	 * also need a fake ResultRelInfo for checking the partition constraint
+	 * later; set that up now.
+	 */
+	if (parent_pd)
+	{
+		ResultRelInfo *rri = makeNode(ResultRelInfo);
+
+		InitResultRelInfo(rri, rel, 1, proute->partition_root, 0);
+		proute->nonleaf_partitions[dispatchidx] = rri;
+	}
+
+	/*
 	 * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
 	 * install a downlink in the parent to allow quick descent.
 	 */
diff --git a/src/test/isolation/expected/partition-concurrent-attach.out b/src/test/isolation/expected/partition-concurrent-attach.out
new file mode 100644
index 0000000..17fac39
--- /dev/null
+++ b/src/test/isolation/expected/partition-concurrent-attach.out
@@ -0,0 +1,49 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s1a s2b s2i s1c s2c s2s
+step s1b: begin;
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
+step s2b: begin;
+step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
+step s1c: commit;
+step s2i: <... completed>
+error in steps s1c s2i: ERROR:  new row for relation "tpart_default" violates partition constraint
+step s2c: commit;
+step s2s: select tableoid::regclass, * from tpart;
+tableoid       i              j              
+
+tpart_2        110            xxx            
+tpart_2        120            yyy            
+tpart_2        150            zzz            
+
+starting permutation: s1b s1a s2b s2i2 s1c s2c s2s
+step s1b: begin;
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
+step s2b: begin;
+step s2i2: insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
+step s1c: commit;
+step s2i2: <... completed>
+error in steps s1c s2i2: ERROR:  new row for relation "tpart_default" violates partition constraint
+step s2c: commit;
+step s2s: select tableoid::regclass, * from tpart;
+tableoid       i              j              
+
+tpart_2        110            xxx            
+tpart_2        120            yyy            
+tpart_2        150            zzz            
+
+starting permutation: s1b s2b s2i s1a s2c s1c s2s
+step s1b: begin;
+step s2b: begin;
+step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); <waiting ...>
+step s2c: commit;
+step s1a: <... completed>
+error in steps s2c s1a: ERROR:  updated partition constraint for default partition "tpart_default_default" would be violated by some row
+step s1c: commit;
+step s2s: select tableoid::regclass, * from tpart;
+tableoid       i              j              
+
+tpart_default_default110            xxx            
+tpart_default_default120            yyy            
+tpart_default_default150            zzz            
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 65d1443..f838276 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -90,3 +90,4 @@ test: plpgsql-toast
 test: truncate-conflict
 test: serializable-parallel
 test: serializable-parallel-2
+test: partition-concurrent-attach
diff --git a/src/test/isolation/specs/partition-concurrent-attach.spec b/src/test/isolation/specs/partition-concurrent-attach.spec
new file mode 100644
index 0000000..48c3f83
--- /dev/null
+++ b/src/test/isolation/specs/partition-concurrent-attach.spec
@@ -0,0 +1,43 @@
+# Verify that default partition constraint is enforced correctly
+# in light of partitions being added concurrently to its parent
+setup {
+  drop table if exists tpart;
+  create table tpart(i int, j text) partition by range(i);
+  create table tpart_1(like tpart);
+  create table tpart_2(like tpart);
+  create table tpart_default (a int, j text, i int) partition by list (j);
+  create table tpart_default_default (a int, i int, b int, j text);
+  alter table tpart_default_default drop b;
+  alter table tpart_default attach partition tpart_default_default default;
+  alter table tpart_default drop a;
+  alter table tpart attach partition tpart_default default;
+  alter table tpart attach partition tpart_1 for values from(0) to (100);
+  insert into tpart_2 values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
+}
+
+session "s1"
+step "s1b"	{	begin; }
+step "s1a"	{	alter table tpart attach partition tpart_2 for values from (100) to (200); }
+step "s1c"	{	commit; }
+
+session "s2"
+step "s2b"	{	begin; }
+step "s2i"	{	insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); }
+step "s2i2"	{	insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); }
+step "s2c"	{	commit; }
+step "s2s"	{	select tableoid::regclass, * from tpart; }
+
+teardown	{	drop table tpart; }
+
+# insert into tpart by s2 which routes to tpart_default due to not seeing
+# concurrently added tpart_2 should fail, because the partition constraint
+# of tpart_default would have changed due to tpart_2 having been added
+permutation "s1b" "s1a" "s2b" "s2i" "s1c" "s2c" "s2s"
+
+# similar to above, but now insert into sub-partitioned tpart_default
+permutation "s1b" "s1a" "s2b" "s2i2" "s1c" "s2c" "s2s"
+
+# reverse: now the insert into tpart_default by s2 occurs first followed by
+# attach in s1, which should fail when it scans the leaf default partition
+# find the violating rows
+permutation "s1b" "s2b" "s2i" "s1a" "s2c" "s1c" "s2s"
-- 
1.8.3.1

