From 9adaa32ff081af79176e0bc08a60bf873970c7b8 Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Tue, 16 Mar 2021 14:03:05 +0900
Subject: [PATCH v2] Allow batching of inserts during cross-partition updates

Currently, the insert that runs internally as part of
cross-partition update of partitioned tables can't use batching
because ExecInitRoutingInfo() which initializes the insert target
partition allows batching only if the query's original operation is
CMD_INSERT.  This commit loosens that restriction to allow the
internal inserts of cross-partition use batching too, essentially
reverting a check that 927f453a94106 put in place.

There are two places that need fixing to correctly handle batching
in such cases:

* postgresGetForeignModifyBatchSize() must look at the appropriate
PgFdwModifyState, that is, one that belongs to the insert, not to the
original update operation, because only the former would be set up to
contain the information necessary to perform batching.

* ExecModifyTable(), when inserting any remaining batched tuples,
must look at the correct set of ResultRelInfos that would've been
used by such inserts, because failing to do so would result in those
tuples not actually getting inserted.  To fix, ExecModifyTable() is
now made to get the ResultRelInfos from the PartitionTupleRouting
data structure which contains the ResultRelInfo that would be used by
those internal inserts. To allow nodeModifyTable.c look inside
PartitionTupleRouting, its definition is now exposed whereas it was
previously local to execPartition.c.

This also adjusts the test case added by 927f453a94106 to confirm
that the insert performed when moving the rows into the foreign
partition indeed uses batched mode.
---
 .../postgres_fdw/expected/postgres_fdw.out    | 16 ++++-
 contrib/postgres_fdw/postgres_fdw.c           | 11 ++-
 contrib/postgres_fdw/sql/postgres_fdw.sql     | 11 ++-
 src/backend/executor/execPartition.c          | 72 +------------------
 src/backend/executor/nodeModifyTable.c        | 50 +++++++++----
 src/include/executor/execPartition.h          | 72 ++++++++++++++++++-
 6 files changed, 133 insertions(+), 99 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 0649b6b81c..a415cb6786 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -9414,8 +9414,8 @@ SELECT COUNT(*) FROM batch_table;
     66
 (1 row)
 
--- Check that enabling batched inserts doesn't interfere with cross-partition
--- updates
+-- Check that batched inserts also works for inserts made during
+-- cross-partition updates
 CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
 CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
 CREATE FOREIGN TABLE batch_cp_upd_test1_f
@@ -9425,7 +9425,7 @@ CREATE FOREIGN TABLE batch_cp_upd_test1_f
 	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
 CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
 	FOR VALUES IN (2);
-INSERT INTO batch_cp_upd_test VALUES (1), (2);
+INSERT INTO batch_cp_upd_test VALUES (2), (2);
 -- The following moves a row from the local partition to the foreign one
 UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
 SELECT tableoid::regclass, * FROM batch_cp_upd_test;
@@ -9435,5 +9435,15 @@ SELECT tableoid::regclass, * FROM batch_cp_upd_test;
  batch_cp_upd_test1_f | 1
 (2 rows)
 
+-- To confirm that they were indeed inserted in batch mode, that is, with a
+-- single command, check cmin of the moved rows in the foreign partition's base
+-- table
+SELECT cmin, * FROM batch_cp_upd_test1;
+ cmin | a 
+------+---
+    1 | 1
+    1 | 1
+(2 rows)
+
 -- Clean up
 DROP TABLE batch_table, batch_cp_upd_test CASCADE;
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 35b48575c5..53472cf73c 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -1934,18 +1934,17 @@ static int
 postgresGetForeignModifyBatchSize(ResultRelInfo *resultRelInfo)
 {
 	int	batch_size;
-	PgFdwModifyState *fmstate = resultRelInfo->ri_FdwState ?
-							(PgFdwModifyState *) resultRelInfo->ri_FdwState :
-							NULL;
+	PgFdwModifyState *fmstate = (PgFdwModifyState *) resultRelInfo->ri_FdwState;
 
 	/* should be called only once */
 	Assert(resultRelInfo->ri_BatchSize == 0);
 
 	/*
-	 * Should never get called when the insert is being performed as part of
-	 * a row movement operation.
+	 * Use the auxiliary state if any; see postgresBeginForeignInsert() for
+	 * details on what it represents.
 	 */
-	Assert(fmstate == NULL || fmstate->aux_fmstate == NULL);
+	if (fmstate && fmstate->aux_fmstate != NULL)
+		fmstate = fmstate->aux_fmstate;
 
 	/*
 	 * In EXPLAIN without ANALYZE, ri_fdwstate is NULL, so we have to lookup
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 2b525ea44a..68dd6b1011 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -2909,8 +2909,8 @@ CREATE TABLE batch_table_p2
 INSERT INTO batch_table SELECT * FROM generate_series(1, 66) i;
 SELECT COUNT(*) FROM batch_table;
 
--- Check that enabling batched inserts doesn't interfere with cross-partition
--- updates
+-- Check that batched inserts also works for inserts made during
+-- cross-partition updates
 CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
 CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
 CREATE FOREIGN TABLE batch_cp_upd_test1_f
@@ -2920,11 +2920,16 @@ CREATE FOREIGN TABLE batch_cp_upd_test1_f
 	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
 CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
 	FOR VALUES IN (2);
-INSERT INTO batch_cp_upd_test VALUES (1), (2);
+INSERT INTO batch_cp_upd_test VALUES (2), (2);
 
 -- The following moves a row from the local partition to the foreign one
 UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
 SELECT tableoid::regclass, * FROM batch_cp_upd_test;
 
+-- To confirm that they were indeed inserted in batch mode, that is, with a
+-- single command, check cmin of the moved rows in the foreign partition's base
+-- table
+SELECT cmin, * FROM batch_cp_upd_test1;
+
 -- Clean up
 DROP TABLE batch_table, batch_cp_upd_test CASCADE;
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index b8da4c5967..811997b4c8 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -35,75 +35,6 @@
 #include "utils/ruleutils.h"
 
 
-/*-----------------------
- * PartitionTupleRouting - Encapsulates all information required to
- * route a tuple inserted into a partitioned table to one of its leaf
- * partitions.
- *
- * partition_root
- *		The partitioned table that's the target of the command.
- *
- * partition_dispatch_info
- *		Array of 'max_dispatch' elements containing a pointer to a
- *		PartitionDispatch object for every partitioned table touched by tuple
- *		routing.  The entry for the target partitioned table is *always*
- *		present in the 0th element of this array.  See comment for
- *		PartitionDispatchData->indexes for details on how this array is
- *		indexed.
- *
- * nonleaf_partitions
- *		Array of 'max_dispatch' elements containing pointers to fake
- *		ResultRelInfo objects for nonleaf partitions, useful for checking
- *		the partition constraint.
- *
- * num_dispatch
- *		The current number of items stored in the 'partition_dispatch_info'
- *		array.  Also serves as the index of the next free array element for
- *		new PartitionDispatch objects that need to be stored.
- *
- * max_dispatch
- *		The current allocated size of the 'partition_dispatch_info' array.
- *
- * partitions
- *		Array of 'max_partitions' elements containing a pointer to a
- *		ResultRelInfo for every leaf partitions touched by tuple routing.
- *		Some of these are pointers to ResultRelInfos which are borrowed out of
- *		'subplan_resultrel_htab'.  The remainder have been built especially
- *		for tuple routing.  See comment for PartitionDispatchData->indexes for
- *		details on how this array is indexed.
- *
- * num_partitions
- *		The current number of items stored in the 'partitions' array.  Also
- *		serves as the index of the next free array element for new
- *		ResultRelInfo objects that need to be stored.
- *
- * max_partitions
- *		The current allocated size of the 'partitions' array.
- *
- * subplan_resultrel_htab
- *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
- *		cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
- *		NULL in other cases.  Some of these may be useful for tuple routing
- *		to save having to build duplicates.
- *
- * memcxt
- *		Memory context used to allocate subsidiary structs.
- *-----------------------
- */
-struct PartitionTupleRouting
-{
-	Relation	partition_root;
-	PartitionDispatch *partition_dispatch_info;
-	ResultRelInfo **nonleaf_partitions;
-	int			num_dispatch;
-	int			max_dispatch;
-	ResultRelInfo **partitions;
-	int			num_partitions;
-	int			max_partitions;
-	HTAB	   *subplan_resultrel_htab;
-	MemoryContext memcxt;
-};
-
 /*-----------------------
  * PartitionDispatch - information about one partitioned table in a partition
  * hierarchy required to route a tuple to any of its partitions.  A
@@ -1000,8 +931,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 	 *
 	 * If the FDW does not support batching, we set the batch size to 1.
 	 */
-	if (mtstate->operation == CMD_INSERT &&
-		partRelInfo->ri_FdwRoutine != NULL &&
+	if (partRelInfo->ri_FdwRoutine != NULL &&
 		partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
 		partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
 		partRelInfo->ri_BatchSize =
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 2993ba43e3..5f92854c94 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2059,8 +2059,6 @@ ExecModifyTable(PlanState *pstate)
 	HeapTupleData oldtupdata;
 	HeapTuple	oldtuple;
 	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
-	List				  *relinfos = NIL;
-	ListCell			  *lc;
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -2277,22 +2275,46 @@ ExecModifyTable(PlanState *pstate)
 	}
 
 	/*
-	 * Insert remaining tuples for batch insert.
+	 * Insert any remaining batched tuples.
+	 *
+	 * If the query's main target relation is a partitioned table, any inserts
+	 * would have been performed using tuple routing, so use the appropriate
+	 * set of target relations.  Note that this also covers any inserts
+	 * performed during cross-partition UPDATEs that would have occurred
+	 * through tuple routing.
 	 */
 	if (proute)
-		relinfos = estate->es_tuple_routing_result_relations;
-	else
-		relinfos = estate->es_opened_result_relations;
+	{
+		int			i;
 
-	foreach(lc, relinfos)
+		Assert(node->rootResultRelInfo->ri_RelationDesc->rd_rel->relkind ==
+			   RELKIND_PARTITIONED_TABLE);
+		for (i = 0; i < proute->num_partitions; i++)
+		{
+			resultRelInfo = proute->partitions[i];
+			if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
+	}
+	else
 	{
-		resultRelInfo = lfirst(lc);
-		if (resultRelInfo->ri_NumSlots > 0)
-			ExecBatchInsert(node, resultRelInfo,
-						   resultRelInfo->ri_Slots,
-						   resultRelInfo->ri_PlanSlots,
-						   resultRelInfo->ri_NumSlots,
-						   estate, node->canSetTag);
+		ListCell *lc;
+
+		/* Otherwise, use result relations from the range table. */
+		foreach(lc, estate->es_opened_result_relations)
+		{
+			resultRelInfo = lfirst(lc);
+			if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
 	}
 
 	/*
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index d30ffde7d9..846261ef88 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -18,9 +18,77 @@
 #include "nodes/plannodes.h"
 #include "partitioning/partprune.h"
 
-/* See execPartition.c for the definitions. */
+/* See execPartition.c for the definition. */
 typedef struct PartitionDispatchData *PartitionDispatch;
-typedef struct PartitionTupleRouting PartitionTupleRouting;
+
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ *		The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ *		Array of 'max_dispatch' elements containing a pointer to a
+ *		PartitionDispatch object for every partitioned table touched by tuple
+ *		routing.  The entry for the target partitioned table is *always*
+ *		present in the 0th element of this array.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
+ * num_dispatch
+ *		The current number of items stored in the 'partition_dispatch_info'
+ *		array.  Also serves as the index of the next free array element for
+ *		new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ *		The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ *		Array of 'max_partitions' elements containing a pointer to a
+ *		ResultRelInfo for every leaf partitions touched by tuple routing.
+ *		Some of these are pointers to ResultRelInfos which are borrowed out of
+ *		'subplan_resultrel_htab'.  The remainder have been built especially
+ *		for tuple routing.  See comment for PartitionDispatchData->indexes for
+ *		details on how this array is indexed.
+ *
+ * num_partitions
+ *		The current number of items stored in the 'partitions' array.  Also
+ *		serves as the index of the next free array element for new
+ *		ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ *		The current allocated size of the 'partitions' array.
+ *
+ * subplan_resultrel_htab
+ *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
+ *		cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
+ *		NULL in other cases.  Some of these may be useful for tuple routing
+ *		to save having to build duplicates.
+ *
+ * memcxt
+ *		Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+	Relation	partition_root;
+	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
+	int			num_dispatch;
+	int			max_dispatch;
+	ResultRelInfo **partitions;
+	int			num_partitions;
+	int			max_partitions;
+	HTAB	   *subplan_resultrel_htab;
+	MemoryContext memcxt;
+} PartitionTupleRouting;
 
 /*
  * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning
-- 
2.24.1

