From 72d33b12941f29d26ea6a07ded0a839486523ddb Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Tue, 16 Mar 2021 14:03:05 +0900
Subject: [PATCH v8 2/2] Allow batching of inserts during cross-partition
 updates

Currently, the insert that runs internally as part of
cross-partition update of partitioned tables can't use batching
because ExecInitRoutingInfo(), which initializes the insert target
partition, allows batching only if the query's original operation is
CMD_INSERT.  This commit loosens that restriction to allow the
internal inserts of cross-partition updates use batching too,
essentially reverting a check that 927f453a94106 put in place.

There are two places that needed to be fixed in order to correctly
handle batching in such cases:

* postgresGetForeignModifyBatchSize() must look at the appropriate
PgFdwModifyState, that is, one that belongs to the insert, not to the
original update operation, because only the former would be set up to
contain the information necessary to perform batching.

* ExecModifyTable(), when inserting any remaining batched tuples,
must look at the correct set of ResultRelInfos that would've been
used by such inserts, because failing to do so would result in those
tuples not actually getting inserted.  To fix, ExecModifyTable() is
now made to get the ResultRelInfos from the PartitionTupleRouting
data structure which contains the ResultRelInfo that would be used by
those internal inserts. To allow nodeModifyTable.c look inside
PartitionTupleRouting, its definition, which was previously local to
execPartition.c, is exposed via execPartition.h.

This also adjusts the test case added by 927f453a94106 to confirm
that the insert performed when moving the rows into the foreign
partition indeed uses batched mode.
---
 .../postgres_fdw/expected/postgres_fdw.out    | 53 +++++++++++---
 contrib/postgres_fdw/postgres_fdw.c           | 13 ++--
 contrib/postgres_fdw/sql/postgres_fdw.sql     | 44 +++++++++---
 src/backend/executor/execPartition.c          | 72 +------------------
 src/backend/executor/nodeModifyTable.c        | 56 ++++++++++-----
 src/include/executor/execPartition.h          | 72 ++++++++++++++++++-
 6 files changed, 195 insertions(+), 115 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index ed25e7a743..9522a9f80c 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -9763,8 +9763,8 @@ SELECT COUNT(*) FROM batch_table;
     66
 (1 row)
 
--- Check that enabling batched inserts doesn't interfere with cross-partition
--- updates
+-- Check that batched mode also works for inserts made during
+-- cross-partition updates
 CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
 CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
 CREATE FOREIGN TABLE batch_cp_upd_test1_f
@@ -9772,21 +9772,52 @@ CREATE FOREIGN TABLE batch_cp_upd_test1_f
 	FOR VALUES IN (1)
 	SERVER loopback
 	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
-CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
+CREATE TABLE batch_cp_upd_test2 PARTITION OF batch_cp_upd_test
 	FOR VALUES IN (2);
-INSERT INTO batch_cp_upd_test VALUES (1), (2);
--- The following moves a row from the local partition to the foreign one
-UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
-ERROR:  cannot route tuples into foreign table to be updated "batch_cp_upd_test1_f"
-SELECT tableoid::regclass, * FROM batch_cp_upd_test;
+CREATE TABLE batch_cp_upd_test3 (LIKE batch_cp_upd_test);
+CREATE FOREIGN TABLE batch_cp_upd_test3_f
+	PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (3)
+	SERVER loopback
+	OPTIONS (table_name 'batch_cp_upd_test3', batch_size '1');
+INSERT INTO batch_cp_upd_test VALUES (2), (2);
+-- Create statement triggers on remote tables that "log" any INSERTs
+-- performed on them.
+CREATE TABLE cmdlog (cmd text);
+CREATE FUNCTION log_stmt () RETURNS TRIGGER LANGUAGE plpgsql AS $$
+	BEGIN INSERT INTO public.cmdlog VALUES (TG_OP || ' on ' || TG_RELNAME); RETURN NULL; END;
+$$;
+CREATE TRIGGER stmt_trig AFTER INSERT ON batch_cp_upd_test1
+	FOR EACH STATEMENT EXECUTE FUNCTION log_stmt();
+CREATE TRIGGER stmt_trig AFTER INSERT ON batch_cp_upd_test3
+	FOR EACH STATEMENT EXECUTE FUNCTION log_stmt();
+-- This update moves rows from the local partition to the foreign one that
+-- has insert batching enabled, so 1 INSERT for both rows.
+UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
+INSERT INTO batch_cp_upd_test VALUES (2), (2);
+-- This update moves rows from the local partition to the foreign one that
+-- has insert batching disabled, so separate INSERTs for each row.
+UPDATE batch_cp_upd_test t SET a = 3 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
+SELECT tableoid::regclass, * FROM batch_cp_upd_test ORDER BY 1;
        tableoid       | a 
 ----------------------+---
  batch_cp_upd_test1_f | 1
- batch_cp_up_test1    | 2
-(2 rows)
+ batch_cp_upd_test1_f | 1
+ batch_cp_upd_test3_f | 3
+ batch_cp_upd_test3_f | 3
+(4 rows)
+
+-- Should see 1 INSERT on batch_cp_upd_test1 and 2 on batch_cp_upd_test3.
+SELECT * FROM cmdlog ORDER BY 1;
+             cmd              
+------------------------------
+ INSERT on batch_cp_upd_test1
+ INSERT on batch_cp_upd_test3
+ INSERT on batch_cp_upd_test3
+(3 rows)
 
 -- Clean up
-DROP TABLE batch_table, batch_cp_upd_test, batch_table_p0, batch_table_p1 CASCADE;
+DROP TABLE batch_table, batch_cp_upd_test, cmdlog, batch_table_p0, batch_table_p1 CASCADE;
 -- Use partitioning
 ALTER SERVER loopback OPTIONS (ADD batch_size '10');
 CREATE TABLE batch_table ( x int, field1 text, field2 text) PARTITION BY HASH (x);
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index f15c97ad7a..634d278097 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -2008,25 +2008,24 @@ static int
 postgresGetForeignModifyBatchSize(ResultRelInfo *resultRelInfo)
 {
 	int			batch_size;
-	PgFdwModifyState *fmstate = resultRelInfo->ri_FdwState ?
-	(PgFdwModifyState *) resultRelInfo->ri_FdwState :
-	NULL;
+	PgFdwModifyState *fmstate = (PgFdwModifyState *) resultRelInfo->ri_FdwState;
 
 	/* should be called only once */
 	Assert(resultRelInfo->ri_BatchSize == 0);
 
 	/*
-	 * Should never get called when the insert is being performed as part of a
-	 * row movement operation.
+	 * Use the auxiliary state if any; see postgresBeginForeignInsert() for
+	 * details on what it represents.
 	 */
-	Assert(fmstate == NULL || fmstate->aux_fmstate == NULL);
+	if (fmstate != NULL && fmstate->aux_fmstate != NULL)
+		fmstate = fmstate->aux_fmstate;
 
 	/*
 	 * In EXPLAIN without ANALYZE, ri_FdwState is NULL, so we have to lookup
 	 * the option directly in server/table options. Otherwise just use the
 	 * value we determined earlier.
 	 */
-	if (fmstate)
+	if (fmstate != NULL)
 		batch_size = fmstate->batch_size;
 	else
 		batch_size = get_batch_size_option(resultRelInfo->ri_RelationDesc);
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 02a6b15a13..ffd5a037d8 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -3071,8 +3071,8 @@ CREATE TABLE batch_table_p2
 INSERT INTO batch_table SELECT * FROM generate_series(1, 66) i;
 SELECT COUNT(*) FROM batch_table;
 
--- Check that enabling batched inserts doesn't interfere with cross-partition
--- updates
+-- Check that batched mode also works for inserts made during
+-- cross-partition updates
 CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
 CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
 CREATE FOREIGN TABLE batch_cp_upd_test1_f
@@ -3080,16 +3080,44 @@ CREATE FOREIGN TABLE batch_cp_upd_test1_f
 	FOR VALUES IN (1)
 	SERVER loopback
 	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
-CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
+CREATE TABLE batch_cp_upd_test2 PARTITION OF batch_cp_upd_test
 	FOR VALUES IN (2);
-INSERT INTO batch_cp_upd_test VALUES (1), (2);
+CREATE TABLE batch_cp_upd_test3 (LIKE batch_cp_upd_test);
+CREATE FOREIGN TABLE batch_cp_upd_test3_f
+	PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (3)
+	SERVER loopback
+	OPTIONS (table_name 'batch_cp_upd_test3', batch_size '1');
+INSERT INTO batch_cp_upd_test VALUES (2), (2);
+
+-- Create statement triggers on remote tables that "log" any INSERTs
+-- performed on them.
+CREATE TABLE cmdlog (cmd text);
+CREATE FUNCTION log_stmt () RETURNS TRIGGER LANGUAGE plpgsql AS $$
+	BEGIN INSERT INTO public.cmdlog VALUES (TG_OP || ' on ' || TG_RELNAME); RETURN NULL; END;
+$$;
+CREATE TRIGGER stmt_trig AFTER INSERT ON batch_cp_upd_test1
+	FOR EACH STATEMENT EXECUTE FUNCTION log_stmt();
+CREATE TRIGGER stmt_trig AFTER INSERT ON batch_cp_upd_test3
+	FOR EACH STATEMENT EXECUTE FUNCTION log_stmt();
+
+-- This update moves rows from the local partition to the foreign one that
+-- has insert batching enabled, so 1 INSERT for both rows.
+UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
+
+INSERT INTO batch_cp_upd_test VALUES (2), (2);
+
+-- This update moves rows from the local partition to the foreign one that
+-- has insert batching disabled, so separate INSERTs for each row.
+UPDATE batch_cp_upd_test t SET a = 3 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
+
+SELECT tableoid::regclass, * FROM batch_cp_upd_test ORDER BY 1;
 
--- The following moves a row from the local partition to the foreign one
-UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
-SELECT tableoid::regclass, * FROM batch_cp_upd_test;
+-- Should see 1 INSERT on batch_cp_upd_test1 and 2 on batch_cp_upd_test3.
+SELECT * FROM cmdlog ORDER BY 1;
 
 -- Clean up
-DROP TABLE batch_table, batch_cp_upd_test, batch_table_p0, batch_table_p1 CASCADE;
+DROP TABLE batch_table, batch_cp_upd_test, cmdlog, batch_table_p0, batch_table_p1 CASCADE;
 
 -- Use partitioning
 ALTER SERVER loopback OPTIONS (ADD batch_size '10');
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 5c723bc54e..3dc80f7399 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -35,75 +35,6 @@
 #include "utils/ruleutils.h"
 
 
-/*-----------------------
- * PartitionTupleRouting - Encapsulates all information required to
- * route a tuple inserted into a partitioned table to one of its leaf
- * partitions.
- *
- * partition_root
- *		The partitioned table that's the target of the command.
- *
- * partition_dispatch_info
- *		Array of 'max_dispatch' elements containing a pointer to a
- *		PartitionDispatch object for every partitioned table touched by tuple
- *		routing.  The entry for the target partitioned table is *always*
- *		present in the 0th element of this array.  See comment for
- *		PartitionDispatchData->indexes for details on how this array is
- *		indexed.
- *
- * nonleaf_partitions
- *		Array of 'max_dispatch' elements containing pointers to fake
- *		ResultRelInfo objects for nonleaf partitions, useful for checking
- *		the partition constraint.
- *
- * num_dispatch
- *		The current number of items stored in the 'partition_dispatch_info'
- *		array.  Also serves as the index of the next free array element for
- *		new PartitionDispatch objects that need to be stored.
- *
- * max_dispatch
- *		The current allocated size of the 'partition_dispatch_info' array.
- *
- * partitions
- *		Array of 'max_partitions' elements containing a pointer to a
- *		ResultRelInfo for every leaf partition touched by tuple routing.
- *		Some of these are pointers to ResultRelInfos which are borrowed out of
- *		the owning ModifyTableState node.  The remainder have been built
- *		especially for tuple routing.  See comment for
- *		PartitionDispatchData->indexes for details on how this array is
- *		indexed.
- *
- * is_borrowed_rel
- *		Array of 'max_partitions' booleans recording whether a given entry
- *		in 'partitions' is a ResultRelInfo pointer borrowed from the owning
- *		ModifyTableState node, rather than being built here.
- *
- * num_partitions
- *		The current number of items stored in the 'partitions' array.  Also
- *		serves as the index of the next free array element for new
- *		ResultRelInfo objects that need to be stored.
- *
- * max_partitions
- *		The current allocated size of the 'partitions' array.
- *
- * memcxt
- *		Memory context used to allocate subsidiary structs.
- *-----------------------
- */
-struct PartitionTupleRouting
-{
-	Relation	partition_root;
-	PartitionDispatch *partition_dispatch_info;
-	ResultRelInfo **nonleaf_partitions;
-	int			num_dispatch;
-	int			max_dispatch;
-	ResultRelInfo **partitions;
-	bool	   *is_borrowed_rel;
-	int			num_partitions;
-	int			max_partitions;
-	MemoryContext memcxt;
-};
-
 /*-----------------------
  * PartitionDispatch - information about one partitioned table in a partition
  * hierarchy required to route a tuple to any of its partitions.  A
@@ -923,8 +854,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 	 *
 	 * If the FDW does not support batching, we set the batch size to 1.
 	 */
-	if (mtstate->operation == CMD_INSERT &&
-		partRelInfo->ri_FdwRoutine != NULL &&
+	if (partRelInfo->ri_FdwRoutine != NULL &&
 		partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
 		partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
 		partRelInfo->ri_BatchSize =
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index dad459a467..2a97b719b6 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2360,9 +2360,6 @@ ExecModifyTable(PlanState *pstate)
 	ItemPointerData tuple_ctid;
 	HeapTupleData oldtupdata;
 	HeapTuple	oldtuple;
-	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
-	List	   *relinfos = NIL;
-	ListCell   *lc;
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -2617,22 +2614,49 @@ ExecModifyTable(PlanState *pstate)
 	}
 
 	/*
-	 * Insert remaining tuples for batch insert.
+	 * Insert any remaining batched tuples.
+	 *
+	 * If the query's main target relation is a partitioned table, any inserts
+	 * would have been performed using tuple routing, so use the appropriate
+	 * set of target relations.  Note that this also covers any inserts
+	 * performed during cross-partition UPDATEs that would have occurred
+	 * through tuple routing.
 	 */
-	if (proute)
-		relinfos = estate->es_tuple_routing_result_relations;
-	else
-		relinfos = estate->es_opened_result_relations;
+	if (node->mt_partition_tuple_routing)
+	{
+		PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
+		int			i;
 
-	foreach(lc, relinfos)
+		Assert(node->rootResultRelInfo);
+		Assert(node->rootResultRelInfo->ri_RelationDesc->rd_rel->relkind ==
+			   RELKIND_PARTITIONED_TABLE);
+		for (i = 0; i < proute->num_partitions; i++)
+		{
+			resultRelInfo = proute->partitions[i];
+			if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
+	}
+	else
 	{
-		resultRelInfo = lfirst(lc);
-		if (resultRelInfo->ri_NumSlots > 0)
-			ExecBatchInsert(node, resultRelInfo,
-							resultRelInfo->ri_Slots,
-							resultRelInfo->ri_PlanSlots,
-							resultRelInfo->ri_NumSlots,
-							estate, node->canSetTag);
+		ListCell *lc;
+
+		/* Otherwise, use result relations from the range table. */
+		foreach(lc, estate->es_opened_result_relations)
+		{
+			resultRelInfo = lfirst(lc);
+			Assert(resultRelInfo != NULL);
+			if (resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
 	}
 
 	/*
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 694e38b7dd..29d485e914 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -18,9 +18,77 @@
 #include "nodes/plannodes.h"
 #include "partitioning/partprune.h"
 
-/* See execPartition.c for the definitions. */
+/* See execPartition.c for the definition. */
 typedef struct PartitionDispatchData *PartitionDispatch;
-typedef struct PartitionTupleRouting PartitionTupleRouting;
+
+/*
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ *		The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ *		Array of 'max_dispatch' elements containing a pointer to a
+ *		PartitionDispatch object for every partitioned table touched by tuple
+ *		routing.  The entry for the target partitioned table is *always*
+ *		present in the 0th element of this array.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
+ * num_dispatch
+ *		The current number of items stored in the 'partition_dispatch_info'
+ *		array.  Also serves as the index of the next free array element for
+ *		new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ *		The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ *		Array of 'max_partitions' elements containing a pointer to a
+ *		ResultRelInfo for every leaf partition touched by tuple routing.
+ *		Some of these are pointers to ResultRelInfos which are borrowed out of
+ *		the owning ModifyTableState node.  The remainder have been built
+ *		especially for tuple routing.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * is_borrowed_rel
+ *		Array of 'max_partitions' booleans recording whether a given entry
+ *		in 'partitions' is a ResultRelInfo pointer borrowed from the owning
+ *		ModifyTableState node, rather than being built here.
+ *
+ * num_partitions
+ *		The current number of items stored in the 'partitions' array.  Also
+ *		serves as the index of the next free array element for new
+ *		ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ *		The current allocated size of the 'partitions' array.
+ *
+ * memcxt
+ *		Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+	Relation	partition_root;
+	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
+	int			num_dispatch;
+	int			max_dispatch;
+	ResultRelInfo **partitions;
+	bool	   *is_borrowed_rel;
+	int			num_partitions;
+	int			max_partitions;
+	MemoryContext memcxt;
+} PartitionTupleRouting;
 
 /*
  * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning
-- 
2.24.1

