From 58a1e12f1740680ae810e5a42c8de59d4e348549 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 23 Jun 2018 15:32:53 +0530
Subject: [PATCH] Allow using the updated tuple while moving it to a different
 partition.

An update that causes the tuple to be moved to a different partition was
missing out on re-constructing the to-be-updated tuple, based on the latest
tuple in the update chain.  Instead, it's simply deleting the latest tuple
and inserting a new tuple in the new partition based on the old tuple.
Commit 2f17844104 didn't consider this case, so some of the updates were
getting lost.

Reported-by: Pavan Deolasee
Author: Amit Khandekar, with minor changes by me
Reviewed-by: Dilip Kumar and Amit Kapila
Discussion: https://postgr.es/m/CAJ3gD9fRbEzDqdeDq1jxqZUb47kJn+tQ7=Bcgjc8quqKsDViKQ@mail.gmail.com
---
 src/backend/commands/trigger.c                     | 22 ++++++-
 src/backend/executor/execReplication.c             |  2 +-
 src/backend/executor/nodeModifyTable.c             | 63 +++++++++++++-----
 src/include/commands/trigger.h                     |  3 +-
 .../isolation/expected/partition-key-update-4.out  | 60 +++++++++++++++++
 src/test/isolation/isolation_schedule              |  1 +
 .../isolation/specs/partition-key-update-4.spec    | 76 ++++++++++++++++++++++
 7 files changed, 208 insertions(+), 19 deletions(-)
 create mode 100644 src/test/isolation/expected/partition-key-update-4.out
 create mode 100644 src/test/isolation/specs/partition-key-update-4.spec

diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 57519fe..2436692 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2726,11 +2726,19 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
 							  false, NULL, NULL, NIL, NULL, transition_capture);
 }
 
+/*
+ * Execute BEFORE ROW DELETE triggers.
+ *
+ * True indicates caller can proceed with the delete.  False indicates caller
+ * need to suppress the delete and additionally if requested, we need to pass
+ * back the concurrently updated tuple if any.
+ */
 bool
 ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 					 ResultRelInfo *relinfo,
 					 ItemPointer tupleid,
-					 HeapTuple fdw_trigtuple)
+					 HeapTuple fdw_trigtuple,
+					 TupleTableSlot **epqslot)
 {
 	TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
 	bool		result = true;
@@ -2747,6 +2755,18 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 									   LockTupleExclusive, &newSlot);
 		if (trigtuple == NULL)
 			return false;
+
+		/*
+		 * If the tuple was concurrently updated and the caller of this
+		 * function requested for the updated tuple, skip the trigger
+		 * execution.
+		 */
+		if (newSlot != NULL && epqslot != NULL)
+		{
+			*epqslot = newSlot;
+			heap_freetuple(trigtuple);
+			return false;
+		}
 	}
 	else
 		trigtuple = fdw_trigtuple;
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 41e857e..5006baa 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -531,7 +531,7 @@ ExecSimpleRelationDelete(EState *estate, EPQState *epqstate,
 	{
 		skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
 										   &searchslot->tts_tuple->t_self,
-										   NULL);
+										   NULL, NULL);
 	}
 
 	if (!skip_tuple)
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 7e0b867..58bd0f1 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -609,7 +609,9 @@ ExecInsert(ModifyTableState *mtstate,
  *		foreign table, tupleid is invalid; the FDW has to figure out
  *		which row to delete using data from the planSlot.  oldtuple is
  *		passed to foreign table triggers; it is NULL when the foreign
- *		table has no relevant triggers.
+ *		table has no relevant triggers. When this DELETE is a part of
+ *		an UPDATE of partition-key, then the slot returned by
+ *		EvalPlanQual() is passed back using output parameter epqslot.
  *
  *		Returns RETURNING result if any, otherwise NULL.
  * ----------------------------------------------------------------
@@ -622,6 +624,7 @@ ExecDelete(ModifyTableState *mtstate,
 		   EPQState *epqstate,
 		   EState *estate,
 		   bool *tupleDeleted,
+		   TupleTableSlot **epqslot,
 		   bool processReturning,
 		   bool canSetTag,
 		   bool changingPart)
@@ -649,7 +652,7 @@ ExecDelete(ModifyTableState *mtstate,
 		bool		dodelete;
 
 		dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
-										tupleid, oldtuple);
+										tupleid, oldtuple, epqslot);
 
 		if (!dodelete)			/* "do nothing" */
 			return NULL;
@@ -769,19 +772,30 @@ ldelete:;
 
 				if (!ItemPointerEquals(tupleid, &hufd.ctid))
 				{
-					TupleTableSlot *epqslot;
-
-					epqslot = EvalPlanQual(estate,
-										   epqstate,
-										   resultRelationDesc,
-										   resultRelInfo->ri_RangeTableIndex,
-										   LockTupleExclusive,
-										   &hufd.ctid,
-										   hufd.xmax);
-					if (!TupIsNull(epqslot))
+					TupleTableSlot *my_epqslot;
+
+					my_epqslot = EvalPlanQual(estate,
+											  epqstate,
+											  resultRelationDesc,
+											  resultRelInfo->ri_RangeTableIndex,
+											  LockTupleExclusive,
+											  &hufd.ctid,
+											  hufd.xmax);
+					if (!TupIsNull(my_epqslot))
 					{
 						*tupleid = hufd.ctid;
-						goto ldelete;
+
+						/*
+						 * If requested, skip delete and pass back the updated
+						 * row.
+						 */
+						if (epqslot)
+						{
+							*epqslot = my_epqslot;
+							return NULL;
+						}
+						else
+							goto ldelete;
 					}
 				}
 				/* tuple already deleted; nothing to do */
@@ -1052,6 +1066,7 @@ lreplace:;
 		{
 			bool		tuple_deleted;
 			TupleTableSlot *ret_slot;
+			TupleTableSlot *epqslot = NULL;
 			PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
 			int			map_index;
 			TupleConversionMap *tupconv_map;
@@ -1081,7 +1096,7 @@ lreplace:;
 			 * processing. We want to return rows from INSERT.
 			 */
 			ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate,
-					   estate, &tuple_deleted, false,
+					   estate, &tuple_deleted, &epqslot, false,
 					   false /* canSetTag */ , true /* changingPart */ );
 
 			/*
@@ -1105,7 +1120,23 @@ lreplace:;
 			 * resurrect it.
 			 */
 			if (!tuple_deleted)
-				return NULL;
+			{
+				/*
+				 * epqslot will be typically NULL.  But when ExecDelete()
+				 * finds that another transaction has concurrently updated the
+				 * same row, it re-fetches the row, skips the delete, and
+				 * epqslot is set to the re-fetched tuple slot. In that case,
+				 * we need to do all the checks again.
+				 */
+				if (TupIsNull(epqslot))
+					return NULL;
+				else
+				{
+					slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
+					tuple = ExecMaterializeSlot(slot);
+					goto lreplace;
+				}
+			}
 
 			/*
 			 * Updates set the transition capture map only when a new subplan
@@ -2136,7 +2167,7 @@ ExecModifyTable(PlanState *pstate)
 			case CMD_DELETE:
 				slot = ExecDelete(node, tupleid, oldtuple, planSlot,
 								  &node->mt_epqstate, estate,
-								  NULL, true, node->canSetTag,
+								  NULL, NULL, true, node->canSetTag,
 								  false /* changingPart */ );
 				break;
 			default:
diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h
index a5b8610..1031448 100644
--- a/src/include/commands/trigger.h
+++ b/src/include/commands/trigger.h
@@ -206,7 +206,8 @@ extern bool ExecBRDeleteTriggers(EState *estate,
 					 EPQState *epqstate,
 					 ResultRelInfo *relinfo,
 					 ItemPointer tupleid,
-					 HeapTuple fdw_trigtuple);
+					 HeapTuple fdw_trigtuple,
+					 TupleTableSlot **epqslot);
 extern void ExecARDeleteTriggers(EState *estate,
 					 ResultRelInfo *relinfo,
 					 ItemPointer tupleid,
diff --git a/src/test/isolation/expected/partition-key-update-4.out b/src/test/isolation/expected/partition-key-update-4.out
new file mode 100644
index 0000000..774a7fa
--- /dev/null
+++ b/src/test/isolation/expected/partition-key-update-4.out
@@ -0,0 +1,60 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s2b s2u1 s1u s2c s1c s1s
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2u1: UPDATE foo SET b = b || ' update2' WHERE a = 1;
+step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1c: COMMIT;
+step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
+tableoid       a              b              
+
+foo2           2              ABC update2 update1
+
+starting permutation: s1b s2b s2ut1 s1ut s2c s1c s1st s1stl
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2ut1: UPDATE footrg SET b = b || ' update2' WHERE a = 1;
+step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1ut: <... completed>
+step s1c: COMMIT;
+step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
+tableoid       a              b              
+
+footrg2        2              ABC update2 update1
+step s1stl: SELECT * FROM triglog ORDER BY a;
+a              b              
+
+1              ABC update2 trigger
+
+starting permutation: s1b s2b s2u2 s1u s2c s1c s1s
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2u2: UPDATE foo SET b = 'EFG' WHERE a = 1;
+step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1c: COMMIT;
+step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
+tableoid       a              b              
+
+foo1           1              EFG            
+
+starting permutation: s1b s2b s2ut2 s1ut s2c s1c s1st s1stl
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2ut2: UPDATE footrg SET b = 'EFG' WHERE a = 1;
+step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1ut: <... completed>
+step s1c: COMMIT;
+step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
+tableoid       a              b              
+
+footrg1        1              EFG            
+step s1stl: SELECT * FROM triglog ORDER BY a;
+a              b              
+
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 0e99721..d5594e8 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -74,4 +74,5 @@ test: predicate-gin-nomatch
 test: partition-key-update-1
 test: partition-key-update-2
 test: partition-key-update-3
+test: partition-key-update-4
 test: plpgsql-toast
diff --git a/src/test/isolation/specs/partition-key-update-4.spec b/src/test/isolation/specs/partition-key-update-4.spec
new file mode 100644
index 0000000..1d53a7d
--- /dev/null
+++ b/src/test/isolation/specs/partition-key-update-4.spec
@@ -0,0 +1,76 @@
+# Test that a row that ends up in a new partition contains changes made by
+# a concurrent transaction.
+
+setup
+{
+  --
+  -- Setup to test concurrent handling of ExecDelete().
+  --
+  CREATE TABLE foo (a int, b text) PARTITION BY LIST(a);
+  CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1);
+  CREATE TABLE foo2 PARTITION OF foo FOR VALUES IN (2);
+  INSERT INTO foo VALUES (1, 'ABC');
+
+  --
+  -- Setup to test concurrent handling of GetTupleForTrigger().
+  --
+  CREATE TABLE footrg (a int, b text) PARTITION BY LIST(a);
+  CREATE TABLE triglog as select * from footrg;
+  CREATE TABLE footrg1 PARTITION OF footrg FOR VALUES IN (1);
+  CREATE TABLE footrg2 PARTITION OF footrg FOR VALUES IN (2);
+  INSERT INTO footrg VALUES (1, 'ABC');
+  CREATE FUNCTION func_footrg() RETURNS TRIGGER AS $$
+  BEGIN
+	 OLD.b = OLD.b || ' trigger';
+
+	 -- This will verify that the trigger is not run *before* the row is
+	 -- refetched by EvalPlanQual. The OLD row should contain the changes made
+	 -- by the concurrent session.
+     INSERT INTO triglog select OLD.*;
+
+     RETURN OLD;
+  END $$ LANGUAGE PLPGSQL;
+  CREATE TRIGGER footrg_ondel BEFORE DELETE ON footrg1
+   FOR EACH ROW EXECUTE PROCEDURE func_footrg();
+
+}
+
+teardown
+{
+  DROP TABLE foo;
+  DROP TRIGGER footrg_ondel ON footrg1;
+  DROP FUNCTION func_footrg();
+  DROP TABLE footrg;
+  DROP TABLE triglog;
+}
+
+session "s1"
+step "s1b"	 { BEGIN ISOLATION LEVEL READ COMMITTED; }
+step "s1u"   { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
+step "s1ut"  { UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
+step "s1s"   { SELECT tableoid::regclass, * FROM foo ORDER BY a; }
+step "s1st"  { SELECT tableoid::regclass, * FROM footrg ORDER BY a; }
+step "s1stl"  { SELECT * FROM triglog ORDER BY a; }
+step "s1c"	 { COMMIT; }
+
+session "s2"
+step "s2b"	 { BEGIN ISOLATION LEVEL READ COMMITTED; }
+step "s2u1"  { UPDATE foo SET b = b || ' update2' WHERE a = 1; }
+step "s2u2"  { UPDATE foo SET b = 'EFG' WHERE a = 1; }
+step "s2ut1" { UPDATE footrg SET b = b || ' update2' WHERE a = 1; }
+step "s2ut2" { UPDATE footrg SET b = 'EFG' WHERE a = 1; }
+step "s2c"	 { COMMIT; }
+
+
+# Session s1 is moving a row into another partition, but is waiting for
+# another session s2 that is updating the original row. The row that ends up
+# in the new partition should contain the changes made by session s2.
+permutation "s1b" "s2b" "s2u1" "s1u" "s2c" "s1c" "s1s"
+
+# Same as above, except, session s1 is waiting in GetTupleTrigger().
+permutation "s1b" "s2b" "s2ut1" "s1ut" "s2c" "s1c" "s1st" "s1stl"
+
+# Below two cases are similar to the above two; except that the session s1
+# fails EvalPlanQual() test, so partition key update does not happen.
+permutation "s1b" "s2b" "s2u2" "s1u" "s2c" "s1c" "s1s"
+permutation "s1b" "s2b" "s2ut2" "s1ut" "s2c" "s1c" "s1st" "s1stl"
-- 
1.8.3.1

