From e4c78647c044b818ad4694e0eeb42db3aac4e9d2 Mon Sep 17 00:00:00 2001
From: Amul Sul <sulamul@gmail.com>
Date: Mon, 19 Mar 2018 11:31:18 +0530
Subject: [PATCH 1/2] Invalidate ip_blkid v7

v7: Update w.r.t Amit Kapila's review comments[12]
 - Reverted ItemPointerData comment changes did in v2 patch.
 - Updated t_ctid comment in htup_details.h.

v6:
 - Changes w.r.t Amit's suggestion[11].
 - AFAICS, Other than previous places(v6-wip), I didn't found any
   other places where we need the check for invalid block numberother than.
 - Minor change in the comments : 'partition key' -> 'the partition key'
 - ran pgindent.

v6-wip: Update w.r.t Andres Freund review comments[8]
 - Added HeapTupleHeaderValidBlockNumber, HeapTupleHeaderSetBlockNumber
   and ItemPointerValidBlockNumber macro.
 - Fixed comments as per Andres' suggestions.
 - Also added invalid block number check in heap_get_latest_tid as
   discussed in the thread[9], similar change made in heap_lock_updated_tuple_rec.
 - In heap_lock_updated_tuple, rewrite_heap_tuple & EvalPlanQualFetch,
   I've added check for invalid block number where ItemPointerEquals used
   to conclude tuple has been updated/deleted.

 TODO:
 1. Yet to test changes made in the heap_lock_updated_tuple,
    rewrite_heap_tuple & EvalPlanQualFetch function are valid or not.
    Also, address TODO tags in the same function.
 2. Also check there are other places where similar changes like
    above needed(wrt Pavan Deolasee[10] concern)

v5:
 - Added code in heap_mask to skip wal_consistency_checking[7]
 - Fixed previous todos.

v5-wip2:
 - Minor changes in RelationFindReplTupleByIndex() and
   RelationFindReplTupleSeq()

 - TODO;
   Same as the privious

v5-wip: Update w.r.t Amit Kapila's comments[6].
 - Reverted error message in nodeModifyTable.c from 'tuple to be locked'
   to 'tuple to be updated'.

 - TODO:
 1. Yet to made a decision of having LOG/ELOG/ASSERT in the
    RelationFindReplTupleByIndex() and RelationFindReplTupleSeq().

v4: Rebased on "UPDATE of partition key v35" patch[5].

v3: Update w.r.t Amit Kapila's[3] & Alvaro Herrera[4] comments
 - typo in all error message and comment : "to an another" -> "to another"
 - error message change : "tuple to be updated" -> "tuple to be locked"
 - In ExecOnConflictUpdate(), error report converted into assert &
  comments added.

v2: Updated w.r.t Robert review comments[2]
 - Updated couple of comment of heap_delete argument and ItemPointerData
 - Added same concurrent update error logic in ExecOnConflictUpdate,
   RelationFindReplTupleByIndex and RelationFindReplTupleSeq

v1: Initial version -- as per Amit Kapila's suggestions[1]
 - When tuple is being moved to another partition then ip_blkid in the
   tuple header mark to InvalidBlockNumber.

 -------------
  References:
 -------------
 1] https://postgr.es/m/CAA4eK1KEZQ%2BCyXbBzfn1jFHoEfa_OemDLhLyy7xfD1QUZLo1DQ%40mail.gmail.com
 2] https://postgr.es/m/CA%2BTgmoYY98AEjh7RDtuzaLC--_0smCozXRu6bFmZTaX5Ne%3DB5Q%40mail.gmail.com
 3] https://postgr.es/m/CAA4eK1LQS6TmsGaEwR9HgF-9TZTHxrdAELuX6wOZBDbbjOfDjQ@mail.gmail.com
 4] https://postgr.es/m/20171124160756.eyljpmpfzwd6jmnr@alvherre.pgsql
 5] https://postgr.es/m/CAJ3gD9dixkkMzNnnP1CaZ1H17-U17ok_sVbjZZo+wnB=rJH6yg@mail.gmail.com
 6] https://postgr.es/m/CAA4eK1LHVnNWYF53F1gUGx6CTxuvznozvU-Lr-dfE=Qeu1gEcg@mail.gmail.com
 7] https://postgr.es/m/CAAJ_b94_29wiUA83W8LQjtfjv9XNV=+PT8+ioWRPjnnFHe3eqw@mail.gmail.com
 8] https://postgr.es/m/20180305232353.gpue7jldnm4bjf4i@alap3.anarazel.de
 9] https://postgr.es/m/CAAJ_b97BBkRWFowGRs9VNzFykoK0ikGB1yYEsWfYK8xR5enSrw@mail.gmail.com
10] https://postgr.es/m/CABOikdPXwqkLGgTZZm2qYwTn4L69V36rCh55fFma1fAYbon7Vg@mail.gmail.com
11] https://postgr.es/m/CAAJ_b97ohg=+WfiFT3g2x14rvXsXOFXvjH43GkYbgcLZvF7k+w@mail.gmail.com
12] https://postgr.es/m/CAA4eK1LbML3=uruBA46qGA9ZE_F3Co8MsjefLrJoLz+Q_TW3vg@mail.gmail.com
---
 src/backend/access/heap/heapam.c       | 37 +++++++++++++++++++++++++++++++---
 src/backend/access/heap/rewriteheap.c  |  1 +
 src/backend/commands/trigger.c         |  5 +++++
 src/backend/executor/execMain.c        |  7 ++++++-
 src/backend/executor/execReplication.c | 26 ++++++++++++++++--------
 src/backend/executor/nodeLockRows.c    |  5 +++++
 src/backend/executor/nodeModifyTable.c | 28 +++++++++++++++++++++----
 src/include/access/heapam.h            |  2 +-
 src/include/access/htup_details.h      | 12 +++++++++--
 src/include/storage/itemptr.h          |  7 +++++++
 10 files changed, 111 insertions(+), 19 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index c08ab14c02..54e6c6bde0 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2304,6 +2304,7 @@ heap_get_latest_tid(Relation relation,
 		 */
 		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
 			HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
+			!HeapTupleHeaderValidBlockNumber(tp.t_data) ||
 			ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
 		{
 			UnlockReleaseBuffer(buffer);
@@ -2314,6 +2315,9 @@ heap_get_latest_tid(Relation relation,
 		priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
 		UnlockReleaseBuffer(buffer);
 	}							/* end of loop */
+
+	/* Make sure that the return value has a valid block number */
+	Assert(ItemPointerValidBlockNumber(tid));
 }
 
 
@@ -3037,6 +3041,9 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
  *	crosscheck - if not InvalidSnapshot, also check tuple against this
  *	wait - true if should wait for any conflicting update to commit/abort
  *	hufd - output parameter, filled in failure cases (see below)
+ *	changing_part - true iff the tuple is being moved to another partition
+ *					table due to an update of the partition key. Otherwise,
+ *					false.
  *
  * Normal, successful return value is HeapTupleMayBeUpdated, which
  * actually means we did delete it.  Failure return codes are
@@ -3052,7 +3059,7 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
 HTSU_Result
 heap_delete(Relation relation, ItemPointer tid,
 			CommandId cid, Snapshot crosscheck, bool wait,
-			HeapUpdateFailureData *hufd)
+			HeapUpdateFailureData *hufd, bool changing_part)
 {
 	HTSU_Result result;
 	TransactionId xid = GetCurrentTransactionId();
@@ -3320,6 +3327,13 @@ l1:
 	/* Make sure there is no forward chain link in t_ctid */
 	tp.t_data->t_ctid = tp.t_self;
 
+	/*
+	 * Sets a block identifier to the InvalidBlockNumber to indicate such an
+	 * update being moved tuple to another partition.
+	 */
+	if (changing_part)
+		HeapTupleHeaderSetBlockNumber(tp.t_data, InvalidBlockNumber);
+
 	MarkBufferDirty(buffer);
 
 	/*
@@ -3445,7 +3459,7 @@ simple_heap_delete(Relation relation, ItemPointer tid)
 	result = heap_delete(relation, tid,
 						 GetCurrentCommandId(true), InvalidSnapshot,
 						 true /* wait for commit */ ,
-						 &hufd);
+						 &hufd, false);
 	switch (result)
 	{
 		case HeapTupleSelfUpdated:
@@ -5956,6 +5970,7 @@ l4:
 next:
 		/* if we find the end of update chain, we're done. */
 		if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
+			!HeapTupleHeaderValidBlockNumber(mytup.t_data) ||
 			ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
 			HeapTupleHeaderIsOnlyLocked(mytup.t_data))
 		{
@@ -6007,7 +6022,8 @@ static HTSU_Result
 heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
 						TransactionId xid, LockTupleMode mode)
 {
-	if (!ItemPointerEquals(&tuple->t_self, ctid))
+	if (ItemPointerValidBlockNumber(ctid) &&
+		!ItemPointerEquals(&tuple->t_self, ctid))
 	{
 		/*
 		 * If this is the first possibly-multixact-able operation in the
@@ -9322,6 +9338,21 @@ heap_mask(char *pagedata, BlockNumber blkno)
 			 */
 			if (HeapTupleHeaderIsSpeculative(page_htup))
 				ItemPointerSet(&page_htup->t_ctid, blkno, off);
+
+			/*
+			 * For a deleted tuple, a block identifier is set to
+			 * InvalidBlockNumber to indicate that the tuple has been moved to
+			 * another partition due to an update of the partition key.
+			 *
+			 * During redo, heap_xlog_delete sets t_ctid to current block
+			 * number and self offset number.  It doesn't verify the tuple is
+			 * deleted by usual delete/update or deleted by the update of the
+			 * partition key on the master.  Hence, like speculative tuple, to
+			 * ignore any inconsistency set block identifier to current block
+			 * number.
+			 */
+			if (!HeapTupleHeaderValidBlockNumber(page_htup))
+				HeapTupleHeaderSetBlockNumber(page_htup, blkno);
 		}
 
 		/*
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 7d466c2588..0943d95ea1 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -424,6 +424,7 @@ rewrite_heap_tuple(RewriteState state,
 	 */
 	if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
 		  HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) &&
+		HeapTupleHeaderValidBlockNumber(old_tuple->t_data) &&
 		!(ItemPointerEquals(&(old_tuple->t_self),
 							&(old_tuple->t_data->t_ctid))))
 	{
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index fbd176b5d0..93c1f2a51f 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -3071,6 +3071,11 @@ ltrmark:;
 					ereport(ERROR,
 							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 							 errmsg("could not serialize access due to concurrent update")));
+				if (!ItemPointerValidBlockNumber(&hufd.ctid))
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
+
 				if (!ItemPointerEquals(&hufd.ctid, &tuple.t_self))
 				{
 					/* it was updated, so look at the updated version */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 91ba939bdc..884c012e5b 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -2712,6 +2712,10 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
 						ereport(ERROR,
 								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 								 errmsg("could not serialize access due to concurrent update")));
+					if (!ItemPointerValidBlockNumber(&hufd.ctid))
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
 
 					/* Should not encounter speculative tuple on recheck */
 					Assert(!HeapTupleHeaderIsSpeculative(tuple.t_data));
@@ -2780,7 +2784,8 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
 		 * As above, it should be safe to examine xmax and t_ctid without the
 		 * buffer content lock, because they can't be changing.
 		 */
-		if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid))
+		if (!HeapTupleHeaderValidBlockNumber(tuple.t_data) ||
+			ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid))
 		{
 			/* deleted, so forget about it */
 			ReleaseBuffer(buffer);
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 32891abbdf..8430420de7 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -190,10 +190,15 @@ retry:
 			case HeapTupleMayBeUpdated:
 				break;
 			case HeapTupleUpdated:
-				/* XXX: Improve handling here */
-				ereport(LOG,
-						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
-						 errmsg("concurrent update, retrying")));
+				if (!ItemPointerValidBlockNumber(&hufd.ctid))
+					ereport(LOG,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
+				else
+					/* XXX: Improve handling here */
+					ereport(LOG,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("concurrent update, retrying")));
 				goto retry;
 			case HeapTupleInvisible:
 				elog(ERROR, "attempted to lock invisible tuple");
@@ -348,10 +353,15 @@ retry:
 			case HeapTupleMayBeUpdated:
 				break;
 			case HeapTupleUpdated:
-				/* XXX: Improve handling here */
-				ereport(LOG,
-						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
-						 errmsg("concurrent update, retrying")));
+				if (!ItemPointerValidBlockNumber(&hufd.ctid))
+					ereport(LOG,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
+				else
+					/* XXX: Improve handling here */
+					ereport(LOG,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("concurrent update, retrying")));
 				goto retry;
 			case HeapTupleInvisible:
 				elog(ERROR, "attempted to lock invisible tuple");
diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c
index b39ccf7dc1..dd4d5f25ca 100644
--- a/src/backend/executor/nodeLockRows.c
+++ b/src/backend/executor/nodeLockRows.c
@@ -218,6 +218,11 @@ lnext:
 					ereport(ERROR,
 							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 							 errmsg("could not serialize access due to concurrent update")));
+				if (!ItemPointerValidBlockNumber(&hufd.ctid))
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
+
 				if (ItemPointerEquals(&hufd.ctid, &tuple.t_self))
 				{
 					/* Tuple was deleted, so don't return it */
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 3332ae4bf3..784695aa1d 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -719,7 +719,8 @@ ExecDelete(ModifyTableState *mtstate,
 		   EState *estate,
 		   bool *tupleDeleted,
 		   bool processReturning,
-		   bool canSetTag)
+		   bool canSetTag,
+		   bool changing_part)
 {
 	ResultRelInfo *resultRelInfo;
 	Relation	resultRelationDesc;
@@ -810,7 +811,8 @@ ldelete:;
 							 estate->es_output_cid,
 							 estate->es_crosscheck_snapshot,
 							 true /* wait for commit */ ,
-							 &hufd);
+							 &hufd,
+							 changing_part);
 		switch (result)
 		{
 			case HeapTupleSelfUpdated:
@@ -856,6 +858,11 @@ ldelete:;
 					ereport(ERROR,
 							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 							 errmsg("could not serialize access due to concurrent update")));
+				if (!ItemPointerValidBlockNumber(&hufd.ctid))
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("tuple to be updated was already moved to another partition due to concurrent update")));
+
 				if (!ItemPointerEquals(tupleid, &hufd.ctid))
 				{
 					TupleTableSlot *epqslot;
@@ -1158,7 +1165,7 @@ lreplace:;
 			 * processing. We want to return rows from INSERT.
 			 */
 			ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate,
-					   &tuple_deleted, false, false);
+					   &tuple_deleted, false, false, true);
 
 			/*
 			 * For some reason if DELETE didn't happen (e.g. trigger prevented
@@ -1303,6 +1310,11 @@ lreplace:;
 					ereport(ERROR,
 							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 							 errmsg("could not serialize access due to concurrent update")));
+				if (!ItemPointerValidBlockNumber(&hufd.ctid))
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("tuple to be updated was already moved to another partition due to concurrent update")));
+
 				if (!ItemPointerEquals(tupleid, &hufd.ctid))
 				{
 					TupleTableSlot *epqslot;
@@ -1473,6 +1485,14 @@ ExecOnConflictUpdate(ModifyTableState *mtstate,
 						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 						 errmsg("could not serialize access due to concurrent update")));
 
+			/*
+			 * As long as we don't support an UPDATE of INSERT ON CONFLICT for
+			 * a partitioned table we shouldn't reach to a case where tuple to
+			 * be lock is moved to another partition due to concurrent update
+			 * of the partition key.
+			 */
+			Assert(ItemPointerValidBlockNumber(&hufd.ctid));
+
 			/*
 			 * Tell caller to try again from the very start.
 			 *
@@ -2062,7 +2082,7 @@ ExecModifyTable(PlanState *pstate)
 			case CMD_DELETE:
 				slot = ExecDelete(node, tupleid, oldtuple, planSlot,
 								  &node->mt_epqstate, estate,
-								  NULL, true, node->canSetTag);
+								  NULL, true, node->canSetTag, false);
 				break;
 			default:
 				elog(ERROR, "unknown operation");
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 4c0256b18a..e8da83c303 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -156,7 +156,7 @@ extern void heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 				  CommandId cid, int options, BulkInsertState bistate);
 extern HTSU_Result heap_delete(Relation relation, ItemPointer tid,
 			CommandId cid, Snapshot crosscheck, bool wait,
-			HeapUpdateFailureData *hufd);
+			HeapUpdateFailureData *hufd, bool changing_part);
 extern void heap_finish_speculative(Relation relation, HeapTuple tuple);
 extern void heap_abort_speculative(Relation relation, HeapTuple tuple);
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid,
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index 2ab1815390..964104b1d1 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -83,8 +83,10 @@
  *
  * A word about t_ctid: whenever a new tuple is stored on disk, its t_ctid
  * is initialized with its own TID (location).  If the tuple is ever updated,
- * its t_ctid is changed to point to the replacement version of the tuple.
- * Thus, a tuple is the latest version of its row iff XMAX is invalid or
+ * its t_ctid is changed to point to the replacement version of the tuple or
+ * the block number (ip_blkid) is invalidated if the tuple is moved from one
+ * partition to another partition relation due to an update of the partition
+ * key.  Thus, a tuple is the latest version of its row iff XMAX is invalid or
  * t_ctid points to itself (in which case, if XMAX is valid, the tuple is
  * either locked or deleted).  One can follow the chain of t_ctid links
  * to find the newest version of the row.  Beware however that VACUUM might
@@ -441,6 +443,12 @@ do { \
 	ItemPointerSet(&(tup)->t_ctid, token, SpecTokenOffsetNumber) \
 )
 
+#define HeapTupleHeaderSetBlockNumber(tup, blkno) \
+		ItemPointerSetBlockNumber(&(tup)->t_ctid, blkno)
+
+#define HeapTupleHeaderValidBlockNumber(tup) \
+		ItemPointerValidBlockNumber(&(tup)->t_ctid)
+
 #define HeapTupleHeaderGetDatumLength(tup) \
 	VARSIZE(tup)
 
diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h
index 6c9ed3696b..b71449e712 100644
--- a/src/include/storage/itemptr.h
+++ b/src/include/storage/itemptr.h
@@ -60,6 +60,13 @@ typedef ItemPointerData *ItemPointer;
 #define ItemPointerIsValid(pointer) \
 	((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0)))
 
+/*
+ * ItemPointerIsValid
+ *		True iff the block number of the item pointer is valid.
+ */
+#define ItemPointerValidBlockNumber(pointer) \
+	((bool) (BlockNumberIsValid(ItemPointerGetBlockNumberNoCheck(pointer))))
+
 /*
  * ItemPointerGetBlockNumberNoCheck
  *		Returns the block number of a disk item pointer.
-- 
2.14.1

