From 772103d67b8b518541209b0460f80e3342a406e6 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Mon, 22 Jun 2020 10:05:21 +0530
Subject: [PATCH] Introduce vacuum errcontext to display additional
 information.

The additional information displayed will be block number for error
occurring while processing heap and index name for error occurring
while processing the index.

This will help us in diagnosing the problems that occur during a vacuum.
For ex. due to corruption (either caused by bad hardware or by some bug)
if we get some error while vacuuming, it can help us identify the block
in heap and or additional index information.

It sets up an error context callback to display additional information
with the error.  During different phases of vacuum (heap scan, heap
vacuum, index vacuum, index clean up, heap truncate), we update the error
context callback to display appropriate information.  We can extend it to
a bit more granular level like adding the phases for FSM operations or for
prefetching the blocks while truncating. However, I felt that it requires
adding many more error callback function calls and can make the code a bit
complex, so left those for now.
---
 src/backend/access/heap/vacuumlazy.c | 185 +++++++++++++++++++++++++++++++++--
 src/tools/pgindent/typedefs.list     |   1 +
 2 files changed, 178 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 3d4719d..a3f8f0b 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -110,8 +110,21 @@
  */
 #define PREFETCH_SIZE			((BlockNumber) 32)
 
+/* Phases of vacuum during which we report error context. */
+typedef enum
+{
+	VACUUM_ERRCB_PHASE_UNKNOWN,
+	VACUUM_ERRCB_PHASE_SCAN_HEAP,
+	VACUUM_ERRCB_PHASE_VACUUM_INDEX,
+	VACUUM_ERRCB_PHASE_VACUUM_HEAP,
+	VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
+	VACUUM_ERRCB_PHASE_TRUNCATE
+} VacErrPhase;
+
 typedef struct LVRelStats
 {
+	char	   *relnamespace;
+	char	   *relname;
 	/* useindex = true means two-pass strategy; false means one-pass */
 	bool		useindex;
 	/* Overall statistics about rel */
@@ -136,6 +149,11 @@ typedef struct LVRelStats
 	int			num_index_scans;
 	TransactionId latestRemovedXid;
 	bool		lock_waiter_detected;
+
+	/* Used for error callback */
+	char	   *indname;
+	BlockNumber blkno;			/* used only for heap operations */
+	VacErrPhase phase;
 } LVRelStats;
 
 
@@ -175,6 +193,9 @@ static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
 static int	vac_cmp_itemptr(const void *left, const void *right);
 static bool heap_page_is_all_visible(Relation rel, Buffer buf,
 									 TransactionId *visibility_cutoff_xid, bool *all_frozen);
+static void vacuum_error_callback(void *arg);
+static void update_vacuum_error_info(LVRelStats *errinfo, int phase,
+									 BlockNumber blkno, char *indname);
 
 
 /*
@@ -208,6 +229,7 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
 	double		new_live_tuples;
 	TransactionId new_frozen_xid;
 	MultiXactId new_min_multi;
+	ErrorContextCallback errcallback;
 
 	Assert(params != NULL);
 	Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
@@ -257,6 +279,10 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
 
 	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
 
+	vacrelstats->relnamespace = get_namespace_name(RelationGetNamespace(onerel));
+	vacrelstats->relname = pstrdup(RelationGetRelationName(onerel));
+	vacrelstats->indname = NULL;
+	vacrelstats->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
 	vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
 	vacrelstats->old_live_tuples = onerel->rd_rel->reltuples;
 	vacrelstats->num_index_scans = 0;
@@ -268,6 +294,22 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
 	vacrelstats->useindex = (nindexes > 0 &&
 							 params->index_cleanup == VACOPT_TERNARY_ENABLED);
 
+	/*
+	 * Setup error traceback support for ereport().  The idea is to set up an
+	 * error context callback to display additional information on any error
+	 * during a vacuum.  During different phases of vacuum (heap scan, heap
+	 * vacuum, index vacuum, index clean up, heap truncate), we update the
+	 * error context callback to display appropriate information.
+	 *
+	 * Note that the index vacuum and heap vacuum phases may be called
+	 * multiple times in the middle of the heap scan phase.  So the old phase
+	 * information is restored at the end of those phases.
+	 */
+	errcallback.callback = vacuum_error_callback;
+	errcallback.arg = vacrelstats;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	/* Do the vacuuming */
 	lazy_scan_heap(onerel, params, vacrelstats, Irel, nindexes, aggressive);
 
@@ -294,7 +336,19 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
 	 * Optionally truncate the relation.
 	 */
 	if (should_attempt_truncation(params, vacrelstats))
+	{
+		/*
+		 * Update error traceback information.  This is the last phase during
+		 * which we add context information to errors, so we don't need to
+		 * revert to the previous phase.
+		 */
+		update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_TRUNCATE,
+								 vacrelstats->nonempty_pages, NULL);
 		lazy_truncate_heap(onerel, vacrelstats);
+	}
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
 
 	/* Report that we are now doing final cleanup */
 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
@@ -483,7 +537,6 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 	BlockNumber nblocks,
 				blkno;
 	HeapTupleData tuple;
-	char	   *relname;
 	TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid;
 	TransactionId relminmxid = onerel->rd_rel->relminmxid;
 	BlockNumber empty_pages,
@@ -511,17 +564,16 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 
 	pg_rusage_init(&ru0);
 
-	relname = RelationGetRelationName(onerel);
 	if (aggressive)
 		ereport(elevel,
 				(errmsg("aggressively vacuuming \"%s.%s\"",
-						get_namespace_name(RelationGetNamespace(onerel)),
-						relname)));
+						vacrelstats->relnamespace,
+						vacrelstats->relname)));
 	else
 		ereport(elevel,
 				(errmsg("vacuuming \"%s.%s\"",
-						get_namespace_name(RelationGetNamespace(onerel)),
-						relname)));
+						vacrelstats->relnamespace,
+						vacrelstats->relname)));
 
 	empty_pages = vacuumed_pages = 0;
 	next_fsm_block_to_vacuum = (BlockNumber) 0;
@@ -642,6 +694,9 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 
 		pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
 
+		update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_SCAN_HEAP,
+								 blkno, NULL);
+
 		if (blkno == next_unskippable_block)
 		{
 			/* Time to advance next_unskippable_block */
@@ -1309,7 +1364,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 				 && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
 		{
 			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
-				 relname, blkno);
+				 vacrelstats->relname, blkno);
 			visibilitymap_clear(onerel, blkno, vmbuffer,
 								VISIBILITYMAP_VALID_BITS);
 		}
@@ -1330,7 +1385,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 		else if (PageIsAllVisible(page) && has_dead_tuples)
 		{
 			elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
-				 relname, blkno);
+				 vacrelstats->relname, blkno);
 			PageClearAllVisible(page);
 			MarkBufferDirty(buf);
 			visibilitymap_clear(onerel, blkno, vmbuffer,
@@ -1513,6 +1568,12 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 	int			npages;
 	PGRUsage	ru0;
 	Buffer		vmbuffer = InvalidBuffer;
+	LVRelStats	olderrinfo;
+
+	/* Update error traceback information */
+	olderrinfo = *vacrelstats;
+	update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
+							 InvalidBlockNumber, NULL);
 
 	pg_rusage_init(&ru0);
 	npages = 0;
@@ -1528,6 +1589,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		vacuum_delay_point();
 
 		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
+		vacrelstats->blkno = tblk;
 		buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
 								 vac_strategy);
 		if (!ConditionalLockBufferForCleanup(buf))
@@ -1559,6 +1621,12 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 					RelationGetRelationName(onerel),
 					tupindex, npages),
 			 errdetail_internal("%s", pg_rusage_show(&ru0))));
+
+	/* Revert to the previous phase information for error traceback */
+	update_vacuum_error_info(vacrelstats,
+							 olderrinfo.phase,
+							 olderrinfo.blkno,
+							 olderrinfo.indname);
 }
 
 /*
@@ -1580,9 +1648,15 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	int			uncnt = 0;
 	TransactionId visibility_cutoff_xid;
 	bool		all_frozen;
+	LVRelStats	olderrinfo;
 
 	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
 
+	/* Update error traceback information */
+	olderrinfo = *vacrelstats;
+	update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
+							 blkno, NULL);
+
 	START_CRIT_SECTION();
 
 	for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
@@ -1659,6 +1733,11 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 							  *vmbuffer, visibility_cutoff_xid, flags);
 	}
 
+	/* Revert to the previous phase information for error traceback */
+	update_vacuum_error_info(vacrelstats,
+							 olderrinfo.phase,
+							 olderrinfo.blkno,
+							 olderrinfo.indname);
 	return tupindex;
 }
 
@@ -1729,6 +1808,7 @@ lazy_vacuum_index(Relation indrel,
 {
 	IndexVacuumInfo ivinfo;
 	PGRUsage	ru0;
+	LVRelStats      olderrinfo;
 
 	pg_rusage_init(&ru0);
 
@@ -1741,6 +1821,13 @@ lazy_vacuum_index(Relation indrel,
 	ivinfo.num_heap_tuples = vacrelstats->old_live_tuples;
 	ivinfo.strategy = vac_strategy;
 
+	/* Update error traceback information */
+	olderrinfo = *vacrelstats;
+	update_vacuum_error_info(vacrelstats,
+							 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
+							 InvalidBlockNumber,
+							 RelationGetRelationName(indrel));
+
 	/* Do bulk deletion */
 	*stats = index_bulk_delete(&ivinfo, *stats,
 							   lazy_tid_reaped, (void *) vacrelstats);
@@ -1750,6 +1837,12 @@ lazy_vacuum_index(Relation indrel,
 					RelationGetRelationName(indrel),
 					vacrelstats->num_dead_tuples),
 			 errdetail_internal("%s", pg_rusage_show(&ru0))));
+
+	/* Revert to the previous phase information for error traceback */
+	update_vacuum_error_info(vacrelstats,
+							 olderrinfo.phase,
+							 olderrinfo.blkno,
+							 olderrinfo.indname);
 }
 
 /*
@@ -1762,6 +1855,7 @@ lazy_cleanup_index(Relation indrel,
 {
 	IndexVacuumInfo ivinfo;
 	PGRUsage	ru0;
+	LVRelStats      olderrcbarg;
 
 	pg_rusage_init(&ru0);
 
@@ -1779,8 +1873,21 @@ lazy_cleanup_index(Relation indrel,
 	ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
 	ivinfo.strategy = vac_strategy;
 
+	/* Update error traceback information */
+	olderrcbarg = *vacrelstats;
+	update_vacuum_error_info(vacrelstats,
+							VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
+							InvalidBlockNumber,
+							RelationGetRelationName(indrel));
+
 	stats = index_vacuum_cleanup(&ivinfo, stats);
 
+	/* Revert back to the old phase information for error traceback */
+	update_vacuum_error_info(vacrelstats,
+							olderrcbarg.phase,
+							olderrcbarg.blkno,
+							olderrcbarg.indname);
+
 	if (!stats)
 		return;
 
@@ -1936,6 +2043,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 		 * were vacuuming.
 		 */
 		new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
+		vacrelstats->blkno = new_rel_pages;
 
 		if (new_rel_pages >= old_rel_pages)
 		{
@@ -2339,3 +2447,64 @@ heap_page_is_all_visible(Relation rel, Buffer buf,
 
 	return all_visible;
 }
+
+/*
+ * Error context callback for errors occurring during vacuum.
+ */
+static void
+vacuum_error_callback(void *arg)
+{
+	LVRelStats *errinfo = arg;
+
+	switch (errinfo->phase)
+	{
+		case VACUUM_ERRCB_PHASE_SCAN_HEAP:
+			if (BlockNumberIsValid(errinfo->blkno))
+				errcontext("while scanning block %u of relation \"%s.%s\"",
+						   errinfo->blkno, errinfo->relnamespace, errinfo->relname);
+			break;
+
+		case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
+			if (BlockNumberIsValid(errinfo->blkno))
+				errcontext("while vacuuming block %u of relation \"%s.%s\"",
+						   errinfo->blkno, errinfo->relnamespace, errinfo->relname);
+			break;
+
+		case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
+			errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
+					   errinfo->indname, errinfo->relnamespace, errinfo->relname);
+			break;
+
+		case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
+			errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
+					   errinfo->indname, errinfo->relnamespace, errinfo->relname);
+			break;
+
+		case VACUUM_ERRCB_PHASE_TRUNCATE:
+			if (BlockNumberIsValid(errinfo->blkno))
+				errcontext("while truncating relation \"%s.%s\" to %u blocks",
+						   errinfo->relnamespace, errinfo->relname, errinfo->blkno);
+			break;
+
+		case VACUUM_ERRCB_PHASE_UNKNOWN:
+		default:
+			return;				/* do nothing; the errinfo may not be
+								 * initialized */
+	}
+}
+
+/* Update vacuum error callback for the current phase, block, and index. */
+static void
+update_vacuum_error_info(LVRelStats *errinfo, int phase, BlockNumber blkno,
+						 char *indname)
+{
+	errinfo->blkno = blkno;
+	errinfo->phase = phase;
+
+	/* Free index name from any previous phase */
+	if (errinfo->indname)
+		pfree(errinfo->indname);
+
+	/* For index phases, save the name of the current index for the callback */
+	errinfo->indname = indname ? pstrdup(indname) : NULL;
+}
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 2f23dc3..022a3d4 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2551,6 +2551,7 @@ UserMapping
 UserOpts
 VacAttrStats
 VacAttrStatsP
+VacErrPhase
 VacOptTernaryValue
 VacuumParams
 VacuumRelation
-- 
1.8.3.1

