From b7ace6f7eab4a2c181427ac4a7719da62faf78ee Mon Sep 17 00:00:00 2001
From: Baji Shaik <baji.pgdev@gmail.com>
Date: Sun, 17 May 2026 23:32:42 +0000
Subject: [PATCH] Fix REPACK decoding worker not cleaned up on FATAL exit

When the launching backend of REPACK (CONCURRENTLY) is terminated via
pg_terminate_backend(), ProcDiePending causes ereport(FATAL) which
bypasses PG_FINALLY blocks.  As a result, stop_repack_decoding_worker()
is never called, leaving the decoding worker running indefinitely and
holding its temporary replication slot.

Fix by using PG_ENSURE_ERROR_CLEANUP, which handles both ERROR and
FATAL exits.  Unlike before_shmem_exit, this does not leak callback
slots when the same backend runs multiple REPACK (CONCURRENTLY)
commands, and unlike on_proc_exit, it runs before memory contexts
are destroyed so the worker handle is still valid.
---
 src/backend/commands/repack.c | 42 +++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/backend/commands/repack.c b/src/backend/commands/repack.c
index fae88d6bb83..17eb9eb029c 100644
--- a/src/backend/commands/repack.c
+++ b/src/backend/commands/repack.c
@@ -64,6 +64,7 @@
 #include "pgstat.h"
 #include "replication/logicalrelation.h"
 #include "storage/bufmgr.h"
+#include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -211,6 +212,7 @@ static Oid	determine_clustered_index(Relation rel, bool usingindex,
 
 static void start_repack_decoding_worker(Oid relid);
 static void stop_repack_decoding_worker(void);
+static void repack_decoding_worker_cleanup_cb(int code, Datum arg);
 static Snapshot get_initial_snapshot(DecodingWorker *worker);
 
 static void ProcessRepackMessage(StringInfo msg);
@@ -660,24 +662,25 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
 		TransferPredicateLocksToHeapRelation(OldHeap);
 
 	/* rebuild_relation does all the dirty work */
-	PG_TRY();
-	{
-		rebuild_relation(OldHeap, index, verbose, ident_idx);
-	}
-	PG_FINALLY();
+	if (concurrent)
 	{
-		if (concurrent)
+		/*
+		 * Use PG_ENSURE_ERROR_CLEANUP so that the decoding worker is stopped
+		 * on both ERROR and FATAL exits.  PG_FINALLY only handles ERROR;
+		 * FATAL (e.g. from pg_terminate_backend) would bypass it, leaving
+		 * the worker running and holding its replication slot indefinitely.
+		 */
+		PG_ENSURE_ERROR_CLEANUP(repack_decoding_worker_cleanup_cb, (Datum) 0);
 		{
-			/*
-			 * Since during normal operation the worker was already asked to
-			 * exit, stopping it explicitly is especially important on ERROR.
-			 * However it still seems a good practice to make sure that the
-			 * worker never survives the REPACK command.
-			 */
-			stop_repack_decoding_worker();
+			rebuild_relation(OldHeap, index, verbose, ident_idx);
 		}
+		PG_END_ENSURE_ERROR_CLEANUP(repack_decoding_worker_cleanup_cb, (Datum) 0);
+		stop_repack_decoding_worker();
+	}
+	else
+	{
+		rebuild_relation(OldHeap, index, verbose, ident_idx);
 	}
-	PG_END_TRY();
 
 	/* rebuild_relation closes OldHeap, and index if valid */
 
@@ -3482,6 +3485,17 @@ start_repack_decoding_worker(Oid relid)
 	ConditionVariableCancelSleep();
 }
 
+/*
+ * PG_ENSURE_ERROR_CLEANUP callback to stop the decoding worker.
+ * This ensures the worker is terminated on both ERROR and FATAL exits,
+ * unlike PG_FINALLY which only handles ERROR.
+ */
+static void
+repack_decoding_worker_cleanup_cb(int code, Datum arg)
+{
+	stop_repack_decoding_worker();
+}
+
 /*
  * Stop the decoding worker and cleanup the related resources.
  *
-- 
2.50.1

