From 518b70713ba08956364f5d48b632ff403eaa8d72 Mon Sep 17 00:00:00 2001
From: Mikhail Kharitonov <mikhail.kharitonov.dev@gmail.com>
Date: Wed, 3 Dec 2025 11:17:14 +0300
Subject: [PATCH] VACUUM: use relation birth XID to refine data horizon for new
 relations

Store a relation birth XID in pg_class and use it to refine the
VISHORIZON_DATA horizon so that transactions that started before the
relation existed do not hold back vacuuming of that relation.
---
 src/backend/catalog/heap.c                    |   9 +
 src/backend/storage/ipc/procarray.c           | 108 ++++++++-
 src/backend/utils/cache/relcache.c            |  25 +++
 src/include/catalog/pg_class.h                |   5 +-
 src/include/utils/rel.h                       |   4 +
 .../recovery/t/121_vacuum_xid_horizons.pl     | 211 ++++++++++++++++++
 6 files changed, 360 insertions(+), 2 deletions(-)
 create mode 100644 src/test/recovery/t/121_vacuum_xid_horizons.pl

diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index fd6537567ea..2e3d051eaa6 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -917,6 +917,7 @@ InsertPgClassTuple(Relation pg_class_desc,
 	Datum		values[Natts_pg_class];
 	bool		nulls[Natts_pg_class];
 	HeapTuple	tup;
+	TransactionId cxid = InvalidTransactionId;
 
 	/* This is a tad tedious, but way cleaner than what we used to do... */
 	memset(values, 0, sizeof(values));
@@ -953,6 +954,14 @@ InsertPgClassTuple(Relation pg_class_desc,
 	values[Anum_pg_class_relrewrite - 1] = ObjectIdGetDatum(rd_rel->relrewrite);
 	values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid);
 	values[Anum_pg_class_relminmxid - 1] = MultiXactIdGetDatum(rd_rel->relminmxid);
+	if (!rd_rel->relisshared && !IsBootstrapProcessingMode() && !RecoveryInProgress())
+	{
+		(void)GetCurrentTransactionId();
+		cxid = GetTopTransactionIdIfAny();
+	}
+
+	values[Anum_pg_class_relminxid - 1] = TransactionIdGetDatum(cxid);
+
 	if (relacl != (Datum) 0)
 		values[Anum_pg_class_relacl - 1] = relacl;
 	else
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 200f72c6e25..31852e59a88 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1938,6 +1938,107 @@ GlobalVisHorizonKindForRel(Relation rel)
 		return VISHORIZON_TEMP;
 }
 
+static TransactionId
+ComputeDataHorizonForRelation(TransactionId create_xid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	bool in_recovery = RecoveryInProgress();
+	TransactionId kaxmin = InvalidTransactionId;
+	FullTransactionId latest_completed = TransamVariables->latestCompletedXid;
+
+	TransactionId initial = XidFromFullTransactionId(latest_completed);
+	TransactionId rel_data_oldest_nonremovable = initial;
+
+	Assert(TransactionIdIsValid(initial));
+	TransactionIdAdvance(initial);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	/*
+	 * Take replication slot xmin into account globally; slots always limit
+	 * how far horizons can advance.
+	 */
+	TransactionId slot_xmin = procArray->replication_slot_xmin;
+
+	for (int i = 0; i < arrayP->numProcs; i++)
+	{
+		int pgprocno = arrayP->pgprocnos[i];
+		PGPROC *proc = &allProcs[pgprocno];
+		int8 flags = ProcGlobal->statusFlags[i];
+		TransactionId xid;
+		TransactionId xmin;
+		TransactionId eff;
+
+		/* As in ComputeXidHorizons: skip VACUUM and logical decoding. */
+		if (flags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
+			continue;
+
+		/*
+		 * Only consider backends in the current database (as usual for
+		 * the data horizon).
+		 */
+		if (!(proc->databaseId == MyDatabaseId ||
+			  MyDatabaseId == InvalidOid || /* starting backend */
+			  (flags & PROC_AFFECTS_ALL_HORIZONS) ||
+			  in_recovery))
+			continue;
+
+		xid = pg_atomic_read_u64(&ProcGlobal->xids[i]);
+		xmin = pg_atomic_read_u64(&proc->xmin);
+
+		/*
+		 * As in the generic horizon computation, use the older of
+		 * xid/xmin.
+		 */
+		eff = TransactionIdOlder(xmin, xid);
+
+		/* If neither xid nor xmin is valid, this backend does not matter. */
+		if (!TransactionIdIsValid(eff))
+			continue;
+
+		/*
+		 * Filter by relation creation xid:
+		 *
+		 * If the transaction started before create_xid (xid < create_xid)
+		 * and either it has no snapshot yet (xmin invalid), or its
+		 * snapshot also predates the relation's creation (xmin <
+		 * create_xid), then this backend cannot see any tuples in the
+		 * relation and can be ignored for this horizon.
+		 */
+		if (TransactionIdIsValid(create_xid))
+		{
+			bool started_before_create = TransactionIdPrecedes(xid, create_xid);
+			bool snapshot_before_create;
+
+			snapshot_before_create =
+				(!TransactionIdIsValid(xmin)) ||
+				TransactionIdPrecedes(xmin, create_xid);
+
+			if (started_before_create && snapshot_before_create)
+				continue; /* cannot require preserving tuples in this relation */
+		}
+
+		/* Otherwise, include its effective xid/xmin in the MIN() computation. */
+		rel_data_oldest_nonremovable =
+			TransactionIdOlder(rel_data_oldest_nonremovable, eff);
+	}
+
+	if (in_recovery)
+		kaxmin = KnownAssignedXidsGetOldestXmin();
+
+	LWLockRelease(ProcArrayLock);
+
+	if (in_recovery)
+		rel_data_oldest_nonremovable =
+			TransactionIdOlder(rel_data_oldest_nonremovable, kaxmin);
+
+	/* Replication slots still limit the horizon. */
+	rel_data_oldest_nonremovable =
+		TransactionIdOlder(rel_data_oldest_nonremovable, slot_xmin);
+
+	return rel_data_oldest_nonremovable;
+}
+
 /*
  * Return the oldest XID for which deleted tuples must be preserved in the
  * passed table.
@@ -1963,7 +2064,12 @@ GetOldestNonRemovableTransactionId(Relation rel)
 		case VISHORIZON_CATALOG:
 			return horizons.catalog_oldest_nonremovable;
 		case VISHORIZON_DATA:
-			return horizons.data_oldest_nonremovable;
+			TransactionId create_xid = RelationGetCreationXid(rel);
+
+			if (TransactionIdIsValid(create_xid))
+				return ComputeDataHorizonForRelation(create_xid);
+			else
+				return horizons.data_oldest_nonremovable;
 		case VISHORIZON_TEMP:
 			return horizons.temp_oldest_nonremovable;
 	}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 915d0bc9084..a67498d6585 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1316,6 +1316,9 @@ retry:
 	/* It's fully valid */
 	relation->rd_isvalid = true;
 
+	relation->rd_creation_xid = InvalidTransactionId;
+	relation->rd_creation_xid_valid = false;
+
 #ifdef MAYBE_RECOVER_RELATION_BUILD_MEMORY
 	if (tmpcxt)
 	{
@@ -6993,3 +6996,25 @@ ResOwnerReleaseRelation(Datum res)
 
 	RelationCloseCleanup((Relation) DatumGetPointer(res));
 }
+
+TransactionId
+RelationGetCreationXid(Relation rel)
+{
+	if (rel == NULL)
+		return InvalidTransactionId;
+
+	if (rel->rd_creation_xid_valid)
+		return rel->rd_creation_xid;
+
+	if (rel->rd_rel)
+	{
+		TransactionId x = rel->rd_rel->relminxid;
+		if (TransactionIdIsNormal(x))
+		{
+			rel->rd_creation_xid = x;
+			rel->rd_creation_xid_valid = true;
+			return x;
+		}
+	}
+	return InvalidTransactionId;
+}
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index 07d182da796..6e2408fda71 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -131,6 +131,9 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat
 	/* all multixacts in this rel are >= this; it is really a MultiXactId */
 	TransactionId relminmxid BKI_DEFAULT(1);	/* FirstMultiXactId */
 
+	/* Birth XID of the relation. 0 = InvalidTransactionId = unknown. */
+	TransactionId relminxid BKI_DEFAULT(0);
+
 #ifdef CATALOG_VARLEN			/* variable-length fields start here */
 	/* NOTE: These fields are not present in a relcache entry's rd_rel field. */
 	/* access permissions */
@@ -146,7 +149,7 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat
 
 /* Size of fixed part of pg_class tuples, not counting var-length fields */
 #define CLASS_TUPLE_SIZE \
-	 (offsetof(FormData_pg_class,relminmxid) + sizeof(TransactionId))
+	 (offsetof(FormData_pg_class,relminxid) + sizeof(TransactionId))
 
 /* ----------------
  *		Form_pg_class corresponds to a pointer to a tuple with
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 80286076a11..fb4eafeea93 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -108,6 +108,9 @@ typedef struct RelationData
 													 * any value */
 	SubTransactionId rd_droppedSubid;	/* dropped with another Subid set */
 
+	TransactionId rd_creation_xid;
+	bool rd_creation_xid_valid;
+
 	Form_pg_class rd_rel;		/* RELATION tuple */
 	TupleDesc	rd_att;			/* tuple descriptor */
 	Oid			rd_id;			/* relation's object id */
@@ -718,4 +721,5 @@ RelationCloseSmgr(Relation relation)
 extern void RelationIncrementReferenceCount(Relation rel);
 extern void RelationDecrementReferenceCount(Relation rel);
 
+extern TransactionId RelationGetCreationXid(Relation rel);
 #endif							/* REL_H */
diff --git a/src/test/recovery/t/121_vacuum_xid_horizons.pl b/src/test/recovery/t/121_vacuum_xid_horizons.pl
new file mode 100644
index 00000000000..58bd45d8a1a
--- /dev/null
+++ b/src/test/recovery/t/121_vacuum_xid_horizons.pl
@@ -0,0 +1,211 @@
+use strict;
+use warnings;
+use Test::More;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+
+# Start a minimal cluster to exercise VACUUM and XID horizon behavior.
+my $node = PostgreSQL::Test::Cluster->new('relhorizon');
+$node->init;
+
+# Keep autovacuum out of the way and force VACUUM VERBOSE messages to be in
+# English and to go to stderr.
+$node->append_conf('postgresql.conf', q{
+  client_min_messages = info
+  autovacuum = off
+  lc_messages = 'C'
+});
+$node->start;
+
+# Run VACUUM (VERBOSE) and return all of its output.
+# VACUUM VERBOSE writes to stderr, so collect both stdout and stderr.
+sub vacuum_verbose {
+    my ($relname) = @_;
+    my ($out, $err) = ('', '');
+    $node->psql('postgres', "VACUUM (VERBOSE) $relname;", stdout => \$out, stderr => \$err);
+    return $out . $err;
+}
+
+# Helper regexp: either VACUUM truncated some table or removed some tuples.
+# We know which relation we vacuumed from the test context, so we do not insist
+# on a specific table name in the message.
+my $vac_removed_or_truncated_re =
+  qr/(table "[^"]+": truncated \d+ to \d+ pages|tuples:\s+\d+\s+removed)/;
+
+# Helper regexp: VACUUM reports dead tuples that are "not yet removable".
+my $vac_not_yet_removable_re =
+  qr/\b[1-9]\d*\s+are dead but not yet removable/;
+
+# One "old" session kept open to emulate a long-running transaction that
+# influences the global XID horizon.
+my $sess_old = $node->background_psql('postgres');
+
+# ============================================================
+# T1: relation created after the long-running transaction;
+# VACUUM on such a relation should not be constrained by it
+# ============================================================
+$sess_old->query('BEGIN;');
+$sess_old->query('SELECT txid_current();');    # ensure the session has an XID
+
+$node->safe_psql('postgres', q{
+  DROP TABLE IF EXISTS t1;
+  CREATE TABLE t1(id int);
+  INSERT INTO t1 SELECT generate_series(1,1000);
+  DELETE FROM t1;
+});
+
+my $vac_t1 = vacuum_verbose('t1');
+
+# VACUUM should be able to either truncate the relation or remove dead tuples.
+like(
+    $vac_t1,
+    $vac_removed_or_truncated_re,
+    'T1: VACUUM on relation created after old transaction can remove dead tuples'
+);
+
+# There must be no "are dead but not yet removable" due to the old transaction.
+unlike(
+    $vac_t1,
+    $vac_not_yet_removable_re,
+    'T1: old long-running transaction does not retain dead tuples in new relation'
+);
+
+# Statistics should also report no remaining dead tuples.
+my $dead_t1 = $node->safe_psql('postgres', q{
+  SELECT n_dead_tup FROM pg_stat_all_tables WHERE relname = 't1';
+});
+chomp $dead_t1;
+is($dead_t1, '0', 'T1: pg_stat_all_tables reports zero dead tuples');
+
+# ============================================================
+# T2: REPEATABLE READ transaction started after relation creation
+# must retain dead tuples until it commits
+# ============================================================
+
+# Create the test relation.
+$node->safe_psql('postgres', q{
+  DROP TABLE IF EXISTS t2;
+  CREATE TABLE t2(id int);
+  INSERT INTO t2 SELECT generate_series(1,5);
+});
+
+# Start a separate REPEATABLE READ transaction and take a snapshot.
+my $sess_rr = $node->background_psql('postgres');
+$sess_rr->query('BEGIN ISOLATION LEVEL REPEATABLE READ;');
+$sess_rr->query('SELECT * FROM t2;');  # take a snapshot
+
+# Delete rows in the main session.
+$node->safe_psql('postgres', q{
+  DELETE FROM t2;
+});
+
+# While the REPEATABLE READ transaction is open, VACUUM should report
+# dead tuples as "not yet removable".
+my $vac_t2_hold = vacuum_verbose('t2');
+like(
+    $vac_t2_hold,
+    $vac_not_yet_removable_re,
+    'T2: REPEATABLE READ snapshot keeps dead tuples while transaction is open'
+);
+
+# Once the REPEATABLE READ transaction commits, VACUUM can remove them.
+$sess_rr->query('COMMIT;');
+
+my $vac_t2_after = vacuum_verbose('t2');
+like(
+    $vac_t2_after,
+    $vac_removed_or_truncated_re,
+    'T2: after REPEATABLE READ commit VACUUM can remove dead tuples'
+);
+
+# ============================================================
+# T3: VACUUM FULL rewrite must preserve the "birth XID" so that
+# subsequent VACUUM can still remove dead tuples correctly
+# ============================================================
+
+$node->safe_psql('postgres', q{
+  DROP TABLE IF EXISTS t3;
+  CREATE TABLE t3(id int);
+  INSERT INTO t3 SELECT generate_series(1,10);
+  DELETE FROM t3;
+});
+
+my $vac_t3_first = vacuum_verbose('t3');
+like(
+    $vac_t3_first,
+    $vac_removed_or_truncated_re,
+    'T3: initial VACUUM on rewritten candidate relation works'
+);
+
+# Rewrite the relation with VACUUM FULL, then create and delete tuples again.
+$node->safe_psql('postgres', q{
+  VACUUM FULL t3;
+  INSERT INTO t3 SELECT generate_series(11,20);
+  DELETE FROM t3;
+});
+
+my $vac_t3_second = vacuum_verbose('t3');
+like(
+    $vac_t3_second,
+    $vac_removed_or_truncated_re,
+    'T3: VACUUM after VACUUM FULL rewrite still removes dead tuples'
+);
+
+# ============================================================
+# T4: partitioned table — a partition created before the long-running
+# transaction must be held; a newer partition must not
+# ============================================================
+
+$node->safe_psql('postgres', q{
+  DROP TABLE IF EXISTS p_parent CASCADE;
+  CREATE TABLE p_parent(id int, payload text) PARTITION BY RANGE (id);
+  CREATE TABLE p_child1 PARTITION OF p_parent FOR VALUES FROM (1) TO (1000);
+});
+
+# Restart the long-running transaction after the first partition exists,
+# so that p_child1 is older than its XID.
+$sess_old->query('COMMIT;');
+$sess_old->query('BEGIN;');
+$sess_old->query('SELECT txid_current();');
+
+# Create a newer partition and populate/delete tuples in both partitions.
+$node->safe_psql('postgres', q{
+  CREATE TABLE p_child2 PARTITION OF p_parent FOR VALUES FROM (1000) TO (2000);
+  INSERT INTO p_child1 SELECT generate_series(1,100), repeat('x',10);
+  DELETE FROM p_child1;
+  INSERT INTO p_child2 SELECT generate_series(1000,1100), repeat('x',10);
+  DELETE FROM p_child2;
+});
+
+# The partition created after the long-running transaction should be fully cleaned.
+my $vac_p2 = vacuum_verbose('p_child2');
+like(
+    $vac_p2,
+    $vac_removed_or_truncated_re,
+    'T4: partition created after old transaction can be vacuumed fully'
+);
+
+# The older partition should still retain dead tuples while the transaction is open.
+my $vac_p1_held = vacuum_verbose('p_child1');
+like(
+    $vac_p1_held,
+    $vac_not_yet_removable_re,
+    'T4: partition created before old transaction retains dead tuples'
+);
+
+# After the long-running transaction commits, VACUUM can remove tuples
+# from the older partition as well.
+$sess_old->query('COMMIT;');
+
+my $vac_p1_after = vacuum_verbose('p_child1');
+like(
+    $vac_p1_after,
+    $vac_removed_or_truncated_re,
+    'T4: after old transaction commits older partition can be vacuumed fully'
+);
+
+# Cleanup.
+$sess_rr->quit if $sess_rr;
+$sess_old->quit;
+$node->stop;
+done_testing();
-- 
2.34.1

