From 0cedbc2e826af3f5917626f2ea52e4a7668049b5 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 14 Jul 2022 15:14:56 -0400
Subject: [PATCH 1/2] Allow TRUNCATE pg_largeobject in binary upgrade mode.

Because the goal here is to be able to change the relfilenode of
this system table and its index, we need to use the code path that
is normally used for transactional rewrites. However, because we
want to be able to use this code path even when the relfilenode
hasn't changed, we need to deviate from ordinary transactional
semantics by nuking the old storage in its entirety before creating
the new storage.
---
 src/backend/commands/tablecmds.c   | 11 ++++-
 src/backend/storage/smgr/md.c      | 27 +++++++++---
 src/backend/utils/cache/relcache.c | 66 +++++++++++++++++++++++++++---
 3 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index a2f577024a..35f9f6d8d9 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -40,6 +40,7 @@
 #include "catalog/pg_depend.h"
 #include "catalog/pg_foreign_table.h"
 #include "catalog/pg_inherits.h"
+#include "catalog/pg_largeobject.h"
 #include "catalog/pg_namespace.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_statistic_ext.h"
@@ -2185,7 +2186,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple)
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 				 errmsg("\"%s\" is not a table", relname)));
 
-	if (!allowSystemTableMods && IsSystemClass(relid, reltuple))
+	/*
+	 * Most system catalogs can't be truncated at all, or at least not unless
+	 * allow_system_table_mods=on. As an exception, however, we allow
+	 * pg_largeobject to be truncated as part of pg_upgrade, because we need
+	 * to change its relfilenode to match the old cluster, and allowing a
+	 * TRUNCATE command to be executed is the easiest way of doing that.
+	 */
+	if (!allowSystemTableMods && IsSystemClass(relid, reltuple)
+		&& (!IsBinaryUpgrade || relid != LargeObjectRelationId))
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied: \"%s\" is a system catalog",
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 3998296a62..3deac496ee 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -319,6 +319,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo)
 {
 	char	   *path;
 	int			ret;
+	BlockNumber segno = 0;
 
 	path = relpath(rlocator, forkNum);
 
@@ -353,8 +354,22 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo)
 		/* Prevent other backends' fds from holding on to the disk space */
 		ret = do_truncate(path);
 
-		/* Register request to unlink first segment later */
-		register_unlink_segment(rlocator, forkNum, 0 /* first seg */ );
+		/*
+		 * Except during a binary upgrade, register request to unlink first
+		 * segment later, rather than now.
+		 *
+		 * If we're performing a binary upgrade, the dangers described in the
+		 * header comments for mdunlink() do not exist, since after a crash
+		 * or even a simple ERROR, the upgrade fails and the whole new cluster
+		 * must be recreated from scratch. And, on the other hand, it is
+		 * important to remove the files from disk immediately, because we
+		 * may be about to reuse the same relfilenumber.
+		 */
+		if (!IsBinaryUpgrade)
+		{
+			register_unlink_segment(rlocator, forkNum, 0 /* first seg */ );
+			++segno;
+		}
 	}
 
 	/*
@@ -363,15 +378,17 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo)
 	if (ret >= 0)
 	{
 		char	   *segpath = (char *) palloc(strlen(path) + 12);
-		BlockNumber segno;
 
 		/*
 		 * Note that because we loop until getting ENOENT, we will correctly
 		 * remove all inactive segments as well as active ones.
 		 */
-		for (segno = 1;; segno++)
+		for (;; segno++)
 		{
-			sprintf(segpath, "%s.%u", path, segno);
+			if (segno == 0)
+				strcpy(segpath, path);
+			else
+				sprintf(segpath, "%s.%u", path, segno);
 
 			if (!RelFileLocatorBackendIsTemp(rlocator))
 			{
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index bdb771d278..5658e48777 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -41,6 +41,7 @@
 #include "access/tupdesc_details.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "catalog/binary_upgrade.h"
 #include "catalog/catalog.h"
 #include "catalog/indexing.h"
 #include "catalog/namespace.h"
@@ -3707,9 +3708,36 @@ RelationSetNewRelfilenumber(Relation relation, char persistence)
 	TransactionId freezeXid = InvalidTransactionId;
 	RelFileLocator newrlocator;
 
-	/* Allocate a new relfilenumber */
-	newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace,
-										   NULL, persistence);
+	if (!IsBinaryUpgrade)
+	{
+		/* Allocate a new relfilenumber */
+		newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace,
+											   NULL, persistence);
+	}
+	else if (relation->rd_rel->relkind == RELKIND_INDEX)
+	{
+		if (!OidIsValid(binary_upgrade_next_index_pg_class_relfilenumber))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("index relfilenumber value not set when in binary upgrade mode")));
+
+		newrelfilenumber = binary_upgrade_next_index_pg_class_relfilenumber;
+		binary_upgrade_next_index_pg_class_relfilenumber = InvalidOid;
+	}
+	else if (relation->rd_rel->relkind == RELKIND_RELATION)
+	{
+		if (!OidIsValid(binary_upgrade_next_heap_pg_class_relfilenumber))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("heap relfilenumber value not set when in binary upgrade mode")));
+
+		newrelfilenumber = binary_upgrade_next_heap_pg_class_relfilenumber;
+		binary_upgrade_next_heap_pg_class_relfilenumber = InvalidOid;
+	}
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("unexpected request for new relfilenumber in binary upgrade mode")));
 
 	/*
 	 * Get a writable copy of the pg_class tuple for the given relation.
@@ -3724,9 +3752,37 @@ RelationSetNewRelfilenumber(Relation relation, char persistence)
 	classform = (Form_pg_class) GETSTRUCT(tuple);
 
 	/*
-	 * Schedule unlinking of the old storage at transaction commit.
+	 * Schedule unlinking of the old storage at transaction commit, except
+	 * when performing a binary upgrade, when we must do it immediately.
 	 */
-	RelationDropStorage(relation);
+	if (IsBinaryUpgrade)
+	{
+		SMgrRelation	srel;
+
+		/*
+		 * During a binary upgrade, we use this code path to ensure that
+		 * pg_largeobject and its index have the same relfilenumbers as in
+		 * the old cluster. This is necessary because pg_upgrade treats
+		 * pg_largebject like a user table, not a system table. It is however
+		 * possible that a table or index may need to end up with the same
+		 * relfilenumber in the new cluster as what it had in the old cluster.
+		 * Hence, we can't wait until commit time to remove the old storage.
+		 *
+		 * In general, this function needs to have transactional semantics,
+		 * and removing the old storage before commit time surely isn't.
+		 * However, it doesn't really matter, because if a binary upgrade
+		 * fails at this stage, the new cluster will need to be recreated
+		 * anyway.
+		 */
+		srel = smgropen(relation->rd_locator, relation->rd_backend);
+		smgrdounlinkall(&srel, 1, false);
+		smgrclose(srel);
+	}
+	else
+	{
+		/* Not a binary upgrade, so just schedule it to happen later. */
+		RelationDropStorage(relation);
+	}
 
 	/*
 	 * Create storage for the main fork of the new relfilenumber.  If it's a
-- 
2.24.3 (Apple Git-128)

