From ff0e75ea7ff0e6eb791e6d60333de2c45790b4af Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Mon, 11 Jul 2022 16:01:27 -0400
Subject: [PATCH v1] rough draft of removing relmap size restriction

---
 doc/src/sgml/monitoring.sgml            |  6 +-
 src/backend/utils/activity/wait_event.c |  3 +
 src/backend/utils/cache/relmapper.c     | 99 +++++++++++++++++--------
 src/include/utils/wait_event.h          |  1 +
 4 files changed, 77 insertions(+), 32 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 4549c2560e..2ba1c157ac 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1403,9 +1403,13 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry><literal>RelationMapRead</literal></entry>
       <entry>Waiting for a read of the relation map file.</entry>
      </row>
+     <row>
+      <entry><literal>RelationMapReplace</literal></entry>
+      <entry>Waiting for durable replacement of a relation map file.</entry>
+     </row>
      <row>
       <entry><literal>RelationMapSync</literal></entry>
-      <entry>Waiting for the relation map file to reach durable storage.</entry>
+      <entry>Waiting for a temporary relation map file to reach durable storage.</entry>
      </row>
      <row>
       <entry><literal>RelationMapWrite</literal></entry>
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 87c15b9c6f..9f3715fa31 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -630,6 +630,9 @@ pgstat_get_wait_io(WaitEventIO w)
 		case WAIT_EVENT_RELATION_MAP_READ:
 			event_name = "RelationMapRead";
 			break;
+		case WAIT_EVENT_RELATION_MAP_REPLACE:
+			event_name = "RelationMapReplace";
+			break;
 		case WAIT_EVENT_RELATION_MAP_SYNC:
 			event_name = "RelationMapSync";
 			break;
diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c
index 8e5595b468..977223fedf 100644
--- a/src/backend/utils/cache/relmapper.c
+++ b/src/backend/utils/cache/relmapper.c
@@ -71,6 +71,7 @@
  * worth the trouble given the intended size of the mapping sets.
  */
 #define RELMAPPER_FILENAME		"pg_filenode.map"
+#define RELMAPPER_TEMP_FILENAME	"pg_filenode.map.tmp"
 
 #define RELMAPPER_FILEMAGIC		0x592717	/* version ID value */
 
@@ -877,6 +878,7 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
 {
 	int			fd;
 	char		mapfilename[MAXPGPATH];
+	char		maptempfilename[MAXPGPATH];
 
 	/*
 	 * Fill in the overhead fields and update CRC.
@@ -890,17 +892,62 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
 	FIN_CRC32C(newmap->crc);
 
 	/*
-	 * Open the target file.  We prefer to do this before entering the
-	 * critical section, so that an open() failure need not force PANIC.
+	 * Construct filenames -- a temporary file that we'll create to write the
+	 * data initially, and then the permanent name to which we will rename it.
 	 */
 	snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
 			 dbpath, RELMAPPER_FILENAME);
-	fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY);
+	snprintf(maptempfilename, sizeof(maptempfilename), "%s/%s",
+			 dbpath, RELMAPPER_TEMP_FILENAME);
+
+	/*
+	 * Open a temporary file. If a file already exists with this name, it must
+	 * be left over from a previous crash, so we can overwrite it. Concurrent
+	 * calls to this function are not allowed.
+	 */
+	fd = OpenTransientFile(maptempfilename,
+						   O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m",
-						mapfilename)));
+						maptempfilename)));
+
+	/* Write new data to the file. */
+	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
+	if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write file \"%s\": %m",
+						maptempfilename)));
+	}
+	pgstat_report_wait_end();
+
+	/*
+	 * Make sure it's durably on disk.
+	 *
+	 * We must fsync() the parent directory too, to make sure that the new
+	 * file can't vanish after a crash.
+	 */
+	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
+	if (pg_fsync(fd) != 0)
+		ereport(data_sync_elevel(ERROR),
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m",
+						maptempfilename)));
+	fsync_fname_ext(dbpath, true, false, ERROR);
+	pgstat_report_wait_end();
+
+	/* And close the file. */
+	if (CloseTransientFile(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m",
+						maptempfilename)));
 
 	if (write_wal)
 	{
@@ -924,40 +971,30 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
 		XLogFlush(lsn);
 	}
 
-	errno = 0;
-	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
-	if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
-	{
-		/* if write didn't set errno, assume problem is no disk space */
-		if (errno == 0)
-			errno = ENOSPC;
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not write file \"%s\": %m",
-						mapfilename)));
-	}
-	pgstat_report_wait_end();
-
 	/*
-	 * We choose to fsync the data to disk before considering the task done.
-	 * It would be possible to relax this if it turns out to be a performance
-	 * issue, but it would complicate checkpointing --- see notes for
+	 * We could use durable_rename() here and skip the calls to pg_fsync()
+	 * and fsync_fname_ext() above, but by doing it this way, we minimize
+	 * the amount of work that must be done in the critical section. We first
+	 * rename the file, and then fsync it under the new name, and also the
+	 * containing directory.
+	 *
+	 * It is possible that we could avoid waiting for fsync() here as well,
+	 * but it would complicate checkpointing --- see notes for
 	 * CheckPointRelationMap.
+	 *
+	 * NB: Although we instruct fsync_fname_ext() to use ERROR, we will often
+	 * be in a critical section at this point; if so, ERROR will become PANIC.
 	 */
-	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
-	if (pg_fsync(fd) != 0)
+	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_REPLACE);
+	if (rename(maptempfilename, mapfilename) < 0)
 		ereport(data_sync_elevel(ERROR),
 				(errcode_for_file_access(),
-				 errmsg("could not fsync file \"%s\": %m",
-						mapfilename)));
+				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+						maptempfilename, mapfilename)));
+	fsync_fname_ext(mapfilename, false, false, ERROR);
+	fsync_fname_ext(dbpath, true, false, ERROR);
 	pgstat_report_wait_end();
 
-	if (CloseTransientFile(fd) != 0)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not close file \"%s\": %m",
-						mapfilename)));
-
 	/*
 	 * Now that the file is safely on disk, send sinval message to let other
 	 * backends know to re-read it.  We must do this inside the critical
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index b578e2ec75..f570c6b7f3 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -193,6 +193,7 @@ typedef enum
 	WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
 	WAIT_EVENT_LOGICAL_REWRITE_WRITE,
 	WAIT_EVENT_RELATION_MAP_READ,
+	WAIT_EVENT_RELATION_MAP_REPLACE,
 	WAIT_EVENT_RELATION_MAP_SYNC,
 	WAIT_EVENT_RELATION_MAP_WRITE,
 	WAIT_EVENT_REORDER_BUFFER_READ,
-- 
2.24.3 (Apple Git-128)

