Hi,

On 2022-02-22 01:11:21 -0800, Andres Freund wrote:
> I've started to work on a few debugging aids to find problem like
> these. Attached are two WIP patches:

Forgot to attach. Also importantly includes a tap test for several of these
issues

Greetings,

Andres Freund
>From 0bc64874f8e5faae9a38731a83aa7b001095cc35 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 21 Feb 2022 15:44:02 -0800
Subject: [PATCH v1 1/4] WIP: test for file reuse dangers around database and
 tablespace commands.

---
 src/test/recovery/t/029_relfilenode_reuse.pl | 233 +++++++++++++++++++
 1 file changed, 233 insertions(+)
 create mode 100644 src/test/recovery/t/029_relfilenode_reuse.pl

diff --git a/src/test/recovery/t/029_relfilenode_reuse.pl b/src/test/recovery/t/029_relfilenode_reuse.pl
new file mode 100644
index 00000000000..22d8e85614c
--- /dev/null
+++ b/src/test/recovery/t/029_relfilenode_reuse.pl
@@ -0,0 +1,233 @@
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use File::Basename;
+
+
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf('postgresql.conf', q[
+allow_in_place_tablespaces = true
+log_connections=on
+# to avoid "repairing" corruption
+full_page_writes=off
+log_min_messages=debug2
+autovacuum_naptime=1s
+shared_buffers=1MB
+]);
+$node_primary->start;
+
+
+# Create streaming standby linking to primary
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+$node_standby->start;
+
+# To avoid hanging while expecting some specific input from a psql
+# instance being driven by us, add a timeout high enough that it
+# should never trigger even on very slow machines, unless something
+# is really wrong.
+my $psql_timeout = IPC::Run::timer(300);
+
+my %psql_primary = (stdin => '', stdout => '', stderr => '');
+$psql_primary{run} = IPC::Run::start(
+	[ 'psql', '-XA', '-f', '-', '-d', $node_primary->connstr('postgres') ],
+	'<',
+	\$psql_primary{stdin},
+	'>',
+	\$psql_primary{stdout},
+	'2>',
+	\$psql_primary{stderr},
+	$psql_timeout);
+
+my %psql_standby = ('stdin' => '', 'stdout' => '', 'stderr' => '');
+$psql_standby{run} = IPC::Run::start(
+	[ 'psql', '-XA', '-f', '-', '-d', $node_standby->connstr('postgres') ],
+	'<',
+	\$psql_standby{stdin},
+	'>',
+	\$psql_standby{stdout},
+	'2>',
+	\$psql_standby{stderr},
+	$psql_timeout);
+
+
+# Create template database with a table that we'll update, to trigger dirty
+# rows. Using a template database + preexisting rows makes it a bit easier to
+# reproduce, because there's no cache invalidations generated.
+
+$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db_template OID = 50000;");
+$node_primary->safe_psql('conflict_db_template', q[
+    CREATE TABLE large(id serial primary key, dataa text, datab text);
+    INSERT INTO large(dataa, datab) SELECT g.i::text, 1 FROM generate_series(1, 4000) g(i);]);
+$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
+
+$node_primary->safe_psql('postgres', q[
+    CREATE EXTENSION pg_prewarm;
+    CREATE TABLE replace_sb(data text);
+    INSERT INTO replace_sb(data) SELECT random()::text FROM generate_series(1, 15000);]);
+
+# Use longrunning transactions, so that AtEOXact_SMgr doesn't close files
+send_query_and_wait(
+	\%psql_primary,
+	q[BEGIN;],
+	qr/BEGIN/m);
+send_query_and_wait(
+	\%psql_standby,
+	q[BEGIN;],
+	qr/BEGIN/m);
+
+# Cause lots of dirty rows in shared_buffers
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 1;");
+
+# Now do a bunch of work in another database. That will end up needing to
+# write back dirty data from the previous step, opening the relevant file
+# descriptors
+cause_eviction(\%psql_primary, \%psql_standby);
+
+# drop and recreate database
+$node_primary->safe_psql('postgres', "DROP DATABASE conflict_db;");
+$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
+
+verify($node_primary, $node_standby, 1,
+	   "initial contents as expected");
+
+# Again cause lots of dirty rows in shared_buffers, but use a different update
+# value so we can check everything is OK
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 2;");
+
+# Again cause a lot of IO. That'll again write back dirty data, but uses (XXX
+# adjust after bugfix) the already opened file descriptor.
+# FIXME
+cause_eviction(\%psql_primary, \%psql_standby);
+
+verify($node_primary, $node_standby, 2,
+	   "update to reused relfilenode (due to DB oid conflict) is not lost");
+
+
+$node_primary->safe_psql('conflict_db', "VACUUM FULL large;");
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 3;");
+
+verify($node_primary, $node_standby, 3,
+	   "restored contents as expected");
+
+# Test for old filehandles after moving a database in / out of tablespace
+$node_primary->safe_psql('postgres', q[CREATE TABLESPACE test_tablespace LOCATION '']);
+
+# cause dirty buffers
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 4;");
+# cause files to be opened in backend in other database
+cause_eviction(\%psql_primary, \%psql_standby);
+
+# move database back / forth
+$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
+$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE pg_default');
+
+# cause dirty buffers
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 5;");
+cause_eviction(\%psql_primary, \%psql_standby);
+
+verify($node_primary, $node_standby, 5,
+	   "post move contents as expected");
+
+$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
+
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 7;");
+cause_eviction(\%psql_primary, \%psql_standby);
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 8;");
+$node_primary->safe_psql('postgres', 'DROP DATABASE conflict_db');
+$node_primary->safe_psql('postgres', 'DROP TABLESPACE test_tablespace');
+
+$node_primary->safe_psql('postgres', 'REINDEX TABLE pg_database');
+
+
+# explicitly shut down psql instances gracefully - to avoid hangs
+# or worse on windows
+$psql_primary{stdin} .= "\\q\n";
+$psql_primary{run}->finish;
+$psql_standby{stdin} .= "\\q\n";
+$psql_standby{run}->finish;
+
+$node_primary->stop();
+$node_standby->stop();
+
+# Make sure that there weren't crashes during shutdown
+
+command_like([ 'pg_controldata', $node_primary->data_dir ],
+	qr/Database cluster state:\s+shut down\n/, 'primary shut down ok');
+command_like([ 'pg_controldata', $node_standby->data_dir ],
+	qr/Database cluster state:\s+shut down in recovery\n/, 'standby shut down ok');
+done_testing();
+
+sub verify
+{
+	my ($primary, $standby, $counter, $message) = @_;
+
+	my $query = "SELECT datab, count(*) FROM large GROUP BY 1 ORDER BY 1 LIMIT 10";
+	is($primary->safe_psql('conflict_db', $query),
+	   "$counter|4000",
+	   "primary: $message");
+
+	$primary->wait_for_catchup($standby);
+	is($standby->safe_psql('conflict_db', $query),
+	   "$counter|4000",
+	   "standby: $message");
+}
+
+sub cause_eviction
+{
+	my ($psql_primary, $psql_standby) = @_;
+
+	send_query_and_wait(
+		$psql_primary,
+		q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
+		qr/warmed_buffers/m);
+
+	send_query_and_wait(
+		$psql_standby,
+		q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
+		qr/warmed_buffers/m);
+}
+
+# Send query, wait until string matches
+sub send_query_and_wait
+{
+	my ($psql, $query, $untl) = @_;
+	my $ret;
+
+	# send query
+	$$psql{stdin} .= $query;
+	$$psql{stdin} .= "\n";
+
+	# wait for query results
+	$$psql{run}->pump_nb();
+	while (1)
+	{
+		last if $$psql{stdout} =~ /$untl/;
+
+		if ($psql_timeout->is_expired)
+		{
+			BAIL_OUT("aborting wait: program timed out\n"
+				  . "stream contents: >>$$psql{stdout}<<\n"
+				  . "pattern searched for: $untl\n");
+			return 0;
+		}
+		if (not $$psql{run}->pumpable())
+		{
+			BAIL_OUT("aborting wait: program died\n"
+				  . "stream contents: >>$$psql{stdout}<<\n"
+				  . "pattern searched for: $untl\n");
+			return 0;
+		}
+		$$psql{run}->pump();
+	}
+
+	$$psql{stdout} = '';
+
+	return 1;
+}
-- 
2.34.0

>From a416c09f33c429aea0fd2b3cd1fa03878c520d8d Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 21 Feb 2022 15:41:23 -0800
Subject: [PATCH v1 2/4] WIP: AssertFileNotDeleted(fd).

Author:
Reviewed-By:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/fd.h              |  1 +
 src/backend/access/transam/slru.c     |  2 +
 src/backend/access/transam/xlog.c     |  2 +
 src/backend/replication/walreceiver.c |  2 +
 src/backend/storage/file/fd.c         | 83 +++++++++++++++++++++++++++
 5 files changed, 90 insertions(+)

diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 29209e27243..3bb8f669b64 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -191,6 +191,7 @@ extern int	durable_rename_excl(const char *oldfile, const char *newfile, int log
 extern void SyncDataDirectory(void);
 extern int	data_sync_elevel(int elevel);
 
+extern void AssertFileNotDeleted(int fd);
 /* Filename components */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
 #define PG_TEMP_FILE_PREFIX "pgsql_tmp"
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 30a476ed5dc..e4907ea5356 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -871,6 +871,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
 		}
 	}
 
+	AssertFileNotDeleted(fd);
+
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
 	if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d2bd7a3576..e1225bc0627 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2198,6 +2198,8 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 				if (track_wal_io_timing)
 					INSTR_TIME_SET_CURRENT(start);
 
+				AssertFileNotDeleted(openLogFile);
+
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
 				written = pg_pwrite(openLogFile, from, nleft, startoffset);
 				pgstat_report_wait_end();
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index ceaff097b97..23bf982b545 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -912,6 +912,8 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli)
 		else
 			segbytes = nbytes;
 
+		AssertFileNotDeleted(recvFile);
+
 		/* OK to write the logs */
 		errno = 0;
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 14b77f28617..123815e4a80 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -93,6 +93,7 @@
 #include "common/file_perm.h"
 #include "common/file_utils.h"
 #include "common/pg_prng.h"
+#include "common/string.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "port/pg_iovec.h"
@@ -2073,6 +2074,8 @@ FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
 	if (returnCode < 0)
 		return returnCode;
 
+	AssertFileNotDeleted(VfdCache[file].fd);
+
 	pgstat_report_wait_start(wait_event_info);
 	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
 							   POSIX_FADV_WILLNEED);
@@ -2103,6 +2106,11 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
 	if (returnCode < 0)
 		return;
 
+	/*
+	 * XXX: can't assert non-use of fd right now,
+	 * ScheduleBufferTagForWriteback can end up writing at a later time.
+	 */
+
 	pgstat_report_wait_start(wait_event_info);
 	pg_flush_data(VfdCache[file].fd, offset, nbytes);
 	pgstat_report_wait_end();
@@ -2128,6 +2136,8 @@ FileRead(File file, char *buffer, int amount, off_t offset,
 
 	vfdP = &VfdCache[file];
 
+	AssertFileNotDeleted(vfdP->fd);
+
 retry:
 	pgstat_report_wait_start(wait_event_info);
 	returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
@@ -2184,6 +2194,8 @@ FileWrite(File file, char *buffer, int amount, off_t offset,
 
 	vfdP = &VfdCache[file];
 
+	AssertFileNotDeleted(vfdP->fd);
+
 	/*
 	 * If enforcing temp_file_limit and it's a temp file, check to see if the
 	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
@@ -2276,6 +2288,8 @@ FileSync(File file, uint32 wait_event_info)
 	if (returnCode < 0)
 		return returnCode;
 
+	AssertFileNotDeleted(VfdCache[file].fd);
+
 	pgstat_report_wait_start(wait_event_info);
 	returnCode = pg_fsync(VfdCache[file].fd);
 	pgstat_report_wait_end();
@@ -2297,6 +2311,8 @@ FileSize(File file)
 			return (off_t) -1;
 	}
 
+	AssertFileNotDeleted(VfdCache[file].fd);
+
 	return lseek(VfdCache[file].fd, 0, SEEK_END);
 }
 
@@ -2314,6 +2330,8 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info)
 	if (returnCode < 0)
 		return returnCode;
 
+	AssertFileNotDeleted(VfdCache[file].fd);
+
 	pgstat_report_wait_start(wait_event_info);
 	returnCode = ftruncate(VfdCache[file].fd, offset);
 	pgstat_report_wait_end();
@@ -3828,6 +3846,71 @@ data_sync_elevel(int elevel)
 	return data_sync_retry ? elevel : PANIC;
 }
 
+void
+AssertFileNotDeleted(int fd)
+{
+	struct stat statbuf;
+	int			ret;
+	char		deleted_filename[MAXPGPATH];
+	bool		have_filename = false;
+
+	/*
+	 * fstat shouldn't fail, so it seems ok to error out, even if it's
+	 * just a debugging aid.
+	 *
+	 * XXX: Figure out which operating systems this works on.
+	 */
+	ret = fstat(fd, &statbuf);
+	if (ret != 0)
+		elog(ERROR, "fstat failed: %m");
+
+	/*
+	 * On several operating systems st_nlink == 0 indicates that the file has
+	 * been deleted. On some OS/filesystem combinations a deleted file may
+	 * still show up with nlink > 0, but nlink == 0 shouldn't be returned
+	 * spuriously. Hardlinks obviously can prevent this from working, but we
+	 * don't expect any, so that's fine.
+	 */
+	if (statbuf.st_nlink > 0)
+		return;
+
+#if defined(__linux__)
+	{
+		char        path[MAXPGPATH];
+		const char *const deleted_suffix = " (deleted)";
+
+		/*
+		 * On linux we can figure out what the file name
+		 */
+		sprintf(path, "/proc/self/fd/%d", fd);
+		ret = readlink(path, deleted_filename, sizeof(deleted_filename) - 1);
+
+		// FIXME: Tolerate most errors here
+		if (ret == -1)
+			elog(PANIC, "readlink failed: %m");
+
+		/* readlink doesn't null terminate */
+		deleted_filename[ret] = 0;
+		have_filename = true;
+
+		/* chop off the " (deleted)" */
+		if (pg_str_endswith(deleted_filename, deleted_suffix))
+		{
+			Size		len = strlen(deleted_filename);
+
+			deleted_filename[len - strlen(deleted_suffix)] = 0;
+		}
+	}
+#endif
+
+	if (have_filename)
+		elog(PANIC, "file descriptor %d for file %s is of a deleted file",
+			 fd, deleted_filename);
+	else
+		elog(PANIC, "file descriptor %d is of a deleted file",
+			 fd);
+}
+
 /*
  * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
  * error is returned, it is unspecified how much has been written.
-- 
2.34.0

>From d4aeb3e17d7fb053af0ca94f5f59ef1ea0c34f98 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 21 Feb 2022 16:42:16 -0800
Subject: [PATCH v1 3/4] WIP: fix old-fd issues using global barriers
 everywhere.

---
 src/include/pg_config_manual.h      |  9 ---------
 src/include/storage/smgr.h          |  1 +
 src/backend/commands/tablespace.c   | 10 +++++-----
 src/backend/storage/buffer/bufmgr.c | 11 ++++++++---
 src/backend/storage/smgr/smgr.c     | 20 ++++++++++++++++++++
 5 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 84ce5a4a5d7..1eb39884144 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -152,16 +152,7 @@
 #define EXEC_BACKEND
 #endif
 
-/*
- * If USE_BARRIER_SMGRRELEASE is defined, certain code paths that unlink
- * directories will ask other backends to close all smgr file descriptors.
- * This is enabled on Windows, because otherwise unlinked but still open files
- * can prevent rmdir(containing_directory) from succeeding.  On other
- * platforms, it can be defined to exercise those code paths.
- */
-#if defined(WIN32)
 #define USE_BARRIER_SMGRRELEASE
-#endif
 
 /*
  * Define this if your operating system supports link()
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 8e3ef92cda1..d774fc98cf8 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -79,6 +79,7 @@ typedef SMgrRelationData *SMgrRelation;
 
 extern void smgrinit(void);
 extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
+extern SMgrRelation smgropen_cond(RelFileNode rnode, BackendId backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
 extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln);
 extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln);
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 40514ab550f..0c433360b63 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -1574,6 +1574,11 @@ tblspc_redo(XLogReaderState *record)
 	{
 		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
 
+#if defined(USE_BARRIER_SMGRRELEASE)
+		/* Close all smgr fds in all backends. */
+		WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
+#endif
+
 		/*
 		 * If we issued a WAL record for a drop tablespace it implies that
 		 * there were no files in it at all when the DROP was done. That means
@@ -1591,11 +1596,6 @@ tblspc_redo(XLogReaderState *record)
 		 */
 		if (!destroy_tablespace_directories(xlrec->ts_id, true))
 		{
-#if defined(USE_BARRIER_SMGRRELEASE)
-			/* Close all smgr fds in all backends. */
-			WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
-#endif
-
 			ResolveRecoveryConflictWithTablespace(xlrec->ts_id);
 
 			/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f5459c68f89..e9fd512df7c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4819,9 +4819,14 @@ IssuePendingWritebacks(WritebackContext *context)
 
 		i += ahead;
 
-		/* and finally tell the kernel to write the data to storage */
-		reln = smgropen(tag.rnode, InvalidBackendId);
-		smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
+		/*
+		 * Finally tell the kernel to write the data to storage. Don't smgr if
+		 * previously closed, otherwise we could end up evading fd-reuse
+		 * protection.
+		 */
+		reln = smgropen_cond(tag.rnode, InvalidBackendId);
+		if (reln)
+			smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
 	}
 
 	context->nr_pending = 0;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d71a557a352..efa83ae3942 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -190,6 +190,26 @@ smgropen(RelFileNode rnode, BackendId backend)
 	return reln;
 }
 
+/*
+ * Like smgropen(), but returns NULL if relation is not already open.
+ */
+SMgrRelation
+smgropen_cond(RelFileNode rnode, BackendId backend)
+{
+	RelFileNodeBackend brnode;
+
+	if (SMgrRelationHash == NULL)
+		return NULL;
+
+	/* Look up or create an entry */
+	brnode.node = rnode;
+	brnode.backend = backend;
+
+	return (SMgrRelation) hash_search(SMgrRelationHash,
+									  (void *) &brnode,
+									  HASH_FIND, NULL);
+}
+
 /*
  * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
  *
-- 
2.34.0

>From 85eb22f6d290765c8b7b931f4e968b4808ff33a3 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 21 Feb 2022 16:47:26 -0800
Subject: [PATCH v1 4/4] WIP: Add linux-only AssertNoDeletedFilesOpenPid().

Author:
Reviewed-By:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/fd.h            |  3 ++
 src/backend/commands/dbcommands.c   |  6 ++++
 src/backend/storage/file/fd.c       | 52 +++++++++++++++++++++++++++++
 src/backend/storage/ipc/procarray.c | 22 ++++++++++++
 4 files changed, 83 insertions(+)

diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 3bb8f669b64..d12482be90d 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -192,6 +192,9 @@ extern void SyncDataDirectory(void);
 extern int	data_sync_elevel(int elevel);
 
 extern void AssertFileNotDeleted(int fd);
+extern void AssertNoDeletedFilesOpen(void);
+extern void AssertNoDeletedFilesOpenPid(int pid);
+
 /* Filename components */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
 #define PG_TEMP_FILE_PREFIX "pgsql_tmp"
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index c37e3c9a9a4..2b9ccddbde7 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1069,6 +1069,8 @@ dropdb(const char *dbname, bool missing_ok, bool force)
 	WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
 #endif
 
+	AssertNoDeletedFilesOpen();
+
 	/*
 	 * Remove all tablespace subdirs belonging to the database.
 	 */
@@ -1322,6 +1324,8 @@ movedb(const char *dbname, const char *tblspcname)
 	WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
 #endif
 
+	AssertNoDeletedFilesOpen();
+
 	/*
 	 * Now drop all buffers holding data of the target database; they should
 	 * no longer be dirty so DropDatabaseBuffers is safe.
@@ -2453,6 +2457,8 @@ dbase_redo(XLogReaderState *record)
 		WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
 #endif
 
+		AssertNoDeletedFilesOpen();
+
 		for (i = 0; i < xlrec->ntablespaces; i++)
 		{
 			dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_ids[i]);
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 123815e4a80..782412b7d32 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -3846,6 +3846,58 @@ data_sync_elevel(int elevel)
 	return data_sync_retry ? elevel : PANIC;
 }
 
+void
+AssertNoDeletedFilesOpenPid(int pid)
+{
+#if defined(__linux__)
+	const char *const deleted_suffix = " (deleted)";
+	char		proc_dir[MAXPGPATH];
+	struct dirent *de;
+	DIR		   *dirdesc;
+
+	sprintf(proc_dir, "/proc/%d/fd", pid);
+
+	elog(LOG, "checking %s", proc_dir);
+
+	dirdesc = AllocateDir(proc_dir);
+	if (dirdesc == NULL)
+	{
+		elog(WARNING, "could not open %s", proc_dir);
+		return;
+	}
+
+	while ((de = ReadDir(dirdesc, proc_dir)) != NULL)
+	{
+		char        path[MAXPGPATH];
+		char        target[MAXPGPATH];
+		int			ret;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		sprintf(path, "/proc/%d/fd/%s", pid, de->d_name);
+
+		ret = readlink(path, target, sizeof(path) - 1);
+
+		// FIXME: Tolerate most errors here
+		if (ret == -1)
+			elog(PANIC, "readlink failed: %m");
+
+		/* readlink doesn't null terminate */
+		target[ret] = 0;
+
+		if (pg_str_endswith(target, deleted_suffix))
+		{
+			elog(PANIC, "pid %d has deleted file %s open in fd %s",
+				 pid, target, de->d_name);
+		}
+	}
+
+	FreeDir(dirdesc);
+#endif
+}
+
 void
 AssertFileNotDeleted(int fd)
 {
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 13d192ec2b4..e33d16f7588 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -58,6 +58,7 @@
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "storage/fd.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/spin.h"
@@ -3772,6 +3773,27 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
 	return true;				/* timed out, still conflicts */
 }
 
+void
+AssertNoDeletedFilesOpen(void)
+{
+#if defined(__linux__)
+	ProcArrayStruct *arrayP = procArray;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (int index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		AssertNoDeletedFilesOpenPid(proc->pid);
+	}
+
+	LWLockRelease(ProcArrayLock);
+#endif
+}
+
+
 /*
  * Terminate existing connections to the specified database. This routine
  * is used by the DROP DATABASE command when user has asked to forcefully
-- 
2.34.0

Reply via email to