Hi, On 2022-02-22 01:11:21 -0800, Andres Freund wrote: > I've started to work on a few debugging aids to find problem like > these. Attached are two WIP patches:
Forgot to attach. Also importantly includes a tap test for several of these issues Greetings, Andres Freund
>From 0bc64874f8e5faae9a38731a83aa7b001095cc35 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 21 Feb 2022 15:44:02 -0800 Subject: [PATCH v1 1/4] WIP: test for file reuse dangers around database and tablespace commands. --- src/test/recovery/t/029_relfilenode_reuse.pl | 233 +++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 src/test/recovery/t/029_relfilenode_reuse.pl diff --git a/src/test/recovery/t/029_relfilenode_reuse.pl b/src/test/recovery/t/029_relfilenode_reuse.pl new file mode 100644 index 00000000000..22d8e85614c --- /dev/null +++ b/src/test/recovery/t/029_relfilenode_reuse.pl @@ -0,0 +1,233 @@ +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use File::Basename; + + +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->append_conf('postgresql.conf', q[ +allow_in_place_tablespaces = true +log_connections=on +# to avoid "repairing" corruption +full_page_writes=off +log_min_messages=debug2 +autovacuum_naptime=1s +shared_buffers=1MB +]); +$node_primary->start; + + +# Create streaming standby linking to primary +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +# To avoid hanging while expecting some specific input from a psql +# instance being driven by us, add a timeout high enough that it +# should never trigger even on very slow machines, unless something +# is really wrong. +my $psql_timeout = IPC::Run::timer(300); + +my %psql_primary = (stdin => '', stdout => '', stderr => ''); +$psql_primary{run} = IPC::Run::start( + [ 'psql', '-XA', '-f', '-', '-d', $node_primary->connstr('postgres') ], + '<', + \$psql_primary{stdin}, + '>', + \$psql_primary{stdout}, + '2>', + \$psql_primary{stderr}, + $psql_timeout); + +my %psql_standby = ('stdin' => '', 'stdout' => '', 'stderr' => ''); +$psql_standby{run} = IPC::Run::start( + [ 'psql', '-XA', '-f', '-', '-d', $node_standby->connstr('postgres') ], + '<', + \$psql_standby{stdin}, + '>', + \$psql_standby{stdout}, + '2>', + \$psql_standby{stderr}, + $psql_timeout); + + +# Create template database with a table that we'll update, to trigger dirty +# rows. Using a template database + preexisting rows makes it a bit easier to +# reproduce, because there's no cache invalidations generated. + +$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db_template OID = 50000;"); +$node_primary->safe_psql('conflict_db_template', q[ + CREATE TABLE large(id serial primary key, dataa text, datab text); + INSERT INTO large(dataa, datab) SELECT g.i::text, 1 FROM generate_series(1, 4000) g(i);]); +$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;"); + +$node_primary->safe_psql('postgres', q[ + CREATE EXTENSION pg_prewarm; + CREATE TABLE replace_sb(data text); + INSERT INTO replace_sb(data) SELECT random()::text FROM generate_series(1, 15000);]); + +# Use longrunning transactions, so that AtEOXact_SMgr doesn't close files +send_query_and_wait( + \%psql_primary, + q[BEGIN;], + qr/BEGIN/m); +send_query_and_wait( + \%psql_standby, + q[BEGIN;], + qr/BEGIN/m); + +# Cause lots of dirty rows in shared_buffers +$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 1;"); + +# Now do a bunch of work in another database. That will end up needing to +# write back dirty data from the previous step, opening the relevant file +# descriptors +cause_eviction(\%psql_primary, \%psql_standby); + +# drop and recreate database +$node_primary->safe_psql('postgres', "DROP DATABASE conflict_db;"); +$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;"); + +verify($node_primary, $node_standby, 1, + "initial contents as expected"); + +# Again cause lots of dirty rows in shared_buffers, but use a different update +# value so we can check everything is OK +$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 2;"); + +# Again cause a lot of IO. That'll again write back dirty data, but uses (XXX +# adjust after bugfix) the already opened file descriptor. +# FIXME +cause_eviction(\%psql_primary, \%psql_standby); + +verify($node_primary, $node_standby, 2, + "update to reused relfilenode (due to DB oid conflict) is not lost"); + + +$node_primary->safe_psql('conflict_db', "VACUUM FULL large;"); +$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 3;"); + +verify($node_primary, $node_standby, 3, + "restored contents as expected"); + +# Test for old filehandles after moving a database in / out of tablespace +$node_primary->safe_psql('postgres', q[CREATE TABLESPACE test_tablespace LOCATION '']); + +# cause dirty buffers +$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 4;"); +# cause files to be opened in backend in other database +cause_eviction(\%psql_primary, \%psql_standby); + +# move database back / forth +$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace'); +$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE pg_default'); + +# cause dirty buffers +$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 5;"); +cause_eviction(\%psql_primary, \%psql_standby); + +verify($node_primary, $node_standby, 5, + "post move contents as expected"); + +$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace'); + +$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 7;"); +cause_eviction(\%psql_primary, \%psql_standby); +$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 8;"); +$node_primary->safe_psql('postgres', 'DROP DATABASE conflict_db'); +$node_primary->safe_psql('postgres', 'DROP TABLESPACE test_tablespace'); + +$node_primary->safe_psql('postgres', 'REINDEX TABLE pg_database'); + + +# explicitly shut down psql instances gracefully - to avoid hangs +# or worse on windows +$psql_primary{stdin} .= "\\q\n"; +$psql_primary{run}->finish; +$psql_standby{stdin} .= "\\q\n"; +$psql_standby{run}->finish; + +$node_primary->stop(); +$node_standby->stop(); + +# Make sure that there weren't crashes during shutdown + +command_like([ 'pg_controldata', $node_primary->data_dir ], + qr/Database cluster state:\s+shut down\n/, 'primary shut down ok'); +command_like([ 'pg_controldata', $node_standby->data_dir ], + qr/Database cluster state:\s+shut down in recovery\n/, 'standby shut down ok'); +done_testing(); + +sub verify +{ + my ($primary, $standby, $counter, $message) = @_; + + my $query = "SELECT datab, count(*) FROM large GROUP BY 1 ORDER BY 1 LIMIT 10"; + is($primary->safe_psql('conflict_db', $query), + "$counter|4000", + "primary: $message"); + + $primary->wait_for_catchup($standby); + is($standby->safe_psql('conflict_db', $query), + "$counter|4000", + "standby: $message"); +} + +sub cause_eviction +{ + my ($psql_primary, $psql_standby) = @_; + + send_query_and_wait( + $psql_primary, + q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;], + qr/warmed_buffers/m); + + send_query_and_wait( + $psql_standby, + q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;], + qr/warmed_buffers/m); +} + +# Send query, wait until string matches +sub send_query_and_wait +{ + my ($psql, $query, $untl) = @_; + my $ret; + + # send query + $$psql{stdin} .= $query; + $$psql{stdin} .= "\n"; + + # wait for query results + $$psql{run}->pump_nb(); + while (1) + { + last if $$psql{stdout} =~ /$untl/; + + if ($psql_timeout->is_expired) + { + BAIL_OUT("aborting wait: program timed out\n" + . "stream contents: >>$$psql{stdout}<<\n" + . "pattern searched for: $untl\n"); + return 0; + } + if (not $$psql{run}->pumpable()) + { + BAIL_OUT("aborting wait: program died\n" + . "stream contents: >>$$psql{stdout}<<\n" + . "pattern searched for: $untl\n"); + return 0; + } + $$psql{run}->pump(); + } + + $$psql{stdout} = ''; + + return 1; +} -- 2.34.0
>From a416c09f33c429aea0fd2b3cd1fa03878c520d8d Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 21 Feb 2022 15:41:23 -0800 Subject: [PATCH v1 2/4] WIP: AssertFileNotDeleted(fd). Author: Reviewed-By: Discussion: https://postgr.es/m/ Backpatch: --- src/include/storage/fd.h | 1 + src/backend/access/transam/slru.c | 2 + src/backend/access/transam/xlog.c | 2 + src/backend/replication/walreceiver.c | 2 + src/backend/storage/file/fd.c | 83 +++++++++++++++++++++++++++ 5 files changed, 90 insertions(+) diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 29209e27243..3bb8f669b64 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -191,6 +191,7 @@ extern int durable_rename_excl(const char *oldfile, const char *newfile, int log extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); +extern void AssertFileNotDeleted(int fd); /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILE_PREFIX "pgsql_tmp" diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 30a476ed5dc..e4907ea5356 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -871,6 +871,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) } } + AssertFileNotDeleted(fd); + errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 0d2bd7a3576..e1225bc0627 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2198,6 +2198,8 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) if (track_wal_io_timing) INSTR_TIME_SET_CURRENT(start); + AssertFileNotDeleted(openLogFile); + pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); written = pg_pwrite(openLogFile, from, nleft, startoffset); pgstat_report_wait_end(); diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index ceaff097b97..23bf982b545 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -912,6 +912,8 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) else segbytes = nbytes; + AssertFileNotDeleted(recvFile); + /* OK to write the logs */ errno = 0; diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 14b77f28617..123815e4a80 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -93,6 +93,7 @@ #include "common/file_perm.h" #include "common/file_utils.h" #include "common/pg_prng.h" +#include "common/string.h" #include "miscadmin.h" #include "pgstat.h" #include "port/pg_iovec.h" @@ -2073,6 +2074,8 @@ FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info) if (returnCode < 0) return returnCode; + AssertFileNotDeleted(VfdCache[file].fd); + pgstat_report_wait_start(wait_event_info); returnCode = posix_fadvise(VfdCache[file].fd, offset, amount, POSIX_FADV_WILLNEED); @@ -2103,6 +2106,11 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) if (returnCode < 0) return; + /* + * XXX: can't assert non-use of fd right now, + * ScheduleBufferTagForWriteback can end up writing at a later time. + */ + pgstat_report_wait_start(wait_event_info); pg_flush_data(VfdCache[file].fd, offset, nbytes); pgstat_report_wait_end(); @@ -2128,6 +2136,8 @@ FileRead(File file, char *buffer, int amount, off_t offset, vfdP = &VfdCache[file]; + AssertFileNotDeleted(vfdP->fd); + retry: pgstat_report_wait_start(wait_event_info); returnCode = pg_pread(vfdP->fd, buffer, amount, offset); @@ -2184,6 +2194,8 @@ FileWrite(File file, char *buffer, int amount, off_t offset, vfdP = &VfdCache[file]; + AssertFileNotDeleted(vfdP->fd); + /* * If enforcing temp_file_limit and it's a temp file, check to see if the * write would overrun temp_file_limit, and throw error if so. Note: it's @@ -2276,6 +2288,8 @@ FileSync(File file, uint32 wait_event_info) if (returnCode < 0) return returnCode; + AssertFileNotDeleted(VfdCache[file].fd); + pgstat_report_wait_start(wait_event_info); returnCode = pg_fsync(VfdCache[file].fd); pgstat_report_wait_end(); @@ -2297,6 +2311,8 @@ FileSize(File file) return (off_t) -1; } + AssertFileNotDeleted(VfdCache[file].fd); + return lseek(VfdCache[file].fd, 0, SEEK_END); } @@ -2314,6 +2330,8 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info) if (returnCode < 0) return returnCode; + AssertFileNotDeleted(VfdCache[file].fd); + pgstat_report_wait_start(wait_event_info); returnCode = ftruncate(VfdCache[file].fd, offset); pgstat_report_wait_end(); @@ -3828,6 +3846,71 @@ data_sync_elevel(int elevel) return data_sync_retry ? elevel : PANIC; } +void +AssertFileNotDeleted(int fd) +{ + struct stat statbuf; + int ret; + char deleted_filename[MAXPGPATH]; + bool have_filename = false; + + /* + * fstat shouldn't fail, so it seems ok to error out, even if it's + * just a debugging aid. + * + * XXX: Figure out which operating systems this works on. + */ + ret = fstat(fd, &statbuf); + if (ret != 0) + elog(ERROR, "fstat failed: %m"); + + /* + * On several operating systems st_nlink == 0 indicates that the file has + * been deleted. On some OS/filesystem combinations a deleted file may + * still show up with nlink > 0, but nlink == 0 shouldn't be returned + * spuriously. Hardlinks obviously can prevent this from working, but we + * don't expect any, so that's fine. + */ + if (statbuf.st_nlink > 0) + return; + +#if defined(__linux__) + { + char path[MAXPGPATH]; + const char *const deleted_suffix = " (deleted)"; + + /* + * On linux we can figure out what the file name + */ + sprintf(path, "/proc/self/fd/%d", fd); + ret = readlink(path, deleted_filename, sizeof(deleted_filename) - 1); + + // FIXME: Tolerate most errors here + if (ret == -1) + elog(PANIC, "readlink failed: %m"); + + /* readlink doesn't null terminate */ + deleted_filename[ret] = 0; + have_filename = true; + + /* chop off the " (deleted)" */ + if (pg_str_endswith(deleted_filename, deleted_suffix)) + { + Size len = strlen(deleted_filename); + + deleted_filename[len - strlen(deleted_suffix)] = 0; + } + } +#endif + + if (have_filename) + elog(PANIC, "file descriptor %d for file %s is of a deleted file", + fd, deleted_filename); + else + elog(PANIC, "file descriptor %d is of a deleted file", + fd); +} + /* * A convenience wrapper for pg_pwritev() that retries on partial write. If an * error is returned, it is unspecified how much has been written. -- 2.34.0
>From d4aeb3e17d7fb053af0ca94f5f59ef1ea0c34f98 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 21 Feb 2022 16:42:16 -0800 Subject: [PATCH v1 3/4] WIP: fix old-fd issues using global barriers everywhere. --- src/include/pg_config_manual.h | 9 --------- src/include/storage/smgr.h | 1 + src/backend/commands/tablespace.c | 10 +++++----- src/backend/storage/buffer/bufmgr.c | 11 ++++++++--- src/backend/storage/smgr/smgr.c | 20 ++++++++++++++++++++ 5 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 84ce5a4a5d7..1eb39884144 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -152,16 +152,7 @@ #define EXEC_BACKEND #endif -/* - * If USE_BARRIER_SMGRRELEASE is defined, certain code paths that unlink - * directories will ask other backends to close all smgr file descriptors. - * This is enabled on Windows, because otherwise unlinked but still open files - * can prevent rmdir(containing_directory) from succeeding. On other - * platforms, it can be defined to exercise those code paths. - */ -#if defined(WIN32) #define USE_BARRIER_SMGRRELEASE -#endif /* * Define this if your operating system supports link() diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 8e3ef92cda1..d774fc98cf8 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -79,6 +79,7 @@ typedef SMgrRelationData *SMgrRelation; extern void smgrinit(void); extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); +extern SMgrRelation smgropen_cond(RelFileNode rnode, BackendId backend); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 40514ab550f..0c433360b63 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -1574,6 +1574,11 @@ tblspc_redo(XLogReaderState *record) { xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record); +#if defined(USE_BARRIER_SMGRRELEASE) + /* Close all smgr fds in all backends. */ + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); +#endif + /* * If we issued a WAL record for a drop tablespace it implies that * there were no files in it at all when the DROP was done. That means @@ -1591,11 +1596,6 @@ tblspc_redo(XLogReaderState *record) */ if (!destroy_tablespace_directories(xlrec->ts_id, true)) { -#if defined(USE_BARRIER_SMGRRELEASE) - /* Close all smgr fds in all backends. */ - WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); -#endif - ResolveRecoveryConflictWithTablespace(xlrec->ts_id); /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f5459c68f89..e9fd512df7c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4819,9 +4819,14 @@ IssuePendingWritebacks(WritebackContext *context) i += ahead; - /* and finally tell the kernel to write the data to storage */ - reln = smgropen(tag.rnode, InvalidBackendId); - smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); + /* + * Finally tell the kernel to write the data to storage. Don't smgr if + * previously closed, otherwise we could end up evading fd-reuse + * protection. + */ + reln = smgropen_cond(tag.rnode, InvalidBackendId); + if (reln) + smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); } context->nr_pending = 0; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index d71a557a352..efa83ae3942 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -190,6 +190,26 @@ smgropen(RelFileNode rnode, BackendId backend) return reln; } +/* + * Like smgropen(), but returns NULL if relation is not already open. + */ +SMgrRelation +smgropen_cond(RelFileNode rnode, BackendId backend) +{ + RelFileNodeBackend brnode; + + if (SMgrRelationHash == NULL) + return NULL; + + /* Look up or create an entry */ + brnode.node = rnode; + brnode.backend = backend; + + return (SMgrRelation) hash_search(SMgrRelationHash, + (void *) &brnode, + HASH_FIND, NULL); +} + /* * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object * -- 2.34.0
>From 85eb22f6d290765c8b7b931f4e968b4808ff33a3 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 21 Feb 2022 16:47:26 -0800 Subject: [PATCH v1 4/4] WIP: Add linux-only AssertNoDeletedFilesOpenPid(). Author: Reviewed-By: Discussion: https://postgr.es/m/ Backpatch: --- src/include/storage/fd.h | 3 ++ src/backend/commands/dbcommands.c | 6 ++++ src/backend/storage/file/fd.c | 52 +++++++++++++++++++++++++++++ src/backend/storage/ipc/procarray.c | 22 ++++++++++++ 4 files changed, 83 insertions(+) diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 3bb8f669b64..d12482be90d 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -192,6 +192,9 @@ extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); extern void AssertFileNotDeleted(int fd); +extern void AssertNoDeletedFilesOpen(void); +extern void AssertNoDeletedFilesOpenPid(int pid); + /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILE_PREFIX "pgsql_tmp" diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index c37e3c9a9a4..2b9ccddbde7 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1069,6 +1069,8 @@ dropdb(const char *dbname, bool missing_ok, bool force) WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); #endif + AssertNoDeletedFilesOpen(); + /* * Remove all tablespace subdirs belonging to the database. */ @@ -1322,6 +1324,8 @@ movedb(const char *dbname, const char *tblspcname) WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); #endif + AssertNoDeletedFilesOpen(); + /* * Now drop all buffers holding data of the target database; they should * no longer be dirty so DropDatabaseBuffers is safe. @@ -2453,6 +2457,8 @@ dbase_redo(XLogReaderState *record) WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); #endif + AssertNoDeletedFilesOpen(); + for (i = 0; i < xlrec->ntablespaces; i++) { dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_ids[i]); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 123815e4a80..782412b7d32 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -3846,6 +3846,58 @@ data_sync_elevel(int elevel) return data_sync_retry ? elevel : PANIC; } +void +AssertNoDeletedFilesOpenPid(int pid) +{ +#if defined(__linux__) + const char *const deleted_suffix = " (deleted)"; + char proc_dir[MAXPGPATH]; + struct dirent *de; + DIR *dirdesc; + + sprintf(proc_dir, "/proc/%d/fd", pid); + + elog(LOG, "checking %s", proc_dir); + + dirdesc = AllocateDir(proc_dir); + if (dirdesc == NULL) + { + elog(WARNING, "could not open %s", proc_dir); + return; + } + + while ((de = ReadDir(dirdesc, proc_dir)) != NULL) + { + char path[MAXPGPATH]; + char target[MAXPGPATH]; + int ret; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + sprintf(path, "/proc/%d/fd/%s", pid, de->d_name); + + ret = readlink(path, target, sizeof(path) - 1); + + // FIXME: Tolerate most errors here + if (ret == -1) + elog(PANIC, "readlink failed: %m"); + + /* readlink doesn't null terminate */ + target[ret] = 0; + + if (pg_str_endswith(target, deleted_suffix)) + { + elog(PANIC, "pid %d has deleted file %s open in fd %s", + pid, target, de->d_name); + } + } + + FreeDir(dirdesc); +#endif +} + void AssertFileNotDeleted(int fd) { diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 13d192ec2b4..e33d16f7588 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -58,6 +58,7 @@ #include "commands/dbcommands.h" #include "miscadmin.h" #include "pgstat.h" +#include "storage/fd.h" #include "storage/proc.h" #include "storage/procarray.h" #include "storage/spin.h" @@ -3772,6 +3773,27 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) return true; /* timed out, still conflicts */ } +void +AssertNoDeletedFilesOpen(void) +{ +#if defined(__linux__) + ProcArrayStruct *arrayP = procArray; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (int index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + AssertNoDeletedFilesOpenPid(proc->pid); + } + + LWLockRelease(ProcArrayLock); +#endif +} + + /* * Terminate existing connections to the specified database. This routine * is used by the DROP DATABASE command when user has asked to forcefully -- 2.34.0