A race with KeepFileRestoredFromArchive() can cause a restartpoint to fail, as
seen once on the buildfarm[1].  The attached patch adds a test case; it
applies atop the "stop events" patch[2].  We have two systems for adding
long-term pg_wal directory entries.  KeepFileRestoredFromArchive() adds them
during archive recovery, while InstallXLogFileSegment() does so at all times.
Unfortunately, InstallXLogFileSegment() happens throughout archive recovery,
via the checkpointer recycling segments and calling PreallocXlogFiles().
Multiple processes can run InstallXLogFileSegment(), which uses
ControlFileLock to represent the authority to modify the directory listing of
pg_wal.  KeepFileRestoredFromArchive() just assumes it controls pg_wal.

Recycling and preallocation are wasteful during archive recovery, because
KeepFileRestoredFromArchive() unlinks every entry in its path.  I propose to
fix the race by adding an XLogCtl flag indicating which regime currently owns
the right to add long-term pg_wal directory entries.  In the archive recovery
regime, the checkpointer will not preallocate and will unlink old segments
instead of recycling them (like wal_recycle=off).  XLogFileInit() will fail.

Notable alternatives:

- Release ControlFileLock at the end of XLogFileInit(), not at the end of
  InstallXLogFileSegment().  Add ControlFileLock acquisition to
  KeepFileRestoredFromArchive().  This provides adequate mutual exclusion, but
  XLogFileInit() could return a file descriptor for an unlinked file.  That's
  fine for PreallocXlogFiles(), but it feels wrong.

- During restartpoints, never preallocate or recycle segments.  (Just delete
  obsolete WAL.)  By denying those benefits, this presumably makes streaming
  recovery less efficient.

- Make KeepFileRestoredFromArchive() call XLogFileInit() to open a segment,
  then copy bytes.  This is simple, but it multiplies I/O.  That might be
  negligible on account of caching, or it might not be.  A variant, incurring
  extra fsyncs, would be to use durable_rename() to replace the segment we get
  from XLogFileInit().

- Make KeepFileRestoredFromArchive() rename without first unlinking.  This
  avoids checkpoint failure, but a race could trigger noise from the LOG
  message in InstallXLogFileSegment -> durable_rename_excl.

Does anyone prefer some alternative?  It's probably not worth back-patching
anything for a restartpoint failure this rare, because most restartpoint
outcomes are not user-visible.

Thanks,
nm

[1] 
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=mandrill&dt=2020-10-05%2023%3A02%3A17
[2] 
https://postgr.es/m/CAPpHfdtSEOHX8dSk9Qp%2BZ%2B%2Bi4BGQoffKip6JDWngEA%2Bg7Z-XmQ%40mail.gmail.com
Author:     Noah Misch <n...@leadboat.com>
Commit:     Noah Misch <n...@leadboat.com>

    Not for commit; for demonstration only.
    
    Before applying this, apply
    
https://postgr.es/m/CAPpHfdtSEOHX8dSk9Qp%2BZ%2B%2Bi4BGQoffKip6JDWngEA%2Bg7Z-XmQ%40mail.gmail.com

diff --git a/src/backend/access/transam/xlog.c 
b/src/backend/access/transam/xlog.c
index 199d911..1c0a988 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -71,6 +71,7 @@
 #include "storage/reinit.h"
 #include "storage/smgr.h"
 #include "storage/spin.h"
+#include "storage/stopevent.h"
 #include "storage/sync.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
@@ -3583,6 +3584,23 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, 
XLogSegNo srcsegno,
                elog(ERROR, "InstallXLogFileSegment should not have failed");
 }
 
+static Jsonb *
+InstallXLogFileSegmentEndStopEventParams(XLogSegNo segno)
+{
+       MemoryContext oldCxt;
+       JsonbParseState *state = NULL;
+       Jsonb      *res;
+
+       stopevents_make_cxt();
+       oldCxt = MemoryContextSwitchTo(stopevents_cxt);
+       pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+       jsonb_push_int8_key(&state, "segno", segno);
+       res = JsonbValueToJsonb(pushJsonbValue(&state, WJB_END_OBJECT, NULL));
+       MemoryContextSwitchTo(oldCxt);
+
+       return res;
+}
+
 /*
  * Install a new XLOG segment file as a current or future log segment.
  *
@@ -3664,6 +3682,9 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
        if (use_lock)
                LWLockRelease(ControlFileLock);
 
+       STOPEVENT(InstallXLogFileSegmentEndStopEvent,
+                         InstallXLogFileSegmentEndStopEventParams(*segno));
+
        return true;
 }
 
diff --git a/src/backend/access/transam/xlogarchive.c 
b/src/backend/access/transam/xlogarchive.c
index 1c5a4f8..d9c5a3a 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -30,6 +30,7 @@
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pmsignal.h"
+#include "storage/stopevent.h"
 
 /*
  * Attempt to retrieve the specified file from off-line archival storage.
@@ -372,6 +373,23 @@ ExecuteRecoveryCommand(const char *command, const char 
*commandName, bool failOn
 }
 
 
+static Jsonb *
+KeepFileRestoredFromArchiveStopEventParams(const char *xlogfname)
+{
+       MemoryContext oldCxt;
+       JsonbParseState *state = NULL;
+       Jsonb      *res;
+
+       stopevents_make_cxt();
+       oldCxt = MemoryContextSwitchTo(stopevents_cxt);
+       pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+       jsonb_push_string_key(&state, "xlogfname", xlogfname);
+       res = JsonbValueToJsonb(pushJsonbValue(&state, WJB_END_OBJECT, NULL));
+       MemoryContextSwitchTo(oldCxt);
+
+       return res;
+}
+
 /*
  * A file was restored from the archive under a temporary filename (path),
  * and now we want to keep it. Rename it under the permanent filename in
@@ -386,6 +404,9 @@ KeepFileRestoredFromArchive(const char *path, const char 
*xlogfname)
 
        snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", xlogfname);
 
+       STOPEVENT(KeepFileRestoredFromArchiveStopEvent,
+                         
KeepFileRestoredFromArchiveStopEventParams(xlogfname));
+
        if (stat(xlogfpath, &statbuf) == 0)
        {
                char            oldpath[MAXPGPATH];
@@ -424,6 +445,9 @@ KeepFileRestoredFromArchive(const char *path, const char 
*xlogfname)
                reload = true;
        }
 
+       STOPEVENT(KeepFileRestoredFromArchivePreRenameStopEvent,
+                         
KeepFileRestoredFromArchiveStopEventParams(xlogfname));
+
        durable_rename(path, xlogfpath, ERROR);
 
        /*
diff --git a/src/backend/storage/lmgr/stopevent.c 
b/src/backend/storage/lmgr/stopevent.c
index 6b6eb2b..251bdfe 100644
--- a/src/backend/storage/lmgr/stopevent.c
+++ b/src/backend/storage/lmgr/stopevent.c
@@ -280,12 +280,14 @@ handle_stopevent(int event_id, Jsonb *params)
        if (trace_stopevents)
        {
                char       *params_string;
+               MemoryContext oldCxt;
 
+               oldCxt = MemoryContextSwitchTo(stopevents_cxt);
                params_string = DatumGetCString(DirectFunctionCall1(jsonb_out, 
PointerGetDatum(params)));
                elog(DEBUG2, "stop event \"%s\", params \"%s\"",
                         stopeventnames[event_id],
                         params_string);
-               pfree(params_string);
+               MemoryContextSwitchTo(oldCxt);
        }
 
        MemoryContextReset(stopevents_cxt);
@@ -342,7 +344,12 @@ void
 relation_stopevent_params(JsonbParseState **state, Relation relation)
 {
        jsonb_push_int8_key(state, "datoid", MyDatabaseId);
+#if 0
+       /* infinite recursion to segfault */
        jsonb_push_string_key(state, "datname", 
get_database_name(MyDatabaseId));
+#else
+       jsonb_push_string_key(state, "datname", "fake");
+#endif
        jsonb_push_int8_key(state, "reloid", relation->rd_id);
        jsonb_push_string_key(state, "relname", 
NameStr(relation->rd_rel->relname));
 }
diff --git a/src/backend/storage/lmgr/stopeventnames.txt 
b/src/backend/storage/lmgr/stopeventnames.txt
index b5330a1..aadff7a 100644
--- a/src/backend/storage/lmgr/stopeventnames.txt
+++ b/src/backend/storage/lmgr/stopeventnames.txt
@@ -1,2 +1,5 @@
+InstallXLogFileSegmentEndStopEvent
+KeepFileRestoredFromArchiveStopEvent
+KeepFileRestoredFromArchivePreRenameStopEvent
 ReleaseAndReadBufferStopEvent
 EntryFindPostingLeafPageStopEvent
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 84472b9..4c217d5 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -23,6 +23,7 @@
 
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
+#include "storage/stopevent.h"
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
@@ -126,6 +127,10 @@ MemoryContextInit(void)
                                                                                
 8 * 1024,
                                                                                
 8 * 1024);
        MemoryContextAllowInCriticalSection(ErrorContext, true);
+
+       stopevents_make_cxt();
+       /* not really okay; for a demo patch, I'll take my chances */
+       MemoryContextAllowInCriticalSection(stopevents_cxt, true);
 }
 
 /*
diff --git a/src/include/storage/stopevent.h b/src/include/storage/stopevent.h
index a8b62f2..53dbf98 100644
--- a/src/include/storage/stopevent.h
+++ b/src/include/storage/stopevent.h
@@ -2,6 +2,7 @@
 #define SRC_STOPEVENT_H
 
 #include "utils/jsonb.h"
+#include "utils/rel.h"
 #include "storage/stopeventnames.h"
 
 extern bool enable_stopevents;
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 9667f76..caa6de5 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -1737,7 +1737,7 @@ sub poll_query_until
 
        my $cmd = [ 'psql', '-XAt', '-c', $query, '-d', $self->connstr($dbname) 
];
        my ($stdout, $stderr);
-       my $max_attempts = 180 * 10;
+       my $max_attempts = 3 * 10;
        my $attempts     = 0;
 
        while ($attempts < $max_attempts)
diff --git a/src/test/recovery/t/022_install_xlog.pl 
b/src/test/recovery/t/022_install_xlog.pl
new file mode 100644
index 0000000..16d8a22
--- /dev/null
+++ b/src/test/recovery/t/022_install_xlog.pl
@@ -0,0 +1,96 @@
+#
+# Test InstallXLogFileSegment()/KeepFileRestoredFromArchive() mutual exclusion
+#
+# Specifically, recreate this sequence of events:
+#
+# checkpointer enters XLogFileInit, finds no segno:2 file
+# checkpointer finishes InstallXLogFileSegment, creating segno:2 file
+# startupProc enters KeepFileRestoredFromArchive, unlinks segno:2
+# checkpointer enters the last BasicOpenFile of XLogFileInit, which fails
+# startupProc finishes KeepFileRestoredFromArchive (too late)
+#
+use strict;
+use warnings;
+use PostgresNode;
+use Test::More tests => 1;
+
+my $primary = get_new_node('primary');
+$primary->init(
+       has_archiving    => 1,
+       allows_streaming => 1);
+$primary->append_conf('postgresql.conf', 'autovacuum = off');
+$primary->start;
+$primary->stop('immediate');
+$primary->backup_fs_cold('backup');
+$primary->start;
+
+# Standby will get these two segments from the archive.
+$primary->safe_psql(
+       'postgres', q{
+       -- Standby makes a restart point from this checkpoint.
+       CREATE TABLE mine AS SELECT generate_series(1,10) AS x;
+       CHECKPOINT;
+       SELECT pg_switch_wal();
+       -- Standby restores this segment in parallel with its restart point,
+       -- exercising the race condition.
+       INSERT INTO mine SELECT generate_series(10,20) AS x;
+       SELECT pg_switch_wal();
+});
+
+
+my $standby = get_new_node('standby');
+$standby->init_from_backup($primary, 'backup', has_restoring => 1);
+# exercise PreallocXlogFiles(), not recycling
+$standby->append_conf('postgresql.conf', 'wal_keep_size = 160MB');
+# break restore_command so we can first inject stop events
+$standby->append_conf('postgresql.conf', 'restore_command = false');
+$standby->start;
+
+# register all stop events
+$standby->safe_psql(
+       'postgres', q{
+ select pg_stopevent_set('InstallXLogFileSegmentEnd', '$.segno == 2');
+ select pg_stopevent_set('KeepFileRestoredFromArchive',
+  '$.xlogfname == "000000010000000000000002"');
+ select pg_stopevent_set('KeepFileRestoredFromArchivePreRename',
+  '$.xlogfname == "000000010000000000000002"');
+});
+# repair restore_command
+$standby->enable_restoring($primary, 1);
+$standby->safe_psql('postgres', q{SELECT pg_reload_conf();});
+
+
+# poll until some process is blocked as a pg_stopevent_set() directed
+sub poll_until_event
+{
+       my $event = shift;
+       $standby->poll_query_until(
+               'postgres',
+               qq{ SELECT true FROM pg_stopevents() WHERE cardinality(waiters) 
<> 0
+                       AND event_name = '$event'; }
+       ) or die "Timed out while waiting for $event on standby";
+}
+
+poll_until_event 'KeepFileRestoredFromArchive';    # segno==2 restored
+
+# Begin a restartpoint, pausing at CreateRestartPoint -> PreallocXlogFiles ->
+# XLogFileInit -> InstallXLogFileSegment -> InstallXLogFileSegmentEnd.
+my $in    = '';
+my $out   = '';
+my $timer = IPC::Run::timeout(180);
+my $h     = $standby->background_psql('postgres', \$in, \$out, $timer);
+$in .= "CHECKPOINT;\n";
+$h->pump_nb;
+poll_until_event 'InstallXLogFileSegmentEnd';
+
+# release startupProc to unlink segno==2
+$standby->safe_psql('postgres',
+       q{SELECT pg_stopevent_reset('KeepFileRestoredFromArchive');});
+poll_until_event 'KeepFileRestoredFromArchivePreRename';
+# release checkpointer to open segno==2
+$standby->safe_psql('postgres',
+       q{SELECT pg_stopevent_reset('InstallXLogFileSegmentEnd');});
+ok($h->finish, 'CHECKPOINT completed');
+# release startupProc to rename file onto segno==2, but it's too late
+$standby->safe_psql('postgres',
+       q{SELECT pg_stopevent_reset('KeepFileRestoredFromArchivePreRename');});

Reply via email to