On Wed, Sep 9, 2020 at 3:49 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> On Thu, Sep 3, 2020 at 11:30 AM Thomas Munro <thomas.mu...@gmail.com> wrote:
> > On Wed, May 27, 2020 at 12:31 AM Craig Ringer <cr...@2ndquadrant.com> wrote:
> > > On Tue, 12 May 2020, 08:42 Paul Guo, <p...@pivotal.io> wrote:
> > >> 1. StartupXLOG() does fsync on the whole data directory early in the 
> > >> crash recovery. I'm wondering if we could skip some directories (at 
> > >> least the pg_log/, table directories) since wal, etc could ensure 
> > >> consistency. Here is the related code.
> > >>
> > >>       if (ControlFile->state != DB_SHUTDOWNED &&
> > >>           ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
> > >>       {
> > >>           RemoveTempXlogFiles();
> > >>           SyncDataDirectory();
> > >>       }
>
> > 4.  In datadir_fsync_fname(), if ParseRelationPath() is able to
> > recognise a file as being a relation file, build a FileTag and call
> > RegisterSyncRequest() to tell the checkpointer to sync this file as
> > part of the end checkpoint (currently the end-of-recovery checkpoint,
> > but that could also be relaxed).
>
> For the record, Andres Freund mentioned a few problems with this
> off-list and suggested we consider calling Linux syncfs() for each top
> level directory that could potentially be on a different filesystem.
> That seems like a nice idea to look into.

Here's an experimental patch to try that.  One problem is that before
Linux 5.8, syncfs() doesn't report failures[1].  I'm not sure what to
think about that; in the current coding we just log them and carry on
anyway, but any theoretical problems that causes for BLK_DONE should
be moot anyway because of FPIs which result in more writes and syncs.
Another is that it may affect other files that aren't under pgdata as
collateral damage, but that seems acceptable.  It also makes me a bit
sad that this wouldn't help other OSes.

(Archeological note:  The improved syncfs() error reporting is linked
to 2018 PostgreSQL/Linux hacker discussions[2], because it was thought
that syncfs() might be useful for checkpointing, though I believe
since then things have moved on and the new thinking is that we'd use
a new proposed interface to read per-filesystem I/O error counters
while checkpointing.)

[1] https://man7.org/linux/man-pages/man2/sync.2.html
[2] https://lwn.net/Articles/752063/
From ca0fd0769f08e8cdf65fc04baf12dcbf095b79a0 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 27 Sep 2020 17:22:10 +1300
Subject: [PATCH] Use syncfs() for SyncDataDirectory() on Linux.

Opening every file can be slow, as the first step before crash recovery
can begin.  For the same effect, call syncfs() on every possibly
different filesystem.  This avoids faulting in inodes for potentially
very many inodes.
---
 configure                     |  2 +-
 configure.ac                  |  1 +
 src/backend/storage/file/fd.c | 59 +++++++++++++++++++++++++++++++++++
 src/include/pg_config.h.in    |  3 ++
 4 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index 19a3cd09a0..a3e5d6459d 100755
--- a/configure
+++ b/configure
@@ -15155,7 +15155,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 6b9d0487a8..bad4159a94 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1670,6 +1670,7 @@ AC_CHECK_FUNCS(m4_normalize([
 	strchrnul
 	strsignal
 	symlink
+	syncfs
 	sync_file_range
 	uselocale
 	wcstombs_l
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index bd72a87ee3..6da498b08a 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -72,9 +72,11 @@
 
 #include "postgres.h"
 
+#include <dirent.h>
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #ifndef WIN32
 #include <sys/mman.h>
 #endif
@@ -330,10 +332,12 @@ static void walkdir(const char *path,
 					void (*action) (const char *fname, bool isdir, int elevel),
 					bool process_symlinks,
 					int elevel);
+#ifndef HAVE_SYNCFS
 #ifdef PG_FLUSH_DATA_WORKS
 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 #endif
 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+#endif
 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
 
 static int	fsync_parent_path(const char *fname, int elevel);
@@ -3231,6 +3235,23 @@ looks_like_temp_rel_name(const char *name)
 	return true;
 }
 
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int		fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+	{
+		elog(LOG, "could not open %s: %m", path);
+		return;
+	}
+	if (syncfs(fd) < 0)
+		elog(LOG, "syncfs failed for %s: %m", path);
+	close(fd);
+}
+#endif
 
 /*
  * Issue fsync recursively on PGDATA and all its contents.
@@ -3257,6 +3278,10 @@ void
 SyncDataDirectory(void)
 {
 	bool		xlog_is_symlink;
+#ifdef HAVE_SYNCFS
+	DIR		   *dir;
+	struct dirent *de;
+#endif
 
 	/* We can skip this whole thing if fsync is disabled. */
 	if (!enableFsync)
@@ -3285,6 +3310,36 @@ SyncDataDirectory(void)
 		xlog_is_symlink = true;
 #endif
 
+#ifdef HAVE_SYNCFS
+
+	/*
+	 * On Linux, we don't have to open every single file one by one.  We can
+	 * use syncfs() to sync whole filesystems.  We only expect filesystem
+	 * boundaries to exist where we tolerate symlinks, namely pg_wal and the
+	 * tablespaces, so we call syncfs() for each of those directories.
+	 */
+
+	/* Sync the top level pgdata directory. */
+	do_syncfs(".");
+	/* If any tablespaces are configured, sync each of those. */
+	dir = AllocateDir("pg_tblspc");
+	while ((de = ReadDir(dir, "pg_tblspc")))
+	{
+		char		path[MAXPGPATH];
+
+		if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+		do_syncfs(path);
+	}
+	FreeDir(dir);
+	/* If pg_wal is a symlink, process that too. */
+	if (xlog_is_symlink)
+		do_syncfs("pg_wal");
+
+#else
+
 	/*
 	 * If possible, hint to the kernel that we're soon going to fsync the data
 	 * directory and its contents.  Errors in this step are even less
@@ -3310,6 +3365,8 @@ SyncDataDirectory(void)
 	if (xlog_is_symlink)
 		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
 	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
+
+#endif		/* !HAVE_SYNCFS */
 }
 
 /*
@@ -3382,6 +3439,7 @@ walkdir(const char *path,
 }
 
 
+#ifndef HAVE_SYNCFS
 /*
  * Hint to the OS that it should get ready to fsync() this file.
  *
@@ -3434,6 +3492,7 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel)
 	 */
 	fsync_fname_ext(fname, isdir, true, elevel);
 }
+#endif
 
 static void
 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index fb270df678..68a5c4bb1b 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -575,6 +575,9 @@
 /* Define to 1 if you have the `symlink' function. */
 #undef HAVE_SYMLINK
 
+/* Define to 1 if you have the `syncfs' function. */
+#undef HAVE_SYNCFS
+
 /* Define to 1 if you have the `sync_file_range' function. */
 #undef HAVE_SYNC_FILE_RANGE
 
-- 
2.20.1

Reply via email to