On Thu, Mar 18, 2021 at 8:52 PM Paul Guo <gu...@vmware.com> wrote:
> About the syncfs patch, my first impression on the guc name sync_after_crash
> is that it is a boolean type. Not sure about other people's feeling. Do you 
> guys think
> It is better to rename it to a clearer name like sync_method_after_crash or 
> others?

Works for me.  Here is a new version like that, also including the
documentation change discussed with Fujii-san, and a couple of
cosmetic changes.
From 68491a974fe9690b2c616d965066266ec6727e7d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 27 Sep 2020 17:22:10 +1300
Subject: [PATCH v3] Optionally use syncfs() for SyncDataDirectory() on Linux.

On some systems, opening every file in the data directory can be very
slow, as the first step before crash recovery can begin.  Provide an
alternative method, where we call syncfs() on every possibly different
filesystem under the data directory.  This avoids faulting in inodes for
potentially very many inodes.  It comes with some caveats, so it's not
the default.

The option can be controlled with a new setting:

sync_method_after_crash={fsync,syncfs}

Reported-by: Michael Brown <michael.br...@discourse.org>
Reviewed-by: Fujii Masao <masao.fu...@oss.nttdata.com>
Reviewed-by: Paul Guo <gu...@vmware.com>
Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org
Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com
---
 configure                                     |  2 +-
 configure.ac                                  |  1 +
 doc/src/sgml/config.sgml                      | 30 +++++++++
 src/backend/storage/file/fd.c                 | 61 ++++++++++++++++++-
 src/backend/utils/misc/guc.c                  | 17 ++++++
 src/backend/utils/misc/postgresql.conf.sample |  1 +
 src/include/pg_config.h.in                    |  3 +
 src/include/storage/fd.h                      |  6 ++
 src/tools/msvc/Solution.pm                    |  1 +
 9 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 3fd4cecbeb..6a2051da9d 100755
--- a/configure
+++ b/configure
@@ -15239,7 +15239,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l writev
+for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l writev
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 2f1585adc0..dd819ab2c2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1681,6 +1681,7 @@ AC_CHECK_FUNCS(m4_normalize([
 	strchrnul
 	strsignal
 	symlink
+	syncfs
 	sync_file_range
 	uselocale
 	wcstombs_l
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 8603cf3f94..f11af3b763 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9704,6 +9704,36 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-sync-method-after-crash" xreflabel="sync_method_after_crash">
+      <term><varname>sync_method_after_crash</varname> (<type>enum</type>)
+       <indexterm>
+        <primary><varname>sync_method_after_crash</varname> configuration parameter</primary>
+       </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When set to <literal>fsync</literal>, which is the default,
+        <productname>PostgreSQL</productname> will recursively open and fsync
+        all files in the data directory before crash recovery begins.  This
+        is intended to make sure that all WAL and data files are durably stored
+        on disk before replaying changes.  This applies whenever starting a
+        database cluster that did not shut down cleanly, including copies
+        created with pg_basebackup.
+       </para>
+       <para>
+        On Linux, <literal>syncfs</literal> may be used instead, to ask the
+        operating system to flush the whole file system.  This may be a lot
+        faster, because it doesn't need to open each file one by one.  On the
+        other hand, it may be slower if the file system is shared by other
+        applications that modify a lot of files, since those files will also
+        be flushed to disk.  Furthermore, on versions of Linux before 5.8, I/O
+        errors encountered while writing data to disk may not be reported to
+        <productname>PostgreSQL</productname>, and relevant error messages may
+        appear only in kernel logs.
+       </para>
+      </listitem>
+     </varlistentry>
+
     </variablelist>
 
    </sect1>
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index b58502837a..f83a74e969 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -72,9 +72,11 @@
 
 #include "postgres.h"
 
+#include <dirent.h>
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #ifndef WIN32
 #include <sys/mman.h>
 #endif
@@ -158,6 +160,9 @@ int			max_safe_fds = FD_MINFREE;	/* default if not changed */
 /* Whether it is safe to continue running after fsync() fails. */
 bool		data_sync_retry = false;
 
+/* How SyncDataDirectory() should do its job. */
+int			sync_method_after_crash = SYNC_METHOD_AFTER_CRASH_FSYNC;
+
 /* Debugging.... */
 
 #ifdef FDDEBUG
@@ -3263,9 +3268,27 @@ looks_like_temp_rel_name(const char *name)
 	return true;
 }
 
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int		fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+	{
+		elog(LOG, "could not open %s: %m", path);
+		return;
+	}
+	if (syncfs(fd) < 0)
+		elog(LOG, "syncfs failed for %s: %m", path);
+	close(fd);
+}
+#endif
 
 /*
- * Issue fsync recursively on PGDATA and all its contents.
+ * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
+ * all potential filesystem, depending on sync_method_after_crash setting.
  *
  * We fsync regular files and directories wherever they are, but we
  * follow symlinks only for pg_wal and immediately under pg_tblspc.
@@ -3317,6 +3340,42 @@ SyncDataDirectory(void)
 		xlog_is_symlink = true;
 #endif
 
+#ifdef HAVE_SYNCFS
+	if (sync_method_after_crash == SYNC_METHOD_AFTER_CRASH_SYNCFS)
+	{
+		DIR		   *dir;
+		struct dirent *de;
+
+		/*
+		 * On Linux, we don't have to open every single file one by one.  We
+		 * can use syncfs() to sync whole filesystems.  We only expect
+		 * filesystem boundaries to exist where we tolerate symlinks, namely
+		 * pg_wal and the tablespaces, so we call syncfs() for each of those
+		 * directories.
+		 */
+
+		/* Sync the top level pgdata directory. */
+		do_syncfs(".");
+		/* If any tablespaces are configured, sync each of those. */
+		dir = AllocateDir("pg_tblspc");
+		while ((de = ReadDir(dir, "pg_tblspc")))
+		{
+			char		path[MAXPGPATH];
+
+			if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+				continue;
+
+			snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+			do_syncfs(path);
+		}
+		FreeDir(dir);
+		/* If pg_wal is a symlink, process that too. */
+		if (xlog_is_symlink)
+			do_syncfs("pg_wal");
+		return;
+	}
+#endif		/* !HAVE_SYNCFS */
+
 	/*
 	 * If possible, hint to the kernel that we're soon going to fsync the data
 	 * directory and its contents.  Errors in this step are even less
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b263e3493b..70d7a2741f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -487,6 +487,14 @@ const struct config_enum_entry ssl_protocol_versions_info[] = {
 StaticAssertDecl(lengthof(ssl_protocol_versions_info) == (PG_TLS1_3_VERSION + 2),
 				 "array length mismatch");
 
+static struct config_enum_entry sync_method_after_crash_options[] = {
+	{"fsync", SYNC_METHOD_AFTER_CRASH_FSYNC, false},
+#ifdef HAVE_SYNCFS
+	{"syncfs", SYNC_METHOD_AFTER_CRASH_SYNCFS, false},
+#endif
+	{NULL, 0, false}
+};
+
 static struct config_enum_entry shared_memory_options[] = {
 #ifndef WIN32
 	{"sysv", SHMEM_TYPE_SYSV, false},
@@ -4850,6 +4858,15 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"sync_method_after_crash", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
+			gettext_noop("Sets the method for synchronizing the data directory before crash recovery."),
+		},
+		&sync_method_after_crash,
+		SYNC_METHOD_AFTER_CRASH_FSYNC, sync_method_after_crash_options,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 6647f8fd6e..747f29393a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -758,6 +758,7 @@
 
 #exit_on_error = off			# terminate session on any error?
 #restart_after_crash = on		# reinitialize after backend crash?
+#sync_method_after_crash = fsync	# fsync or syncfs (Linux only, 5.8+)
 #data_sync_retry = off			# retry or panic on failure to fsync
 					# data?
 					# (change requires restart)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 7a7cc21d8d..7bb5743e3d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -590,6 +590,9 @@
 /* Define to 1 if you have the `symlink' function. */
 #undef HAVE_SYMLINK
 
+/* Define to 1 if you have the `syncfs' function. */
+#undef HAVE_SYNCFS
+
 /* Define to 1 if you have the `sync_file_range' function. */
 #undef HAVE_SYNC_FILE_RANGE
 
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 30bf7d2193..ee26915c06 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -45,6 +45,11 @@
 
 #include <dirent.h>
 
+typedef enum SyncMethodAfterCrash {
+	SYNC_METHOD_AFTER_CRASH_FSYNC,
+	SYNC_METHOD_AFTER_CRASH_SYNCFS
+} SyncMethodAfterCrash;
+
 struct iovec;					/* avoid including port/pg_iovec.h here */
 
 typedef int File;
@@ -53,6 +58,7 @@ typedef int File;
 /* GUC parameter */
 extern PGDLLIMPORT int max_files_per_process;
 extern PGDLLIMPORT bool data_sync_retry;
+extern int sync_method_after_crash;
 
 /*
  * This is private to fd.c, but exported for save/restore_backend_variables()
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index a4f5cc4bdb..43d171e544 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -388,6 +388,7 @@ sub GenerateFiles
 		HAVE_STRUCT_TM_TM_ZONE                   => undef,
 		HAVE_SYNC_FILE_RANGE                     => undef,
 		HAVE_SYMLINK                             => 1,
+		HAVE_SYNCFS                              => undef,
 		HAVE_SYSLOG                              => undef,
 		HAVE_SYS_EPOLL_H                         => undef,
 		HAVE_SYS_EVENT_H                         => undef,
-- 
2.30.1

Reply via email to