Hi.

Here's an updated patch for the fsync problem(s).

A few points that may be worth considering:

1. I've made the ReadDir → ReadDirExtended change, but in retrospect I
   don't think it's really worth it. Using ReadDir and letting it die
   is probably fine. (Aside: I left ReadDirExtended static for now.)

2. Robert, are you comfortable with what fsync_pgdata() does in xlog.c?
   I wasn't sure if I should move that to fd.c as well. I think it's
   borderline OK for now.

3. There's a comment in fsync_fname that we need to open directories
   with O_RDONLY and non-directories with O_RDWR. Does this also apply
   to pre_sync_fname (i.e. sync_file_range and posix_fadvise) on any
   platforms we care about? It doesn't seem so, but I'm not sure.

4. As a partial aside, why does fsync_fname use OpenTransientFile? It
   looks like it should use BasicOpenFile as pre_sync_fname does. We
   close the fd immediately after calling fsync anyway. Or is there
   some reason I'm missing that we should use OpenTransientFile in
   pre_sync_fname too?

   (Aside²: it's pointless to specify S_IRUSR|S_IWUSR in fsync_fname
   since we're not setting O_CREAT. Minor, so will submit separate
   patch.)

5. I made walktblspc_entries use stat() instead of lstat(), so that we
   can handle symlinks and directories the same way. Note that we won't
   continue to follow links inside the sub-directories because we use
   walkdir instead of recursing.

   But this depends on S_ISDIR() returning true after stat() on Windows
   with a junction. Does that work? And are the entries in pb_tblspc on
   Windows actually junctions?

I've tested this with unreadable files in PGDATA and junk inside
pb_tblspc. Feedback welcome.

I have a separate patch to initdb with the corresponding changes, which
I will post after dinner and a bit more testing.

-- Abhijit
>From 491dcb8c7203fd992dd9c441f5463a75c88e6f52 Mon Sep 17 00:00:00 2001
From: Abhijit Menon-Sen <a...@2ndquadrant.com>
Date: Thu, 28 May 2015 17:22:18 +0530
Subject: Skip unreadable files in fsync_pgdata; follow only expected symlinks

---
 src/backend/access/transam/xlog.c |  40 +++++-
 src/backend/storage/file/fd.c     | 262 ++++++++++++++++++++++++++------------
 src/include/storage/fd.h          |   6 +-
 3 files changed, 226 insertions(+), 82 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 087b6be..32447e7 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -11605,20 +11605,53 @@ SetWalWriterSleeping(bool sleeping)
 
 /*
  * Issue fsync recursively on PGDATA and all its contents.
+ *
+ * We fsync regular files and directories wherever they are, but we
+ * follow symlinks only for pg_xlog and under pg_tblspc.
  */
 static void
 fsync_pgdata(char *datadir)
 {
+	bool		xlog_is_symlink;
+	char		pg_xlog[MAXPGPATH];
+	char		pg_tblspc[MAXPGPATH];
+	struct stat st;
+
 	if (!enableFsync)
 		return;
 
+	snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", datadir);
+	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", datadir);
+
+	/*
+	 * If pg_xlog is a symlink, then we need to recurse into it separately,
+	 * because the first walkdir below will ignore it.
+	 */
+	xlog_is_symlink = false;
+
+	if (lstat(pg_xlog, &st) < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				errmsg("could not stat directory \"pg_xlog\": %m")));
+
+#ifndef WIN32
+	if (S_ISLNK(st.st_mode))
+		xlog_is_symlink = true;
+#else
+	if (pgwin32_is_junction(subpath))
+		xlog_is_symlink = true;
+#endif
+
 	/*
 	 * If possible, hint to the kernel that we're soon going to fsync the data
 	 * directory and its contents.
 	 */
 #if defined(HAVE_SYNC_FILE_RANGE) || \
 	(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
-	walkdir(datadir, pre_sync_fname);
+	walkdir(DEBUG1, datadir, pre_sync_fname);
+	if (xlog_is_symlink)
+		walkdir(DEBUG1, pg_xlog, pre_sync_fname);
+	walktblspc_entries(DEBUG1, pg_tblspc, pre_sync_fname);
 #endif
 
 	/*
@@ -11628,5 +11661,8 @@ fsync_pgdata(char *datadir)
 	 * file fsyncs don't guarantee that the directory entry for the file is
 	 * synced.
 	 */
-	walkdir(datadir, fsync_fname);
+	walkdir(LOG, datadir, fsync_fname_ext);
+	if (xlog_is_symlink)
+		walkdir(LOG, pg_xlog, fsync_fname_ext);
+	walktblspc_entries(LOG, pg_tblspc, fsync_fname_ext);
 }
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 68d43c6..916f8b5 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1921,6 +1921,37 @@ TryAgain:
 }
 
 /*
+ * Read a directory opened with AllocateDir and return the next dirent.
+ * On error, ereports at a caller-specified level and returns NULL if
+ * the level is not ERROR or more severe.
+ */
+static struct dirent *
+ReadDirExtended(int elevel, DIR *dir, const char *dirname)
+{
+	struct dirent *dent;
+
+	/* Give a generic message for AllocateDir failure, if caller didn't */
+	if (dir == NULL)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open directory \"%s\": %m", dirname)));
+		return NULL;
+	}
+
+	errno = 0;
+	if ((dent = readdir(dir)) != NULL)
+		return dent;
+
+	if (errno)
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not read directory \"%s\": %m", dirname)));
+
+	return NULL;
+}
+
+/*
  * Read a directory opened with AllocateDir, ereport'ing any error.
  *
  * This is easier to use than raw readdir() since it takes care of some
@@ -1943,25 +1974,7 @@ TryAgain:
 struct dirent *
 ReadDir(DIR *dir, const char *dirname)
 {
-	struct dirent *dent;
-
-	/* Give a generic message for AllocateDir failure, if caller didn't */
-	if (dir == NULL)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not open directory \"%s\": %m",
-						dirname)));
-
-	errno = 0;
-	if ((dent = readdir(dir)) != NULL)
-		return dent;
-
-	if (errno)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not read directory \"%s\": %m",
-						dirname)));
-	return NULL;
+	return ReadDirExtended(ERROR, dir, dirname);
 }
 
 /*
@@ -2443,48 +2456,122 @@ looks_like_temp_rel_name(const char *name)
 /*
  * Hint to the OS that it should get ready to fsync() this file.
  *
- * Adapted from pre_sync_fname in initdb.c
+ * Ignores errors trying to open unreadable files, and logs other errors at a
+ * caller-specified level.
  */
 void
-pre_sync_fname(char *fname, bool isdir)
+pre_sync_fname(int elevel, char *fname, bool isdir)
 {
 	int			fd;
 
 	fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0);
 
+	if (fd < 0)
+	{
+		if (errno == EACCES || (isdir && errno == EISDIR))
+			return;
+
+#ifdef ETXTBSY
+		if (errno == ETXTBSY)
+			return;
+#endif
+
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+		return;
+	}
+
+	(void) pg_flush_data(fd, 0, 0);
+	(void) close(fd);
+}
+
+/*
+ * fsync_fname_ext -- Try to fsync a file or directory
+ *
+ * Ignores errors trying to open unreadable files, or trying to fsync
+ * directories on systems where that isn't allowed/required, and logs other
+ * errors at a caller-specified level.
+ *
+ * The only reason fsync_fname doesn't call fsync_fname_ext(ERROR, …) is
+ * that EACCES and ETXTBSY are silently ignored here.
+ */
+void
+fsync_fname_ext(int elevel, char *fname, bool isdir)
+{
+	int			fd;
+	int			flags;
+	int			returncode;
+
 	/*
-	 * Some OSs don't allow us to open directories at all (Windows returns
-	 * EACCES)
+	 * Some OSs require directories to be opened read-only whereas other
+	 * systems don't allow us to fsync files opened read-only; so we need both
+	 * cases here
 	 */
-	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
-		return;
+	flags = PG_BINARY;
+	if (!isdir)
+		flags |= O_RDWR;
+	else
+		flags |= O_RDONLY;
 
+	/*
+	 * Open the file, silently ignoring errors about unreadable files
+	 * (or unsupported operations, e.g. opening a directory under
+	 * Windows), and logging others.
+	 */
+	fd = OpenTransientFile(fname, flags, 0);
 	if (fd < 0)
-		ereport(FATAL,
-				(errmsg("could not open file \"%s\": %m",
-						fname)));
+	{
+		if (errno == EACCES || (isdir && errno == EISDIR))
+			return;
+
+#ifdef ETXTBSY
+		if (errno == ETXTBSY)
+			return;
+#endif
 
-	pg_flush_data(fd, 0, 0);
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+		return;
+	}
+
+	returncode = pg_fsync(fd);
+
+	/*
+	 * Some OSes don't allow us to fsync directories at all, so we can ignore
+	 * those errors. Anything else needs to be logged.
+	 */
+	if (returncode != 0 && !(isdir && errno == EBADF))
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", fname)));
 
-	close(fd);
+	CloseTransientFile(fd);
 }
 
 /*
  * walkdir: recursively walk a directory, applying the action to each
- * regular file and directory (including the named directory itself)
- * and following symbolic links.
+ * regular file and directory (including the named directory itself).
  *
- * NB: There is another version of walkdir in initdb.c, but that version
- * behaves differently with respect to symbolic links.  Caveat emptor!
+ * Adapted from walkdir in initdb.c
  */
 void
-walkdir(char *path, void (*action) (char *fname, bool isdir))
+walkdir(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir))
 {
 	DIR		   *dir;
 	struct dirent *de;
 
 	dir = AllocateDir(path);
-	while ((de = ReadDir(dir, path)) != NULL)
+	if (dir == NULL)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open directory \"%s\": %m", path)));
+		return;
+	}
+
+	while ((de = ReadDirExtended(elevel, dir, path)) != NULL)
 	{
 		char		subpath[MAXPGPATH];
 		struct stat fst;
@@ -2498,59 +2585,78 @@ walkdir(char *path, void (*action) (char *fname, bool isdir))
 		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
 
 		if (lstat(subpath, &fst) < 0)
-			ereport(ERROR,
+		{
+			ereport(elevel,
 					(errcode_for_file_access(),
 					 errmsg("could not stat file \"%s\": %m", subpath)));
+			continue;
+		}
 
 		if (S_ISREG(fst.st_mode))
-			(*action) (subpath, false);
+			(*action) (elevel, subpath, false);
 		else if (S_ISDIR(fst.st_mode))
-			walkdir(subpath, action);
-#ifndef WIN32
-		else if (S_ISLNK(fst.st_mode))
-#else
-		else if (pgwin32_is_junction(subpath))
-#endif
-		{
-#if defined(HAVE_READLINK) || defined(WIN32)
-			char		linkpath[MAXPGPATH];
-			int			len;
-			struct stat lst;
+			walkdir(elevel, subpath, action);
+	}
+	FreeDir(dir);
 
-			len = readlink(subpath, linkpath, sizeof(linkpath) - 1);
-			if (len < 0)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not read symbolic link \"%s\": %m",
-								subpath)));
+	(*action) (elevel, path, true);
+}
 
-			if (len >= sizeof(linkpath) - 1)
-				ereport(ERROR,
-						(errmsg("symbolic link \"%s\" target is too long",
-								subpath)));
+/*
+ * walktblspc_entries -- apply the action to the entries in pb_tblspc
+ *
+ * We expect to find only symlinks to tablespace directories here, but
+ * we'll apply the action to regular files, and call walkdir() on any
+ * directories as well.
+ *
+ * Adapted from walktblspc_links in initdb.c
+ */
+void
+walktblspc_entries(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir))
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(path);
+	if (dir == NULL)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open directory \"%s\": %m", path)));
+		return;
+	}
 
-			linkpath[len] = '\0';
+	while ((de = ReadDirExtended(elevel, dir, path)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+		struct stat fst;
 
-			if (lstat(linkpath, &lst) == 0)
-			{
-				if (S_ISREG(lst.st_mode))
-					(*action) (linkpath, false);
-				else if (S_ISDIR(lst.st_mode))
-					walkdir(subpath, action);
-			}
-			else if (errno != ENOENT)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not stat file \"%s\": %m", linkpath)));
-#else
-			ereport(WARNING,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("this platform does not support symbolic links; ignoring \"%s\"",
-							subpath)));
-#endif
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		/*
+		 * We use stat rather than lstat here because we want to know if
+		 * a link points to a directory or not.
+		 */
+		if (stat(subpath, &fst) < 0)
+		{
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", subpath)));
+			continue;
 		}
+
+		if (S_ISREG(fst.st_mode))
+			(*action) (elevel, subpath, false);
+		else if (S_ISDIR(fst.st_mode))
+			walkdir(elevel, subpath, action);
 	}
 	FreeDir(dir);
 
-	(*action) (path, true);
+	(*action) (elevel, path, true);
 }
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 5b563a6..60f4d9b 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -114,8 +114,10 @@ extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern int	pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(char *fname, bool isdir);
-extern void pre_sync_fname(char *fname, bool isdir);
-extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
+extern void fsync_fname_ext(int elevel, char *fname, bool isdir);
+extern void pre_sync_fname(int elevel, char *fname, bool isdir);
+extern void walkdir(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir));
+extern void walktblspc_entries(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir));
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- 
1.9.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to