Hi. Here's an updated patch for the fsync problem(s).
A few points that may be worth considering: 1. I've made the ReadDir → ReadDirExtended change, but in retrospect I don't think it's really worth it. Using ReadDir and letting it die is probably fine. (Aside: I left ReadDirExtended static for now.) 2. Robert, are you comfortable with what fsync_pgdata() does in xlog.c? I wasn't sure if I should move that to fd.c as well. I think it's borderline OK for now. 3. There's a comment in fsync_fname that we need to open directories with O_RDONLY and non-directories with O_RDWR. Does this also apply to pre_sync_fname (i.e. sync_file_range and posix_fadvise) on any platforms we care about? It doesn't seem so, but I'm not sure. 4. As a partial aside, why does fsync_fname use OpenTransientFile? It looks like it should use BasicOpenFile as pre_sync_fname does. We close the fd immediately after calling fsync anyway. Or is there some reason I'm missing that we should use OpenTransientFile in pre_sync_fname too? (Aside²: it's pointless to specify S_IRUSR|S_IWUSR in fsync_fname since we're not setting O_CREAT. Minor, so will submit separate patch.) 5. I made walktblspc_entries use stat() instead of lstat(), so that we can handle symlinks and directories the same way. Note that we won't continue to follow links inside the sub-directories because we use walkdir instead of recursing. But this depends on S_ISDIR() returning true after stat() on Windows with a junction. Does that work? And are the entries in pb_tblspc on Windows actually junctions? I've tested this with unreadable files in PGDATA and junk inside pb_tblspc. Feedback welcome. I have a separate patch to initdb with the corresponding changes, which I will post after dinner and a bit more testing. -- Abhijit
>From 491dcb8c7203fd992dd9c441f5463a75c88e6f52 Mon Sep 17 00:00:00 2001 From: Abhijit Menon-Sen <a...@2ndquadrant.com> Date: Thu, 28 May 2015 17:22:18 +0530 Subject: Skip unreadable files in fsync_pgdata; follow only expected symlinks --- src/backend/access/transam/xlog.c | 40 +++++- src/backend/storage/file/fd.c | 262 ++++++++++++++++++++++++++------------ src/include/storage/fd.h | 6 +- 3 files changed, 226 insertions(+), 82 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 087b6be..32447e7 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -11605,20 +11605,53 @@ SetWalWriterSleeping(bool sleeping) /* * Issue fsync recursively on PGDATA and all its contents. + * + * We fsync regular files and directories wherever they are, but we + * follow symlinks only for pg_xlog and under pg_tblspc. */ static void fsync_pgdata(char *datadir) { + bool xlog_is_symlink; + char pg_xlog[MAXPGPATH]; + char pg_tblspc[MAXPGPATH]; + struct stat st; + if (!enableFsync) return; + snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", datadir); + snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", datadir); + + /* + * If pg_xlog is a symlink, then we need to recurse into it separately, + * because the first walkdir below will ignore it. + */ + xlog_is_symlink = false; + + if (lstat(pg_xlog, &st) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat directory \"pg_xlog\": %m"))); + +#ifndef WIN32 + if (S_ISLNK(st.st_mode)) + xlog_is_symlink = true; +#else + if (pgwin32_is_junction(subpath)) + xlog_is_symlink = true; +#endif + /* * If possible, hint to the kernel that we're soon going to fsync the data * directory and its contents. */ #if defined(HAVE_SYNC_FILE_RANGE) || \ (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)) - walkdir(datadir, pre_sync_fname); + walkdir(DEBUG1, datadir, pre_sync_fname); + if (xlog_is_symlink) + walkdir(DEBUG1, pg_xlog, pre_sync_fname); + walktblspc_entries(DEBUG1, pg_tblspc, pre_sync_fname); #endif /* @@ -11628,5 +11661,8 @@ fsync_pgdata(char *datadir) * file fsyncs don't guarantee that the directory entry for the file is * synced. */ - walkdir(datadir, fsync_fname); + walkdir(LOG, datadir, fsync_fname_ext); + if (xlog_is_symlink) + walkdir(LOG, pg_xlog, fsync_fname_ext); + walktblspc_entries(LOG, pg_tblspc, fsync_fname_ext); } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 68d43c6..916f8b5 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1921,6 +1921,37 @@ TryAgain: } /* + * Read a directory opened with AllocateDir and return the next dirent. + * On error, ereports at a caller-specified level and returns NULL if + * the level is not ERROR or more severe. + */ +static struct dirent * +ReadDirExtended(int elevel, DIR *dir, const char *dirname) +{ + struct dirent *dent; + + /* Give a generic message for AllocateDir failure, if caller didn't */ + if (dir == NULL) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", dirname))); + return NULL; + } + + errno = 0; + if ((dent = readdir(dir)) != NULL) + return dent; + + if (errno) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not read directory \"%s\": %m", dirname))); + + return NULL; +} + +/* * Read a directory opened with AllocateDir, ereport'ing any error. * * This is easier to use than raw readdir() since it takes care of some @@ -1943,25 +1974,7 @@ TryAgain: struct dirent * ReadDir(DIR *dir, const char *dirname) { - struct dirent *dent; - - /* Give a generic message for AllocateDir failure, if caller didn't */ - if (dir == NULL) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open directory \"%s\": %m", - dirname))); - - errno = 0; - if ((dent = readdir(dir)) != NULL) - return dent; - - if (errno) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read directory \"%s\": %m", - dirname))); - return NULL; + return ReadDirExtended(ERROR, dir, dirname); } /* @@ -2443,48 +2456,122 @@ looks_like_temp_rel_name(const char *name) /* * Hint to the OS that it should get ready to fsync() this file. * - * Adapted from pre_sync_fname in initdb.c + * Ignores errors trying to open unreadable files, and logs other errors at a + * caller-specified level. */ void -pre_sync_fname(char *fname, bool isdir) +pre_sync_fname(int elevel, char *fname, bool isdir) { int fd; fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + +#ifdef ETXTBSY + if (errno == ETXTBSY) + return; +#endif + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; + } + + (void) pg_flush_data(fd, 0, 0); + (void) close(fd); +} + +/* + * fsync_fname_ext -- Try to fsync a file or directory + * + * Ignores errors trying to open unreadable files, or trying to fsync + * directories on systems where that isn't allowed/required, and logs other + * errors at a caller-specified level. + * + * The only reason fsync_fname doesn't call fsync_fname_ext(ERROR, …) is + * that EACCES and ETXTBSY are silently ignored here. + */ +void +fsync_fname_ext(int elevel, char *fname, bool isdir) +{ + int fd; + int flags; + int returncode; + /* - * Some OSs don't allow us to open directories at all (Windows returns - * EACCES) + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here */ - if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) - return; + flags = PG_BINARY; + if (!isdir) + flags |= O_RDWR; + else + flags |= O_RDONLY; + /* + * Open the file, silently ignoring errors about unreadable files + * (or unsupported operations, e.g. opening a directory under + * Windows), and logging others. + */ + fd = OpenTransientFile(fname, flags, 0); if (fd < 0) - ereport(FATAL, - (errmsg("could not open file \"%s\": %m", - fname))); + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + +#ifdef ETXTBSY + if (errno == ETXTBSY) + return; +#endif - pg_flush_data(fd, 0, 0); + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; + } + + returncode = pg_fsync(fd); + + /* + * Some OSes don't allow us to fsync directories at all, so we can ignore + * those errors. Anything else needs to be logged. + */ + if (returncode != 0 && !(isdir && errno == EBADF)) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); - close(fd); + CloseTransientFile(fd); } /* * walkdir: recursively walk a directory, applying the action to each - * regular file and directory (including the named directory itself) - * and following symbolic links. + * regular file and directory (including the named directory itself). * - * NB: There is another version of walkdir in initdb.c, but that version - * behaves differently with respect to symbolic links. Caveat emptor! + * Adapted from walkdir in initdb.c */ void -walkdir(char *path, void (*action) (char *fname, bool isdir)) +walkdir(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir)) { DIR *dir; struct dirent *de; dir = AllocateDir(path); - while ((de = ReadDir(dir, path)) != NULL) + if (dir == NULL) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", path))); + return; + } + + while ((de = ReadDirExtended(elevel, dir, path)) != NULL) { char subpath[MAXPGPATH]; struct stat fst; @@ -2498,59 +2585,78 @@ walkdir(char *path, void (*action) (char *fname, bool isdir)) snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); if (lstat(subpath, &fst) < 0) - ereport(ERROR, + { + ereport(elevel, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", subpath))); + continue; + } if (S_ISREG(fst.st_mode)) - (*action) (subpath, false); + (*action) (elevel, subpath, false); else if (S_ISDIR(fst.st_mode)) - walkdir(subpath, action); -#ifndef WIN32 - else if (S_ISLNK(fst.st_mode)) -#else - else if (pgwin32_is_junction(subpath)) -#endif - { -#if defined(HAVE_READLINK) || defined(WIN32) - char linkpath[MAXPGPATH]; - int len; - struct stat lst; + walkdir(elevel, subpath, action); + } + FreeDir(dir); - len = readlink(subpath, linkpath, sizeof(linkpath) - 1); - if (len < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read symbolic link \"%s\": %m", - subpath))); + (*action) (elevel, path, true); +} - if (len >= sizeof(linkpath) - 1) - ereport(ERROR, - (errmsg("symbolic link \"%s\" target is too long", - subpath))); +/* + * walktblspc_entries -- apply the action to the entries in pb_tblspc + * + * We expect to find only symlinks to tablespace directories here, but + * we'll apply the action to regular files, and call walkdir() on any + * directories as well. + * + * Adapted from walktblspc_links in initdb.c + */ +void +walktblspc_entries(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir)) +{ + DIR *dir; + struct dirent *de; + + dir = AllocateDir(path); + if (dir == NULL) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", path))); + return; + } - linkpath[len] = '\0'; + while ((de = ReadDirExtended(elevel, dir, path)) != NULL) + { + char subpath[MAXPGPATH]; + struct stat fst; - if (lstat(linkpath, &lst) == 0) - { - if (S_ISREG(lst.st_mode)) - (*action) (linkpath, false); - else if (S_ISDIR(lst.st_mode)) - walkdir(subpath, action); - } - else if (errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", linkpath))); -#else - ereport(WARNING, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("this platform does not support symbolic links; ignoring \"%s\"", - subpath))); -#endif + CHECK_FOR_INTERRUPTS(); + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); + + /* + * We use stat rather than lstat here because we want to know if + * a link points to a directory or not. + */ + if (stat(subpath, &fst) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", subpath))); + continue; } + + if (S_ISREG(fst.st_mode)) + (*action) (elevel, subpath, false); + else if (S_ISDIR(fst.st_mode)) + walkdir(elevel, subpath, action); } FreeDir(dir); - (*action) (path, true); + (*action) (elevel, path, true); } diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 5b563a6..60f4d9b 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -114,8 +114,10 @@ extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); extern void fsync_fname(char *fname, bool isdir); -extern void pre_sync_fname(char *fname, bool isdir); -extern void walkdir(char *path, void (*action) (char *fname, bool isdir)); +extern void fsync_fname_ext(int elevel, char *fname, bool isdir); +extern void pre_sync_fname(int elevel, char *fname, bool isdir); +extern void walkdir(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir)); +extern void walktblspc_entries(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir)); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp" -- 1.9.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers