At 2014-10-30 14:30:27 +0530, a...@2ndquadrant.com wrote: > > Here's a proposed patch to initdb to make initdb -S fsync everything > under pg_tblspc.
Oops, I meant to include the corresponding patch to xlog.c to do the same at startup. It's based on the initdb patch, but modified to not use fprintf/exit_nicely and so on. (Note that this was written in a single chunk to aid backpatching. There's no attempt made to share code in this set of patches.) Now attached. -- Abhijit
>From ae91da4df7ee60e6f83c98801e979929442f588a Mon Sep 17 00:00:00 2001 From: Abhijit Menon-Sen <a...@2ndquadrant.com> Date: Thu, 6 Nov 2014 00:45:56 +0530 Subject: If we need to perform crash recovery, fsync PGDATA recursively This is so that we don't lose older unflushed writes in the event of a power failure after crash recovery, where more recent writes are preserved. See 20140918083148.ga17...@alap3.anarazel.de for details. --- src/backend/access/transam/xlog.c | 184 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ab04380..ef95f64 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -830,6 +830,12 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static void pre_sync_fname(char *fname, bool isdir); +static void walkdir(char *path, void (*action) (char *fname, bool isdir)); +static void walktblspc_links(char *path, + void (*action) (char *fname, bool isdir)); +static void perform_fsync(char *pg_data); + /* * Insert an XLOG record having the specified RMID and info bytes, * with the body of the record being the data chunk(s) described by @@ -5981,6 +5987,19 @@ StartupXLOG(void) ereport(FATAL, (errmsg("control file contains invalid data"))); + /* + * If we need to perform crash recovery, we issue an fsync on the + * data directory and its contents to try to ensure that any data + * written before the crash are flushed to disk. Otherwise a power + * failure in the near future might cause earlier unflushed writes + * to be lost, even though more recent data written to disk from + * here on would be persisted. + */ + + if (ControlFile->state != DB_SHUTDOWNED && + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + perform_fsync(data_directory); + if (ControlFile->state == DB_SHUTDOWNED) { /* This is the expected case, so don't be chatty in standalone mode */ @@ -11262,3 +11281,168 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +/* + * Hint to the OS that it should get ready to fsync() this file. + * + * Adapted from pre_sync_fname in initdb.c + */ +static void +pre_sync_fname(char *fname, bool isdir) +{ +#if defined(HAVE_SYNC_FILE_RANGE) || \ + (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)) + int fd; + + fd = open(fname, O_RDONLY | PG_BINARY); + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES) + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return; + + if (fd < 0) + ereport(FATAL, + (errmsg("could not open file \"%s\" before fsync", + fname))); + + /* + * Prefer sync_file_range, else use posix_fadvise. We ignore any error + * here since this operation is only a hint anyway. + */ +#if defined(HAVE_SYNC_FILE_RANGE) + sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + + close(fd); +#endif +} + +/* + * walkdir: recursively walk a directory, applying the action to each + * regular file and directory (including the named directory itself). + * + * Adapted from copydir() in copydir.c. + */ +static void +walkdir(char *path, void (*action) (char *fname, bool isdir)) +{ + DIR *dir; + struct dirent *de; + + dir = AllocateDir(path); + while ((de = ReadDir(dir, path)) != NULL) + { + char subpath[MAXPGPATH]; + struct stat fst; + + CHECK_FOR_INTERRUPTS(); + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); + + if (lstat(subpath, &fst) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", subpath))); + + if (S_ISDIR(fst.st_mode)) + walkdir(subpath, action); + else if (S_ISREG(fst.st_mode)) + (*action) (subpath, false); + } + FreeDir(dir); + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. Recent versions of ext4 have made the window much wider but + * it's been an issue for ext3 and other filesystems in the past. + */ + (*action) (path, true); +} + +/* + * walktblspc_links: call walkdir on each entry under the given + * pg_tblspc directory, or do nothing if pg_tblspc doesn't exist. + * + * Adapted from walktblspc_links in initdb.c + */ +static void +walktblspc_links(char *path, void (*action) (char *fname, bool isdir)) +{ + DIR *tblspc_dir; + struct dirent *de; + + tblspc_dir = AllocateDir(path); + if (tblspc_dir == NULL) + return; + + while ((de = ReadDir(tblspc_dir, path)) != NULL) + { + char subpath[MAXPGPATH]; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); + + walkdir(subpath, action); + } + FreeDir(tblspc_dir); +} + +/* + * Issue fsync recursively on PGDATA and all its contents, including the + * links under pg_tblspc. + * + * Adapted from perform_fsync in initdb.c + */ +static void +perform_fsync(char *pg_data) +{ + char pdir[MAXPGPATH]; + char pg_tblspc[MAXPGPATH]; + + /* + * We need to name the parent of PGDATA. get_parent_directory() isn't + * enough here, because it can result in an empty string. + */ + snprintf(pdir, MAXPGPATH, "%s/..", pg_data); + canonicalize_path(pdir); + + /* + * Hint to the OS so that we're going to fsync each of these files soon. + */ + + /* first the parent of the PGDATA directory */ + pre_sync_fname(pdir, true); + + /* then recursively through the directory */ + walkdir(pg_data, pre_sync_fname); + + /* now do the same thing for everything under pg_tblspc */ + + snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data); + walktblspc_links(pg_tblspc, pre_sync_fname); + + /* + * Now, do the fsync()s in the same order. + */ + + /* first the parent of the PGDATA directory */ + fsync_fname(pdir, true); + + /* then recursively through the directory */ + walkdir(pg_data, fsync_fname); + + walktblspc_links(pg_tblspc, fsync_fname); +} -- 1.9.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers