At 2014-10-30 14:30:27 +0530, a...@2ndquadrant.com wrote:
>
> Here's a proposed patch to initdb to make initdb -S fsync everything
> under pg_tblspc.

Oops, I meant to include the corresponding patch to xlog.c to do the
same at startup. It's based on the initdb patch, but modified to not
use fprintf/exit_nicely and so on. (Note that this was written in a
single chunk to aid backpatching. There's no attempt made to share
code in this set of patches.)

Now attached.

-- Abhijit
>From ae91da4df7ee60e6f83c98801e979929442f588a Mon Sep 17 00:00:00 2001
From: Abhijit Menon-Sen <a...@2ndquadrant.com>
Date: Thu, 6 Nov 2014 00:45:56 +0530
Subject: If we need to perform crash recovery, fsync PGDATA recursively

This is so that we don't lose older unflushed writes in the event of
a power failure after crash recovery, where more recent writes are
preserved.

See 20140918083148.ga17...@alap3.anarazel.de for details.
---
 src/backend/access/transam/xlog.c | 184 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ab04380..ef95f64 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -830,6 +830,12 @@ static void WALInsertLockAcquireExclusive(void);
 static void WALInsertLockRelease(void);
 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 
+static void pre_sync_fname(char *fname, bool isdir);
+static void walkdir(char *path, void (*action) (char *fname, bool isdir));
+static void walktblspc_links(char *path,
+							 void (*action) (char *fname, bool isdir));
+static void perform_fsync(char *pg_data);
+
 /*
  * Insert an XLOG record having the specified RMID and info bytes,
  * with the body of the record being the data chunk(s) described by
@@ -5981,6 +5987,19 @@ StartupXLOG(void)
 		ereport(FATAL,
 				(errmsg("control file contains invalid data")));
 
+	/*
+	 * If we need to perform crash recovery, we issue an fsync on the
+	 * data directory and its contents to try to ensure that any data
+	 * written before the crash are flushed to disk. Otherwise a power
+	 * failure in the near future might cause earlier unflushed writes
+	 * to be lost, even though more recent data written to disk from
+	 * here on would be persisted.
+	 */
+
+	if (ControlFile->state != DB_SHUTDOWNED &&
+		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+		perform_fsync(data_directory);
+
 	if (ControlFile->state == DB_SHUTDOWNED)
 	{
 		/* This is the expected case, so don't be chatty in standalone mode */
@@ -11262,3 +11281,168 @@ SetWalWriterSleeping(bool sleeping)
 	XLogCtl->WalWriterSleeping = sleeping;
 	SpinLockRelease(&XLogCtl->info_lck);
 }
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Adapted from pre_sync_fname in initdb.c
+ */
+static void
+pre_sync_fname(char *fname, bool isdir)
+{
+#if defined(HAVE_SYNC_FILE_RANGE) || \
+	(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
+	int			fd;
+
+	fd = open(fname, O_RDONLY | PG_BINARY);
+
+	/*
+	 * Some OSs don't allow us to open directories at all (Windows returns
+	 * EACCES)
+	 */
+	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+		return;
+
+	if (fd < 0)
+		ereport(FATAL,
+				(errmsg("could not open file \"%s\" before fsync",
+						fname)));
+
+	/*
+	 * Prefer sync_file_range, else use posix_fadvise.  We ignore any error
+	 * here since this operation is only a hint anyway.
+	 */
+#if defined(HAVE_SYNC_FILE_RANGE)
+	sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
+#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+	posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+
+	close(fd);
+#endif
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself).
+ *
+ * Adapted from copydir() in copydir.c.
+ */
+static void
+walkdir(char *path, void (*action) (char *fname, bool isdir))
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(path);
+	while ((de = ReadDir(dir, path)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+		struct stat fst;
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		if (lstat(subpath, &fst) < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", subpath)));
+
+		if (S_ISDIR(fst.st_mode))
+			walkdir(subpath, action);
+		else if (S_ISREG(fst.st_mode))
+			(*action) (subpath, false);
+	}
+	FreeDir(dir);
+
+	/*
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced.  Recent versions of ext4 have made the window much wider but
+	 * it's been an issue for ext3 and other filesystems in the past.
+	 */
+	(*action) (path, true);
+}
+
+/*
+ * walktblspc_links: call walkdir on each entry under the given
+ * pg_tblspc directory, or do nothing if pg_tblspc doesn't exist.
+ *
+ * Adapted from walktblspc_links in initdb.c
+ */
+static void
+walktblspc_links(char *path, void (*action) (char *fname, bool isdir))
+{
+	DIR		   *tblspc_dir;
+	struct dirent *de;
+
+	tblspc_dir = AllocateDir(path);
+	if (tblspc_dir == NULL)
+		return;
+
+	while ((de = ReadDir(tblspc_dir, path)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		walkdir(subpath, action);
+	}
+	FreeDir(tblspc_dir);
+}
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents, including the
+ * links under pg_tblspc.
+ *
+ * Adapted from perform_fsync in initdb.c
+ */
+static void
+perform_fsync(char *pg_data)
+{
+	char		pdir[MAXPGPATH];
+	char		pg_tblspc[MAXPGPATH];
+
+	/*
+	 * We need to name the parent of PGDATA.  get_parent_directory() isn't
+	 * enough here, because it can result in an empty string.
+	 */
+	snprintf(pdir, MAXPGPATH, "%s/..", pg_data);
+	canonicalize_path(pdir);
+
+	/*
+	 * Hint to the OS so that we're going to fsync each of these files soon.
+	 */
+
+	/* first the parent of the PGDATA directory */
+	pre_sync_fname(pdir, true);
+
+	/* then recursively through the directory */
+	walkdir(pg_data, pre_sync_fname);
+
+	/* now do the same thing for everything under pg_tblspc */
+
+	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
+	walktblspc_links(pg_tblspc, pre_sync_fname);
+
+	/*
+	 * Now, do the fsync()s in the same order.
+	 */
+
+	/* first the parent of the PGDATA directory */
+	fsync_fname(pdir, true);
+
+	/* then recursively through the directory */
+	walkdir(pg_data, fsync_fname);
+
+	walktblspc_links(pg_tblspc, fsync_fname);
+}
-- 
1.9.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to