At 2015-05-01 08:10:16 -0400, robertmh...@gmail.com wrote:
>
> It seems to me that, at a minimum, it would be good to split those
> controversial and definitely not-back-patchable changes into their
> own patch.

OK, split here (0002*).

> I do mind putting it into xlog.c instead of some place that's actually
> appropriate.

OK, moved to storage/file/fd.c (0001*).

-- Abhijit
>From 088b80eb0825339eca688e4347a4ef547edcadbb Mon Sep 17 00:00:00 2001
From: Abhijit Menon-Sen <a...@2ndquadrant.com>
Date: Thu, 6 Nov 2014 00:45:56 +0530
Subject: Recursively fsync PGDATA at startup after a crash

This is so that we don't lose older unflushed writes in the event of
a power failure after crash recovery, where more recent writes are
preserved.

See 20140918083148.ga17...@alap3.anarazel.de for details.
---
 src/backend/access/transam/xlog.c |  49 +++++++++++++++++
 src/backend/storage/file/fd.c     | 112 ++++++++++++++++++++++++++++++++++++++
 src/include/storage/fd.h          |   2 +
 3 files changed, 163 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6cf4415..084174d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -845,6 +845,8 @@ static void WALInsertLockAcquireExclusive(void);
 static void WALInsertLockRelease(void);
 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 
+static void fsync_pgdata(char *datadir);
+
 /*
  * Insert an XLOG record represented by an already-constructed chain of data
  * chunks.  This is a low-level routine; to construct the WAL record header
@@ -5878,6 +5880,25 @@ StartupXLOG(void)
 		ereport(FATAL,
 				(errmsg("control file contains invalid data")));
 
+	/*
+	 * If we need to perform crash recovery, we issue an fsync on the
+	 * data directory and its contents to try to ensure that any data
+	 * written before the crash are flushed to disk. Otherwise a power
+	 * failure in the near future might cause earlier unflushed writes
+	 * to be lost, even though more recent data written to disk from
+	 * here on would be persisted.
+	 *
+	 * We also do this if the control file indicates that fsync was
+	 * disabled at some point while the server was running earlier.
+	 */
+
+	if (enableFsync &&
+		(ControlFile->state != DB_SHUTDOWNED &&
+		 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY))
+	{
+		fsync_pgdata(data_directory);
+	}
+
 	if (ControlFile->state == DB_SHUTDOWNED)
 	{
 		/* This is the expected case, so don't be chatty in standalone mode */
@@ -11123,3 +11144,31 @@ SetWalWriterSleeping(bool sleeping)
 	XLogCtl->WalWriterSleeping = sleeping;
 	SpinLockRelease(&XLogCtl->info_lck);
 }
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents.
+ */
+static void
+fsync_pgdata(char *datadir)
+{
+	if (!enableFsync)
+		return;
+
+	/*
+	 * If possible, hint to the kernel that we're soon going to fsync
+	 * the data directory and its contents.
+	 */
+#if defined(HAVE_SYNC_FILE_RANGE) || \
+	(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
+	walkdir(datadir, pre_sync_fname);
+#endif
+
+	/*
+	 * Now we do the fsync()s in the same order.
+	 *
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced.
+	 */
+	walkdir(datadir, fsync_fname);
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index f796717..aba12ca 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2439,3 +2439,115 @@ looks_like_temp_rel_name(const char *name)
 		return false;
 	return true;
 }
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Adapted from pre_sync_fname in initdb.c
+ */
+void
+pre_sync_fname(char *fname, bool isdir)
+{
+	int			fd;
+
+	fd = open(fname, O_RDONLY | PG_BINARY);
+
+	/*
+	 * Some OSs don't allow us to open directories at all (Windows returns
+	 * EACCES)
+	 */
+	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+		return;
+
+	if (fd < 0)
+		ereport(FATAL,
+				(errmsg("could not open file \"%s\" before fsync",
+						fname)));
+
+	pg_flush_data(fd, 0, 0);
+
+	close(fd);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself)
+ * and following symbolic links.
+ */
+void
+walkdir(char *path, void (*action) (char *fname, bool isdir))
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(path);
+	while ((de = ReadDir(dir, path)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+		struct stat fst;
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		if (lstat(subpath, &fst) < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", subpath)));
+
+		if (S_ISREG(fst.st_mode))
+			(*action) (subpath, false);
+		else if (S_ISDIR(fst.st_mode))
+			walkdir(subpath, action);
+#ifndef WIN32
+		else if (S_ISLNK(fst.st_mode))
+#else
+		else if (pg_win32_is_junction(subpath))
+#endif
+		{
+#if defined(HAVE_READLINK) || defined(WIN32)
+			char		linkpath[MAXPGPATH];
+			int			len;
+			struct stat lst;
+
+			len = readlink(subpath, linkpath, sizeof(linkpath)-1);
+			if (len < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read symbolic link \"%s\": %m",
+								subpath)));
+
+			if (len >= sizeof(linkpath)-1)
+				ereport(ERROR,
+						(errmsg("symbolic link \"%s\" target is too long",
+								subpath)));
+
+			linkpath[len] = '\0';
+
+			if (lstat(linkpath, &lst) == 0)
+			{
+				if (S_ISREG(lst.st_mode))
+					(*action) (linkpath, false);
+				else if (S_ISDIR(lst.st_mode))
+					walkdir(subpath, action);
+			}
+			else if (errno != ENOENT)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not stat file \"%s\": %m", linkpath)));
+#else
+			ereport(WARNING,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("this platform does not support symbolic links; ignoring \"%s\"",
+							subpath)));
+#endif
+		}
+	}
+	FreeDir(dir);
+
+	(*action) (path, true);
+}
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 5e9571c..5b563a6 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -114,6 +114,8 @@ extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern int	pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(char *fname, bool isdir);
+extern void pre_sync_fname(char *fname, bool isdir);
+extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- 
1.9.1

>From 1768680b672bcb037446230323cabcf9960f7f9a Mon Sep 17 00:00:00 2001
From: Abhijit Menon-Sen <a...@2ndquadrant.com>
Date: Fri, 1 May 2015 17:59:51 +0530
Subject: Recursively fsync PGDATA on the next restart after fsync was disabled

Even if we didn't crash, we want to fsync PGDATA on startup if we know
the server ran with fsync=off at some point since the previous restart.
Otherwise, starting the server with fsync=off for initial data loading
and then restarting it opens you to data loss on a power failure after
the restart.
---
 src/backend/access/transam/xlog.c       | 9 +++++++--
 src/bin/pg_controldata/pg_controldata.c | 2 ++
 src/include/catalog/pg_control.h        | 8 +++++++-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 084174d..af12992 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4823,6 +4823,7 @@ BootStrapXLOG(void)
 	ControlFile->checkPoint = checkPoint.redo;
 	ControlFile->checkPointCopy = checkPoint;
 	ControlFile->unloggedLSN = 1;
+	ControlFile->fsync_disabled = false;
 
 	/* Set important parameter values for use when replaying WAL */
 	ControlFile->MaxConnections = MaxConnections;
@@ -5893,10 +5894,12 @@ StartupXLOG(void)
 	 */
 
 	if (enableFsync &&
-		(ControlFile->state != DB_SHUTDOWNED &&
-		 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY))
+		(ControlFile->fsync_disabled ||
+		 (ControlFile->state != DB_SHUTDOWNED &&
+		  ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)))
 	{
 		fsync_pgdata(data_directory);
+		ControlFile->fsync_disabled = false;
 	}
 
 	if (ControlFile->state == DB_SHUTDOWNED)
@@ -8272,6 +8275,8 @@ CreateCheckPoint(int flags)
 	/* crash recovery should always recover to the end of WAL */
 	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
 	ControlFile->minRecoveryPointTLI = 0;
+	if (!enableFsync)
+		ControlFile->fsync_disabled = true;
 
 	/*
 	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index d8cfe5e..e99014f 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -290,6 +290,8 @@ main(int argc, char *argv[])
 		   (uint32) ControlFile.backupEndPoint);
 	printf(_("End-of-backup record required:        %s\n"),
 		   ControlFile.backupEndRequired ? _("yes") : _("no"));
+	printf(_("Fsync disabled at runtime:            %s\n"),
+		   ControlFile.fsync_disabled ? _("yes") : _("no"));
 	printf(_("Current wal_level setting:            %s\n"),
 		   wal_level_str(ControlFile.wal_level));
 	printf(_("Current wal_log_hints setting:        %s\n"),
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 2e4c381..a71d73e 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	942
+#define PG_CONTROL_VERSION	943
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -182,6 +182,12 @@ typedef struct ControlFileData
 	bool		track_commit_timestamp;
 
 	/*
+	 * Indicates whether fsync was ever disabled since the last restart.
+	 * Tested and set at checkpoints, reset at startup.
+	 */
+	bool		fsync_disabled;
+
+	/*
 	 * This data is used to check for hardware-architecture compatibility of
 	 * the database and the backend executable.  We need not check endianness
 	 * explicitly, since the pg_control version will surely look wrong to a
-- 
1.9.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to