At 2015-01-15 14:32:45 +0100, and...@2ndquadrant.com wrote:
>
> What I am thinking of is that, currently, if you start the server for
> initial loading with fsync=off, and then restart it, you're open to
> data loss. So when the current config file setting is changed from off
> to on, we should fsync the data directory. Even if there was no crash
> restart.

Patch attached.

Changes:

1. Renamed perform_fsync to fsync_recursively (otherwise it would read
   "fsync_pgdata(pg_data)")
2. Added ControlData->fsync_disabled to record whether fsync was ever
   disabled while the server was running (tested in CreateCheckPoint)
3. Run fsync_recursively at startup only if fsync is enabled
4. Run it if we're doing crash recovery, or fsync was disabled
5. Use pg_flush_data in pre_sync_fname
6. Issue fsync on directories too
7. Tested that it works if pg_xlog is a symlink (no changes).

(In short, everything you mentioned in your earlier mail.)

Note that I set ControlData->fsync_disabled to false in BootstrapXLOG,
but it gets set to true during a later CreateCheckPoint(). This means
we run fsync again at startup after initdb. I'm not sure what to do
about that.

Is this about what you had in mind?

-- Abhijit
>From bb2b5f130525dd44a10eab6829b9802b8f6f7eed Mon Sep 17 00:00:00 2001
From: Abhijit Menon-Sen <a...@2ndquadrant.com>
Date: Thu, 6 Nov 2014 00:45:56 +0530
Subject: Recursively fsync PGDATA at startup if needed

It's needed if we need to perform crash recovery or if fsync was
disabled at some point while the server was running earlier (and
we must store that information in the control file).

This is so that we don't lose older unflushed writes in the event of
a power failure after crash recovery, where more recent writes are
preserved.

See 20140918083148.ga17...@alap3.anarazel.de for details.
---
 src/backend/access/transam/xlog.c       | 171 ++++++++++++++++++++++++++++++++
 src/bin/pg_controldata/pg_controldata.c |   2 +
 src/include/catalog/pg_control.h        |   8 +-
 3 files changed, 180 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 71cbe0e..75a6862 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -833,6 +833,12 @@ static void WALInsertLockAcquireExclusive(void);
 static void WALInsertLockRelease(void);
 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 
+static void pre_sync_fname(char *fname, bool isdir);
+static void walkdir(char *path, void (*action) (char *fname, bool isdir));
+static void walktblspc_links(char *path,
+							 void (*action) (char *fname, bool isdir));
+static void fsync_recursively(char *pg_data);
+
 /*
  * Insert an XLOG record represented by an already-constructed chain of data
  * chunks.  This is a low-level routine; to construct the WAL record header
@@ -4721,6 +4727,7 @@ BootStrapXLOG(void)
 	ControlFile->checkPoint = checkPoint.redo;
 	ControlFile->checkPointCopy = checkPoint;
 	ControlFile->unloggedLSN = 1;
+	ControlFile->fsync_disabled = false;
 
 	/* Set important parameter values for use when replaying WAL */
 	ControlFile->MaxConnections = MaxConnections;
@@ -5787,6 +5794,27 @@ StartupXLOG(void)
 		ereport(FATAL,
 				(errmsg("control file contains invalid data")));
 
+	/*
+	 * If we need to perform crash recovery, we issue an fsync on the
+	 * data directory and its contents to try to ensure that any data
+	 * written before the crash are flushed to disk. Otherwise a power
+	 * failure in the near future might cause earlier unflushed writes
+	 * to be lost, even though more recent data written to disk from
+	 * here on would be persisted.
+	 *
+	 * We also do this if the control file indicates that fsync was
+	 * disabled at some point while the server was running earlier.
+	 */
+
+	if (enableFsync &&
+		(ControlFile->fsync_disabled ||
+		 (ControlFile->state != DB_SHUTDOWNED &&
+		  ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)))
+	{
+		fsync_recursively(data_directory);
+		ControlFile->fsync_disabled = false;
+	}
+
 	if (ControlFile->state == DB_SHUTDOWNED)
 	{
 		/* This is the expected case, so don't be chatty in standalone mode */
@@ -8137,6 +8165,8 @@ CreateCheckPoint(int flags)
 	/* crash recovery should always recover to the end of WAL */
 	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
 	ControlFile->minRecoveryPointTLI = 0;
+	if (!enableFsync)
+		ControlFile->fsync_disabled = true;
 
 	/*
 	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
@@ -11008,3 +11038,144 @@ SetWalWriterSleeping(bool sleeping)
 	XLogCtl->WalWriterSleeping = sleeping;
 	SpinLockRelease(&XLogCtl->info_lck);
 }
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Adapted from pre_sync_fname in initdb.c
+ */
+static void
+pre_sync_fname(char *fname, bool isdir)
+{
+	int			fd;
+
+	fd = open(fname, O_RDONLY | PG_BINARY);
+
+	/*
+	 * Some OSs don't allow us to open directories at all (Windows returns
+	 * EACCES)
+	 */
+	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+		return;
+
+	if (fd < 0)
+		ereport(FATAL,
+				(errmsg("could not open file \"%s\" before fsync",
+						fname)));
+
+	pg_flush_data(fd, 0, 0);
+
+	close(fd);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself).
+ *
+ * Adapted from copydir() in copydir.c.
+ */
+static void
+walkdir(char *path, void (*action) (char *fname, bool isdir))
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(path);
+	while ((de = ReadDir(dir, path)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+		struct stat fst;
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		if (lstat(subpath, &fst) < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", subpath)));
+
+		if (S_ISREG(fst.st_mode))
+			(*action) (subpath, false);
+		else if (S_ISDIR(fst.st_mode))
+		{
+			(*action) (subpath, true);
+			walkdir(subpath, action);
+		}
+	}
+	FreeDir(dir);
+
+	/*
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced.  Recent versions of ext4 have made the window much wider but
+	 * it's been an issue for ext3 and other filesystems in the past.
+	 */
+	(*action) (path, true);
+}
+
+/*
+ * walktblspc_links: call walkdir on each entry under the given
+ * pg_tblspc directory, or do nothing if pg_tblspc doesn't exist.
+ *
+ * Adapted from walktblspc_links in initdb.c
+ */
+static void
+walktblspc_links(char *path, void (*action) (char *fname, bool isdir))
+{
+	DIR		   *tblspc_dir;
+	struct dirent *de;
+
+	tblspc_dir = AllocateDir(path);
+	if (tblspc_dir == NULL)
+		return;
+
+	while ((de = ReadDir(tblspc_dir, path)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		walkdir(subpath, action);
+	}
+	FreeDir(tblspc_dir);
+}
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents, including the
+ * links under pg_tblspc.
+ *
+ * Adapted from perform_fsync in initdb.c
+ */
+static void
+fsync_recursively(char *pg_data)
+{
+	char		pg_tblspc[MAXPGPATH];
+
+	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
+
+	/*
+	 * If possible, hint to the kernel that we're soon going to fsync
+	 * the data directory and its contents, and everything under
+	 * pg_tblspc.
+	 */
+
+#if defined(HAVE_SYNC_FILE_RANGE) || \
+	(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
+	walkdir(pg_data, pre_sync_fname);
+	walktblspc_links(pg_tblspc, pre_sync_fname);
+#endif
+
+	/* Now do the fsync()s in the same order. */
+
+	walkdir(pg_data, fsync_fname);
+	walktblspc_links(pg_tblspc, fsync_fname);
+}
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index a838bb5..094f920 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -290,6 +290,8 @@ main(int argc, char *argv[])
 		   (uint32) ControlFile.backupEndPoint);
 	printf(_("End-of-backup record required:        %s\n"),
 		   ControlFile.backupEndRequired ? _("yes") : _("no"));
+	printf(_("Fsync disabled at runtime:            %s\n"),
+		   ControlFile.fsync_disabled ? _("yes") : _("no"));
 	printf(_("Current wal_level setting:            %s\n"),
 		   wal_level_str(ControlFile.wal_level));
 	printf(_("Current wal_log_hints setting:        %s\n"),
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 31232b1..c9d7680 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	942
+#define PG_CONTROL_VERSION	943
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -182,6 +182,12 @@ typedef struct ControlFileData
 	bool		track_commit_timestamp;
 
 	/*
+	 * Indicates whether fsync was ever disabled while the server was
+	 * running. Tested and set at checkpoints, reset at startup.
+	 */
+	bool		fsync_disabled;
+
+	/*
 	 * This data is used to check for hardware-architecture compatibility of
 	 * the database and the backend executable.  We need not check endianness
 	 * explicitly, since the pg_control version will surely look wrong to a
-- 
1.9.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to