From b6a0267321314992cd0a355002610c0514ec0cc8 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 29 May 2021 23:38:10 +1200
Subject: [PATCH] Support direct I/O on macOS.

Macs don't understand O_DIRECT, but they do know how to disable kernel
caching if you make a separate fcntl() call.  Extend the file opening
wrappers in fd.c to handle this.

For now, this affects only WAL data and even then only if you set:

  wal_level=minimal
  max_wal_senders=0

Later proposed patches will make greater use of direct I/O, and it'll be
useful for testing if people developing on Macs can test that.
---
 src/backend/access/transam/xlog.c     | 74 ++++++++++++++++++---------
 src/backend/storage/file/fd.c         | 54 +++++++++++++++++++
 src/bin/pg_test_fsync/pg_test_fsync.c | 35 +++++++++++--
 src/include/access/xlogdefs.h         | 15 ------
 src/include/storage/fd.h              |  2 +
 5 files changed, 138 insertions(+), 42 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 441a9124cd..1732ca944b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -961,6 +961,7 @@ static bool read_tablespace_map(List **tablespaces);
 
 static void rm_redo_error_callback(void *arg);
 static int	get_sync_bit(int method);
+static bool	get_direct_flag(int method);
 
 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 								XLogRecData *rdata,
@@ -3296,7 +3297,8 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	 */
 	if (*use_existent)
 	{
-		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+		fd = BasicOpenFileDirect(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
+								 get_direct_flag(sync_method));
 		if (fd < 0)
 		{
 			if (errno != ENOENT)
@@ -3454,7 +3456,8 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	*use_existent = false;
 
 	/* Now open original target segment (might not be file I just made) */
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	fd = BasicOpenFileDirect(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
+							 get_direct_flag(sync_method));
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -3693,7 +3696,8 @@ XLogFileOpen(XLogSegNo segno)
 
 	XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
 
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	fd = BasicOpenFileDirect(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
+							 get_direct_flag(sync_method));
 	if (fd < 0)
 		ereport(PANIC,
 				(errcode_for_file_access(),
@@ -5455,8 +5459,10 @@ readRecoverySignalFile(void)
 	{
 		int			fd;
 
-		fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
-							   S_IRUSR | S_IWUSR);
+		fd = BasicOpenFilePermDirect(STANDBY_SIGNAL_FILE,
+									 O_RDWR | PG_BINARY | get_sync_bit(sync_method),
+									 S_IRUSR | S_IWUSR,
+									 get_direct_flag(sync_method));
 		if (fd >= 0)
 		{
 			(void) pg_fsync(fd);
@@ -5468,8 +5474,10 @@ readRecoverySignalFile(void)
 	{
 		int			fd;
 
-		fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
-							   S_IRUSR | S_IWUSR);
+		fd = BasicOpenFilePermDirect(RECOVERY_SIGNAL_FILE,
+									 O_RDWR | PG_BINARY | get_sync_bit(sync_method),
+									 S_IRUSR | S_IWUSR,
+									 get_direct_flag(sync_method));
 		if (fd >= 0)
 		{
 			(void) pg_fsync(fd);
@@ -10505,12 +10513,40 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record)
 static int
 get_sync_bit(int method)
 {
-	int			o_direct_flag = 0;
-
 	/* If fsync is disabled, never open in sync mode */
 	if (!enableFsync)
 		return 0;
 
+	switch (method)
+	{
+			/*
+			 * enum values for all sync options are defined even if they are
+			 * not supported on the current platform.  But if not, they are
+			 * not included in the enum option array, and therefore will never
+			 * be seen here.
+			 */
+		case SYNC_METHOD_FSYNC:
+		case SYNC_METHOD_FSYNC_WRITETHROUGH:
+		case SYNC_METHOD_FDATASYNC:
+			return 0;
+#ifdef OPEN_SYNC_FLAG
+		case SYNC_METHOD_OPEN:
+			return OPEN_SYNC_FLAG;
+#endif
+#ifdef OPEN_DATASYNC_FLAG
+		case SYNC_METHOD_OPEN_DSYNC:
+			return OPEN_DATASYNC_FLAG;
+#endif
+		default:
+			/* can't happen (unless we are out of sync with option array) */
+			elog(ERROR, "unrecognized wal_sync_method: %d", method);
+			return 0;			/* silence warning */
+	}
+}
+
+static bool
+get_direct_flag(int method)
+{
 	/*
 	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
 	 * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
@@ -10525,33 +10561,25 @@ get_sync_bit(int method)
 	 * after it's written. Also, walreceiver performs unaligned writes, which
 	 * don't work with O_DIRECT, so it is required for correctness too.
 	 */
-	if (!XLogIsNeeded() && !AmWalReceiverProcess())
-		o_direct_flag = PG_O_DIRECT;
+	if (XLogIsNeeded() || AmWalReceiverProcess())
+		return false;
 
 	switch (method)
 	{
-			/*
-			 * enum values for all sync options are defined even if they are
-			 * not supported on the current platform.  But if not, they are
-			 * not included in the enum option array, and therefore will never
-			 * be seen here.
-			 */
 		case SYNC_METHOD_FSYNC:
 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
 		case SYNC_METHOD_FDATASYNC:
-			return 0;
+			return false;
 #ifdef OPEN_SYNC_FLAG
 		case SYNC_METHOD_OPEN:
-			return OPEN_SYNC_FLAG | o_direct_flag;
+			return true;
 #endif
 #ifdef OPEN_DATASYNC_FLAG
 		case SYNC_METHOD_OPEN_DSYNC:
-			return OPEN_DATASYNC_FLAG | o_direct_flag;
+			return true;
 #endif
 		default:
-			/* can't happen (unless we are out of sync with option array) */
-			elog(ERROR, "unrecognized wal_sync_method: %d", method);
-			return 0;			/* silence warning */
+			return false;
 	}
 }
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index e8cd7ef088..0fff5e3605 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1035,6 +1035,15 @@ BasicOpenFile(const char *fileName, int fileFlags)
 	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
 }
 
+/*
+ * As BasicOpenFile(), but optionally attempt to disable kernel buffering.
+ */
+int
+BasicOpenFileDirect(const char *fileName, int fileFlags, bool direct)
+{
+	return BasicOpenFilePermDirect(fileName, fileFlags, pg_file_create_mode, direct);
+}
+
 /*
  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
  *
@@ -1078,6 +1087,51 @@ tryAgain:
 	return -1;					/* failure */
 }
 
+/*
+ * BasicOpenDirectFilePerm --- open a file, optionally without kernel buffering
+ *
+ * On systems that have a way to request it, open the file in direct mode if
+ * requested.
+ */
+int
+BasicOpenFilePermDirect(const char *fileName, int fileFlags, mode_t fileMode,
+						bool direct)
+{
+	int			fd;
+
+#ifdef O_DIRECT
+	if (direct)
+	{
+		/*
+		 * Almost every Unix system copied IRIX's O_DIRECT flag, and our wrapper
+		 * in src/port/open.c simulates that for Windows.
+		 */
+		fileFlags |= O_DIRECT;
+	}
+#endif
+
+	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
+	if (fd < 0)
+		return fd;
+
+#if !defined(O_DIRECT) && defined(F_NOCACHE)
+	/* macOS requires an extra step. */
+	if (direct && fcntl(fd, F_NOCACHE, 1) < 0)
+	{
+		int			save_errno = errno;
+
+		close(fd);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not disable kernel file caching for file \"%s\": %m",
+						fileName)));
+	}
+#endif
+
+	return fd;
+}
+
 /*
  * AcquireExternalFD - attempt to reserve an external file descriptor
  *
diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c
index 78dab5096c..fef31844fa 100644
--- a/src/bin/pg_test_fsync/pg_test_fsync.c
+++ b/src/bin/pg_test_fsync/pg_test_fsync.c
@@ -217,8 +217,10 @@ handle_args(int argc, char *argv[])
 					"%u seconds per test\n",
 					secs_per_test),
 		   secs_per_test);
-#if PG_O_DIRECT != 0
+#if defined(O_DIRECT)
 	printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n"));
+#elif defined(F_NOCACHE)
+	printf(_("F_NOCACHE supported on this platform for open_datasync and open_sync.\n"));
 #else
 	printf(_("Direct I/O is not supported on this platform.\n"));
 #endif
@@ -258,6 +260,31 @@ test_open(void)
 	close(tmpfile);
 }
 
+static int
+open_direct(const char *path, int flags, mode_t mode)
+{
+	int			fd;
+
+#ifdef O_DIRECT
+	flags |= O_DIRECT;
+#endif
+
+	fd = open(path, flags, mode);
+
+#if !defined(O_DIRECT) && defined(F_NOCACHE)
+	if (fd >= 0 && fcntl(fd, F_NOCACHE, 1) < 0)
+	{
+		int			save_errno = errno;
+
+		close(fd);
+		errno = save_errno;
+		return -1;
+	}
+#endif
+
+	return fd;
+}
+
 static void
 test_sync(int writes_per_op)
 {
@@ -279,7 +306,7 @@ test_sync(int writes_per_op)
 	fflush(stdout);
 
 #ifdef OPEN_DATASYNC_FLAG
-	if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT | PG_BINARY, 0)) == -1)
+	if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
 		fs_warning = true;
@@ -386,7 +413,7 @@ test_sync(int writes_per_op)
 	fflush(stdout);
 
 #ifdef OPEN_SYNC_FLAG
-	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
+	if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
 		fs_warning = true;
@@ -454,7 +481,7 @@ test_open_sync(const char *msg, int writes_size)
 	fflush(stdout);
 
 #ifdef OPEN_SYNC_FLAG
-	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
+	if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
 		printf(NA_FORMAT, _("n/a*"));
 	else
 	{
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index 0940b64ca6..60348d1850 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -64,21 +64,6 @@ typedef uint32 TimeLineID;
  */
 typedef uint16 RepOriginId;
 
-/*
- *	Because O_DIRECT bypasses the kernel buffers, and because we never
- *	read those buffers except during crash recovery or if wal_level != minimal,
- *	it is a win to use it in all cases where we sync on each write().  We could
- *	allow O_DIRECT with fsync(), but it is unclear if fsync() could process
- *	writes not buffered in the kernel.  Also, O_DIRECT is never enough to force
- *	data to the drives, it merely tries to bypass the kernel cache, so we still
- *	need O_SYNC/O_DSYNC.
- */
-#ifdef O_DIRECT
-#define PG_O_DIRECT				O_DIRECT
-#else
-#define PG_O_DIRECT				0
-#endif
-
 /*
  * This chunk of hackery attempts to determine which file sync methods
  * are available on the current platform, and to choose an appropriate
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 5b3c280dd7..65dfe99ba0 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -130,7 +130,9 @@ extern int	CloseTransientFile(int fd);
 
 /* If you've really really gotta have a plain kernel FD, use this */
 extern int	BasicOpenFile(const char *fileName, int fileFlags);
+extern int	BasicOpenFileDirect(const char *fileName, int fileFlags, bool direct);
 extern int	BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
+extern int	BasicOpenFilePermDirect(const char *fileName, int fileFlags, mode_t fileMode, bool direct);
 
 /* Use these for other cases, and also for long-lived BasicOpenFile FDs */
 extern bool AcquireExternalFD(void);
-- 
2.24.3 (Apple Git-128)

