From 093e6c03c8413f2d36f3d28be89b5a93647795ba Mon Sep 17 00:00:00 2001
From: Takashi Menjo <takashi.menjou.vg@hco.ntt.co.jp>
Date: Tue, 4 Aug 2020 13:02:14 +0900
Subject: [PATCH v4 2/3] Read write WAL files using PMDK

Author: Yoshimi Ichiyanagi <ichiyanagi.yoshimi@lab.ntt.co.jp>
---
 src/backend/access/transam/xlog.c             | 461 ++++++++++++------
 src/backend/storage/file/Makefile             |   3 +-
 src/backend/storage/file/fd.c                 | 121 +++++
 src/backend/storage/file/pmem.c               | 188 +++++++
 src/backend/utils/misc/guc.c                  |   2 +-
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/xlog.h                     |   8 +-
 src/include/storage/fd.h                      |  13 +
 src/include/storage/pmem.h                    |  32 ++
 src/include/utils/guc.h                       |   1 +
 10 files changed, 685 insertions(+), 145 deletions(-)
 create mode 100644 src/backend/storage/file/pmem.c
 create mode 100644 src/include/storage/pmem.h

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 184c6672f3..ad50012138 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -63,6 +63,7 @@
 #include "storage/ipc.h"
 #include "storage/large_object.h"
 #include "storage/latch.h"
+#include "storage/pmem.h"
 #include "storage/pmsignal.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -148,6 +149,9 @@ const struct config_enum_entry sync_method_options[] = {
 #endif
 #ifdef OPEN_DATASYNC_FLAG
 	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
+#endif
+#ifdef USE_LIBPMEM
+	{"pmem_drain", SYNC_METHOD_PMEM_DRAIN, false},
 #endif
 	{NULL, 0, false}
 };
@@ -799,6 +803,7 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea
  */
 static int	openLogFile = -1;
 static XLogSegNo openLogSegNo = 0;
+static void *mappedLogFileAddr = NULL;
 
 /*
  * These variables are used similarly to the ones above, but for reading
@@ -816,6 +821,7 @@ static XLogSegNo readSegNo = 0;
 static uint32 readOff = 0;
 static uint32 readLen = 0;
 static XLogSource readSource = XLOG_FROM_ANY;
+static void *mappedReadFileAddr = NULL;
 
 /*
  * Keeps track of which source we're currently reading from. This is
@@ -905,13 +911,15 @@ static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 
 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
+static int	do_XLogFileOpen(char *pathname, int flags, void **addr);
 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 								   bool find_free, XLogSegNo max_segno,
 								   bool use_lock);
 static int	XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
-						 XLogSource source, bool notfoundOk);
-static int	XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
+						 XLogSource source, bool notfoundOk, void **addr);
+static int	XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source,
+							   void **addr);
 static int	XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 						 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
@@ -2399,6 +2407,15 @@ XLogCheckpointNeeded(XLogSegNo new_segno)
 	return false;
 }
 
+static int
+do_XLogFileOpen(char *pathname, int flags, void **addr)
+{
+	if (sync_method == SYNC_METHOD_PMEM_DRAIN)
+		return PmemFileOpen(pathname, flags, wal_segment_size, addr);
+	else
+		return BasicOpenFile(pathname, flags);
+}
+
 /*
  * Write and/or fsync the log at least as far as WriteRqst indicates.
  *
@@ -2478,24 +2495,27 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			 * pages here (since we dump what we have at segment end).
 			 */
 			Assert(npages == 0);
-			if (openLogFile >= 0)
+			if (openLogFile >= 0 || mappedLogFileAddr != NULL)
 				XLogFileClose();
 			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 							wal_segment_size);
 
 			/* create/use new log file */
 			use_existent = true;
-			openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
-			ReserveExternalFD();
+			openLogFile = XLogFileInit(openLogSegNo, &use_existent, true,
+									   &mappedLogFileAddr);
+			if (openLogFile >= 0)
+				ReserveExternalFD();
 		}
 
 		/* Make sure we have the current logfile open */
-		if (openLogFile < 0)
+		if (openLogFile < 0 && mappedLogFileAddr == NULL)
 		{
 			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 							wal_segment_size);
-			openLogFile = XLogFileOpen(openLogSegNo);
-			ReserveExternalFD();
+			openLogFile = XLogFileOpen(openLogSegNo, &mappedLogFileAddr);
+			if (openLogFile >= 0)
+				ReserveExternalFD();
 		}
 
 		/* Add current page to the set of pending pages-to-dump */
@@ -2531,35 +2551,49 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			/* OK to write the page(s) */
 			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
 			nbytes = npages * (Size) XLOG_BLCKSZ;
-			nleft = nbytes;
-			do
+
+			if (mappedLogFileAddr != NULL)
 			{
-				errno = 0;
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
-				written = pg_pwrite(openLogFile, from, nleft, startoffset);
+				PmemFileWrite((char *) mappedLogFileAddr + startoffset, from, nbytes);
 				pgstat_report_wait_end();
-				if (written <= 0)
+
+				written = nbytes;
+				nleft = 0;
+				from += nbytes;
+			}
+			else
+			{
+				nleft = nbytes;
+				do
 				{
-					char		xlogfname[MAXFNAMELEN];
-					int			save_errno;
+					errno = 0;
+					pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
+					written = pg_pwrite(openLogFile, from, nleft, startoffset);
+					pgstat_report_wait_end();
+					if (written <= 0)
+					{
+						char		xlogfname[MAXFNAMELEN];
+						int			save_errno;
 
-					if (errno == EINTR)
-						continue;
+						if (errno == EINTR)
+							continue;
 
-					save_errno = errno;
-					XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
-								 wal_segment_size);
-					errno = save_errno;
-					ereport(PANIC,
-							(errcode_for_file_access(),
-							 errmsg("could not write to log file %s "
-									"at offset %u, length %zu: %m",
-									xlogfname, startoffset, nleft)));
-				}
-				nleft -= written;
-				from += written;
-				startoffset += written;
-			} while (nleft > 0);
+						save_errno = errno;
+						XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
+									 wal_segment_size);
+						errno = save_errno;
+						ereport(PANIC,
+								(errcode_for_file_access(),
+								 errmsg("could not write to log file %s "
+										"at offset %u, length %zu: %m",
+										xlogfname, startoffset, nleft)));
+					}
+					nleft -= written;
+					from += written;
+					startoffset += written;
+				} while (nleft > 0);
+			}
 
 			npages = 0;
 
@@ -2637,16 +2671,17 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 		if (sync_method != SYNC_METHOD_OPEN &&
 			sync_method != SYNC_METHOD_OPEN_DSYNC)
 		{
-			if (openLogFile >= 0 &&
+			if ((openLogFile >= 0 || mappedLogFileAddr != NULL) &&
 				!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
 								 wal_segment_size))
 				XLogFileClose();
-			if (openLogFile < 0)
+			if (openLogFile < 0 && mappedLogFileAddr == NULL)
 			{
 				XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 								wal_segment_size);
-				openLogFile = XLogFileOpen(openLogSegNo);
-				ReserveExternalFD();
+				openLogFile = XLogFileOpen(openLogSegNo, &mappedLogFileAddr);
+				if (openLogFile >= 0)
+					ReserveExternalFD();
 			}
 
 			issue_xlog_fsync(openLogFile, openLogSegNo);
@@ -3070,7 +3105,7 @@ XLogBackgroundFlush(void)
 	 */
 	if (WriteRqst.Write <= LogwrtResult.Flush)
 	{
-		if (openLogFile >= 0)
+		if (openLogFile >= 0 || mappedLogFileAddr != NULL)
 		{
 			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
 								 wal_segment_size))
@@ -3251,7 +3286,8 @@ XLogNeedsFlush(XLogRecPtr record)
  * in a critical section.
  */
 int
-XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
+XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock,
+			 void **addr)
 {
 	char		path[MAXPGPATH];
 	char		tmppath[MAXPGPATH];
@@ -3261,6 +3297,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	int			fd;
 	int			nbytes;
 	int			save_errno;
+	void	   *tmpaddr = NULL;
 
 	XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
 
@@ -3269,8 +3306,10 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	 */
 	if (*use_existent)
 	{
-		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
-		if (fd < 0)
+		fd = do_XLogFileOpen(path,
+							 O_RDWR | PG_BINARY | get_sync_bit(sync_method),
+							 &tmpaddr);
+		if (fd < 0 && tmpaddr == NULL)
 		{
 			if (errno != ENOENT)
 				ereport(ERROR,
@@ -3278,7 +3317,10 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 						 errmsg("could not open file \"%s\": %m", path)));
 		}
 		else
+		{
+			*addr = tmpaddr;
 			return fd;
+		}
 	}
 
 	/*
@@ -3294,8 +3336,9 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	unlink(tmppath);
 
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
-	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
-	if (fd < 0)
+	fd = do_XLogFileOpen(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+						 &tmpaddr);
+	if (fd < 0 && tmpaddr == NULL)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
@@ -3316,29 +3359,41 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 		 * O_DSYNC will be sufficient to sync future writes to the log file.
 		 */
 		for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
+		{
+			if (tmpaddr != NULL)
+				PmemFileWrite((char *) tmpaddr + nbytes, zbuffer.data,
+							  XLOG_BLCKSZ);
+			else
+			{
+				errno = 0;
+				if (write(fd, zbuffer.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+				{
+					/* if write didn't set errno, assume no disk space */
+					save_errno = errno ? errno : ENOSPC;
+					break;
+				}
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * Otherwise, seeking to the end and writing a solitary byte is
+		 * enough.
+		 */
+		if (tmpaddr != NULL)
+			PmemFileWrite((char *) tmpaddr + wal_segment_size - 1,
+						  zbuffer.data, 1);
+		else
 		{
 			errno = 0;
-			if (write(fd, zbuffer.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+			if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
 			{
 				/* if write didn't set errno, assume no disk space */
 				save_errno = errno ? errno : ENOSPC;
-				break;
 			}
 		}
 	}
-	else
-	{
-		/*
-		 * Otherwise, seeking to the end and writing a solitary byte is
-		 * enough.
-		 */
-		errno = 0;
-		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
-		{
-			/* if write didn't set errno, assume no disk space */
-			save_errno = errno ? errno : ENOSPC;
-		}
-	}
 	pgstat_report_wait_end();
 
 	if (save_errno)
@@ -3358,11 +3413,11 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	}
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
-	if (pg_fsync(fd) != 0)
+	if (xlog_fsync(fd, tmpaddr) != 0)
 	{
 		int			save_errno = errno;
 
-		close(fd);
+		do_XLogFileClose(fd, tmpaddr);
 		errno = save_errno;
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -3370,7 +3425,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	}
 	pgstat_report_wait_end();
 
-	if (close(fd) != 0)
+	if (do_XLogFileClose(fd, tmpaddr))
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", tmppath)));
@@ -3411,8 +3466,9 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	*use_existent = false;
 
 	/* Now open original target segment (might not be file I just made) */
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
-	if (fd < 0)
+	fd = do_XLogFileOpen(path,
+						 O_RDWR | PG_BINARY | get_sync_bit(sync_method), addr);
+	if (fd < 0 && *addr == NULL)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m", path)));
@@ -3447,13 +3503,20 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 	int			srcfd;
 	int			fd;
 	int			nbytes;
+	void	   *src_addr = NULL;
+	void	   *dst_addr = NULL;
 
 	/*
 	 * Open the source file
 	 */
 	XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
-	srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
-	if (srcfd < 0)
+	srcfd = -1;
+	if (sync_method == SYNC_METHOD_PMEM_DRAIN)
+		srcfd = MapTransientFile(path, O_RDONLY | PG_BINARY,
+								 wal_segment_size, &src_addr);
+	if (src_addr == NULL)
+		srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+	if (srcfd < 0 && src_addr == NULL)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m", path)));
@@ -3466,8 +3529,15 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 	unlink(tmppath);
 
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
-	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
-	if (fd < 0)
+	if (src_addr != NULL && sync_method == SYNC_METHOD_PMEM_DRAIN)
+		fd = MapTransientFile(tmppath,
+							  O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+							  wal_segment_size, &dst_addr);
+	else
+		fd = OpenTransientFile(tmppath,
+							   O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+
+	if (fd < 0 && dst_addr == NULL)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
@@ -3475,6 +3545,15 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 	/*
 	 * Do the data copying.
 	 */
+	if (src_addr && dst_addr)
+	{
+		pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
+		PmemFileWrite(dst_addr, src_addr, wal_segment_size);
+		pgstat_report_wait_end();
+
+		goto done_copy;
+	}
+
 	for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
 	{
 		int			nread;
@@ -3531,14 +3610,22 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 		pgstat_report_wait_end();
 	}
 
+done_copy:
 	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
-	if (pg_fsync(fd) != 0)
+	if (xlog_fsync(fd, dst_addr) != 0)
 		ereport(data_sync_elevel(ERROR),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
 	pgstat_report_wait_end();
 
-	if (CloseTransientFile(fd) != 0)
+	if (dst_addr)
+	{
+		if (UnmapTransientFile(dst_addr, wal_segment_size))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not unmap file \"%s\": %m", tmppath)));
+	}
+	else if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", tmppath)));
@@ -3547,6 +3634,13 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", path)));
+	if (src_addr)
+		UnmapTransientFile(src_addr, wal_segment_size);
+	else
+		if (CloseTransientFile(srcfd) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not close file \"%s\": %m", path)));
 
 	/*
 	 * Now move the segment into place with its final name.
@@ -3643,15 +3737,16 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
  * Open a pre-existing logfile segment for writing.
  */
 int
-XLogFileOpen(XLogSegNo segno)
+XLogFileOpen(XLogSegNo segno, void **addr)
 {
 	char		path[MAXPGPATH];
 	int			fd;
 
 	XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
 
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
-	if (fd < 0)
+	fd = do_XLogFileOpen(path,
+						 O_RDWR | PG_BINARY | get_sync_bit(sync_method), addr);
+	if (fd < 0 && *addr == NULL)
 		ereport(PANIC,
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m", path)));
@@ -3667,7 +3762,7 @@ XLogFileOpen(XLogSegNo segno)
  */
 static int
 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
-			 XLogSource source, bool notfoundOk)
+			 XLogSource source, bool notfoundOk, void **addr)
 {
 	char		xlogfname[MAXFNAMELEN];
 	char		activitymsg[MAXFNAMELEN + 16];
@@ -3716,8 +3811,8 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
 	}
 
-	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (fd >= 0)
+	fd = do_XLogFileOpen(path, O_RDONLY | PG_BINARY, addr);
+	if (fd >= 0 || *addr != NULL)
 	{
 		/* Success! */
 		curFileTLI = tli;
@@ -3749,7 +3844,7 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
  * This version searches for the segment with any TLI listed in expectedTLEs.
  */
 static int
-XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
+XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source, void **addr)
 {
 	char		path[MAXPGPATH];
 	ListCell   *cell;
@@ -3814,8 +3909,8 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
 		if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
 		{
 			fd = XLogFileRead(segno, emode, tli,
-							  XLOG_FROM_ARCHIVE, true);
-			if (fd != -1)
+							  XLOG_FROM_ARCHIVE, true, addr);
+			if (fd != -1 || *addr != NULL)
 			{
 				elog(DEBUG1, "got WAL segment from archive");
 				if (!expectedTLEs)
@@ -3827,8 +3922,8 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
 		if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
 		{
 			fd = XLogFileRead(segno, emode, tli,
-							  XLOG_FROM_PG_WAL, true);
-			if (fd != -1)
+							  XLOG_FROM_PG_WAL, true, addr);
+			if (fd != -1 || *addr != NULL)
 			{
 				if (!expectedTLEs)
 					expectedTLEs = tles;
@@ -3846,13 +3941,22 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
 	return -1;
 }
 
+int
+do_XLogFileClose(int fd, void *addr)
+{
+	if (!addr)
+		return close(fd);
+
+	return PmemFileClose(addr, wal_segment_size);
+}
+
 /*
  * Close the current logfile segment for writing.
  */
 static void
 XLogFileClose(void)
 {
-	Assert(openLogFile >= 0);
+	Assert(openLogFile >= 0 || mappedLogFileAddr != NULL);
 
 	/*
 	 * WAL segment files will not be re-read in normal operation, so we advise
@@ -3861,11 +3965,11 @@ XLogFileClose(void)
 	 * use the cache to read the WAL segment.
 	 */
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-	if (!XLogIsNeeded())
+	if (!XLogIsNeeded() && openLogFile > 0)
 		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
 
-	if (close(openLogFile) != 0)
+	if (do_XLogFileClose(openLogFile, mappedLogFileAddr))
 	{
 		char		xlogfname[MAXFNAMELEN];
 		int			save_errno = errno;
@@ -3877,8 +3981,12 @@ XLogFileClose(void)
 				 errmsg("could not close file \"%s\": %m", xlogfname)));
 	}
 
-	openLogFile = -1;
-	ReleaseExternalFD();
+	mappedLogFileAddr = NULL;
+	if (openLogFile >= 0)
+	{
+		openLogFile = -1;
+		ReleaseExternalFD();
+	}
 }
 
 /*
@@ -3897,6 +4005,7 @@ PreallocXlogFiles(XLogRecPtr endptr)
 	XLogSegNo	_logSegNo;
 	int			lf;
 	bool		use_existent;
+	void	   *laddr = NULL;
 	uint64		offset;
 
 	XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
@@ -3905,8 +4014,8 @@ PreallocXlogFiles(XLogRecPtr endptr)
 	{
 		_logSegNo++;
 		use_existent = true;
-		lf = XLogFileInit(_logSegNo, &use_existent, true);
-		close(lf);
+		lf = XLogFileInit(_logSegNo, &use_existent, true, &laddr);
+		do_XLogFileClose(lf, laddr);
 		if (!use_existent)
 			CheckpointStats.ckpt_segs_added++;
 	}
@@ -4349,9 +4458,10 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 		EndRecPtr = xlogreader->EndRecPtr;
 		if (record == NULL)
 		{
-			if (readFile >= 0)
+			if (readFile >= 0 || mappedReadFileAddr != NULL)
 			{
-				close(readFile);
+				do_XLogFileClose(readFile, mappedReadFileAddr);
+				mappedReadFileAddr = NULL;
 				readFile = -1;
 			}
 
@@ -5299,7 +5409,7 @@ BootStrapXLOG(void)
 
 	/* Create first XLOG segment file */
 	use_existent = false;
-	openLogFile = XLogFileInit(1, &use_existent, false);
+	openLogFile = XLogFileInit(1, &use_existent, false, &mappedLogFileAddr);
 
 	/*
 	 * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
@@ -5308,30 +5418,39 @@ BootStrapXLOG(void)
 
 	/* Write the first page with the initial record */
 	errno = 0;
-	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
-	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+	if (mappedLogFileAddr != NULL)
 	{
-		/* if write didn't set errno, assume problem is no disk space */
-		if (errno == 0)
-			errno = ENOSPC;
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not write bootstrap write-ahead log file: %m")));
+		pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
+		PmemFileWrite(mappedLogFileAddr, page, XLOG_BLCKSZ);
+	}
+	else
+	{
+		pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
+		if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not write bootstrap write-ahead log file: %m")));
+		}
 	}
 	pgstat_report_wait_end();
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
-	if (pg_fsync(openLogFile) != 0)
+	if (xlog_fsync(openLogFile, (void *) mappedLogFileAddr) != 0)
 		ereport(PANIC,
 				(errcode_for_file_access(),
 				 errmsg("could not fsync bootstrap write-ahead log file: %m")));
 	pgstat_report_wait_end();
 
-	if (close(openLogFile) != 0)
+	if (do_XLogFileClose(openLogFile, mappedLogFileAddr))
 		ereport(PANIC,
 				(errcode_for_file_access(),
 				 errmsg("could not close bootstrap write-ahead log file: %m")));
 
+	mappedLogFileAddr = NULL;
 	openLogFile = -1;
 
 	/* Now create pg_control */
@@ -5566,9 +5685,10 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
 	 * If the ending log segment is still open, close it (to avoid problems on
 	 * Windows with trying to rename or delete an open file).
 	 */
-	if (readFile >= 0)
+	if (readFile >= 0 || mappedReadFileAddr != NULL)
 	{
-		close(readFile);
+		do_XLogFileClose(readFile, mappedReadFileAddr);
+		mappedReadFileAddr = NULL;
 		readFile = -1;
 	}
 
@@ -5607,10 +5727,11 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
 		 */
 		bool		use_existent = true;
 		int			fd;
+		void	   *tmpaddr = NULL;
 
-		fd = XLogFileInit(startLogSegNo, &use_existent, true);
+		fd = XLogFileInit(startLogSegNo, &use_existent, true, &tmpaddr);
 
-		if (close(fd) != 0)
+		if (do_XLogFileClose(fd, tmpaddr))
 		{
 			char		xlogfname[MAXFNAMELEN];
 			int			save_errno = errno;
@@ -7899,9 +8020,10 @@ StartupXLOG(void)
 		ShutdownRecoveryTransactionEnvironment();
 
 	/* Shut down xlogreader */
-	if (readFile >= 0)
+	if (readFile >= 0 || mappedReadFileAddr != NULL)
 	{
-		close(readFile);
+		do_XLogFileClose(readFile, mappedReadFileAddr);
+		mappedReadFileAddr = NULL;
 		readFile = -1;
 	}
 	XLogReaderFree(xlogreader);
@@ -10341,6 +10463,9 @@ get_sync_bit(int method)
 		case SYNC_METHOD_FSYNC:
 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
 		case SYNC_METHOD_FDATASYNC:
+#ifdef USE_LIBPMEM
+		case SYNC_METHOD_PMEM_DRAIN:
+#endif
 			return 0;
 #ifdef OPEN_SYNC_FLAG
 		case SYNC_METHOD_OPEN:
@@ -10358,7 +10483,36 @@ get_sync_bit(int method)
 }
 
 /*
- * GUC support
+ * GUC check_hook for xlog_sync_method
+ */
+bool
+check_xlog_sync_method(int *newval, void **extra, GucSource source)
+{
+	bool		ret;
+	char		tmppath[MAXPGPATH] = {};
+	int			val = newval ? *newval : sync_method;
+
+	if (val != SYNC_METHOD_PMEM_DRAIN)
+		return true;
+
+	snprintf(tmppath, MAXPGPATH, "%s/" XLOGDIR "/pmem.tmp.%d", DataDir, (int) getpid());
+
+	ret = CheckPmem(tmppath);
+
+	if (!ret)
+	{
+		GUC_check_errcode(ERRCODE_INVALID_PARAMETER_VALUE);
+		GUC_check_errmsg("invalid value for parameter \"wal_sync_method\": \"pmem_drain\"");
+		GUC_check_errmsg("%s isn't stored on persistent memory(pmem_is_pmem() returned false).",
+						 XLOGDIR);
+		GUC_check_errhint("Please see also ENVIRONMENT VARIABLES section in man libpmem.");
+	}
+
+	return ret;
+}
+
+/*
+ * GUC assign_hook for xlog_sync_method
  */
 void
 assign_xlog_sync_method(int new_sync_method, void *extra)
@@ -10371,10 +10525,10 @@ assign_xlog_sync_method(int new_sync_method, void *extra)
 		 * changing, close the log file so it will be reopened (with new flag
 		 * bit) at next use.
 		 */
-		if (openLogFile >= 0)
+		if (openLogFile >= 0 || mappedLogFileAddr != NULL)
 		{
 			pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
-			if (pg_fsync(openLogFile) != 0)
+			if (xlog_fsync(openLogFile, (void *) mappedLogFileAddr) != 0)
 			{
 				char		xlogfname[MAXFNAMELEN];
 				int			save_errno;
@@ -10425,6 +10579,11 @@ issue_xlog_fsync(int fd, XLogSegNo segno)
 			if (pg_fdatasync(fd) != 0)
 				msg = _("could not fdatasync file \"%s\": %m");
 			break;
+#endif
+#ifdef USE_LIBPMEM
+		case SYNC_METHOD_PMEM_DRAIN:
+			PmemFileSync();
+			break;
 #endif
 		case SYNC_METHOD_OPEN:
 		case SYNC_METHOD_OPEN_DSYNC:
@@ -10452,6 +10611,16 @@ issue_xlog_fsync(int fd, XLogSegNo segno)
 	pgstat_report_wait_end();
 }
 
+int
+xlog_fsync(int fd, void *addr)
+{
+	if (!addr)
+		return pg_fsync(fd);
+
+	PmemFileSync();
+	return 0;
+}
+
 /*
  * do_pg_start_backup
  *
@@ -11887,7 +12056,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 	 * See if we need to switch to a new segment because the requested record
 	 * is not in the currently open one.
 	 */
-	if (readFile >= 0 &&
+	if ((readFile >= 0 || mappedReadFileAddr != NULL) &&
 		!XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
 	{
 		/*
@@ -11904,7 +12073,8 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 			}
 		}
 
-		close(readFile);
+		do_XLogFileClose(readFile, mappedReadFileAddr);
+		mappedReadFileAddr = NULL;
 		readFile = -1;
 		readSource = XLOG_FROM_ANY;
 	}
@@ -11913,7 +12083,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 
 retry:
 	/* See if we need to retrieve more data */
-	if (readFile < 0 ||
+	if ((readFile < 0 && mappedReadFileAddr == NULL) ||
 		(readSource == XLOG_FROM_STREAM &&
 		 flushedUpto < targetPagePtr + reqLen))
 	{
@@ -11922,8 +12092,9 @@ retry:
 										 private->fetching_ckpt,
 										 targetRecPtr))
 		{
-			if (readFile >= 0)
-				close(readFile);
+			if (readFile >= 0 || mappedReadFileAddr != NULL)
+				do_XLogFileClose(readFile, mappedReadFileAddr);
+			mappedReadFileAddr = NULL;
 			readFile = -1;
 			readLen = 0;
 			readSource = XLOG_FROM_ANY;
@@ -11936,7 +12107,7 @@ retry:
 	 * At this point, we have the right segment open and if we're streaming we
 	 * know the requested record is in it.
 	 */
-	Assert(readFile != -1);
+	Assert(readFile != -1 || mappedReadFileAddr != NULL);
 
 	/*
 	 * If the current segment is being streamed from the primary, calculate how
@@ -11959,28 +12130,33 @@ retry:
 	readOff = targetPageOff;
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
-	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
-	if (r != XLOG_BLCKSZ)
+	if (mappedReadFileAddr)
+		PmemFileRead((char *) mappedReadFileAddr + readOff, readBuf, XLOG_BLCKSZ);
+	else
 	{
-		char		fname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		pgstat_report_wait_end();
-		XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
-		if (r < 0)
+		r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
+		if (r != XLOG_BLCKSZ)
 		{
-			errno = save_errno;
-			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
-					(errcode_for_file_access(),
-					 errmsg("could not read from log segment %s, offset %u: %m",
-							fname, readOff)));
+			char		fname[MAXFNAMELEN];
+			int			save_errno = errno;
+
+			pgstat_report_wait_end();
+			XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
+			if (r < 0)
+			{
+				errno = save_errno;
+				ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+						(errcode_for_file_access(),
+						 errmsg("could not read from log segment %s, offset %u: %m",
+								fname, readOff)));
+			}
+			else
+				ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
+								fname, readOff, r, (Size) XLOG_BLCKSZ)));
+			goto next_record_is_invalid;
 		}
-		else
-			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
-					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
-							fname, readOff, r, (Size) XLOG_BLCKSZ)));
-		goto next_record_is_invalid;
 	}
 	pgstat_report_wait_end();
 
@@ -12028,8 +12204,9 @@ retry:
 next_record_is_invalid:
 	lastSourceFailed = true;
 
-	if (readFile >= 0)
-		close(readFile);
+	if (readFile >= 0 || mappedReadFileAddr != NULL)
+		do_XLogFileClose(readFile, mappedReadFileAddr);
+	mappedReadFileAddr = NULL;
 	readFile = -1;
 	readLen = 0;
 	readSource = XLOG_FROM_ANY;
@@ -12269,9 +12446,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 				Assert(!WalRcvStreaming());
 
 				/* Close any old file we might have open. */
-				if (readFile >= 0)
+				if (readFile >= 0 || mappedReadFileAddr != NULL)
 				{
-					close(readFile);
+					do_XLogFileClose(readFile,
+									 mappedReadFileAddr);
+					mappedReadFileAddr = NULL;
 					readFile = -1;
 				}
 				/* Reset curFileTLI if random fetch. */
@@ -12284,8 +12463,8 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 				 */
 				readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
 											  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
-											  currentSource);
-				if (readFile >= 0)
+											  currentSource, &mappedReadFileAddr);
+				if (readFile >= 0 || mappedReadFileAddr != NULL)
 					return true;	/* success! */
 
 				/*
@@ -12419,14 +12598,14 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 						 * info is set correctly and XLogReceiptTime isn't
 						 * changed.
 						 */
-						if (readFile < 0)
+						if (readFile < 0 && mappedReadFileAddr == NULL)
 						{
 							if (!expectedTLEs)
 								expectedTLEs = readTimeLineHistory(receiveTLI);
 							readFile = XLogFileRead(readSegNo, PANIC,
 													receiveTLI,
-													XLOG_FROM_STREAM, false);
-							Assert(readFile >= 0);
+													XLOG_FROM_STREAM, false, &mappedReadFileAddr);
+							Assert(readFile >= 0 || mappedReadFileAddr != NULL);
 						}
 						else
 						{
diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile
index 5e1291bf2d..462c71bb03 100644
--- a/src/backend/storage/file/Makefile
+++ b/src/backend/storage/file/Makefile
@@ -17,6 +17,7 @@ OBJS = \
 	copydir.o \
 	fd.o \
 	reinit.o \
-	sharedfileset.o
+	sharedfileset.o \
+	pmem.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 5f6420efb2..3281cf146f 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -94,6 +94,7 @@
 #include "portability/mem.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/pmem.h"
 #include "utils/guc.h"
 #include "utils/resowner_private.h"
 
@@ -231,6 +232,9 @@ static uint64 temporary_files_size = 0;
 typedef enum
 {
 	AllocateDescFile,
+#ifdef USE_LIBPMEM
+	AllocateDescMap,
+#endif
 	AllocateDescPipe,
 	AllocateDescDir,
 	AllocateDescRawFD
@@ -245,6 +249,10 @@ typedef struct
 		FILE	   *file;
 		DIR		   *dir;
 		int			fd;
+#ifdef USE_LIBPMEM
+		size_t		fsize;
+		void	   *addr;
+#endif
 	}			desc;
 } AllocateDesc;
 
@@ -1695,6 +1703,78 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
 	return file;
 }
 
+#ifdef USE_LIBPMEM
+/*
+ * Mmap a file with MapTransientFilePerm() and pass default file mode for
+ * the fileMode parameter.
+ */
+int
+MapTransientFile(const char *fileName, int fileFlags, size_t fsize, void **addr)
+{
+	return MapTransientFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT,
+								fsize, addr);
+}
+
+/*
+ * Like AllocateFile, but returns an unbuffered pointer to the mapped area
+ * like mmap(2)
+ */
+int
+MapTransientFilePerm(const char *fileName, int fileFlags, int fileMode,
+					 size_t fsize, void **addr)
+{
+	int			fd;
+
+	DO_DB(elog(LOG, "MapTransientFilePerm: Allocated %d (%s)",
+			   numAllocatedDescs, fileName));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
+						maxAllocatedDescs, fileName)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+	if (addr != NULL)
+	{
+		void	   *ret_addr = NULL;
+
+		fd = PmemFileOpenPerm(fileName, fileFlags, fileMode, fsize, &ret_addr);
+		if (ret_addr != NULL)
+		{
+			AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+			*addr = ret_addr;
+
+			desc->kind = AllocateDescMap;
+			desc->desc.addr = ret_addr;
+			desc->desc.fsize = fsize;
+			desc->create_subid = GetCurrentSubTransactionId();
+			numAllocatedDescs++;
+
+			return fd;
+		}
+	}
+
+	return -1;					/* failure */
+}
+#else
+int
+MapTransientFile(const char *fileName, int fileFlags, size_t fsize, void **addr)
+{
+	return -1;
+}
+
+int
+MapTransientFilePerm(const char *fileName, int fileFlags, int fileMode,
+					 size_t fsize, void **addr)
+{
+	return -1;
+}
+#endif
 
 /*
  * Create a new file.  The directory containing it must already exist.  Files
@@ -2498,6 +2578,11 @@ FreeDesc(AllocateDesc *desc)
 		case AllocateDescRawFD:
 			result = close(desc->desc.fd);
 			break;
+#ifdef USE_LIBPMEM
+		case AllocateDescMap:
+			result = PmemFileClose(desc->desc.addr, desc->desc.fsize);
+			break;
+#endif
 		default:
 			elog(ERROR, "AllocateDesc kind not recognized");
 			result = 0;			/* keep compiler quiet */
@@ -2539,6 +2624,42 @@ FreeFile(FILE *file)
 	return fclose(file);
 }
 
+#ifdef USE_LIBPMEM
+/*
+ * Unmap a file returned by MapTransientFile.
+ *
+ * Note we do not check unmap's return value --- it is up to the caller
+ * to handle unmap errors.
+ */
+int
+UnmapTransientFile(void *addr, size_t fsize)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "UnmapTransientFile: Allocated %d", numAllocatedDescs));
+
+	/* Remove fd from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescMap && desc->desc.addr == addr)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "fd passed to UnmapTransientFile was not obtained from MapTransientFile");
+
+	return PmemFileClose(addr, fsize);
+}
+#else
+int
+UnmapTransientFile(void *addr, size_t fsize)
+{
+	return -1;
+}
+#endif
+
 /*
  * Close a file returned by OpenTransientFile.
  *
diff --git a/src/backend/storage/file/pmem.c b/src/backend/storage/file/pmem.c
new file mode 100644
index 0000000000..b214b6b18e
--- /dev/null
+++ b/src/backend/storage/file/pmem.c
@@ -0,0 +1,188 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmem.c
+ *	  Virtual file descriptor code.
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/pmem.c
+ *
+ * NOTES:
+ *
+ * This code manages an memory-mapped file on a filesystem mounted with DAX on
+ * persistent memory device using the Persistent Memory Development Kit
+ * (http://pmem.io/pmdk/).
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "storage/pmem.h"
+#include "storage/fd.h"
+
+#ifdef USE_LIBPMEM
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libpmem.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#define PmemFileSize 32
+
+/*
+ * This function returns true, only if the file is stored on persistent memory.
+ */
+bool
+CheckPmem(const char *path)
+{
+	int			is_pmem = 0;	/* false */
+	size_t		mapped_len = 0;
+	bool		ret = true;
+	void	   *tmpaddr;
+
+	/*
+	 * The value of is_pmem is 0, if the file(path) isn't stored on persistent
+	 * memory.
+	 */
+	tmpaddr = pmem_map_file(path, PmemFileSize, PMEM_FILE_CREATE,
+							PG_FILE_MODE_DEFAULT, &mapped_len, &is_pmem);
+
+	if (tmpaddr)
+	{
+		pmem_unmap(tmpaddr, mapped_len);
+		unlink(path);
+	}
+
+	if (is_pmem)
+		elog(LOG, "%s is stored on persistent memory.", path);
+	else
+		ret = false;
+
+	return ret;
+}
+
+int
+PmemFileOpen(const char *pathname, int flags, size_t fsize, void **addr)
+{
+	return PmemFileOpenPerm(pathname, flags, PG_FILE_MODE_DEFAULT, fsize, addr);
+}
+
+int
+PmemFileOpenPerm(const char *pathname, int flags, int mode, size_t fsize,
+				 void **addr)
+{
+	int			mapped_flag = 0;
+	size_t		mapped_len = 0;
+	size_t		size = 0;
+	void	   *ret_addr;
+
+	if (addr == NULL)
+		return BasicOpenFile(pathname, flags);
+
+	/* non-zero 'len' not allowed without PMEM_FILE_CREATE */
+	if (flags & O_CREAT)
+	{
+		mapped_flag = PMEM_FILE_CREATE;
+		size = fsize;
+	}
+
+	if (flags & O_EXCL)
+		mapped_flag |= PMEM_FILE_EXCL;
+
+	ret_addr = pmem_map_file(pathname, size, mapped_flag, mode, &mapped_len,
+							 NULL);
+
+	if (fsize != mapped_len)
+	{
+		if (ret_addr != NULL)
+			pmem_unmap(ret_addr, mapped_len);
+
+		return -1;
+	}
+
+	if (mapped_flag & PMEM_FILE_CREATE)
+		if (msync(ret_addr, mapped_len, MS_SYNC))
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not msync log file %s: %m", pathname)));
+
+	*addr = ret_addr;
+
+	return NO_FD_FOR_MAPPED_FILE;
+}
+
+void
+PmemFileWrite(void *dest, void *src, size_t len)
+{
+	pmem_memcpy_nodrain((void *) dest, src, len);
+}
+
+void
+PmemFileRead(void *map_addr, void *buf, size_t len)
+{
+	memcpy(buf, (void *) map_addr, len);
+}
+
+void
+PmemFileSync(void)
+{
+	return pmem_drain();
+}
+
+int
+PmemFileClose(void *addr, size_t fsize)
+{
+	return pmem_unmap((void *) addr, fsize);
+}
+
+
+#else
+bool
+CheckPmem(const char *path)
+{
+	return true;
+}
+
+int
+PmemFileOpen(const char *pathname, int flags, size_t fsize, void **addr)
+{
+	return BasicOpenFile(pathname, flags);
+}
+
+int
+PmemFileOpenPerm(const char *pathname, int flags, int mode, size_t fsize,
+				 void **addr)
+{
+	return BasicOpenFilePerm(pathname, flags, mode);
+}
+
+void
+PmemFileWrite(void *dest, void *src, size_t len)
+{
+	ereport(PANIC, (errmsg("don't have the pmem device")));
+}
+
+void
+PmemFileRead(void *map_addr, void *buf, size_t len)
+{
+	ereport(PANIC, (errmsg("don't have the pmem device")));
+}
+
+void
+PmemFileSync(void)
+{
+	ereport(PANIC, (errmsg("don't have the pmem device")));
+}
+
+int
+PmemFileClose(void *addr, size_t fsize)
+{
+	ereport(PANIC, (errmsg("don't have the pmem device")));
+	return -1;
+}
+#endif
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6f603cbbe8..47eb89f885 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4680,7 +4680,7 @@ static struct config_enum ConfigureNamesEnum[] =
 		},
 		&sync_method,
 		DEFAULT_SYNC_METHOD, sync_method_options,
-		NULL, assign_xlog_sync_method, NULL
+		check_xlog_sync_method, assign_xlog_sync_method, NULL
 	},
 
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 5a0b8e9821..eeb5ba3a0e 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -208,6 +208,7 @@
 					#   fsync
 					#   fsync_writethrough
 					#   open_sync
+					#   pmem_drain
 #full_page_writes = on			# recover from partial page writes
 #wal_compression = off			# enable compression of full-page writes
 #wal_log_hints = off			# also do full page writes of non-critical updates
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 219a7299e1..278a4a1dcf 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -27,6 +27,7 @@
 #define SYNC_METHOD_OPEN		2	/* for O_SYNC */
 #define SYNC_METHOD_FSYNC_WRITETHROUGH	3
 #define SYNC_METHOD_OPEN_DSYNC	4	/* for O_DSYNC */
+#define SYNC_METHOD_PMEM_DRAIN	5	/* for Persistent Memory Development Kit */
 extern int	sync_method;
 
 extern PGDLLIMPORT TimeLineID ThisTimeLineID;	/* current TLI */
@@ -287,8 +288,10 @@ extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata,
 extern void XLogFlush(XLogRecPtr RecPtr);
 extern bool XLogBackgroundFlush(void);
 extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
-extern int	XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock);
-extern int	XLogFileOpen(XLogSegNo segno);
+extern int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock,
+						void **addr);
+extern int	XLogFileOpen(XLogSegNo segno, void **addr);
+extern int	do_XLogFileClose(int fd, void *addr);
 
 extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
 extern XLogSegNo XLogGetLastRemovedSegno(void);
@@ -300,6 +303,7 @@ extern void xlog_desc(StringInfo buf, XLogReaderState *record);
 extern const char *xlog_identify(uint8 info);
 
 extern void issue_xlog_fsync(int fd, XLogSegNo segno);
+extern int	xlog_fsync(int fd, void *addr);
 
 extern bool RecoveryInProgress(void);
 extern RecoveryState GetRecoveryState(void);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 8cd125d7df..c3ec6ecbb3 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -49,6 +49,12 @@
 typedef int File;
 
 
+/*
+ * Default mode for created files, unless something else is specified using
+ * the *Perm() function variants.
+ */
+#define PG_FILE_MODE_DEFAULT	(S_IRUSR | S_IWUSR)
+
 /* GUC parameter */
 extern PGDLLIMPORT int max_files_per_process;
 extern PGDLLIMPORT bool data_sync_retry;
@@ -120,6 +126,13 @@ extern int	OpenTransientFile(const char *fileName, int fileFlags);
 extern int	OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
 extern int	CloseTransientFile(int fd);
 
+/* Operations to allow use of a memory-mapped file */
+extern int MapTransientFile(const char *fileName, int fileFlags, size_t fsize,
+				 void **addr);
+extern int MapTransientFilePerm(const char *fileName, int fileFlags, int fileMode,
+					 size_t fsize, void **addr);
+extern int	UnmapTransientFile(void *addr, size_t fsize);
+
 /* If you've really really gotta have a plain kernel FD, use this */
 extern int	BasicOpenFile(const char *fileName, int fileFlags);
 extern int	BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
diff --git a/src/include/storage/pmem.h b/src/include/storage/pmem.h
new file mode 100644
index 0000000000..b9b9156c91
--- /dev/null
+++ b/src/include/storage/pmem.h
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmem.h
+ *		Virtual file descriptor definitions for persistent memory.
+ *
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pmem.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PMEM_H
+#define PMEM_H
+
+#include "postgres.h"
+
+#define NO_FD_FOR_MAPPED_FILE -2
+
+extern bool CheckPmem(const char *path);
+extern int PmemFileOpen(const char *pathname, int flags, size_t fsize,
+			 void **addr);
+extern int PmemFileOpenPerm(const char *pathname, int flags, int mode,
+				 size_t fsize, void **addr);
+extern void PmemFileWrite(void *dest, void *src, size_t len);
+extern void PmemFileRead(void *map_addr, void *buf, size_t len);
+extern void PmemFileSync(void);
+extern int	PmemFileClose(void *addr, size_t fsize);
+
+#endif							/* PMEM_H */
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 2819282181..802d281245 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -438,6 +438,7 @@ extern void assign_search_path(const char *newval, void *extra);
 
 /* in access/transam/xlog.c */
 extern bool check_wal_buffers(int *newval, void **extra, GucSource source);
+extern bool check_xlog_sync_method(int *newval, void **extra, GucSource source);
 extern void assign_xlog_sync_method(int new_sync_method, void *extra);
 
 #endif							/* GUC_H */
-- 
2.25.1

