At Wed, 1 Mar 2023 14:56:25 -0500, "Gregory Stark (as CFM)" 
<stark....@gmail.com> wrote in 
> On Mon, 6 Feb 2023 at 23:48, Kyotaro Horiguchi <horikyota....@gmail.com> 
> wrote:
> >
> > Thank you for the comment!
> >
> > At Fri, 3 Feb 2023 08:42:52 +0100, Heikki Linnakangas <hlinn...@iki.fi> 
> > wrote in
> > > I want to call out this part of this patch:
> 
> Looks like this patch has received some solid feedback from Heikki and
> you have a path forward. It's not currently building in the build farm
> either.
> 
> I'll set the patch to Waiting on Author for now.

Correctly they are three parts.

Correctly they are three parts. The attached patch is the first part -
the storage mark files, which are used to identify storage files that
have not been committed and should be removed during the next
startup. This feature resolves the issue of orphaned storage files
that may result from a crash occurring during the execution of a
transaction involving the creation of a new table.

I'll post all of the three parts shortly.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 1665e3428b9d777989864ea302eef8368a739e7e Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota....@gmail.com>
Date: Thu, 2 Mar 2023 17:25:12 +0900
Subject: [PATCH v26] Storage mark files

In certain situations, specific operations followed by a crash-restart
can result in orphaned storage files.  These files cannot be removed
through standard methods.  To address this issue, this commit
implements 'mark files' that conveys information about the storage
file. Specifically, the "UNCOMMITED" mark file is introduced to denote
files that have not been committed and should be removed during the
next startup.
---
 src/backend/access/rmgrdesc/smgrdesc.c    |  37 +++
 src/backend/access/transam/README         |  10 +
 src/backend/access/transam/xact.c         |   7 +
 src/backend/access/transam/xlogrecovery.c |  18 ++
 src/backend/backup/basebackup.c           |   9 +-
 src/backend/catalog/storage.c             | 270 ++++++++++++++++++-
 src/backend/storage/file/fd.c             |   4 +-
 src/backend/storage/file/reinit.c         | 313 +++++++++++++++-------
 src/backend/storage/smgr/md.c             |  95 ++++++-
 src/backend/storage/smgr/smgr.c           |  32 +++
 src/backend/storage/sync/sync.c           |  21 +-
 src/bin/pg_rewind/parsexlog.c             |  16 ++
 src/common/relpath.c                      |  47 ++--
 src/include/catalog/storage.h             |   3 +
 src/include/catalog/storage_xlog.h        |  35 ++-
 src/include/common/relpath.h              |   9 +-
 src/include/storage/fd.h                  |   1 +
 src/include/storage/md.h                  |   8 +-
 src/include/storage/reinit.h              |   8 +-
 src/include/storage/smgr.h                |  17 ++
 src/test/recovery/t/013_crash_restart.pl  |  21 ++
 src/tools/pgindent/typedefs.list          |   6 +
 22 files changed, 843 insertions(+), 144 deletions(-)

diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c
index bd841b96e8..f8187385c4 100644
--- a/src/backend/access/rmgrdesc/smgrdesc.c
+++ b/src/backend/access/rmgrdesc/smgrdesc.c
@@ -40,6 +40,37 @@ smgr_desc(StringInfo buf, XLogReaderState *record)
 						 xlrec->blkno, xlrec->flags);
 		pfree(path);
 	}
+	else if (info == XLOG_SMGR_UNLINK)
+	{
+		xl_smgr_unlink *xlrec = (xl_smgr_unlink *) rec;
+		char	   *path = relpathperm(xlrec->rlocator, xlrec->forkNum);
+
+		appendStringInfoString(buf, path);
+		pfree(path);
+	}
+	else if (info == XLOG_SMGR_MARK)
+	{
+		xl_smgr_mark *xlrec = (xl_smgr_mark *) rec;
+		char	   *path = GetRelationPath(xlrec->rlocator.dbOid,
+										   xlrec->rlocator.spcOid,
+										   xlrec->rlocator.relNumber,
+										   InvalidBackendId,
+										   xlrec->forkNum, xlrec->mark);
+		char	   *action = "<none>";
+
+		switch (xlrec->action)
+		{
+			case XLOG_SMGR_MARK_CREATE:
+				action = "CREATE";
+				break;
+			case XLOG_SMGR_MARK_UNLINK:
+				action = "DELETE";
+				break;
+		}
+
+		appendStringInfo(buf, "%s %s", action, path);
+		pfree(path);
+	}
 }
 
 const char *
@@ -55,6 +86,12 @@ smgr_identify(uint8 info)
 		case XLOG_SMGR_TRUNCATE:
 			id = "TRUNCATE";
 			break;
+		case XLOG_SMGR_UNLINK:
+			id = "UNLINK";
+			break;
+		case XLOG_SMGR_MARK:
+			id = "MARK";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 22c8ae9755..bf83d19abd 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -741,6 +741,16 @@ we must panic and abort recovery.  The DBA will have to manually clean up
 then restart recovery.  This is part of the reason for not writing a WAL
 entry until we've successfully done the original action.
 
+================================
+Smgr MARK files
+--------------------------------
+
+An smgr mark file is an empty file that is created alongside a new
+relation storage file to signal that the storage file must be cleaned
+up during recovery.  In contrast to the four actions above, failing to
+remove these files will result in a data loss, in which case the
+server will shut down.
+
 
 Skipping WAL for New RelFileLocator
 --------------------------------
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index b876401260..acbf8f1b12 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2227,6 +2227,9 @@ CommitTransaction(void)
 	 */
 	smgrDoPendingSyncs(true, is_parallel_worker);
 
+	/* Likewise delete mark files for files created during this transaction. */
+	smgrDoPendingCleanups(true);
+
 	/* close large objects before lower-level cleanup */
 	AtEOXact_LargeObject(true);
 
@@ -2478,6 +2481,9 @@ PrepareTransaction(void)
 	 */
 	smgrDoPendingSyncs(true, false);
 
+	/* Likewise delete mark files for files created during this transaction. */
+	smgrDoPendingCleanups(true);
+
 	/* close large objects before lower-level cleanup */
 	AtEOXact_LargeObject(true);
 
@@ -2806,6 +2812,7 @@ AbortTransaction(void)
 	AfterTriggerEndXact(false); /* 'false' means it's abort */
 	AtAbort_Portals();
 	smgrDoPendingSyncs(false, is_parallel_worker);
+	smgrDoPendingCleanups(false);
 	AtEOXact_LargeObject(false);
 	AtAbort_Notify();
 	AtEOXact_RelationMap(false, is_parallel_worker);
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index dbe9394762..4d28635f64 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -42,6 +42,7 @@
 #include "access/xlogutils.h"
 #include "backup/basebackup.h"
 #include "catalog/pg_control.h"
+#include "catalog/storage.h"
 #include "commands/tablespace.h"
 #include "common/file_utils.h"
 #include "miscadmin.h"
@@ -56,6 +57,7 @@
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
+#include "storage/reinit.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
 #include "utils/datetime.h"
@@ -1795,6 +1797,14 @@ PerformWalRecovery(void)
 
 		RmgrCleanup();
 
+		/* cleanup garbage files left during crash recovery */
+		if (!InArchiveRecovery)
+			ResetUnloggedRelations(UNLOGGED_RELATION_DROP_BUFFER |
+								   UNLOGGED_RELATION_CLEANUP);
+
+		/* run rollback cleanup if any */
+		smgrDoPendingDeletes(false);
+
 		ereport(LOG,
 				(errmsg("redo done at %X/%X system usage: %s",
 						LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
@@ -3134,6 +3144,14 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
 			{
 				ereport(DEBUG1,
 						(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
+
+				/* cleanup garbage files left during crash recovery */
+				ResetUnloggedRelations(UNLOGGED_RELATION_DROP_BUFFER |
+									   UNLOGGED_RELATION_CLEANUP);
+
+				/* run rollback cleanup if any */
+				smgrDoPendingDeletes(false);
+
 				InArchiveRecovery = true;
 				if (StandbyModeRequested)
 					EnableStandbyMode();
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 3fb9451643..1b9f909dbc 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -1191,6 +1191,7 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
 		ForkNumber	relForkNum; /* Type of fork if file is a relation */
 		int			relnumchars;	/* Chars in filename that are the
 									 * relnumber */
+		StorageMarks mark;		/* marker file sign */
 
 		/* Skip special stuff */
 		if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
@@ -1241,7 +1242,7 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
 		/* Exclude all forks for unlogged tables except the init fork */
 		if (isDbDir &&
 			parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
-												&relForkNum))
+												&relForkNum, &mark))
 		{
 			/* Never exclude init forks */
 			if (relForkNum != INIT_FORKNUM)
@@ -1448,6 +1449,7 @@ is_checksummed_file(const char *fullpath, const char *filename)
 		strncmp(fullpath, "/", 1) == 0)
 	{
 		int			excludeIdx;
+		char	   *p;
 
 		/* Compare file against noChecksumFiles skip list */
 		for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++)
@@ -1461,6 +1463,11 @@ is_checksummed_file(const char *fullpath, const char *filename)
 				return false;
 		}
 
+		/* exclude mark files */
+		p = strchr(filename, '.');
+		if (p && isalpha(p[1]) && p[2] == 0)
+			return false;
+
 		return true;
 	}
 	else
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index af1491aa1d..03e06246be 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -19,6 +19,7 @@
 
 #include "postgres.h"
 
+#include "access/amapi.h"
 #include "access/parallel.h"
 #include "access/visibilitymap.h"
 #include "access/xact.h"
@@ -66,6 +67,21 @@ typedef struct PendingRelDelete
 	struct PendingRelDelete *next;	/* linked-list link */
 } PendingRelDelete;
 
+#define	PCOP_UNLINK_FORK		(1 << 0)
+#define	PCOP_UNLINK_MARK		(1 << 1)
+
+typedef struct PendingCleanup
+{
+	RelFileLocator rlocator;	/* relation that need a cleanup */
+	int			op;				/* operation mask */
+	ForkNumber	unlink_forknum; /* forknum to unlink */
+	StorageMarks unlink_mark;	/* mark to unlink */
+	BackendId	backend;		/* InvalidBackendId if not a temp rel */
+	bool		atCommit;		/* T=delete at commit; F=delete at abort */
+	int			nestLevel;		/* xact nesting level of request */
+	struct PendingCleanup *next;	/* linked-list link */
+}			PendingCleanup;
+
 typedef struct PendingRelSync
 {
 	RelFileLocator rlocator;
@@ -73,6 +89,7 @@ typedef struct PendingRelSync
 } PendingRelSync;
 
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
+static PendingCleanup * pendingCleanups = NULL; /* head of linked list */
 static HTAB *pendingSyncHash = NULL;
 
 
@@ -123,6 +140,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
 	SMgrRelation srel;
 	BackendId	backend;
 	bool		needs_wal;
+	PendingCleanup *pendingclean;
 
 	Assert(!IsInParallelMode());	/* couldn't update pendingSyncHash */
 
@@ -145,9 +163,23 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
 			return NULL;		/* placate compiler */
 	}
 
+	/*
+	 * We are going to create a new storage file. If server crashes before the
+	 * current transaction ends the file needs to be cleaned up. The
+	 * SMGR_MARK_UNCOMMITED mark file prompts that work at the next startup.
+	 * We don't need this during WAL-loggged CREATE DATABASE. See
+	 * CreateAndCopyRelationData for detail.
+	 */
 	srel = smgropen(rlocator, backend);
+
+	if (register_delete)
+	{
+		log_smgrcreatemark(&rlocator, MAIN_FORKNUM, SMGR_MARK_UNCOMMITTED);
+		smgrcreatemark(srel, MAIN_FORKNUM, SMGR_MARK_UNCOMMITTED, false);
+	}
+
 	smgrcreate(srel, MAIN_FORKNUM, false);
-
+	
 	if (needs_wal)
 		log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
 
@@ -157,16 +189,29 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
 	 */
 	if (register_delete)
 	{
-		PendingRelDelete *pending;
+		PendingRelDelete *pendingdel;
 
-		pending = (PendingRelDelete *)
+		pendingdel = (PendingRelDelete *)
 			MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
-		pending->rlocator = rlocator;
-		pending->backend = backend;
-		pending->atCommit = false;	/* delete if abort */
-		pending->nestLevel = GetCurrentTransactionNestLevel();
-		pending->next = pendingDeletes;
-		pendingDeletes = pending;
+		pendingdel->rlocator = rlocator;
+		pendingdel->backend = backend;
+		pendingdel->atCommit = false;	/* delete if abort */
+		pendingdel->nestLevel = GetCurrentTransactionNestLevel();
+		pendingdel->next = pendingDeletes;
+		pendingDeletes = pendingdel;
+
+		/* drop mark files at commit */
+		pendingclean = (PendingCleanup *)
+			MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup));
+		pendingclean->rlocator = rlocator;
+		pendingclean->op = PCOP_UNLINK_MARK;
+		pendingclean->unlink_forknum = MAIN_FORKNUM;
+		pendingclean->unlink_mark = SMGR_MARK_UNCOMMITTED;
+		pendingclean->backend = backend;
+		pendingclean->atCommit = true;
+		pendingclean->nestLevel = GetCurrentTransactionNestLevel();
+		pendingclean->next = pendingCleanups;
+		pendingCleanups = pendingclean;
 	}
 
 	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
@@ -197,6 +242,69 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
 	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
 }
 
+/*
+ * Perform XLogInsert of an XLOG_SMGR_UNLINK record to WAL.
+ */
+void
+log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum)
+{
+	xl_smgr_unlink xlrec;
+
+	/*
+	 * Make an XLOG entry reporting the file unlink.
+	 */
+	xlrec.rlocator = *rlocator;
+	xlrec.forkNum = forkNum;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+	XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE);
+}
+
+/*
+ * Perform XLogInsert of an XLOG_SMGR_CREATEMARK record to WAL.
+ */
+void
+log_smgrcreatemark(const RelFileLocator *rlocator, ForkNumber forkNum,
+				   StorageMarks mark)
+{
+	xl_smgr_mark xlrec;
+
+	/*
+	 * Make an XLOG entry reporting the file creation.
+	 */
+	xlrec.rlocator = *rlocator;
+	xlrec.forkNum = forkNum;
+	xlrec.mark = mark;
+	xlrec.action = XLOG_SMGR_MARK_CREATE;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+	XLogInsert(RM_SMGR_ID, XLOG_SMGR_MARK | XLR_SPECIAL_REL_UPDATE);
+}
+
+/*
+ * Perform XLogInsert of an XLOG_SMGR_UNLINKMARK record to WAL.
+ */
+void
+log_smgrunlinkmark(const RelFileLocator *rlocator, ForkNumber forkNum,
+				   StorageMarks mark)
+{
+	xl_smgr_mark xlrec;
+
+	/*
+	 * Make an XLOG entry reporting the file creation.
+	 */
+	xlrec.rlocator = *rlocator;
+	xlrec.forkNum = forkNum;
+	xlrec.mark = mark;
+	xlrec.action = XLOG_SMGR_MARK_UNLINK;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+	XLogInsert(RM_SMGR_ID, XLOG_SMGR_MARK | XLR_SPECIAL_REL_UPDATE);
+}
+
 /*
  * RelationDropStorage
  *		Schedule unlinking of physical storage at transaction commit.
@@ -711,6 +819,76 @@ smgrDoPendingDeletes(bool isCommit)
 	}
 }
 
+/*
+ *	smgrDoPendingUnmark() -- Clean up work that emits WAL records
+ *
+ *  The operations handled in the function emits WAL records, which must be
+ *  part of the current transaction.
+ */
+void
+smgrDoPendingCleanups(bool isCommit)
+{
+	int			nestLevel = GetCurrentTransactionNestLevel();
+	PendingCleanup *pending;
+	PendingCleanup *prev;
+	PendingCleanup *next;
+
+	prev = NULL;
+	for (pending = pendingCleanups; pending != NULL; pending = next)
+	{
+		next = pending->next;
+		if (pending->nestLevel < nestLevel)
+		{
+			/* outer-level entries should not be processed yet */
+			prev = pending;
+		}
+		else
+		{
+			/* unlink list entry first, so we don't retry on failure */
+			if (prev)
+				prev->next = next;
+			else
+				pendingCleanups = next;
+
+			/* do cleanup if called for */
+			if (pending->atCommit == isCommit)
+			{
+				SMgrRelation srel;
+
+				srel = smgropen(pending->rlocator, pending->backend);
+
+				Assert((pending->op &
+						~(PCOP_UNLINK_FORK | PCOP_UNLINK_MARK)) == 0);
+
+				if (pending->op & PCOP_UNLINK_FORK)
+				{
+					/* Don't emit wal while recovery. */
+					if (!InRecovery)
+						log_smgrunlink(&pending->rlocator,
+									   pending->unlink_forknum);
+					smgrunlink(srel, pending->unlink_forknum, false);
+				}
+
+				if (pending->op & PCOP_UNLINK_MARK)
+				{
+					if (!InRecovery)
+						log_smgrunlinkmark(&pending->rlocator,
+										   pending->unlink_forknum,
+										   pending->unlink_mark);
+
+					smgrunlinkmark(srel, pending->unlink_forknum,
+								   pending->unlink_mark, InRecovery);
+					smgrclose(srel);
+				}
+			}
+
+			/* must explicitly free the list entry */
+			pfree(pending);
+			/* prev does not change */
+		}
+	}
+}
+
 /*
  *	smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
  */
@@ -971,6 +1149,15 @@ smgr_redo(XLogReaderState *record)
 		reln = smgropen(xlrec->rlocator, InvalidBackendId);
 		smgrcreate(reln, xlrec->forkNum, true);
 	}
+	else if (info == XLOG_SMGR_UNLINK)
+	{
+		xl_smgr_unlink *xlrec = (xl_smgr_unlink *) XLogRecGetData(record);
+		SMgrRelation reln;
+
+		reln = smgropen(xlrec->rlocator, InvalidBackendId);
+		smgrunlink(reln, xlrec->forkNum, true);
+		smgrclose(reln);
+	}
 	else if (info == XLOG_SMGR_TRUNCATE)
 	{
 		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
@@ -1059,6 +1246,71 @@ smgr_redo(XLogReaderState *record)
 
 		FreeFakeRelcacheEntry(rel);
 	}
+	else if (info == XLOG_SMGR_MARK)
+	{
+		xl_smgr_mark *xlrec = (xl_smgr_mark *) XLogRecGetData(record);
+		SMgrRelation reln;
+		PendingCleanup *pending;
+		bool		created = false;
+
+		reln = smgropen(xlrec->rlocator, InvalidBackendId);
+
+		switch (xlrec->action)
+		{
+			case XLOG_SMGR_MARK_CREATE:
+				smgrcreatemark(reln, xlrec->forkNum, xlrec->mark, true);
+				created = true;
+				break;
+			case XLOG_SMGR_MARK_UNLINK:
+				smgrunlinkmark(reln, xlrec->forkNum, xlrec->mark, true);
+				break;
+			default:
+				elog(ERROR, "unknown smgr_mark action \"%c\"", xlrec->mark);
+		}
+
+		if (created)
+		{
+			/* revert mark file operation at abort */
+			pending = (PendingCleanup *)
+				MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup));
+			pending->rlocator = xlrec->rlocator;
+			pending->op = PCOP_UNLINK_MARK;
+			pending->unlink_forknum = xlrec->forkNum;
+			pending->unlink_mark = xlrec->mark;
+			pending->backend = InvalidBackendId;
+			pending->atCommit = false;
+			pending->nestLevel = GetCurrentTransactionNestLevel();
+			pending->next = pendingCleanups;
+			pendingCleanups = pending;
+		}
+		else
+		{
+			/*
+			 * Delete pending action for this mark file if any. We should have
+			 * at most one entry for this action.
+			 */
+			PendingCleanup *prev = NULL;
+
+			for (pending = pendingCleanups; pending != NULL;
+				 pending = pending->next)
+			{
+				if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) &&
+					pending->unlink_forknum == xlrec->forkNum &&
+					(pending->op & PCOP_UNLINK_MARK) != 0)
+				{
+					if (prev)
+						prev->next = pending->next;
+					else
+						pendingCleanups = pending->next;
+
+					pfree(pending);
+					break;
+				}
+
+				prev = pending;
+			}
+		}
+	}
 	else
 		elog(PANIC, "smgr_redo: unknown op code %u", info);
 }
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 9fd8444ed4..1b77347978 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -346,8 +346,6 @@ static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
 
-static int	fsync_parent_path(const char *fname, int elevel);
-
 
 /*
  * pg_fsync --- do fsync with or without writethrough
@@ -3670,7 +3668,7 @@ fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
  * This is aimed at making file operations persistent on disk in case of
  * an OS crash or power failure.
  */
-static int
+int
 fsync_parent_path(const char *fname, int elevel)
 {
 	char		parentpath[MAXPGPATH];
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
index fb55371b1b..250cfe9e44 100644
--- a/src/backend/storage/file/reinit.c
+++ b/src/backend/storage/file/reinit.c
@@ -16,29 +16,45 @@
 
 #include <unistd.h>
 
+#include "access/xlogrecovery.h"
+#include "catalog/pg_tablespace_d.h"
 #include "common/relpath.h"
 #include "postmaster/startup.h"
+#include "storage/bufmgr.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
+#include "storage/md.h"
 #include "storage/reinit.h"
+#include "storage/smgr.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 
 static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
-												  int op);
+												  Oid tspid, int op);
 static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
-											   int op);
+											   Oid tspid, Oid dbid, int op);
 
 typedef struct
 {
-	Oid			reloid;			/* hash key */
-} unlogged_relation_entry;
+ 	RelFileNumber	relNumber;		/* hash key */
+	bool			has_init;		/* has INIT fork */
+	bool			dirty_all;		/* needs to remove all forks */
+}  relfile_entry;
 
 /*
- * Reset unlogged relations from before the last restart.
+ * Clean up and reset relation files from before the last restart.
  *
- * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
- * relation with an "init" fork, except for the "init" fork itself.
+ * If op includes UNLOGGED_RELATION_CLEANUP, we perform different operations
+ * depending on the existence of mark files.
+ *
+ * If SMGR_MARK_UNCOMMITTED mark file for main fork is present we remove the
+ * whole relation along with the mark file.
+ *
+ * Otherwise, if the "init" fork is found.  we remove all forks of any relation
+ * with the "init" fork, except for the "init" fork itself.
+ *
+ * If op includes UNLOGGED_RELATION_DROP_BUFFER, we drop all buffers for all
+ * relations that are to be cleaned up.
  *
  * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
  * fork.
@@ -72,7 +88,7 @@ ResetUnloggedRelations(int op)
 	/*
 	 * First process unlogged files in pg_default ($PGDATA/base)
 	 */
-	ResetUnloggedRelationsInTablespaceDir("base", op);
+	ResetUnloggedRelationsInTablespaceDir("base", DEFAULTTABLESPACE_OID, op);
 
 	/*
 	 * Cycle through directories for all non-default tablespaces.
@@ -81,13 +97,19 @@ ResetUnloggedRelations(int op)
 
 	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
 	{
+		Oid			tspid;
+
 		if (strcmp(spc_de->d_name, ".") == 0 ||
 			strcmp(spc_de->d_name, "..") == 0)
 			continue;
 
 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
-		ResetUnloggedRelationsInTablespaceDir(temp_path, op);
+
+		tspid = atooid(spc_de->d_name);
+
+		Assert(tspid != 0);
+		ResetUnloggedRelationsInTablespaceDir(temp_path, tspid, op);
 	}
 
 	FreeDir(spc_dir);
@@ -103,7 +125,8 @@ ResetUnloggedRelations(int op)
  * Process one tablespace directory for ResetUnloggedRelations
  */
 static void
-ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
+ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
+									  Oid tspid, int op)
 {
 	DIR		   *ts_dir;
 	struct dirent *de;
@@ -130,6 +153,8 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
 
 	while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
 	{
+		Oid			dbid;
+
 		/*
 		 * We're only interested in the per-database directories, which have
 		 * numeric names.  Note that this code will also (properly) ignore "."
@@ -148,7 +173,10 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
 			ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s",
 									 dbspace_path);
 
-		ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
+		dbid = atooid(de->d_name);
+		Assert(dbid != 0);
+
+		ResetUnloggedRelationsInDbspaceDir(dbspace_path, tspid, dbid, op);
 	}
 
 	FreeDir(ts_dir);
@@ -158,125 +186,200 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
  * Process one per-dbspace directory for ResetUnloggedRelations
  */
 static void
-ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
+ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
+								   Oid tspid, Oid dbid, int op)
 {
 	DIR		   *dbspace_dir;
 	struct dirent *de;
 	char		rm_path[MAXPGPATH * 2];
+	HTAB	   *hash;
+	HASHCTL		ctl;
 
 	/* Caller must specify at least one operation. */
-	Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
+	Assert((op & (UNLOGGED_RELATION_CLEANUP |
+				  UNLOGGED_RELATION_DROP_BUFFER |
+				  UNLOGGED_RELATION_INIT)) != 0);
 
 	/*
 	 * Cleanup is a two-pass operation.  First, we go through and identify all
 	 * the files with init forks.  Then, we go through again and nuke
 	 * everything with the same OID except the init fork.
 	 */
+
+	/*
+	 * It's possible that someone could create tons of unlogged relations in
+	 * the same database & tablespace, so we'd better use a hash table rather
+	 * than an array or linked list to keep track of which files need to be
+	 * reset.  Otherwise, this cleanup operation would be O(n^2).
+	 */
+	memset(&ctl, 0, sizeof(ctl));
+	ctl.keysize = sizeof(RelFileNumber);
+	ctl.entrysize = sizeof(relfile_entry);
+	hash = hash_create("unlogged relation RelFileNumbers",
+					   32, &ctl, HASH_ELEM | HASH_BLOBS);
+
+	/* Collect INIT fork and mark files in the directory. */
+	dbspace_dir = AllocateDir(dbspacedirname);
+	while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+	{
+		ForkNumber	forkNum;
+		int			relnumchars;
+		StorageMarks mark;
+
+		/* Skip anything that doesn't look like a relation data file. */
+		if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+												 &forkNum, &mark))
+			continue;
+
+		if (forkNum == INIT_FORKNUM || mark == SMGR_MARK_UNCOMMITTED)
+		{
+			RelFileNumber key;
+			relfile_entry *ent;
+			bool		found;
+
+			/*
+			 * Put the OID portion of the name into the hash table,
+			 * if it isn't already.  If it has SMGR_MARK_UNCOMMITTED mark
+			 * files, the storage file is in dirty state, where clean up is
+			 * needed.
+			 */
+			key = atooid(de->d_name);
+			ent = hash_search(hash, &key, HASH_ENTER, &found);
+
+			if (!found)
+			{
+				ent->has_init = false;
+				ent->dirty_all = false;
+			}
+
+			if (forkNum == MAIN_FORKNUM && mark == SMGR_MARK_UNCOMMITTED)
+				ent->dirty_all = true;
+			else
+			{
+				Assert(forkNum == INIT_FORKNUM);
+				ent->has_init = true;
+			}
+		}
+	}
+
+	/* Done with the first pass. */
+	FreeDir(dbspace_dir);
+
+	/* nothing to do if we don't have init nor cleanup forks */
+	if (hash_get_num_entries(hash) < 1)
+	{
+		hash_destroy(hash);
+		return;
+	}
+
+	if ((op & UNLOGGED_RELATION_DROP_BUFFER) != 0)
+	{
+		/*
+		 * When we come here after recovery, smgr object for this file might
+		 * have been created. In that case we need to drop all buffers then the
+		 * smgr object.  Otherwise checkpointer wrongly tries to flush buffers
+		 * for nonexistent relation storage. This is safe as far as no other
+		 * backends have accessed the relation before starting archive
+		 * recovery.
+		 */
+		HASH_SEQ_STATUS status;
+		relfile_entry *ent;
+		SMgrRelation *srels = palloc(sizeof(SMgrRelation) * 8);
+		int			maxrels = 8;
+		int			nrels = 0;
+		int			i;
+
+		Assert(!HotStandbyActive());
+
+		hash_seq_init(&status, hash);
+		while ((ent = (relfile_entry *) hash_seq_search(&status)) != NULL)
+		{
+			RelFileLocatorBackend rel;
+
+			if (maxrels <= nrels)
+			{
+				maxrels *= 2;
+				srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+			}
+
+			rel.backend = InvalidBackendId;
+			rel.locator.spcOid = tspid;
+			rel.locator.dbOid = dbid;
+			rel.locator.relNumber = ent->relNumber;
+
+			srels[nrels++] = smgropen(rel.locator, InvalidBackendId);
+		}
+
+		DropRelationsAllBuffers(srels, nrels);
+
+		for (i = 0; i < nrels; i++)
+			smgrclose(srels[i]);
+	}
+
+	/*
+	 * Now, make a second pass and remove anything that matches.
+	 */
 	if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
 	{
-		HTAB	   *hash;
-		HASHCTL		ctl;
-
-		/*
-		 * It's possible that someone could create a ton of unlogged relations
-		 * in the same database & tablespace, so we'd better use a hash table
-		 * rather than an array or linked list to keep track of which files
-		 * need to be reset.  Otherwise, this cleanup operation would be
-		 * O(n^2).
-		 */
-		ctl.keysize = sizeof(Oid);
-		ctl.entrysize = sizeof(unlogged_relation_entry);
-		ctl.hcxt = CurrentMemoryContext;
-		hash = hash_create("unlogged relation OIDs", 32, &ctl,
-						   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
-		/* Scan the directory. */
 		dbspace_dir = AllocateDir(dbspacedirname);
 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
 		{
 			ForkNumber	forkNum;
+			StorageMarks mark;
 			int			relnumchars;
-			unlogged_relation_entry ent;
+			RelFileNumber key;
+			relfile_entry *ent;
+			RelFileLocatorBackend rel;
 
 			/* Skip anything that doesn't look like a relation data file. */
 			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
-													 &forkNum))
-				continue;
-
-			/* Also skip it unless this is the init fork. */
-			if (forkNum != INIT_FORKNUM)
-				continue;
-
-			/*
-			 * Put the OID portion of the name into the hash table, if it
-			 * isn't already.
-			 */
-			ent.reloid = atooid(de->d_name);
-			(void) hash_search(hash, &ent, HASH_ENTER, NULL);
-		}
-
-		/* Done with the first pass. */
-		FreeDir(dbspace_dir);
-
-		/*
-		 * If we didn't find any init forks, there's no point in continuing;
-		 * we can bail out now.
-		 */
-		if (hash_get_num_entries(hash) == 0)
-		{
-			hash_destroy(hash);
-			return;
-		}
-
-		/*
-		 * Now, make a second pass and remove anything that matches.
-		 */
-		dbspace_dir = AllocateDir(dbspacedirname);
-		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
-		{
-			ForkNumber	forkNum;
-			int			relnumchars;
-			unlogged_relation_entry ent;
-
-			/* Skip anything that doesn't look like a relation data file. */
-			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
-													 &forkNum))
-				continue;
-
-			/* We never remove the init fork. */
-			if (forkNum == INIT_FORKNUM)
+													 &forkNum, &mark))
 				continue;
 
 			/*
 			 * See whether the OID portion of the name shows up in the hash
 			 * table.  If so, nuke it!
 			 */
-			ent.reloid = atooid(de->d_name);
-			if (hash_search(hash, &ent, HASH_FIND, NULL))
+			key = atooid(de->d_name);
+			ent = hash_search(hash, &key, HASH_FIND, NULL);
+
+			if (!ent)
+				continue;
+
+			if (!ent->dirty_all)
 			{
-				snprintf(rm_path, sizeof(rm_path), "%s/%s",
-						 dbspacedirname, de->d_name);
-				if (unlink(rm_path) < 0)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not remove file \"%s\": %m",
-									rm_path)));
-				else
-					elog(DEBUG2, "unlinked file \"%s\"", rm_path);
+				/* clean permanent relations don't need cleanup */
+				if (!ent->has_init)
+					continue;
+
+				if (forkNum == INIT_FORKNUM && mark == SMGR_MARK_NONE)
+					continue;
 			}
+
+			/* so, nuke it! */
+			snprintf(rm_path, sizeof(rm_path), "%s/%s",
+					 dbspacedirname, de->d_name);
+			if (unlink(rm_path) < 0)
+				ereport(ERROR,
+						errcode_for_file_access(),
+						errmsg("could not remove file \"%s\": %m",
+							   rm_path));
+
+			rel.backend = InvalidBackendId;
+			rel.locator.spcOid = tspid;
+			rel.locator.dbOid = dbid;
+			rel.locator.relNumber = atooid(de->d_name);
+
+			ForgetRelationForkSyncRequests(rel, forkNum);
 		}
 
 		/* Cleanup is complete. */
 		FreeDir(dbspace_dir);
-		hash_destroy(hash);
 	}
 
 	/*
 	 * Initialization happens after cleanup is complete: we copy each init
-	 * fork file to the corresponding main fork file.  Note that if we are
-	 * asked to do both cleanup and init, we may never get here: if the
-	 * cleanup code determines that there are no init forks in this dbspace,
-	 * it will return before we get to this point.
+	 * fork file to the corresponding main fork file.
 	 */
 	if ((op & UNLOGGED_RELATION_INIT) != 0)
 	{
@@ -285,6 +388,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
 		{
 			ForkNumber	forkNum;
+			StorageMarks mark;
 			int			relnumchars;
 			char		relnumbuf[OIDCHARS + 1];
 			char		srcpath[MAXPGPATH * 2];
@@ -292,9 +396,11 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 
 			/* Skip anything that doesn't look like a relation data file. */
 			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
-													 &forkNum))
+													 &forkNum, &mark))
 				continue;
 
+			Assert(mark == SMGR_MARK_NONE);
+
 			/* Also skip it unless this is the init fork. */
 			if (forkNum != INIT_FORKNUM)
 				continue;
@@ -328,15 +434,18 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
 		{
 			ForkNumber	forkNum;
+			StorageMarks mark;
 			int			relnumchars;
 			char		relnumbuf[OIDCHARS + 1];
 			char		mainpath[MAXPGPATH];
 
 			/* Skip anything that doesn't look like a relation data file. */
 			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
-													 &forkNum))
+													 &forkNum, &mark))
 				continue;
 
+			Assert(mark == SMGR_MARK_NONE);
+
 			/* Also skip it unless this is the init fork. */
 			if (forkNum != INIT_FORKNUM)
 				continue;
@@ -379,7 +488,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
  */
 bool
 parse_filename_for_nontemp_relation(const char *name, int *relnumchars,
-									ForkNumber *fork)
+									ForkNumber *fork, StorageMarks *mark)
 {
 	int			pos;
 
@@ -410,11 +519,19 @@ parse_filename_for_nontemp_relation(const char *name, int *relnumchars,
 
 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
 			;
-		if (segchar <= 1)
-			return false;
-		pos += segchar;
+		if (segchar > 1)
+			pos += segchar;
 	}
 
+	/* mark file? */
+	if (name[pos] == '.' && name[pos + 1] != 0)
+	{
+		*mark = name[pos + 1];
+		pos += 2;
+	}
+	else
+		*mark = SMGR_MARK_NONE;
+
 	/* Now we should be at the end. */
 	if (name[pos] != '\0')
 		return false;
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 352958e1fe..0b64635fb8 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -141,7 +141,8 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
 							 BlockNumber blkno, bool skipFsync, int behavior);
 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 							  MdfdVec *seg);
-
+static bool mdmarkexists(SMgrRelation reln, ForkNumber forkNum,
+						 StorageMarks mark);
 
 /*
  *	mdinit() -- Initialize private state for magnetic disk storage manager.
@@ -173,6 +174,82 @@ mdexists(SMgrRelation reln, ForkNumber forknum)
 	return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
 }
 
+/*
+ *  mdcreatemark() -- Create a mark file.
+ *
+ * If isRedo is true, it's okay for the file to exist already.
+ */
+void
+mdcreatemark(SMgrRelation reln, ForkNumber forkNum, StorageMarks mark,
+			 bool isRedo)
+{
+	char	   *path = markpath(reln->smgr_rlocator, forkNum, mark);
+	int			fd;
+
+	/* See mdcreate for details.. */
+	TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
+							reln->smgr_rlocator.locator.dbOid,
+							isRedo);
+
+	fd = BasicOpenFile(path, O_WRONLY | O_CREAT | O_EXCL);
+	if (fd < 0 && (!isRedo || errno != EEXIST))
+		ereport(ERROR,
+				errcode_for_file_access(),
+				errmsg("could not create mark file \"%s\": %m", path));
+
+	pg_fsync(fd);
+	close(fd);
+
+	/*
+	 * To guarantee that the creation of the file is persistent, fsync its
+	 * parent directory.
+	 */
+	fsync_parent_path(path, ERROR);
+
+	pfree(path);
+}
+
+
+/*
+ *  mdunlinkmark()  -- Delete the mark file
+ *
+ * If isRedo is true, it's okay for the file being not found.
+ */
+void
+mdunlinkmark(SMgrRelation reln, ForkNumber forkNum, StorageMarks mark,
+			 bool isRedo)
+{
+	char	   *path = markpath(reln->smgr_rlocator, forkNum, mark);
+
+	if (!isRedo || mdmarkexists(reln, forkNum, mark))
+		durable_unlink(path, ERROR);
+
+	pfree(path);
+}
+
+/*
+ *  mdmarkexists()  -- Check if the file exists.
+ */
+static bool
+mdmarkexists(SMgrRelation reln, ForkNumber forkNum, StorageMarks mark)
+{
+	char	   *path = markpath(reln->smgr_rlocator, forkNum, mark);
+	int			fd;
+
+	fd = BasicOpenFile(path, O_RDONLY);
+	if (fd < 0 && errno != ENOENT)
+		ereport(ERROR,
+				errcode_for_file_access(),
+				errmsg("could not access mark file \"%s\": %m", path));
+	pfree(path);
+
+	if (fd < 0)
+		return false;
+
+	close(fd);
+	return true;
+}
+
 /*
  *	mdcreate() -- Create a new relation on magnetic disk.
  *
@@ -1085,6 +1162,16 @@ register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
 	RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
 }
 
+/*
+ * ForgetRelationForkSyncRequests -- forget any fsyncs and unlinks for a fork
+ */
+void
+ForgetRelationForkSyncRequests(RelFileLocatorBackend rlocator,
+							   ForkNumber forknum)
+{
+	register_forget_request(rlocator, forknum, 0);
+}
+
 /*
  * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
  */
@@ -1445,12 +1532,14 @@ mdsyncfiletag(const FileTag *ftag, char *path)
  * Return 0 on success, -1 on failure, with errno set.
  */
 int
-mdunlinkfiletag(const FileTag *ftag, char *path)
+mdunlinkfiletag(const FileTag *ftag, char *path, StorageMarks mark)
 {
 	char	   *p;
 
 	/* Compute the path. */
-	p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
+	p = GetRelationPath(ftag->rlocator.dbOid, ftag->rlocator.spcOid,
+						ftag->rlocator.relNumber,InvalidBackendId,
+						MAIN_FORKNUM, mark);
 	strlcpy(path, p, MAXPGPATH);
 	pfree(p);
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dc466e5414..9969d84209 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -62,6 +62,10 @@ typedef struct f_smgr
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_createmark) (SMgrRelation reln, ForkNumber forknum,
+									StorageMarks mark, bool isRedo);
+	void		(*smgr_unlinkmark) (SMgrRelation reln, ForkNumber forknum,
+									StorageMarks mark, bool isRedo);
 } f_smgr;
 
 static const f_smgr smgrsw[] = {
@@ -82,6 +86,8 @@ static const f_smgr smgrsw[] = {
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
+		.smgr_createmark = mdcreatemark,
+		.smgr_unlinkmark = mdunlinkmark,
 	}
 };
 
@@ -371,6 +377,26 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 	smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
 }
 
+/*
+ *	smgrcreatemark() -- Create a mark file
+ */
+void
+smgrcreatemark(SMgrRelation reln, ForkNumber forknum, StorageMarks mark,
+			   bool isRedo)
+{
+	smgrsw[reln->smgr_which].smgr_createmark(reln, forknum, mark, isRedo);
+}
+
+/*
+ *	smgrunlinkmark() -- Delete a mark file
+ */
+void
+smgrunlinkmark(SMgrRelation reln, ForkNumber forknum, StorageMarks mark,
+			   bool isRedo)
+{
+	smgrsw[reln->smgr_which].smgr_unlinkmark(reln, forknum, mark, isRedo);
+}
+
 /*
  *	smgrdosyncall() -- Immediately sync all forks of all given relations
  *
@@ -693,6 +719,12 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
 	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
 }
 
+void
+smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+	smgrsw[reln->smgr_which].smgr_unlink(reln->smgr_rlocator, forknum, isRedo);
+}
+
 /*
  * AtEOXact_SMgr
  *
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index 768d1dbfc4..9d99cb8fef 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -91,7 +91,8 @@ static CycleCtr checkpoint_cycle_ctr = 0;
 typedef struct SyncOps
 {
 	int			(*sync_syncfiletag) (const FileTag *ftag, char *path);
-	int			(*sync_unlinkfiletag) (const FileTag *ftag, char *path);
+	int			(*sync_unlinkfiletag) (const FileTag *ftag, char *path,
+									   StorageMarks mark);
 	bool		(*sync_filetagmatches) (const FileTag *ftag,
 										const FileTag *candidate);
 } SyncOps;
@@ -235,7 +236,8 @@ SyncPostCheckpoint(void)
 
 		/* Unlink the file */
 		if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
-														  path) < 0)
+														  path,
+														  SMGR_MARK_NONE) < 0)
 		{
 			/*
 			 * There's a race condition, when the database is dropped at the
@@ -244,6 +246,21 @@ SyncPostCheckpoint(void)
 			 * here. rmtree() also has to ignore ENOENT errors, to deal with
 			 * the possibility that we delete the file first.
 			 */
+			if (errno != ENOENT)
+				ereport(WARNING,
+						errcode_for_file_access(),
+						errmsg("could not remove file \"%s\": %m", path));
+		}
+		else if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
+															   path,
+															   SMGR_MARK_UNCOMMITTED)
+				 < 0)
+		{
+			/*
+			 * And we may have SMGR_MARK_UNCOMMITTED file.  Remove it if the
+			 * fork file has been successfully removed. It's ok if the file
+			 * does not exist.
+			 */
 			if (errno != ENOENT)
 				ereport(WARNING,
 						(errcode_for_file_access(),
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 27782237d0..e9e4bafb01 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -418,6 +418,22 @@ extractPageInfo(XLogReaderState *record)
 		 * source system.
 		 */
 	}
+	else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_UNLINK)
+	{
+		/*
+		 * We can safely ignore there.  We'll see that the file don't exist in
+		 * the target data dir, and copy them in from the source system. No
+		 * need to do anything special here.
+		 */
+	}
+	else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_MARK)
+	{
+		/*
+		 * We can safely ignore these, The file will be removed from the
+		 * target, if it doesn't exist in the source system.  The files are
+		 * empty so we don't need to bother the content.
+		 */
+	}
 	else if (rmid == RM_XACT_ID &&
 			 ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT ||
 			  (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED ||
diff --git a/src/common/relpath.c b/src/common/relpath.c
index 87de5f6c96..b1f6832cfa 100644
--- a/src/common/relpath.c
+++ b/src/common/relpath.c
@@ -139,9 +139,15 @@ GetDatabasePath(Oid dbOid, Oid spcOid)
  */
 char *
 GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
-				int backendId, ForkNumber forkNumber)
+				int backendId, ForkNumber forkNumber, char mark)
 {
 	char	   *path;
+	char		markstr[4];
+
+	if (mark == 0)
+		markstr[0] = 0;
+	else
+		snprintf(markstr, sizeof(markstr), ".%c", mark);
 
 	if (spcOid == GLOBALTABLESPACE_OID)
 	{
@@ -149,10 +155,10 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
 		Assert(dbOid == 0);
 		Assert(backendId == InvalidBackendId);
 		if (forkNumber != MAIN_FORKNUM)
-			path = psprintf("global/%u_%s",
-							relNumber, forkNames[forkNumber]);
+			path = psprintf("global/%u_%s%s",
+							relNumber, forkNames[forkNumber], markstr);
 		else
-			path = psprintf("global/%u", relNumber);
+			path = psprintf("global/%u%s", relNumber, markstr);
 	}
 	else if (spcOid == DEFAULTTABLESPACE_OID)
 	{
@@ -160,22 +166,22 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
 		if (backendId == InvalidBackendId)
 		{
 			if (forkNumber != MAIN_FORKNUM)
-				path = psprintf("base/%u/%u_%s",
+				path = psprintf("base/%u/%u_%s%s",
 								dbOid, relNumber,
-								forkNames[forkNumber]);
+								forkNames[forkNumber], markstr);
 			else
-				path = psprintf("base/%u/%u",
-								dbOid, relNumber);
+				path = psprintf("base/%u/%u%s",
+								dbOid, relNumber, markstr);
 		}
 		else
 		{
 			if (forkNumber != MAIN_FORKNUM)
-				path = psprintf("base/%u/t%d_%u_%s",
+				path = psprintf("base/%u/t%d_%u_%s%s",
 								dbOid, backendId, relNumber,
-								forkNames[forkNumber]);
+								forkNames[forkNumber], markstr);
 			else
-				path = psprintf("base/%u/t%d_%u",
-								dbOid, backendId, relNumber);
+				path = psprintf("base/%u/t%d_%u%s",
+								dbOid, backendId, relNumber, markstr);
 		}
 	}
 	else
@@ -184,27 +190,28 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
 		if (backendId == InvalidBackendId)
 		{
 			if (forkNumber != MAIN_FORKNUM)
-				path = psprintf("pg_tblspc/%u/%s/%u/%u_%s",
+				path = psprintf("pg_tblspc/%u/%s/%u/%u_%s%s",
 								spcOid, TABLESPACE_VERSION_DIRECTORY,
 								dbOid, relNumber,
-								forkNames[forkNumber]);
+								forkNames[forkNumber], markstr);
 			else
-				path = psprintf("pg_tblspc/%u/%s/%u/%u",
+				path = psprintf("pg_tblspc/%u/%s/%u/%u%s",
 								spcOid, TABLESPACE_VERSION_DIRECTORY,
-								dbOid, relNumber);
+								dbOid, relNumber, markstr);
 		}
 		else
 		{
 			if (forkNumber != MAIN_FORKNUM)
-				path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s",
+				path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s%s",
 								spcOid, TABLESPACE_VERSION_DIRECTORY,
 								dbOid, backendId, relNumber,
-								forkNames[forkNumber]);
+								forkNames[forkNumber], markstr);
 			else
-				path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u",
+				path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u%s",
 								spcOid, TABLESPACE_VERSION_DIRECTORY,
-								dbOid, backendId, relNumber);
+								dbOid, backendId, relNumber, markstr);
 		}
 	}
+
 	return path;
 }
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index 45a3c7835c..0b39c6ef56 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -25,6 +25,8 @@ extern PGDLLIMPORT int wal_skip_threshold;
 extern SMgrRelation RelationCreateStorage(RelFileLocator rlocator,
 										  char relpersistence,
 										  bool register_delete);
+extern void RelationCreateInitFork(Relation rel);
+extern void RelationDropInitFork(Relation rel);
 extern void RelationDropStorage(Relation rel);
 extern void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit);
 extern void RelationPreTruncate(Relation rel);
@@ -43,6 +45,7 @@ extern void RestorePendingSyncs(char *startAddress);
 extern void smgrDoPendingDeletes(bool isCommit);
 extern void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker);
 extern int	smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr);
+extern void smgrDoPendingCleanups(bool isCommit);
 extern void AtSubCommit_smgr(void);
 extern void AtSubAbort_smgr(void);
 extern void PostPrepare_smgr(void);
diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h
index 6b0a7aa3df..a36646c6ee 100644
--- a/src/include/catalog/storage_xlog.h
+++ b/src/include/catalog/storage_xlog.h
@@ -18,17 +18,23 @@
 #include "lib/stringinfo.h"
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
+#include "storage/smgr.h"
 
 /*
  * Declarations for smgr-related XLOG records
  *
- * Note: we log file creation and truncation here, but logging of deletion
- * actions is handled by xact.c, because it is part of transaction commit.
+ * Note: we log file creation, truncation and buffer persistence change here,
+ * but logging of deletion actions is handled mainly by xact.c, because it is
+ * part of transaction commit in most cases.  However, there's a case where
+ * init forks are deleted outside control of transaction.
  */
 
 /* XLOG gives us high 4 bits */
 #define XLOG_SMGR_CREATE	0x10
 #define XLOG_SMGR_TRUNCATE	0x20
+#define XLOG_SMGR_UNLINK	0x30
+#define XLOG_SMGR_MARK		0x40
+#define XLOG_SMGR_BUFPERSISTENCE	0x50
 
 typedef struct xl_smgr_create
 {
@@ -36,6 +42,26 @@ typedef struct xl_smgr_create
 	ForkNumber	forkNum;
 } xl_smgr_create;
 
+typedef struct xl_smgr_unlink
+{
+	RelFileLocator rlocator;
+	ForkNumber	forkNum;
+} xl_smgr_unlink;
+
+typedef enum smgr_mark_action
+{
+	XLOG_SMGR_MARK_CREATE = 'c',
+	XLOG_SMGR_MARK_UNLINK = 'u'
+} smgr_mark_action;
+
+typedef struct xl_smgr_mark
+{
+	RelFileLocator rlocator;
+	ForkNumber	forkNum;
+	StorageMarks mark;
+	smgr_mark_action action;
+} xl_smgr_mark;
+
 /* flags for xl_smgr_truncate */
 #define SMGR_TRUNCATE_HEAP		0x0001
 #define SMGR_TRUNCATE_VM		0x0002
@@ -51,6 +77,11 @@ typedef struct xl_smgr_truncate
 } xl_smgr_truncate;
 
 extern void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum);
+extern void log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum);
+extern void log_smgrcreatemark(const RelFileLocator *rlocator,
+							   ForkNumber forkNum, StorageMarks mark);
+extern void log_smgrunlinkmark(const RelFileLocator *rlocator,
+							   ForkNumber forkNum, StorageMarks mark);
 
 extern void smgr_redo(XLogReaderState *record);
 extern void smgr_desc(StringInfo buf, XLogReaderState *record);
diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h
index 511c21682e..28c9dbcd13 100644
--- a/src/include/common/relpath.h
+++ b/src/include/common/relpath.h
@@ -74,7 +74,7 @@ extern int	forkname_chars(const char *str, ForkNumber *fork);
 extern char *GetDatabasePath(Oid dbOid, Oid spcOid);
 
 extern char *GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
-							 int backendId, ForkNumber forkNumber);
+							 int backendId, ForkNumber forkNumber, char mark);
 
 /*
  * Wrapper macros for GetRelationPath.  Beware of multiple
@@ -84,7 +84,7 @@ extern char *GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
 /* First argument is a RelFileLocator */
 #define relpathbackend(rlocator, backend, forknum) \
 	GetRelationPath((rlocator).dbOid, (rlocator).spcOid, (rlocator).relNumber, \
-					backend, forknum)
+					backend, forknum, 0)
 
 /* First argument is a RelFileLocator */
 #define relpathperm(rlocator, forknum) \
@@ -94,4 +94,9 @@ extern char *GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
 #define relpath(rlocator, forknum) \
 	relpathbackend((rlocator).locator, (rlocator).backend, forknum)
 
+/* First argument is a RelFileLocatorBackend */
+#define markpath(rlocator, forknum, mark)								\
+	GetRelationPath((rlocator).locator.dbOid, (rlocator).locator.spcOid, \
+					(rlocator).locator.relNumber,						\
+					(rlocator).backend, forknum, mark)
 #endif							/* RELPATH_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index f85de97d08..91612f2e42 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -179,6 +179,7 @@ extern void pg_flush_data(int fd, off_t offset, off_t nbytes);
 extern int	pg_truncate(const char *path, off_t length);
 extern void fsync_fname(const char *fname, bool isdir);
 extern int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
+extern int	fsync_parent_path(const char *fname, int elevel);
 extern int	durable_rename(const char *oldfile, const char *newfile, int elevel);
 extern int	durable_unlink(const char *fname, int elevel);
 extern void SyncDataDirectory(void);
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 8f32af9ef3..37de1a0d7b 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -23,6 +23,10 @@
 extern void mdinit(void);
 extern void mdopen(SMgrRelation reln);
 extern void mdclose(SMgrRelation reln, ForkNumber forknum);
+extern void mdcreatemark(SMgrRelation reln, ForkNumber forknum,
+						 StorageMarks mark, bool isRedo);
+extern void mdunlinkmark(SMgrRelation reln, ForkNumber forknum,
+						 StorageMarks mark, bool isRedo);
 extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
@@ -41,12 +45,14 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber nblocks);
 extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
 
+extern void ForgetRelationForkSyncRequests(RelFileLocatorBackend rlocator,
+										   ForkNumber forknum);
 extern void ForgetDatabaseSyncRequests(Oid dbid);
 extern void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo);
 
 /* md sync callbacks */
 extern int	mdsyncfiletag(const FileTag *ftag, char *path);
-extern int	mdunlinkfiletag(const FileTag *ftag, char *path);
+extern int	mdunlinkfiletag(const FileTag *ftag, char *path, StorageMarks mark);
 extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
 
 #endif							/* MD_H */
diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h
index e2bbb5abe9..119dac1505 100644
--- a/src/include/storage/reinit.h
+++ b/src/include/storage/reinit.h
@@ -16,14 +16,16 @@
 #define REINIT_H
 
 #include "common/relpath.h"
-
+#include "storage/smgr.h"
 
 extern void ResetUnloggedRelations(int op);
 extern bool parse_filename_for_nontemp_relation(const char *name,
 												int *relnumchars,
-												ForkNumber *fork);
+												ForkNumber *fork,
+												StorageMarks *mark);
 
 #define UNLOGGED_RELATION_CLEANUP		0x0001
-#define UNLOGGED_RELATION_INIT			0x0002
+#define UNLOGGED_RELATION_DROP_BUFFER	0x0002
+#define UNLOGGED_RELATION_INIT			0x0004
 
 #endif							/* REINIT_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 0935144f42..da6e0f3d64 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,18 @@
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
 
+/*
+ * Storage marks is a file of which existence suggests something about a
+ * file. The name of such files is "<filename>.<mark>", where the mark is one
+ * of the values of StorageMarks. Since ".<digit>" means segment files so don't
+ * use digits for the mark character.
+ */
+typedef enum StorageMarks
+{
+	SMGR_MARK_NONE = 0,
+	SMGR_MARK_UNCOMMITTED = 'u' /* the file is not committed yet */
+} StorageMarks;
+
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
  * cached file handles.  An SMgrRelation is created (if not already present)
@@ -87,7 +99,12 @@ extern void smgrcloseall(void);
 extern void smgrcloserellocator(RelFileLocatorBackend rlocator);
 extern void smgrrelease(SMgrRelation reln);
 extern void smgrreleaseall(void);
+extern void smgrcreatemark(SMgrRelation reln, ForkNumber forknum,
+						   StorageMarks mark, bool isRedo);
+extern void smgrunlinkmark(SMgrRelation reln, ForkNumber forknum,
+						   StorageMarks mark, bool isRedo);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern void smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl
index 92e7b367df..9def8d2062 100644
--- a/src/test/recovery/t/013_crash_restart.pl
+++ b/src/test/recovery/t/013_crash_restart.pl
@@ -86,6 +86,24 @@ ok( pump_until(
 $killme_stdout = '';
 $killme_stderr = '';
 
+#create a table that should *not* survive, but has rows.
+#the table's contents is requried to cause access to the storage file
+#after a restart.
+$killme_stdin .= q[
+CREATE TABLE not_alive AS SELECT 1 as a;
+SELECT pg_relation_filepath('not_alive');
+];
+ok( pump_until(
+		$killme,         $psql_timeout,
+		\$killme_stdout, qr/[[:alnum:]\/]+[\r\n]$/m),
+	'added in-creation table');
+my $not_alive_relfile = $node->data_dir . "/" . $killme_stdout;
+chomp($not_alive_relfile);
+$killme_stdout = '';
+$killme_stderr = '';
+
+# The relfile must be exists now
+ok ( -e $not_alive_relfile, 'relfile for in-creation table');
 
 # Start longrunning query in second session; its failure will signal that
 # crash-restart has occurred.  The initial wait for the trivial select is to
@@ -144,6 +162,9 @@ $killme->run();
 ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
 $monitor->run();
 
+# The relfile must have been removed due to the recent restart.
+ok ( ! -e $not_alive_relfile,
+	 'relfile for the in-creation table should be removed after restart');
 
 # Acquire pid of new backend
 $killme_stdin .= q[
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 86a9303bf5..e7ba5d2dc8 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1986,6 +1986,7 @@ PatternInfoArray
 Pattern_Prefix_Status
 Pattern_Type
 PendingFsyncEntry
+PendingMarkCleanup
 PendingRelDelete
 PendingRelSync
 PendingUnlinkEntry
@@ -2618,6 +2619,7 @@ StdRdOptIndexCleanup
 StdRdOptions
 Step
 StopList
+StorageMarks
 StrategyNumber
 StreamCtl
 String
@@ -3630,6 +3632,7 @@ registered_buffer
 regmatch_t
 regoff_t
 regproc
+relfile_entry
 relopt_bool
 relopt_enum
 relopt_enum_elt_def
@@ -3683,6 +3686,7 @@ slist_iter
 slist_mutable_iter
 slist_node
 slock_t
+smgr_mark_action
 socket_set
 socklen_t
 spgBulkDeleteState
@@ -3884,7 +3888,9 @@ xl_restore_point
 xl_running_xacts
 xl_seq_rec
 xl_smgr_create
+xl_smgr_mark
 xl_smgr_truncate
+xl_smgr_unlink
 xl_standby_lock
 xl_standby_locks
 xl_tblspc_create_rec
-- 
2.31.1

Reply via email to