On Wed, Feb 12, 2020 at 9:54 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> In commit 3eb77eba we made it possible for any subsystem that wants a
> file to be flushed as part of the next checkpoint to ask the
> checkpointer to do that, as previously only md.c could do.

Hello,

While working on recovery performance, I found my way back to this
idea and rebased the patch.

Problem statement:

Every time we have to write out a page of pg_commit_ts, pg_multixact
or pg_xact due to cache pressure, we immediately call fsync().  This
runs serially in the recovery process, and it's quite bad for
pg_commit_ts, because we need to dump out a page for every ~800
transactions (track_commit_timestamps is not enabled by default).  If
we ask the checkpointer to do it, it collapses the 2048 fsync calls
for each SLRU segment into one, and the kernel can write out the data
with larger I/Os, maybe even ahead of time, and update the inode only
once.

Experiment:

Run crash recovery for 1 million pgbench transactions:

  postgres -D pgdata \
    -c synchronous_commit=off \
    -c enable_commit_timestamps=on \
    -c max_wal_size=10GB \
    -c checkpoint_timeout=60min

  # in another shell
  pgbench -i -s10 postgres
  psql postgres -c checkpoint
  pgbench -t1000000 -Mprepared postgres
  killall -9 postgres

  # save the crashed pgdata dir for repeated experiments
  mv pgdata pgdata-save

  # now run experiments like this and see how long recovery takes
  rm -fr pgdata
  cp -r pgdata-save pgdata
  postgres -D pgdata

What I see on a system that has around 2.5ms latency for fsync:

  master: 16.83 seconds
  patched: 4.00 seconds

It's harder to see it without commit timestamps enabled since we only
need to flush a pg_xact page every 32k transactions (and multixacts
are more complicated to test), but you can still see the effect.  With
8x more transactions to make it clearer what's going on, I could
measure a speedup of around 6% from this patch, which I suppose scales
up fairly obviously with storage latency (every million transaction =
at least 30 fsyncs stalls, so you can multiply that by your fsync
latency and work out how much time your recovery process will be
asleep at the wheel instead of applying your records).

>From a syscall overhead point of view, it's a bit unfortunate that we
open and close SLRU segments every time we write, but it's probably
not really enough to complain about... except for the (small) risk of
an inode dropping out of kernel caches in the time between closing it
and the checkpointer opening it.  Hmm.
From 0fe8767316d5a973f43553080c3973759d83038d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Tue, 4 Aug 2020 17:57:18 +1200
Subject: [PATCH v2] Ask the checkpointer to flush SLRU files.

Previously, we called fsync() after writing out pg_xact, multixact and
commit_ts pages, leading to an I/O stall in user backends and recovery.
Ask the checkpointer to collapse requests for the same file into a
single system call as part of the next checkpoint, as we do for relation
files.

Discussion: https://postgr.es/m/CA+hUKGLJ=84yt+nvhkeedauutvhmfq9i-n7k_o50jmq6rpj...@mail.gmail.com
---
 src/backend/access/transam/clog.c      |  13 +++-
 src/backend/access/transam/commit_ts.c |  12 ++-
 src/backend/access/transam/multixact.c |  24 +++++-
 src/backend/access/transam/slru.c      | 101 +++++++++++++++++++------
 src/backend/access/transam/subtrans.c  |   4 +-
 src/backend/commands/async.c           |   5 +-
 src/backend/storage/lmgr/predicate.c   |   4 +-
 src/backend/storage/sync/sync.c        |  24 +++++-
 src/include/access/clog.h              |   3 +
 src/include/access/commit_ts.h         |   3 +
 src/include/access/multixact.h         |   4 +
 src/include/access/slru.h              |  12 ++-
 src/include/storage/sync.h             |   7 +-
 13 files changed, 172 insertions(+), 44 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index f3da40ae01..0c5b7a525e 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -42,6 +42,7 @@
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "storage/proc.h"
+#include "storage/sync.h"
 
 /*
  * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
@@ -692,7 +693,8 @@ CLOGShmemInit(void)
 {
 	XactCtl->PagePrecedes = CLOGPagePrecedes;
 	SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
-				  XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER);
+				  XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
+				  SYNC_HANDLER_CLOG);
 }
 
 /*
@@ -1034,3 +1036,12 @@ clog_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "clog_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync clog files.
+ */
+int
+clogsyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(XactCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 903280ae92..b4edbdb4e3 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -555,7 +555,8 @@ CommitTsShmemInit(void)
 	CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
 	SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
 				  CommitTsSLRULock, "pg_commit_ts",
-				  LWTRANCHE_COMMITTS_BUFFER);
+				  LWTRANCHE_COMMITTS_BUFFER,
+				  SYNC_HANDLER_COMMIT_TS);
 
 	commitTsShared = ShmemInitStruct("CommitTs shared",
 									 sizeof(CommitTimestampShared),
@@ -1083,3 +1084,12 @@ commit_ts_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "commit_ts_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync commit_ts files.
+ */
+int
+committssyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(CommitTsCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 475f5ed861..27ae2edbdc 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1831,11 +1831,13 @@ MultiXactShmemInit(void)
 	SimpleLruInit(MultiXactOffsetCtl,
 				  "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
 				  MultiXactOffsetSLRULock, "pg_multixact/offsets",
-				  LWTRANCHE_MULTIXACTOFFSET_BUFFER);
+				  LWTRANCHE_MULTIXACTOFFSET_BUFFER,
+				  SYNC_HANDLER_MULTIXACT_OFFSET);
 	SimpleLruInit(MultiXactMemberCtl,
 				  "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
 				  MultiXactMemberSLRULock, "pg_multixact/members",
-				  LWTRANCHE_MULTIXACTMEMBER_BUFFER);
+				  LWTRANCHE_MULTIXACTMEMBER_BUFFER,
+				  SYNC_HANDLER_MULTIXACT_MEMBER);
 
 	/* Initialize our shared state struct */
 	MultiXactState = ShmemInitStruct("Shared MultiXact State",
@@ -3386,3 +3388,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS)
 
 	SRF_RETURN_DONE(funccxt);
 }
+
+/*
+ * Entrypoint for sync.c to sync offsets files.
+ */
+int
+multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(MultiXactOffsetCtl, ftag, path);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+multixactmemberssyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(MultiXactMemberCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 9e145f1c36..db1243772f 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -80,6 +80,18 @@ typedef struct SlruFlushData
 
 typedef struct SlruFlushData *SlruFlush;
 
+/*
+ * Populate a file tag describing a segment file.  We only use the segment
+ * number, since we can derive everything else we need by having separate
+ * sync handler functions for clog, multixact etc.
+ */
+#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = (xx_handler), \
+	(a).segno = (xx_segno) \
+)
+
 /*
  * Macro to mark a buffer slot "most recently used".  Note multiple evaluation
  * of arguments!
@@ -173,7 +185,8 @@ SimpleLruShmemSize(int nslots, int nlsns)
  */
 void
 SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-			  LWLock *ctllock, const char *subdir, int tranche_id)
+			  LWLock *ctllock, const char *subdir, int tranche_id,
+			  SyncRequestHandler sync_handler)
 {
 	SlruShared	shared;
 	bool		found;
@@ -251,7 +264,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 	 * assume caller set PagePrecedes.
 	 */
 	ctl->shared = shared;
-	ctl->do_fsync = true;		/* default behavior */
+	ctl->sync_handler = sync_handler;
 	StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir));
 }
 
@@ -870,23 +883,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	}
 	pgstat_report_wait_end();
 
-	/*
-	 * If not part of Flush, need to fsync now.  We assume this happens
-	 * infrequently enough that it's not a performance issue.
-	 */
-	if (!fdata)
+	/* Queue up a sync request for the checkpointer. */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
 	{
-		pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
-		if (ctl->do_fsync && pg_fsync(fd) != 0)
+		FileTag		tag;
+
+		INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+		if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
 		{
+			/* No space to enqueue sync request.  Do it synchronously. */
+			pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
+			if (pg_fsync(fd) != 0)
+			{
+				pgstat_report_wait_end();
+				slru_errcause = SLRU_FSYNC_FAILED;
+				slru_errno = errno;
+				CloseTransientFile(fd);
+				return false;
+			}
 			pgstat_report_wait_end();
-			slru_errcause = SLRU_FSYNC_FAILED;
-			slru_errno = errno;
-			CloseTransientFile(fd);
-			return false;
 		}
-		pgstat_report_wait_end();
+	}
 
+	/* Close file, unless part of flush request. */
+	if (!fdata)
+	{
 		if (CloseTransientFile(fd) != 0)
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
@@ -1162,21 +1183,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 	LWLockRelease(shared->ControlLock);
 
 	/*
-	 * Now fsync and close any files that were open
+	 * Now close any files that were open
 	 */
 	ok = true;
 	for (i = 0; i < fdata.num_files; i++)
 	{
-		pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
-		if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0)
-		{
-			slru_errcause = SLRU_FSYNC_FAILED;
-			slru_errno = errno;
-			pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
-			ok = false;
-		}
-		pgstat_report_wait_end();
-
 		if (CloseTransientFile(fdata.fd[i]) != 0)
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
@@ -1295,6 +1306,7 @@ SlruDeleteSegment(SlruCtl ctl, int segno)
 	int			slotno;
 	char		path[MAXPGPATH];
 	bool		did_write;
+	FileTag		tag;
 
 	/* Clean out any possibly existing references to the segment. */
 	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -1338,6 +1350,17 @@ restart:
 	snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
 	ereport(DEBUG2,
 			(errmsg("removing file \"%s\"", path)));
+
+	/*
+	 * Tell the checkpointer to forget any sync requests, before we unlink the
+	 * file.
+	 */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
+	{
+		INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+		RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
+	}
+
 	unlink(path);
 
 	LWLockRelease(shared->ControlLock);
@@ -1436,3 +1459,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
 
 	return retval;
 }
+
+/*
+ * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
+ * that they can provide the correct "SlruCtl" (otherwise we don't know how to
+ * build the path), but they just forward to this common implementation that
+ * performs the fsync.
+ */
+int
+slrusyncfiletag(SlruCtl ctl, const FileTag *ftag, char *path)
+{
+	int			fd;
+	int			save_errno;
+	int			result;
+
+	SlruFileName(ctl, path, ftag->segno);
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd < 0)
+		return -1;
+
+	result = pg_fsync(fd);
+	save_errno = errno;
+
+	CloseTransientFile(fd);
+
+	errno = save_errno;
+	return result;
+}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index f33ae407a6..68568f18dc 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -193,9 +193,7 @@ SUBTRANSShmemInit(void)
 	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
 	SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
 				  SubtransSLRULock, "pg_subtrans",
-				  LWTRANCHE_SUBTRANS_BUFFER);
-	/* Override default assumption that writes should be fsync'd */
-	SubTransCtl->do_fsync = false;
+				  LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
 }
 
 /*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 71b7577afc..d0a40b0556 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -554,9 +554,8 @@ AsyncShmemInit(void)
 	 */
 	NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
 	SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
-				  NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER);
-	/* Override default assumption that writes should be fsync'd */
-	NotifyCtl->do_fsync = false;
+				  NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
+				  SYNC_HANDLER_NONE);
 
 	if (!found)
 	{
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d24919f76b..579201b10d 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -821,9 +821,7 @@ SerialInit(void)
 	SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
 	SimpleLruInit(SerialSlruCtl, "Serial",
 				  NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
-				  LWTRANCHE_SERIAL_BUFFER);
-	/* Override default assumption that writes should be fsync'd */
-	SerialSlruCtl->do_fsync = false;
+				  LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
 
 	/*
 	 * Create or attach to the SerialControl structure.
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index 3ded2cdd71..64107fd732 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -18,6 +18,9 @@
 #include <fcntl.h>
 #include <sys/file.h>
 
+#include "access/commit_ts.h"
+#include "access/clog.h"
+#include "access/multixact.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "commands/tablespace.h"
@@ -90,13 +93,32 @@ typedef struct SyncOps
 										const FileTag *candidate);
 } SyncOps;
 
+/*
+ * These indexes must correspond to the values of the SyncRequestHandler enum.
+ */
 static const SyncOps syncsw[] = {
 	/* magnetic disk */
 	{
 		.sync_syncfiletag = mdsyncfiletag,
 		.sync_unlinkfiletag = mdunlinkfiletag,
 		.sync_filetagmatches = mdfiletagmatches
-	}
+	},
+	/* pg_xact */
+	{
+		.sync_syncfiletag = clogsyncfiletag
+	},
+	/* pg_commit_ts */
+	{
+		.sync_syncfiletag = committssyncfiletag
+	},
+	/* pg_multixact/offsets */
+	{
+		.sync_syncfiletag = multixactoffsetssyncfiletag
+	},
+	/* pg_multixact/members */
+	{
+		.sync_syncfiletag = multixactmemberssyncfiletag
+	},
 };
 
 /*
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index 2db8acb189..d97b9042dc 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -12,6 +12,7 @@
 #define CLOG_H
 
 #include "access/xlogreader.h"
+#include "storage/sync.h"
 #include "lib/stringinfo.h"
 
 /*
@@ -50,6 +51,8 @@ extern void CheckPointCLOG(void);
 extern void ExtendCLOG(TransactionId newestXact);
 extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid);
 
+extern int clogsyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define CLOG_ZEROPAGE		0x00
 #define CLOG_TRUNCATE		0x10
diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h
index 2740c02a84..27900ce430 100644
--- a/src/include/access/commit_ts.h
+++ b/src/include/access/commit_ts.h
@@ -14,6 +14,7 @@
 #include "access/xlog.h"
 #include "datatype/timestamp.h"
 #include "replication/origin.h"
+#include "storage/sync.h"
 #include "utils/guc.h"
 
 
@@ -45,6 +46,8 @@ extern void SetCommitTsLimit(TransactionId oldestXact,
 							 TransactionId newestXact);
 extern void AdvanceOldestCommitTsXid(TransactionId oldestXact);
 
+extern int committssyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define COMMIT_TS_ZEROPAGE		0x00
 #define COMMIT_TS_TRUNCATE		0x10
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 6d729008c6..71d6e78063 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -13,6 +13,7 @@
 
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
+#include "storage/sync.h"
 
 
 /*
@@ -116,6 +117,9 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1,
 										MultiXactId multi2);
 
+extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path);
+extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path);
+
 extern void AtEOXact_MultiXact(void);
 extern void AtPrepare_MultiXact(void);
 extern void PostPrepare_MultiXact(TransactionId xid);
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 61fbc80ef0..2720284157 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -15,6 +15,7 @@
 
 #include "access/xlogdefs.h"
 #include "storage/lwlock.h"
+#include "storage/sync.h"
 
 
 /*
@@ -111,10 +112,10 @@ typedef struct SlruCtlData
 	SlruShared	shared;
 
 	/*
-	 * This flag tells whether to fsync writes (true for pg_xact and multixact
-	 * stuff, false for pg_subtrans and pg_notify).
+	 * Which sync handler function to use when handing sync requests over to
+	 * the checkpointer.  SYNC_HANDLER_NONE to disable fsync (eg pg_notify).
 	 */
-	bool		do_fsync;
+	SyncRequestHandler sync_handler;
 
 	/*
 	 * Decide which of two page numbers is "older" for truncation purposes. We
@@ -135,7 +136,8 @@ typedef SlruCtlData *SlruCtl;
 
 extern Size SimpleLruShmemSize(int nslots, int nlsns);
 extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-						  LWLock *ctllock, const char *subdir, int tranche_id);
+						  LWLock *ctllock, const char *subdir, int tranche_id,
+						  SyncRequestHandler sync_handler);
 extern int	SimpleLruZeroPage(SlruCtl ctl, int pageno);
 extern int	SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 							  TransactionId xid);
@@ -151,6 +153,8 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
 extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
 extern void SlruDeleteSegment(SlruCtl ctl, int segno);
 
+extern int slrusyncfiletag(SlruCtl ctl, const FileTag *ftag, char *path);
+
 /* SlruScanDirectory public callbacks */
 extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
 										int segpage, void *data);
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index e16ab8e711..f32e412e75 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -34,7 +34,12 @@ typedef enum SyncRequestType
  */
 typedef enum SyncRequestHandler
 {
-	SYNC_HANDLER_MD = 0			/* md smgr */
+	SYNC_HANDLER_MD = 0,
+	SYNC_HANDLER_CLOG,
+	SYNC_HANDLER_COMMIT_TS,
+	SYNC_HANDLER_MULTIXACT_OFFSET,
+	SYNC_HANDLER_MULTIXACT_MEMBER,
+	SYNC_HANDLER_NONE
 } SyncRequestHandler;
 
 /*
-- 
2.20.1

Reply via email to