On 2014-08-27 11:14:46 +0200, Andres Freund wrote:
> On 2014-08-27 11:05:52 +0200, Fabien COELHO wrote:
> > I can test a couple of patches. I already did one on someone advice (make
> > bgwriter round all stuff in 1s instead of 120s, without positive effect.
> 
> I've quickly cobbled together the attached patch (which at least doesn't
> seem to crash & burn). It tries to trigger pages being flushed out
> during the paced phase of checkpoints instead of the fsync phase. The
> sync_on_checkpoint_flush can be used to enable/disable that behaviour.
> 
> I'd be interested to hear whether that improves your latency numbers. I
> unfortunately don't have more time to spend on this right now :(.

And actually attached. Note that it's linux only...

Greetings,

Andres Freund

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 4576970..eab30a6 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -78,6 +78,7 @@ bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
 double		bgwriter_lru_multiplier = 2.0;
 bool		track_io_timing = false;
+bool		sync_on_checkpoint_flush = false;
 
 /*
  * How many buffers PrefetchBuffer callers should try to stay ahead of their
@@ -356,7 +357,7 @@ static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(volatile BufferDesc *buf);
 static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
 static void BufferSync(int flags);
-static int	SyncOneBuffer(int buf_id, bool skip_recently_used);
+static int	SyncOneBuffer(int buf_id, bool skip_recently_used, bool flush_to_disk);
 static void WaitIO(volatile BufferDesc *buf);
 static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
 static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
@@ -369,7 +370,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
 			BlockNumber blockNum,
 			BufferAccessStrategy strategy,
 			bool *foundPtr);
-static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
+static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln, bool flush_to_disk);
 static void AtProcExit_Buffers(int code, Datum arg);
 static void CheckForBufferLeaks(void);
 static int	rnode_comparator(const void *p1, const void *p2);
@@ -947,7 +948,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 												smgr->smgr_rnode.node.dbNode,
 											  smgr->smgr_rnode.node.relNode);
 
-				FlushBuffer(buf, NULL);
+				FlushBuffer(buf, NULL, false);
 				LWLockRelease(buf->content_lock);
 
 				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
@@ -1573,7 +1574,7 @@ BufferSync(int flags)
 		 */
 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
 		{
-			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
+			if (SyncOneBuffer(buf_id, false, true) & BUF_WRITTEN)
 			{
 				TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
 				BgWriterStats.m_buf_written_checkpoints++;
@@ -1850,7 +1851,7 @@ BgBufferSync(void)
 	/* Execute the LRU scan */
 	while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
 	{
-		int			buffer_state = SyncOneBuffer(next_to_clean, true);
+		int			buffer_state = SyncOneBuffer(next_to_clean, true, false);
 
 		if (++next_to_clean >= NBuffers)
 		{
@@ -1927,7 +1928,7 @@ BgBufferSync(void)
  * Note: caller must have done ResourceOwnerEnlargeBuffers.
  */
 static int
-SyncOneBuffer(int buf_id, bool skip_recently_used)
+SyncOneBuffer(int buf_id, bool skip_recently_used, bool flush_to_disk)
 {
 	volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
 	int			result = 0;
@@ -1966,7 +1967,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	PinBuffer_Locked(bufHdr);
 	LWLockAcquire(bufHdr->content_lock, LW_SHARED);
 
-	FlushBuffer(bufHdr, NULL);
+	FlushBuffer(bufHdr, NULL, flush_to_disk);
 
 	LWLockRelease(bufHdr->content_lock);
 	UnpinBuffer(bufHdr, true);
@@ -2230,7 +2231,7 @@ BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
  * as the second parameter.  If not, pass NULL.
  */
 static void
-FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
+FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln, bool flush_to_disk)
 {
 	XLogRecPtr	recptr;
 	ErrorContextCallback errcallback;
@@ -2342,6 +2343,10 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 									   reln->smgr_rnode.node.spcNode,
 									   reln->smgr_rnode.node.dbNode,
 									   reln->smgr_rnode.node.relNode);
+	if (flush_to_disk && sync_on_checkpoint_flush)
+		smgrflush(reln,
+				  buf->tag.forkNum,
+				  buf->tag.blockNum);
 
 	/* Pop the error context stack */
 	error_context_stack = errcallback.previous;
@@ -2769,7 +2774,7 @@ FlushRelationBuffers(Relation rel)
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
-			FlushBuffer(bufHdr, rel->rd_smgr);
+			FlushBuffer(bufHdr, rel->rd_smgr, false);
 			LWLockRelease(bufHdr->content_lock);
 			UnpinBuffer(bufHdr, true);
 		}
@@ -2819,7 +2824,7 @@ FlushDatabaseBuffers(Oid dbid)
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
-			FlushBuffer(bufHdr, NULL);
+			FlushBuffer(bufHdr, NULL, false);
 			LWLockRelease(bufHdr->content_lock);
 			UnpinBuffer(bufHdr, true);
 		}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 1f69c9e..fead234 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1417,6 +1417,32 @@ retry:
 }
 
 int
+FileFlush(File file, off_t offset, int amount)
+{
+#ifdef __linux__
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileFlush: %d (%s) " INT64_FORMAT " %d",
+			   file, VfdCache[file].fileName,
+			   (int64) offset, amount));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	returnCode = sync_file_range(VfdCache[file].fd, offset, amount,
+								 SYNC_FILE_RANGE_WRITE);
+
+	return returnCode;
+#else
+	Assert(FileIsValid(file));
+	return 0;
+#endif
+}
+
+int
 FileSync(File file)
 {
 	int			returnCode;
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 167d61c..54cefea 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -800,6 +800,28 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 }
 
 /*
+ *	mdflush() -- Trigger dirty buffer being written out to disk
+ */
+void
+mdflush(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	off_t		seekpos;
+	MdfdVec    *v;
+
+	v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
+
+	seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	if (FileFlush(v->mdfd_vfd, seekpos, BLCKSZ) < 0)
+	{
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not flush block %u in file \"%s\": %m",
+						blocknum, FilePathName(v->mdfd_vfd))));
+	}
+}
+
+/*
  *	mdnblocks() -- Get the number of blocks stored in a relation.
  *
  *		Important side effect: all active segments of the relation are opened
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d16f559..bf6bc70 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -53,6 +53,8 @@ typedef struct f_smgr
 										  BlockNumber blocknum, char *buffer);
 	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, char *buffer, bool skipFsync);
+	void		(*smgr_flush) (SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 											  BlockNumber nblocks);
@@ -66,7 +68,7 @@ typedef struct f_smgr
 static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
-		mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
+		mdprefetch, mdread, mdwrite, mdflush, mdnblocks, mdtruncate, mdimmedsync,
 		mdpreckpt, mdsync, mdpostckpt
 	}
 };
@@ -651,6 +653,16 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 }
 
 /*
+ *	smgrflush() -- Trigger dirty OS buffer to be written out to disk
+ */
+void
+smgrflush(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	(*(smgrsw[reln->smgr_which].smgr_flush)) (reln, forknum, blocknum);
+}
+
+
+/*
  *	smgrnblocks() -- Calculate the number of blocks in the
  *					 supplied relation.
  */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a8a17c2..0316b52 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1083,6 +1083,16 @@ static struct config_bool ConfigureNamesBool[] =
 	},
 
 	{
+		{"sync_on_checkpoint_flush", PGC_POSTMASTER, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("preflush data during checkpoints."),
+			NULL
+		},
+		&sync_on_checkpoint_flush,
+		false,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"update_process_title", PGC_SUSET, STATS_COLLECTOR,
 			gettext_noop("Updates the process title to show the active SQL command."),
 			gettext_noop("Enables updating of the process title every time a new SQL command is received by the server.")
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 42d9120..ba6a4b1 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -52,6 +52,7 @@ extern int	bgwriter_lru_maxpages;
 extern double bgwriter_lru_multiplier;
 extern bool track_io_timing;
 extern int	target_prefetch_pages;
+extern bool	sync_on_checkpoint_flush;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index a6df8fb..f7dff39 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -71,6 +71,7 @@ extern void FileClose(File file);
 extern int	FilePrefetch(File file, off_t offset, int amount);
 extern int	FileRead(File file, char *buffer, int amount);
 extern int	FileWrite(File file, char *buffer, int amount);
+extern int	FileFlush(File file, off_t offset, int amount);
 extern int	FileSync(File file);
 extern off_t FileSeek(File file, off_t offset, int whence);
 extern int	FileTruncate(File file, off_t offset);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index ba7c909..6cd71ac 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -96,6 +96,9 @@ extern void smgrread(SMgrRelation reln, ForkNumber forknum,
 		 BlockNumber blocknum, char *buffer);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
 		  BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void smgrflush(SMgrRelation reln, ForkNumber forknum,
+		  BlockNumber blocknum);
+
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
 			 BlockNumber nblocks);
@@ -122,6 +125,8 @@ extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	   char *buffer);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
 		BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdflush(SMgrRelation reln, ForkNumber forknum,
+		BlockNumber blocknum);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 		   BlockNumber nblocks);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to