I create fsync v2 patch. There's not much time, so I try to focus fsync patch in this commit festa as adviced by Heikki. And I'm sorry that it is not good that diverging from main discussion in this commit festa... Of course, I continue to try another improvement.

* Changes
- Add ckpt_flag in mdsync() etc with reference by Heikki's patch. It will be more controllable mdsync() in checkpoint. - Too long sleep in fsync phase is not good for checkpoint schedule. So I set limited sleep time which is always less than 10 seconds(MAX_FSYNC_SLEEP). I think that 10 seconds sleep time is a suitable value in various situations. And I also considered limited sleep time by checkpoint progress, however, I thought md.c should be simple and remain robust. So I have remained simple. - Maximum checkpoint_fsync_sleep_ratio in guc.c is changed 1 to 2. Because I set limited sleep time 10 secounds. We can more flexibly change it and be more safety.

And I considered abbreviation of parameters in my fsync patch.
 * checkpoint_fsync_delay_threshold
  In general, I think that it is suitable about 1 second in various 
environments.
If we want to adjust sleep time in fsync phase, we can change checkpoint_fsync_sleep_ratio.

 * checkpoint_fsync_sleep_ratio
I don't want to omit this parameter, because it can only regulate sleep time in fsync phase and checkpoint time.


* Benchmark Result(DBT-2)
                         | NOTPM    Average  90%tile  Maximum
 ------------------------+----------------------------------------
 original_0.7 (baseline) | 3610.42  4.556    10.9180  23.1326
 fsync v1                | 3685.51  4.036     9.2017  17.5594
 fsync v2                | 3748.80  3.562     8.1871  17.5101

I'm not sure about this result. Fsync v2 patch was too good. Of cource I didn't do anything in executing benchmark. Please see checkpoint_time.txt which is written detail checkpoint in each checkpoint. Fsync v2 patch seems to be short in each checkpoint time.


* Benchmark Setting
 [postgresql.conf]
  archive_mode = on
  archive_command = '/bin/cp %p /pgdata/pgarch/arc_dbt2/%f'
  synchronous_commit = on
  max_connections = 300
  shared_buffers = 2458MB
  work_mem = 1MB
  fsync = on
  wal_sync_method = fdatasync
  full_page_writes = on
  checkpoint_segments = 300
  checkpoint_timeout = 15min
  checkpoint_completion_target = 0.7
  segsize=1GB(default)

 [patched postgresql.conf (add)]
  checkpointer_fsync_delay_ratio = 1
  checkpointer_fsync_delay_threshold = 1000ms

 [DBT-2 driver settings]
  SESSION:250
  WH:340
  TPW:10
  PRETEST_DURATION: 1800
  TEST_DURATION: 1800


* Test Server
  Server: HP Proliant DL360 G7
  CPU:    Xeon E5640 2.66GHz (1P/4C)
  Memory: 18GB(PC3-10600R-9)
  Disk:   146GB(15k)*4 RAID1+0
  RAID controller: P410i/256MB
  (Add) Set off energy efficient function in BIOS and OS.

Best regards,
--
Mitsumasa KONDO
NTT Open Sorce Software Center
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fdf6625..2b223e9 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -143,14 +143,16 @@ static CheckpointerShmemStruct *CheckpointerShmem;
  */
 int			CheckPointTimeout = 300;
 int			CheckPointWarning = 30;
+int			CheckPointerFsyncDelayThreshold = -1;
 double		CheckPointCompletionTarget = 0.5;
+double		CheckPointerFsyncDelayRatio = 0.0;
 
 /*
  * Flags set by interrupt handlers for later service in the main loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t checkpoint_requested = false;
-static volatile sig_atomic_t shutdown_requested = false;
+extern volatile sig_atomic_t checkpoint_requested = false;
+extern volatile sig_atomic_t shutdown_requested = false;
 
 /*
  * Private state
@@ -169,7 +171,6 @@ static pg_time_t last_xlog_switch_time;
 
 static void CheckArchiveTimeout(void);
 static bool IsCheckpointOnSchedule(double progress);
-static bool ImmediateCheckpointRequested(void);
 static bool CompactCheckpointerRequestQueue(void);
 static void UpdateSharedMemoryConfig(void);
 
@@ -643,7 +644,7 @@ CheckArchiveTimeout(void)
  * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
  * there is one pending behind it.)
  */
-static bool
+extern bool
 ImmediateCheckpointRequested(void)
 {
 	if (checkpoint_requested)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8079226..3f02d0b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1828,7 +1828,7 @@ CheckPointBuffers(int flags)
 	BufferSync(flags);
 	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
-	smgrsync();
+	smgrsync(flags);
 	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
 }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e629181..d762511 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -21,6 +21,7 @@
  */
 #include "postgres.h"
 
+#include <signal.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/file.h>
@@ -44,6 +45,9 @@
 #define FSYNCS_PER_ABSORB		10
 #define UNLINKS_PER_ABSORB		10
 
+/* Protect too long sleep in each file fsync. */
+#define MAX_FSYNC_SLEEP		10000
+
 /*
  * Special values for the segno arg to RememberFsyncRequest.
  *
@@ -162,6 +166,8 @@ static List *pendingUnlinks = NIL;
 static CycleCtr mdsync_cycle_ctr = 0;
 static CycleCtr mdckpt_cycle_ctr = 0;
 
+extern volatile sig_atomic_t checkpoint_requested;
+extern volatile sig_atomic_t shutdown_requested;
 
 typedef enum					/* behavior for mdopen & _mdfd_getseg */
 {
@@ -235,7 +241,7 @@ SetForwardFsyncRequests(void)
 	/* Perform any pending fsyncs we may have queued up, then drop table */
 	if (pendingOpsTable)
 	{
-		mdsync();
+		mdsync(CHECKPOINT_IMMEDIATE);
 		hash_destroy(pendingOpsTable);
 	}
 	pendingOpsTable = NULL;
@@ -974,7 +980,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
  *	mdsync() -- Sync previous writes to stable storage.
  */
 void
-mdsync(void)
+mdsync(int ckpt_flags)
 {
 	static bool mdsync_in_progress = false;
 
@@ -1171,6 +1177,28 @@ mdsync(void)
 								 FilePathName(seg->mdfd_vfd),
 								 (double) elapsed / 1000);
 
+						/*
+						 * If this fsync has long time, we sleep 'fsync-time * checkpoint_fsync_delay_ratio'
+						 * for giving priority to executing transaction.
+						 */
+						if(CheckPointerFsyncDelayThreshold >= 0 &&
+							CheckPointerFsyncDelayRatio > 0 &&
+							!shutdown_requested &&
+							!ImmediateCheckpointRequested() &&
+							!(ckpt_flags & CHECKPOINT_FORCE) &&
+							!(ckpt_flags & CHECKPOINT_END_OF_RECOVERY) &&
+							(elapsed / 1000 > CheckPointerFsyncDelayThreshold))
+						{
+							double fsync_sleep = (elapsed / 1000) * CheckPointerFsyncDelayRatio;
+
+							/* Too long sleep is not good for checkpoint scheduler */
+							if(fsync_sleep > MAX_FSYNC_SLEEP)
+								fsync_sleep = MAX_FSYNC_SLEEP;
+							pg_usleep(fsync_sleep * 1000L);
+							if(log_checkpoints)
+								elog(DEBUG1, "checkpoint sync sleep: time=%.3f msec",
+									fsync_sleep);
+						}
 						break;	/* out of retry loop */
 					}
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index f7f1437..bc07b03 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -58,7 +58,7 @@ typedef struct f_smgr
 											  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_pre_ckpt) (void);		/* may be NULL */
-	void		(*smgr_sync) (void);	/* may be NULL */
+	void		(*smgr_sync) (int ckpt_flags);	/* may be NULL */
 	void		(*smgr_post_ckpt) (void);		/* may be NULL */
 } f_smgr;
 
@@ -708,14 +708,18 @@ smgrpreckpt(void)
  *	smgrsync() -- Sync files to disk during checkpoint.
  */
 void
-smgrsync(void)
+smgrsync(int ckpt_flags)
 {
 	int			i;
 
+	/*
+	 * XXX: If we ever have more than one smgr, the remaining progress
+	 * should somehow be divided among all smgrs.
+	 */
 	for (i = 0; i < NSmgr; i++)
 	{
 		if (smgrsw[i].smgr_sync)
-			(*(smgrsw[i].smgr_sync)) ();
+			(*(smgrsw[i].smgr_sync)) (ckpt_flags);
 	}
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ea16c64..a240c43 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2014,6 +2014,17 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"checkpointer_fsync_delay_threshold", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+			gettext_noop("If a file fsync time over this threshold, checkpointer sleep file_fsync_time * checkpointer_fsync_delay_ratio."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&CheckPointerFsyncDelayThreshold,
+		-1, -1, 1000000,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
 			gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
 			NULL,
@@ -2551,6 +2562,16 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"checkpointer_fsync_delay_ratio", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+		gettext_noop("checkpointer sleep time during file fsync in checkpoint."),
+		NULL
+		},
+		&CheckPointerFsyncDelayRatio,
+		0.0, 0.0, 2.0,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0303ac7..707b433 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,6 +186,8 @@
 #checkpoint_timeout = 5min		# range 30s-1h
 #checkpoint_completion_target = 0.5	# checkpoint target duration, 0.0 - 1.0
 #checkpoint_warning = 30s		# 0 disables
+#checkpointer_fsync_delay_ratio = 0.0	# range 0.0 - 1.0
+#checkpointer_fsync_delay_threshold = -1 	# range 0 - 1000000 milliseconds. -1 is disable.
 
 # - Archiving -
 
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 46d3c26..a02ba1f 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -23,7 +23,9 @@
 extern int	BgWriterDelay;
 extern int	CheckPointTimeout;
 extern int	CheckPointWarning;
+extern int	CheckPointerFsyncDelayThreshold;
 extern double CheckPointCompletionTarget;
+extern double CheckPointerFsyncDelayRatio;
 
 extern void BackgroundWriterMain(void) __attribute__((noreturn));
 extern void CheckpointerMain(void) __attribute__((noreturn));
@@ -31,6 +33,7 @@ extern void CheckpointerMain(void) __attribute__((noreturn));
 extern void RequestCheckpoint(int flags);
 extern void CheckpointWriteDelay(int flags, double progress);
 
+extern bool ImmediateCheckpointRequested(void);
 extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
 					BlockNumber segno);
 extern void AbsorbFsyncRequests(void);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 98b6f13..f796ab7 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -100,7 +100,7 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
 			 BlockNumber nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void smgrpreckpt(void);
-extern void smgrsync(void);
+extern void smgrsync(int ckpt_flags);
 extern void smgrpostckpt(void);
 extern void AtEOXact_SMgr(void);
 
@@ -126,7 +126,7 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 		   BlockNumber nblocks);
 extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void mdpreckpt(void);
-extern void mdsync(void);
+extern void mdsync(int ckpt_flags);
 extern void mdpostckpt(void);
 
 extern void SetForwardFsyncRequests(void);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 8dcdd4b..efc5ee4 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -63,6 +63,7 @@ enum config_group
 	RESOURCES_KERNEL,
 	RESOURCES_VACUUM_DELAY,
 	RESOURCES_BGWRITER,
+	RESOURCES_CHECKPOINTER,
 	RESOURCES_ASYNCHRONOUS,
 	WAL,
 	WAL_SETTINGS,
[normal(baseline)]
 instid |           start            | flags | num_buffers | xlog_added | 
xlog_removed | xlog_recycled | write_duration | sync_duration | total_duration
--------+----------------------------+-------+-------------+------------+--------------+---------------+----------------+---------------+----------------
     70 | 2013-07-08 11:27:40.542+09 | xlog  |         280 |          0 |       
     0 |             0 |         28.257 |         2.392 |         31.091
     70 | 2013-07-08 11:29:39.612+09 | xlog  |         192 |          0 |       
     0 |           300 |         19.284 |          3.55 |         22.898
     70 | 2013-07-08 11:31:09.955+09 | xlog  |         180 |          0 |       
     0 |           300 |          18.83 |         1.694 |         20.778
     70 | 2013-07-08 11:32:36.318+09 | xlog  |         176 |          0 |       
     0 |           300 |           17.7 |         2.456 |          20.31
     70 | 2013-07-08 11:34:13.438+09 | xlog  |         305 |          0 |       
     0 |           300 |         30.931 |         0.202 |         31.265
     70 | 2013-07-08 11:42:13.719+09 | xlog  |       18835 |          0 |       
     0 |           300 |        317.425 |        125.49 |        452.053
     70 | 2013-07-08 11:57:13.853+09 | time  |       84425 |          0 |       
     0 |           300 |        254.573 |       168.103 |        432.488
     70 | 2013-07-08 12:12:14.743+09 | time  |       88956 |          0 |       
     0 |           254 |        230.688 |       229.129 |        468.879
     70 | 2013-07-08 12:27:13.559+09 | time  |       85990 |          0 |       
     0 |           291 |          255.5 |       194.442 |        452.612
     70 | 2013-07-08 12:42:13.563+09 | time  |      137667 |          0 |       
     0 |           294 |        500.112 |        36.581 |        537.435

[fsync v1]
 instid |           start            | flags | num_buffers | xlog_added | 
xlog_removed | xlog_recycled | write_duration | sync_duration | total_duration
--------+----------------------------+-------+-------------+------------+--------------+---------------+----------------+---------------+----------------
     64 | 2013-07-05 18:19:36.447+09 | xlog  |         277 |          0 |       
     0 |             0 |          28.19 |        11.978 |         40.263
     64 | 2013-07-05 18:21:33.651+09 | xlog  |         177 |          0 |       
     0 |           300 |         17.894 |          0.29 |           18.2
     64 | 2013-07-05 18:23:12.687+09 | xlog  |         190 |          0 |       
     0 |           300 |         21.539 |         0.561 |         22.835
     64 | 2013-07-05 18:24:44.243+09 | xlog  |         176 |          0 |       
     0 |           300 |          17.81 |         8.655 |          26.53
     64 | 2013-07-05 18:26:27.906+09 | xlog  |         315 |          0 |       
     0 |           300 |         32.572 |          2.19 |         34.813
     64 | 2013-07-05 18:34:26.008+09 | xlog  |       17092 |          0 |       
     0 |           300 |        304.681 |       175.806 |        485.226
     64 | 2013-07-05 18:49:26.017+09 | time  |       82336 |          0 |       
     0 |           300 |         278.16 |       246.004 |        528.054
     64 | 2013-07-05 19:04:26.9+09   | time  |       85040 |          0 |       
     0 |           253 |        236.266 |        335.98 |        577.039
     64 | 2013-07-05 19:19:26.233+09 | time  |       84745 |          0 |       
     0 |           292 |        189.681 |       436.045 |        630.238
     64 | 2013-07-05 19:34:27.037+09 | time  |      137777 |          0 |       
     0 |           294 |        504.566 |        73.033 |        580.061

[fsync v2]
 instid |           start            |   flags   | num_buffers | xlog_added | 
xlog_removed | xlog_recycled | write_duration | sync_duration | total_duration
--------+----------------------------+-----------+-------------+------------+--------------+---------------+----------------+---------------+----------------
     75 | 2013-07-08 15:50:41.166+09 | xlog      |         284 |          0 |   
         0 |             0 |         28.748 |         4.742 |         33.638
     75 | 2013-07-08 15:52:30.394+09 | xlog      |         176 |          0 |   
         0 |           300 |         17.935 |         0.917 |         18.898
     75 | 2013-07-08 15:54:06.255+09 | xlog      |         303 |          0 |   
         0 |           300 |         30.429 |         2.113 |         32.638
     75 | 2013-07-08 15:55:44.686+09 | xlog      |         216 |          0 |   
         0 |           300 |         22.043 |         4.063 |         26.165
     75 | 2013-07-08 15:57:12.866+09 | xlog      |         179 |          0 |   
         0 |           300 |           18.2 |         8.008 |         26.459
     75 | 2013-07-08 16:05:13.214+09 | xlog      |       16063 |          0 |   
         0 |           300 |        313.357 |       127.779 |        446.138
     75 | 2013-07-08 16:20:13.94+09  | time      |       78018 |          0 |   
         0 |           300 |        194.376 |        326.53 |        527.111
     75 | 2013-07-08 16:35:13.86+09  | time      |       92081 |          0 |   
         0 |           249 |         189.26 |       359.835 |        563.836
     75 | 2013-07-08 16:50:13.184+09 | xlog time |       87794 |          0 |   
         0 |           295 |        206.032 |       366.987 |        577.932
     75 | 2013-07-08 17:05:13.188+09 | time      |      136923 |          0 |   
         0 |           300 |        505.149 |        21.488 |        529.828
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to