(2013/06/12 23:07), Robert Haas wrote:
On Mon, Jun 10, 2013 at 3:48 PM, Simon Riggs <si...@2ndquadrant.com> wrote:
On 10 June 2013 11:51, KONDO Mitsumasa <kondo.mitsum...@lab.ntt.co.jp> wrote:
I create patch which is improvement of checkpoint IO scheduler for stable
transaction responses.

Looks like good results, with good measurements. Should be an
interesting discussion.

+1.

I suspect we want to poke at the algorithms a little here and maybe
see if we can do this without adding new GUCs.  Also, I think this is
probably two separate patches, in the end.  But the direction seems
good to me.
Thank you for comment!

I separate my patch in checkpoint-wirte and in checkpoint-fsync. As you
say, my patch has a lot of new GUCs. I don't think it cannot be decided
automatic. However, it is difficult that chekpoint-scheduler is suitable
for all of enviroments which are like virtual server, public cloude server,
and embedded server, etc. So I think that default setting parameter works
same as before. Setting parameter is primitive and difficult, but if we can
set correctly, it is suitable for a lot of enviroments and will not work unintended action.

I try to take something into consideration about less GUCs version. And if you have good idea, please discussion about this!

Best Regards,
--
Mitsumasa KONDO
NTT Open Source Software Center
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fdf6625..0c0f215 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -141,9 +141,12 @@ static CheckpointerShmemStruct *CheckpointerShmem;
 /*
  * GUC parameters
  */
+int			CheckPointerWriteDelay = 200;
 int			CheckPointTimeout = 300;
 int			CheckPointWarning = 30;
 double		CheckPointCompletionTarget = 0.5;
+double		CheckPointSmoothTarget = 0.0;
+double		CheckPointSmoothMargin = 0.0;
 
 /*
  * Flags set by interrupt handlers for later service in the main loop.
@@ -715,7 +718,7 @@ CheckpointWriteDelay(int flags, double progress)
 		 * Checkpointer and bgwriter are no longer related so take the Big
 		 * Sleep.
 		 */
-		pg_usleep(100000L);
+		pg_usleep(CheckPointerWriteDelay * 1000L);
 	}
 	else if (--absorb_counter <= 0)
 	{
@@ -742,14 +745,36 @@ IsCheckpointOnSchedule(double progress)
 {
 	XLogRecPtr	recptr;
 	struct timeval now;
-	double		elapsed_xlogs,
+	double		original_progress,
+			elapsed_xlogs,
 				elapsed_time;
 
 	Assert(ckpt_active);
 
-	/* Scale progress according to checkpoint_completion_target. */
-	progress *= CheckPointCompletionTarget;
+	/* This variable is used by smooth checkpoint schedule.*/
+	original_progress = progress * CheckPointCompletionTarget;
 
+	/* Scale progress according to checkpoint_completion_target and checkpoint_smooth_target. */
+	if(progress >= CheckPointSmoothTarget)
+	{
+		/* Normal checkpoint schedule. */
+		progress *= CheckPointCompletionTarget;
+	}
+	else
+	{
+		/*
+		 * Smooth checkpoint schedule.
+		 *
+		 * When initial checkpoint, it tends to be high IO road average
+		 * and slow executing transactions. This schedule reduces them
+		 * and improve IO responce. As 'progress' approximates CheckPointSmoothTarget,
+		 * it becomes near normal checkpoint schedule. If you want to more
+		 * smooth checkpoint schedule, you set higher CheckPointSmoothTarget.
+		 */
+		progress *= ((CheckPointSmoothTarget - progress) / CheckPointSmoothTarget) *
+				(CheckPointSmoothMargin + 1 - CheckPointCompletionTarget) +
+				CheckPointCompletionTarget;
+	}
 	/*
 	 * Check against the cached value first. Only do the more expensive
 	 * calculations once we reach the target previously calculated. Since
@@ -779,6 +804,14 @@ IsCheckpointOnSchedule(double progress)
 			ckpt_cached_elapsed = elapsed_xlogs;
 			return false;
 		}
+		else if (original_progress < elapsed_xlogs)
+		{
+			ckpt_cached_elapsed = elapsed_xlogs;
+
+			/* smooth checkpoint write */
+			pg_usleep(CheckPointerWriteDelay * 1000L);
+			return false;
+		}
 	}
 
 	/*
@@ -793,6 +826,14 @@ IsCheckpointOnSchedule(double progress)
 		ckpt_cached_elapsed = elapsed_time;
 		return false;
 	}
+	else if (original_progress < elapsed_time)
+	{
+		ckpt_cached_elapsed = elapsed_time;
+
+		/* smooth checkpoint write */
+		pg_usleep(CheckPointerWriteDelay * 1000L);
+		return false;
+	}
 
 	/* It looks like we're on schedule. */
 	return true;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ea16c64..d41dc17 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2014,6 +2014,17 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"checkpointer_write_delay", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+			gettext_noop("checkpointer sleep time during dirty buffers write in checkpoint."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&CheckPointerWriteDelay,
+		200, 10, 10000,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
 			gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
 			NULL,
@@ -2551,6 +2562,26 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"checkpoint_smooth_target", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+			gettext_noop("Smooth control IO load between starting checkpoint and this target parameter in progress of checkpoint."),
+			NULL
+		},
+		&CheckPointSmoothTarget,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"checkpoint_smooth_margin", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+		gettext_noop("More smooth control IO load between starting checkpoint and checkpoint_smooth_target."),
+		NULL
+		},
+		&CheckPointSmoothMargin,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0303ac7..b4d83f2 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -185,7 +185,10 @@
 #checkpoint_segments = 3		# in logfile segments, min 1, 16MB each
 #checkpoint_timeout = 5min		# range 30s-1h
 #checkpoint_completion_target = 0.5	# checkpoint target duration, 0.0 - 1.0
+#checkpoint_smooth_target = 0.0		# smooth checkpoint target, 0.0 - 1.0
+#checkpoint_smooth_margin = 0.0		# smooth checkpoint margin, 0.0 - 1.0
 #checkpoint_warning = 30s		# 0 disables
+#checkpointer_write_delay = 200ms	# 10-10000 milliseconds
 
 # - Archiving -
 
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 46d3c26..8a441bc 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -21,9 +21,12 @@
 
 /* GUC options */
 extern int	BgWriterDelay;
+extern int	CheckPointerWriteDelay;
 extern int	CheckPointTimeout;
 extern int	CheckPointWarning;
 extern double CheckPointCompletionTarget;
+extern double CheckPointSmoothTarget;
+extern double CheckPointSmoothMargin;
 
 extern void BackgroundWriterMain(void) __attribute__((noreturn));
 extern void CheckpointerMain(void) __attribute__((noreturn));
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 8dcdd4b..efc5ee4 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -63,6 +63,7 @@ enum config_group
 	RESOURCES_KERNEL,
 	RESOURCES_VACUUM_DELAY,
 	RESOURCES_BGWRITER,
+	RESOURCES_CHECKPOINTER,
 	RESOURCES_ASYNCHRONOUS,
 	WAL,
 	WAL_SETTINGS,
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fdf6625..2b223e9 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -143,14 +143,16 @@ static CheckpointerShmemStruct *CheckpointerShmem;
  */
 int			CheckPointTimeout = 300;
 int			CheckPointWarning = 30;
+int			CheckPointerFsyncDelayThreshold = -1;
 double		CheckPointCompletionTarget = 0.5;
+double		CheckPointerFsyncDelayRatio = 0.0;
 
 /*
  * Flags set by interrupt handlers for later service in the main loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t checkpoint_requested = false;
-static volatile sig_atomic_t shutdown_requested = false;
+extern volatile sig_atomic_t checkpoint_requested = false;
+extern volatile sig_atomic_t shutdown_requested = false;
 
 /*
  * Private state
@@ -169,7 +171,6 @@ static pg_time_t last_xlog_switch_time;
 
 static void CheckArchiveTimeout(void);
 static bool IsCheckpointOnSchedule(double progress);
-static bool ImmediateCheckpointRequested(void);
 static bool CompactCheckpointerRequestQueue(void);
 static void UpdateSharedMemoryConfig(void);
 
@@ -643,7 +644,7 @@ CheckArchiveTimeout(void)
  * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
  * there is one pending behind it.)
  */
-static bool
+extern bool
 ImmediateCheckpointRequested(void)
 {
 	if (checkpoint_requested)
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e629181..99dac53 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -21,6 +21,7 @@
  */
 #include "postgres.h"
 
+#include <signal.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/file.h>
@@ -162,6 +163,8 @@ static List *pendingUnlinks = NIL;
 static CycleCtr mdsync_cycle_ctr = 0;
 static CycleCtr mdckpt_cycle_ctr = 0;
 
+extern volatile sig_atomic_t checkpoint_requested;
+extern volatile sig_atomic_t shutdown_requested;
 
 typedef enum					/* behavior for mdopen & _mdfd_getseg */
 {
@@ -1171,6 +1174,20 @@ mdsync(void)
 								 FilePathName(seg->mdfd_vfd),
 								 (double) elapsed / 1000);
 
+						/*
+						 * If this fsync has long time, we sleep 'fsync-time * checkpoint_fsync_delay_ratio'
+						 * for giving priority to executing transaction.
+						 */
+						if( CheckPointerFsyncDelayThreshold >= 0 &&
+							!shutdown_requested &&
+							!ImmediateCheckpointRequested() &&
+							(elapsed / 1000 > CheckPointerFsyncDelayThreshold))
+						{
+							pg_usleep((elapsed / 1000) * CheckPointerFsyncDelayRatio * 1000L);
+							if(log_checkpoints)
+								elog(DEBUG1, "checkpoint sync sleep: time=%.3f msec",
+									(double) (elapsed / 1000) * CheckPointerFsyncDelayRatio);
+						}
 						break;	/* out of retry loop */
 					}
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ea16c64..74051cb 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2014,6 +2014,17 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"checkpointer_fsync_delay_threshold", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+			gettext_noop("If a file fsync time over this threshold, checkpointer sleep file_fsync_time * checkpointer_fsync_delay_ratio."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&CheckPointerFsyncDelayThreshold,
+		-1, -1, 1000000,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
 			gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
 			NULL,
@@ -2551,6 +2562,16 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"checkpointer_fsync_delay_ratio", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+		gettext_noop("checkpointer sleep time during file fsync in checkpoint."),
+		NULL
+		},
+		&CheckPointerFsyncDelayRatio,
+		0.0, 0.0, 1.0,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0303ac7..707b433 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,6 +186,8 @@
 #checkpoint_timeout = 5min		# range 30s-1h
 #checkpoint_completion_target = 0.5	# checkpoint target duration, 0.0 - 1.0
 #checkpoint_warning = 30s		# 0 disables
+#checkpointer_fsync_delay_ratio = 0.0	# range 0.0 - 1.0
+#checkpointer_fsync_delay_threshold = -1 	# range 0 - 1000000 milliseconds. -1 is disable.
 
 # - Archiving -
 
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 46d3c26..a02ba1f 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -23,7 +23,9 @@
 extern int	BgWriterDelay;
 extern int	CheckPointTimeout;
 extern int	CheckPointWarning;
+extern int	CheckPointerFsyncDelayThreshold;
 extern double CheckPointCompletionTarget;
+extern double CheckPointerFsyncDelayRatio;
 
 extern void BackgroundWriterMain(void) __attribute__((noreturn));
 extern void CheckpointerMain(void) __attribute__((noreturn));
@@ -31,6 +33,7 @@ extern void CheckpointerMain(void) __attribute__((noreturn));
 extern void RequestCheckpoint(int flags);
 extern void CheckpointWriteDelay(int flags, double progress);
 
+extern bool ImmediateCheckpointRequested(void);
 extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
 					BlockNumber segno);
 extern void AbsorbFsyncRequests(void);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 8dcdd4b..efc5ee4 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -63,6 +63,7 @@ enum config_group
 	RESOURCES_KERNEL,
 	RESOURCES_VACUUM_DELAY,
 	RESOURCES_BGWRITER,
+	RESOURCES_CHECKPOINTER,
 	RESOURCES_ASYNCHRONOUS,
 	WAL,
 	WAL_SETTINGS,
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to