(2013/06/12 23:07), Robert Haas wrote:
On Mon, Jun 10, 2013 at 3:48 PM, Simon Riggs <si...@2ndquadrant.com> wrote:
On 10 June 2013 11:51, KONDO Mitsumasa <kondo.mitsum...@lab.ntt.co.jp> wrote:
I create patch which is improvement of checkpoint IO scheduler for stable
transaction responses.
Looks like good results, with good measurements. Should be an
interesting discussion.
+1.
I suspect we want to poke at the algorithms a little here and maybe
see if we can do this without adding new GUCs. Also, I think this is
probably two separate patches, in the end. But the direction seems
good to me.
Thank you for comment!
I separate my patch in checkpoint-wirte and in checkpoint-fsync. As you
say, my patch has a lot of new GUCs. I don't think it cannot be decided
automatic. However, it is difficult that chekpoint-scheduler is suitable
for all of enviroments which are like virtual server, public cloude server,
and embedded server, etc. So I think that default setting parameter works
same as before. Setting parameter is primitive and difficult, but if we can
set correctly, it is suitable for a lot of enviroments and will not work
unintended action.
I try to take something into consideration about less GUCs version. And if you
have good idea, please discussion about this!
Best Regards,
--
Mitsumasa KONDO
NTT Open Source Software Center
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fdf6625..0c0f215 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -141,9 +141,12 @@ static CheckpointerShmemStruct *CheckpointerShmem;
/*
* GUC parameters
*/
+int CheckPointerWriteDelay = 200;
int CheckPointTimeout = 300;
int CheckPointWarning = 30;
double CheckPointCompletionTarget = 0.5;
+double CheckPointSmoothTarget = 0.0;
+double CheckPointSmoothMargin = 0.0;
/*
* Flags set by interrupt handlers for later service in the main loop.
@@ -715,7 +718,7 @@ CheckpointWriteDelay(int flags, double progress)
* Checkpointer and bgwriter are no longer related so take the Big
* Sleep.
*/
- pg_usleep(100000L);
+ pg_usleep(CheckPointerWriteDelay * 1000L);
}
else if (--absorb_counter <= 0)
{
@@ -742,14 +745,36 @@ IsCheckpointOnSchedule(double progress)
{
XLogRecPtr recptr;
struct timeval now;
- double elapsed_xlogs,
+ double original_progress,
+ elapsed_xlogs,
elapsed_time;
Assert(ckpt_active);
- /* Scale progress according to checkpoint_completion_target. */
- progress *= CheckPointCompletionTarget;
+ /* This variable is used by smooth checkpoint schedule.*/
+ original_progress = progress * CheckPointCompletionTarget;
+ /* Scale progress according to checkpoint_completion_target and checkpoint_smooth_target. */
+ if(progress >= CheckPointSmoothTarget)
+ {
+ /* Normal checkpoint schedule. */
+ progress *= CheckPointCompletionTarget;
+ }
+ else
+ {
+ /*
+ * Smooth checkpoint schedule.
+ *
+ * When initial checkpoint, it tends to be high IO road average
+ * and slow executing transactions. This schedule reduces them
+ * and improve IO responce. As 'progress' approximates CheckPointSmoothTarget,
+ * it becomes near normal checkpoint schedule. If you want to more
+ * smooth checkpoint schedule, you set higher CheckPointSmoothTarget.
+ */
+ progress *= ((CheckPointSmoothTarget - progress) / CheckPointSmoothTarget) *
+ (CheckPointSmoothMargin + 1 - CheckPointCompletionTarget) +
+ CheckPointCompletionTarget;
+ }
/*
* Check against the cached value first. Only do the more expensive
* calculations once we reach the target previously calculated. Since
@@ -779,6 +804,14 @@ IsCheckpointOnSchedule(double progress)
ckpt_cached_elapsed = elapsed_xlogs;
return false;
}
+ else if (original_progress < elapsed_xlogs)
+ {
+ ckpt_cached_elapsed = elapsed_xlogs;
+
+ /* smooth checkpoint write */
+ pg_usleep(CheckPointerWriteDelay * 1000L);
+ return false;
+ }
}
/*
@@ -793,6 +826,14 @@ IsCheckpointOnSchedule(double progress)
ckpt_cached_elapsed = elapsed_time;
return false;
}
+ else if (original_progress < elapsed_time)
+ {
+ ckpt_cached_elapsed = elapsed_time;
+
+ /* smooth checkpoint write */
+ pg_usleep(CheckPointerWriteDelay * 1000L);
+ return false;
+ }
/* It looks like we're on schedule. */
return true;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ea16c64..d41dc17 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2014,6 +2014,17 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"checkpointer_write_delay", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("checkpointer sleep time during dirty buffers write in checkpoint."),
+ NULL,
+ GUC_UNIT_MS
+ },
+ &CheckPointerWriteDelay,
+ 200, 10, 10000,
+ NULL, NULL, NULL
+ },
+
+ {
{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
NULL,
@@ -2551,6 +2562,26 @@ static struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL
},
+ {
+ {"checkpoint_smooth_target", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("Smooth control IO load between starting checkpoint and this target parameter in progress of checkpoint."),
+ NULL
+ },
+ &CheckPointSmoothTarget,
+ 0.0, 0.0, 1.0,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"checkpoint_smooth_margin", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("More smooth control IO load between starting checkpoint and checkpoint_smooth_target."),
+ NULL
+ },
+ &CheckPointSmoothMargin,
+ 0.0, 0.0, 1.0,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0303ac7..b4d83f2 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -185,7 +185,10 @@
#checkpoint_segments = 3 # in logfile segments, min 1, 16MB each
#checkpoint_timeout = 5min # range 30s-1h
#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0
+#checkpoint_smooth_target = 0.0 # smooth checkpoint target, 0.0 - 1.0
+#checkpoint_smooth_margin = 0.0 # smooth checkpoint margin, 0.0 - 1.0
#checkpoint_warning = 30s # 0 disables
+#checkpointer_write_delay = 200ms # 10-10000 milliseconds
# - Archiving -
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 46d3c26..8a441bc 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -21,9 +21,12 @@
/* GUC options */
extern int BgWriterDelay;
+extern int CheckPointerWriteDelay;
extern int CheckPointTimeout;
extern int CheckPointWarning;
extern double CheckPointCompletionTarget;
+extern double CheckPointSmoothTarget;
+extern double CheckPointSmoothMargin;
extern void BackgroundWriterMain(void) __attribute__((noreturn));
extern void CheckpointerMain(void) __attribute__((noreturn));
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 8dcdd4b..efc5ee4 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -63,6 +63,7 @@ enum config_group
RESOURCES_KERNEL,
RESOURCES_VACUUM_DELAY,
RESOURCES_BGWRITER,
+ RESOURCES_CHECKPOINTER,
RESOURCES_ASYNCHRONOUS,
WAL,
WAL_SETTINGS,
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fdf6625..2b223e9 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -143,14 +143,16 @@ static CheckpointerShmemStruct *CheckpointerShmem;
*/
int CheckPointTimeout = 300;
int CheckPointWarning = 30;
+int CheckPointerFsyncDelayThreshold = -1;
double CheckPointCompletionTarget = 0.5;
+double CheckPointerFsyncDelayRatio = 0.0;
/*
* Flags set by interrupt handlers for later service in the main loop.
*/
static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t checkpoint_requested = false;
-static volatile sig_atomic_t shutdown_requested = false;
+extern volatile sig_atomic_t checkpoint_requested = false;
+extern volatile sig_atomic_t shutdown_requested = false;
/*
* Private state
@@ -169,7 +171,6 @@ static pg_time_t last_xlog_switch_time;
static void CheckArchiveTimeout(void);
static bool IsCheckpointOnSchedule(double progress);
-static bool ImmediateCheckpointRequested(void);
static bool CompactCheckpointerRequestQueue(void);
static void UpdateSharedMemoryConfig(void);
@@ -643,7 +644,7 @@ CheckArchiveTimeout(void)
* this does not check the *current* checkpoint's IMMEDIATE flag, but whether
* there is one pending behind it.)
*/
-static bool
+extern bool
ImmediateCheckpointRequested(void)
{
if (checkpoint_requested)
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e629181..99dac53 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -21,6 +21,7 @@
*/
#include "postgres.h"
+#include <signal.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
@@ -162,6 +163,8 @@ static List *pendingUnlinks = NIL;
static CycleCtr mdsync_cycle_ctr = 0;
static CycleCtr mdckpt_cycle_ctr = 0;
+extern volatile sig_atomic_t checkpoint_requested;
+extern volatile sig_atomic_t shutdown_requested;
typedef enum /* behavior for mdopen & _mdfd_getseg */
{
@@ -1171,6 +1174,20 @@ mdsync(void)
FilePathName(seg->mdfd_vfd),
(double) elapsed / 1000);
+ /*
+ * If this fsync has long time, we sleep 'fsync-time * checkpoint_fsync_delay_ratio'
+ * for giving priority to executing transaction.
+ */
+ if( CheckPointerFsyncDelayThreshold >= 0 &&
+ !shutdown_requested &&
+ !ImmediateCheckpointRequested() &&
+ (elapsed / 1000 > CheckPointerFsyncDelayThreshold))
+ {
+ pg_usleep((elapsed / 1000) * CheckPointerFsyncDelayRatio * 1000L);
+ if(log_checkpoints)
+ elog(DEBUG1, "checkpoint sync sleep: time=%.3f msec",
+ (double) (elapsed / 1000) * CheckPointerFsyncDelayRatio);
+ }
break; /* out of retry loop */
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ea16c64..74051cb 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2014,6 +2014,17 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"checkpointer_fsync_delay_threshold", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("If a file fsync time over this threshold, checkpointer sleep file_fsync_time * checkpointer_fsync_delay_ratio."),
+ NULL,
+ GUC_UNIT_MS
+ },
+ &CheckPointerFsyncDelayThreshold,
+ -1, -1, 1000000,
+ NULL, NULL, NULL
+ },
+
+ {
{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
NULL,
@@ -2551,6 +2562,16 @@ static struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL
},
+ {
+ {"checkpointer_fsync_delay_ratio", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("checkpointer sleep time during file fsync in checkpoint."),
+ NULL
+ },
+ &CheckPointerFsyncDelayRatio,
+ 0.0, 0.0, 1.0,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0303ac7..707b433 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,6 +186,8 @@
#checkpoint_timeout = 5min # range 30s-1h
#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0
#checkpoint_warning = 30s # 0 disables
+#checkpointer_fsync_delay_ratio = 0.0 # range 0.0 - 1.0
+#checkpointer_fsync_delay_threshold = -1 # range 0 - 1000000 milliseconds. -1 is disable.
# - Archiving -
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 46d3c26..a02ba1f 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -23,7 +23,9 @@
extern int BgWriterDelay;
extern int CheckPointTimeout;
extern int CheckPointWarning;
+extern int CheckPointerFsyncDelayThreshold;
extern double CheckPointCompletionTarget;
+extern double CheckPointerFsyncDelayRatio;
extern void BackgroundWriterMain(void) __attribute__((noreturn));
extern void CheckpointerMain(void) __attribute__((noreturn));
@@ -31,6 +33,7 @@ extern void CheckpointerMain(void) __attribute__((noreturn));
extern void RequestCheckpoint(int flags);
extern void CheckpointWriteDelay(int flags, double progress);
+extern bool ImmediateCheckpointRequested(void);
extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void AbsorbFsyncRequests(void);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 8dcdd4b..efc5ee4 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -63,6 +63,7 @@ enum config_group
RESOURCES_KERNEL,
RESOURCES_VACUUM_DELAY,
RESOURCES_BGWRITER,
+ RESOURCES_CHECKPOINTER,
RESOURCES_ASYNCHRONOUS,
WAL,
WAL_SETTINGS,
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers