I create fsync v2 patch. There's not much time, so I try to focus fsync patch in
this commit festa as adviced by Heikki. And I'm sorry that it is not good that
diverging from main discussion in this commit festa... Of course, I continue to
try another improvement.
* Changes
- Add ckpt_flag in mdsync() etc with reference by Heikki's patch. It will be
more controllable mdsync() in checkpoint.
- Too long sleep in fsync phase is not good for checkpoint schedule. So I set
limited sleep time which is always less than 10 seconds(MAX_FSYNC_SLEEP).
I think that 10 seconds sleep time is a suitable value in various situations.
And I also considered limited sleep time by checkpoint progress,
however, I thought md.c should be simple and remain robust. So I have remained
simple.
- Maximum checkpoint_fsync_sleep_ratio in guc.c is changed 1 to 2. Because I
set limited sleep time 10 secounds. We can more flexibly change it and be more
safety.
And I considered abbreviation of parameters in my fsync patch.
* checkpoint_fsync_delay_threshold
In general, I think that it is suitable about 1 second in various
environments.
If we want to adjust sleep time in fsync phase, we can change
checkpoint_fsync_sleep_ratio.
* checkpoint_fsync_sleep_ratio
I don't want to omit this parameter, because it can only regulate sleep time
in fsync phase and checkpoint time.
* Benchmark Result(DBT-2)
| NOTPM Average 90%tile Maximum
------------------------+----------------------------------------
original_0.7 (baseline) | 3610.42 4.556 10.9180 23.1326
fsync v1 | 3685.51 4.036 9.2017 17.5594
fsync v2 | 3748.80 3.562 8.1871 17.5101
I'm not sure about this result. Fsync v2 patch was too good. Of cource I didn't
do anything in executing benchmark.
Please see checkpoint_time.txt which is written detail checkpoint in each
checkpoint. Fsync v2 patch seems to be short in each checkpoint time.
* Benchmark Setting
[postgresql.conf]
archive_mode = on
archive_command = '/bin/cp %p /pgdata/pgarch/arc_dbt2/%f'
synchronous_commit = on
max_connections = 300
shared_buffers = 2458MB
work_mem = 1MB
fsync = on
wal_sync_method = fdatasync
full_page_writes = on
checkpoint_segments = 300
checkpoint_timeout = 15min
checkpoint_completion_target = 0.7
segsize=1GB(default)
[patched postgresql.conf (add)]
checkpointer_fsync_delay_ratio = 1
checkpointer_fsync_delay_threshold = 1000ms
[DBT-2 driver settings]
SESSION:250
WH:340
TPW:10
PRETEST_DURATION: 1800
TEST_DURATION: 1800
* Test Server
Server: HP Proliant DL360 G7
CPU: Xeon E5640 2.66GHz (1P/4C)
Memory: 18GB(PC3-10600R-9)
Disk: 146GB(15k)*4 RAID1+0
RAID controller: P410i/256MB
(Add) Set off energy efficient function in BIOS and OS.
Best regards,
--
Mitsumasa KONDO
NTT Open Sorce Software Center
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fdf6625..2b223e9 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -143,14 +143,16 @@ static CheckpointerShmemStruct *CheckpointerShmem;
*/
int CheckPointTimeout = 300;
int CheckPointWarning = 30;
+int CheckPointerFsyncDelayThreshold = -1;
double CheckPointCompletionTarget = 0.5;
+double CheckPointerFsyncDelayRatio = 0.0;
/*
* Flags set by interrupt handlers for later service in the main loop.
*/
static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t checkpoint_requested = false;
-static volatile sig_atomic_t shutdown_requested = false;
+extern volatile sig_atomic_t checkpoint_requested = false;
+extern volatile sig_atomic_t shutdown_requested = false;
/*
* Private state
@@ -169,7 +171,6 @@ static pg_time_t last_xlog_switch_time;
static void CheckArchiveTimeout(void);
static bool IsCheckpointOnSchedule(double progress);
-static bool ImmediateCheckpointRequested(void);
static bool CompactCheckpointerRequestQueue(void);
static void UpdateSharedMemoryConfig(void);
@@ -643,7 +644,7 @@ CheckArchiveTimeout(void)
* this does not check the *current* checkpoint's IMMEDIATE flag, but whether
* there is one pending behind it.)
*/
-static bool
+extern bool
ImmediateCheckpointRequested(void)
{
if (checkpoint_requested)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8079226..3f02d0b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1828,7 +1828,7 @@ CheckPointBuffers(int flags)
BufferSync(flags);
CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
- smgrsync();
+ smgrsync(flags);
CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e629181..d762511 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -21,6 +21,7 @@
*/
#include "postgres.h"
+#include <signal.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
@@ -44,6 +45,9 @@
#define FSYNCS_PER_ABSORB 10
#define UNLINKS_PER_ABSORB 10
+/* Protect too long sleep in each file fsync. */
+#define MAX_FSYNC_SLEEP 10000
+
/*
* Special values for the segno arg to RememberFsyncRequest.
*
@@ -162,6 +166,8 @@ static List *pendingUnlinks = NIL;
static CycleCtr mdsync_cycle_ctr = 0;
static CycleCtr mdckpt_cycle_ctr = 0;
+extern volatile sig_atomic_t checkpoint_requested;
+extern volatile sig_atomic_t shutdown_requested;
typedef enum /* behavior for mdopen & _mdfd_getseg */
{
@@ -235,7 +241,7 @@ SetForwardFsyncRequests(void)
/* Perform any pending fsyncs we may have queued up, then drop table */
if (pendingOpsTable)
{
- mdsync();
+ mdsync(CHECKPOINT_IMMEDIATE);
hash_destroy(pendingOpsTable);
}
pendingOpsTable = NULL;
@@ -974,7 +980,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
* mdsync() -- Sync previous writes to stable storage.
*/
void
-mdsync(void)
+mdsync(int ckpt_flags)
{
static bool mdsync_in_progress = false;
@@ -1171,6 +1177,28 @@ mdsync(void)
FilePathName(seg->mdfd_vfd),
(double) elapsed / 1000);
+ /*
+ * If this fsync has long time, we sleep 'fsync-time * checkpoint_fsync_delay_ratio'
+ * for giving priority to executing transaction.
+ */
+ if(CheckPointerFsyncDelayThreshold >= 0 &&
+ CheckPointerFsyncDelayRatio > 0 &&
+ !shutdown_requested &&
+ !ImmediateCheckpointRequested() &&
+ !(ckpt_flags & CHECKPOINT_FORCE) &&
+ !(ckpt_flags & CHECKPOINT_END_OF_RECOVERY) &&
+ (elapsed / 1000 > CheckPointerFsyncDelayThreshold))
+ {
+ double fsync_sleep = (elapsed / 1000) * CheckPointerFsyncDelayRatio;
+
+ /* Too long sleep is not good for checkpoint scheduler */
+ if(fsync_sleep > MAX_FSYNC_SLEEP)
+ fsync_sleep = MAX_FSYNC_SLEEP;
+ pg_usleep(fsync_sleep * 1000L);
+ if(log_checkpoints)
+ elog(DEBUG1, "checkpoint sync sleep: time=%.3f msec",
+ fsync_sleep);
+ }
break; /* out of retry loop */
}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index f7f1437..bc07b03 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -58,7 +58,7 @@ typedef struct f_smgr
BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_pre_ckpt) (void); /* may be NULL */
- void (*smgr_sync) (void); /* may be NULL */
+ void (*smgr_sync) (int ckpt_flags); /* may be NULL */
void (*smgr_post_ckpt) (void); /* may be NULL */
} f_smgr;
@@ -708,14 +708,18 @@ smgrpreckpt(void)
* smgrsync() -- Sync files to disk during checkpoint.
*/
void
-smgrsync(void)
+smgrsync(int ckpt_flags)
{
int i;
+ /*
+ * XXX: If we ever have more than one smgr, the remaining progress
+ * should somehow be divided among all smgrs.
+ */
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_sync)
- (*(smgrsw[i].smgr_sync)) ();
+ (*(smgrsw[i].smgr_sync)) (ckpt_flags);
}
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ea16c64..a240c43 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2014,6 +2014,17 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"checkpointer_fsync_delay_threshold", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("If a file fsync time over this threshold, checkpointer sleep file_fsync_time * checkpointer_fsync_delay_ratio."),
+ NULL,
+ GUC_UNIT_MS
+ },
+ &CheckPointerFsyncDelayThreshold,
+ -1, -1, 1000000,
+ NULL, NULL, NULL
+ },
+
+ {
{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
NULL,
@@ -2551,6 +2562,16 @@ static struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL
},
+ {
+ {"checkpointer_fsync_delay_ratio", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("checkpointer sleep time during file fsync in checkpoint."),
+ NULL
+ },
+ &CheckPointerFsyncDelayRatio,
+ 0.0, 0.0, 2.0,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0303ac7..707b433 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,6 +186,8 @@
#checkpoint_timeout = 5min # range 30s-1h
#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0
#checkpoint_warning = 30s # 0 disables
+#checkpointer_fsync_delay_ratio = 0.0 # range 0.0 - 1.0
+#checkpointer_fsync_delay_threshold = -1 # range 0 - 1000000 milliseconds. -1 is disable.
# - Archiving -
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 46d3c26..a02ba1f 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -23,7 +23,9 @@
extern int BgWriterDelay;
extern int CheckPointTimeout;
extern int CheckPointWarning;
+extern int CheckPointerFsyncDelayThreshold;
extern double CheckPointCompletionTarget;
+extern double CheckPointerFsyncDelayRatio;
extern void BackgroundWriterMain(void) __attribute__((noreturn));
extern void CheckpointerMain(void) __attribute__((noreturn));
@@ -31,6 +33,7 @@ extern void CheckpointerMain(void) __attribute__((noreturn));
extern void RequestCheckpoint(int flags);
extern void CheckpointWriteDelay(int flags, double progress);
+extern bool ImmediateCheckpointRequested(void);
extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void AbsorbFsyncRequests(void);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 98b6f13..f796ab7 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -100,7 +100,7 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void smgrpreckpt(void);
-extern void smgrsync(void);
+extern void smgrsync(int ckpt_flags);
extern void smgrpostckpt(void);
extern void AtEOXact_SMgr(void);
@@ -126,7 +126,7 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void mdpreckpt(void);
-extern void mdsync(void);
+extern void mdsync(int ckpt_flags);
extern void mdpostckpt(void);
extern void SetForwardFsyncRequests(void);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 8dcdd4b..efc5ee4 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -63,6 +63,7 @@ enum config_group
RESOURCES_KERNEL,
RESOURCES_VACUUM_DELAY,
RESOURCES_BGWRITER,
+ RESOURCES_CHECKPOINTER,
RESOURCES_ASYNCHRONOUS,
WAL,
WAL_SETTINGS,
[normal(baseline)]
instid | start | flags | num_buffers | xlog_added |
xlog_removed | xlog_recycled | write_duration | sync_duration | total_duration
--------+----------------------------+-------+-------------+------------+--------------+---------------+----------------+---------------+----------------
70 | 2013-07-08 11:27:40.542+09 | xlog | 280 | 0 |
0 | 0 | 28.257 | 2.392 | 31.091
70 | 2013-07-08 11:29:39.612+09 | xlog | 192 | 0 |
0 | 300 | 19.284 | 3.55 | 22.898
70 | 2013-07-08 11:31:09.955+09 | xlog | 180 | 0 |
0 | 300 | 18.83 | 1.694 | 20.778
70 | 2013-07-08 11:32:36.318+09 | xlog | 176 | 0 |
0 | 300 | 17.7 | 2.456 | 20.31
70 | 2013-07-08 11:34:13.438+09 | xlog | 305 | 0 |
0 | 300 | 30.931 | 0.202 | 31.265
70 | 2013-07-08 11:42:13.719+09 | xlog | 18835 | 0 |
0 | 300 | 317.425 | 125.49 | 452.053
70 | 2013-07-08 11:57:13.853+09 | time | 84425 | 0 |
0 | 300 | 254.573 | 168.103 | 432.488
70 | 2013-07-08 12:12:14.743+09 | time | 88956 | 0 |
0 | 254 | 230.688 | 229.129 | 468.879
70 | 2013-07-08 12:27:13.559+09 | time | 85990 | 0 |
0 | 291 | 255.5 | 194.442 | 452.612
70 | 2013-07-08 12:42:13.563+09 | time | 137667 | 0 |
0 | 294 | 500.112 | 36.581 | 537.435
[fsync v1]
instid | start | flags | num_buffers | xlog_added |
xlog_removed | xlog_recycled | write_duration | sync_duration | total_duration
--------+----------------------------+-------+-------------+------------+--------------+---------------+----------------+---------------+----------------
64 | 2013-07-05 18:19:36.447+09 | xlog | 277 | 0 |
0 | 0 | 28.19 | 11.978 | 40.263
64 | 2013-07-05 18:21:33.651+09 | xlog | 177 | 0 |
0 | 300 | 17.894 | 0.29 | 18.2
64 | 2013-07-05 18:23:12.687+09 | xlog | 190 | 0 |
0 | 300 | 21.539 | 0.561 | 22.835
64 | 2013-07-05 18:24:44.243+09 | xlog | 176 | 0 |
0 | 300 | 17.81 | 8.655 | 26.53
64 | 2013-07-05 18:26:27.906+09 | xlog | 315 | 0 |
0 | 300 | 32.572 | 2.19 | 34.813
64 | 2013-07-05 18:34:26.008+09 | xlog | 17092 | 0 |
0 | 300 | 304.681 | 175.806 | 485.226
64 | 2013-07-05 18:49:26.017+09 | time | 82336 | 0 |
0 | 300 | 278.16 | 246.004 | 528.054
64 | 2013-07-05 19:04:26.9+09 | time | 85040 | 0 |
0 | 253 | 236.266 | 335.98 | 577.039
64 | 2013-07-05 19:19:26.233+09 | time | 84745 | 0 |
0 | 292 | 189.681 | 436.045 | 630.238
64 | 2013-07-05 19:34:27.037+09 | time | 137777 | 0 |
0 | 294 | 504.566 | 73.033 | 580.061
[fsync v2]
instid | start | flags | num_buffers | xlog_added |
xlog_removed | xlog_recycled | write_duration | sync_duration | total_duration
--------+----------------------------+-----------+-------------+------------+--------------+---------------+----------------+---------------+----------------
75 | 2013-07-08 15:50:41.166+09 | xlog | 284 | 0 |
0 | 0 | 28.748 | 4.742 | 33.638
75 | 2013-07-08 15:52:30.394+09 | xlog | 176 | 0 |
0 | 300 | 17.935 | 0.917 | 18.898
75 | 2013-07-08 15:54:06.255+09 | xlog | 303 | 0 |
0 | 300 | 30.429 | 2.113 | 32.638
75 | 2013-07-08 15:55:44.686+09 | xlog | 216 | 0 |
0 | 300 | 22.043 | 4.063 | 26.165
75 | 2013-07-08 15:57:12.866+09 | xlog | 179 | 0 |
0 | 300 | 18.2 | 8.008 | 26.459
75 | 2013-07-08 16:05:13.214+09 | xlog | 16063 | 0 |
0 | 300 | 313.357 | 127.779 | 446.138
75 | 2013-07-08 16:20:13.94+09 | time | 78018 | 0 |
0 | 300 | 194.376 | 326.53 | 527.111
75 | 2013-07-08 16:35:13.86+09 | time | 92081 | 0 |
0 | 249 | 189.26 | 359.835 | 563.836
75 | 2013-07-08 16:50:13.184+09 | xlog time | 87794 | 0 |
0 | 295 | 206.032 | 366.987 | 577.932
75 | 2013-07-08 17:05:13.188+09 | time | 136923 | 0 |
0 | 300 | 505.149 | 21.488 | 529.828
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers