Updated Branches: refs/heads/cassandra-1.1.0 3f8db24e4 -> ba7b0bddf
support trickling fsync() on writes patch by scode; reviewed by xedin for CASSANDRA-3950 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/ba7b0bdd Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/ba7b0bdd Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/ba7b0bdd Branch: refs/heads/cassandra-1.1.0 Commit: ba7b0bddfd381be2262111cb6c3351ee31b94c78 Parents: 3f8db24 Author: Peter Schuller <peter.schul...@infidyne.com> Authored: Wed Feb 29 01:50:44 2012 -0800 Committer: Peter Schuller <peter.schul...@infidyne.com> Committed: Wed Feb 29 01:50:44 2012 -0800 ---------------------------------------------------------------------- CHANGES.txt | 1 + conf/cassandra.yaml | 8 ++++ src/java/org/apache/cassandra/config/Config.java | 2 + .../cassandra/config/DatabaseDescriptor.java | 10 ++++++ .../apache/cassandra/io/util/SequentialWriter.java | 26 ++++++++++++++- 5 files changed, 46 insertions(+), 1 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index fec8f38..47a004c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -16,6 +16,7 @@ * reincarnate removed and deprecated KsDef/CfDef attributes (CASSANDRA-3953) * Fix race between writes and read for cache (CASSANDRA-3862) * perform static initialization of StorageProxy on start-up (CASSANDRA-3797) + * support trickling fsync() on writes (CASSANDRA-3950) Merged from 1.0: * remove the wait on hint future during write (CASSANDRA-3870) * (cqlsh) ignore missing CfDef opts (CASSANDRA-3933) http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/conf/cassandra.yaml ---------------------------------------------------------------------- diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index 1e1c056..fa4ee9f 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -227,6 +227,14 @@ memtable_flush_queue_size: 4 # Increase this to the size of the column slices you typically perform sliced_buffer_size_in_kb: 64 +# Whether to, when doing sequential writing, fsync() at intervals in +# order to force the operating system to flush the dirty +# buffers. Enable this to avoid sudden dirty buffer flushing from +# impacting read latencies. Almost always a good idea on SSD:s; not +# necessarily on platters. +trickle_fsync: false +trickle_fsync_interval_in_kb: 10240 + # TCP port, for commands and data storage_port: 7000 http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/src/java/org/apache/cassandra/config/Config.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 35f6bf9..d875584 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -123,6 +123,8 @@ public class Config public boolean incremental_backups = false; public int memtable_flush_queue_size = 4; + public boolean trickle_fsync = false; + public int trickle_fsync_interval_in_kb = 10240; public int key_cache_size_in_mb = 2; public int key_cache_save_period = 14400; http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/src/java/org/apache/cassandra/config/DatabaseDescriptor.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index 0c20160..db2669c 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -981,6 +981,16 @@ public class DatabaseDescriptor return conf.commitlog_total_space_in_mb; } + public static boolean getTrickleFsync() + { + return conf.trickle_fsync; + } + + public static int getTrickleFsyncIntervalInKb() + { + return conf.trickle_fsync_interval_in_kb; + } + public static int getKeyCacheSizeInMB() { return conf.key_cache_size_in_mb; http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/src/java/org/apache/cassandra/io/util/SequentialWriter.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java index 098fad4..596d72e 100644 --- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java +++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java @@ -23,6 +23,7 @@ import java.nio.channels.ClosedChannelException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.utils.CLibrary; public class SequentialWriter extends OutputStream @@ -51,6 +52,12 @@ public class SequentialWriter extends OutputStream // used if skip I/O cache was enabled private long ioCacheStartOffset = 0, bytesSinceCacheFlush = 0; + // whether to do trickling fsync() to avoid sudden bursts of dirty buffer flushing by kernel causing read + // latency spikes + private boolean trickleFsync; + private int trickleFsyncByteInterval; + private int bytesSinceTrickleFsync = 0; + public final DataOutputStream stream; private MessageDigest digest; @@ -62,6 +69,8 @@ public class SequentialWriter extends OutputStream buffer = new byte[bufferSize]; this.skipIOCache = skipIOCache; + this.trickleFsync = DatabaseDescriptor.getTrickleFsync(); + this.trickleFsyncByteInterval = DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024; fd = CLibrary.getfd(out.getFD()); directoryFD = CLibrary.tryOpenDirectory(file.getParent()); stream = new DataOutputStream(this); @@ -145,12 +154,17 @@ public class SequentialWriter extends OutputStream syncInternal(); } + protected void syncDataOnlyInternal() throws IOException + { + out.getFD().sync(); + } + protected void syncInternal() throws IOException { if (syncNeeded) { flushInternal(); - out.getFD().sync(); + syncDataOnlyInternal(); if (!directorySynced) { @@ -181,6 +195,16 @@ public class SequentialWriter extends OutputStream { flushData(); + if (trickleFsync) + { + bytesSinceTrickleFsync += validBufferBytes; + if (bytesSinceTrickleFsync >= trickleFsyncByteInterval) + { + syncDataOnlyInternal(); + bytesSinceTrickleFsync = 0; + } + } + if (skipIOCache) { // we don't know when the data reaches disk since we aren't