support trickling fsync() on writes

patch by scode; reviewed by xedin for CASSANDRA-3950


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/ba7b0bdd
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/ba7b0bdd
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/ba7b0bdd

Branch: refs/heads/cassandra-1.1
Commit: ba7b0bddfd381be2262111cb6c3351ee31b94c78
Parents: 3f8db24
Author: Peter Schuller <peter.schul...@infidyne.com>
Authored: Wed Feb 29 01:50:44 2012 -0800
Committer: Peter Schuller <peter.schul...@infidyne.com>
Committed: Wed Feb 29 01:50:44 2012 -0800

----------------------------------------------------------------------
 CHANGES.txt                                        |    1 +
 conf/cassandra.yaml                                |    8 ++++
 src/java/org/apache/cassandra/config/Config.java   |    2 +
 .../cassandra/config/DatabaseDescriptor.java       |   10 ++++++
 .../apache/cassandra/io/util/SequentialWriter.java |   26 ++++++++++++++-
 5 files changed, 46 insertions(+), 1 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index fec8f38..47a004c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -16,6 +16,7 @@
  * reincarnate removed and deprecated KsDef/CfDef attributes (CASSANDRA-3953)
  * Fix race between writes and read for cache (CASSANDRA-3862)
  * perform static initialization of StorageProxy on start-up (CASSANDRA-3797)
+ * support trickling fsync() on writes (CASSANDRA-3950)
 Merged from 1.0:
  * remove the wait on hint future during write (CASSANDRA-3870)
  * (cqlsh) ignore missing CfDef opts (CASSANDRA-3933)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/conf/cassandra.yaml
----------------------------------------------------------------------
diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index 1e1c056..fa4ee9f 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -227,6 +227,14 @@ memtable_flush_queue_size: 4
 # Increase this to the size of the column slices you typically perform
 sliced_buffer_size_in_kb: 64
 
+# Whether to, when doing sequential writing, fsync() at intervals in
+# order to force the operating system to flush the dirty
+# buffers. Enable this to avoid sudden dirty buffer flushing from
+# impacting read latencies. Almost always a good idea on SSD:s; not
+# necessarily on platters.
+trickle_fsync: false
+trickle_fsync_interval_in_kb: 10240
+
 # TCP port, for commands and data
 storage_port: 7000
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/src/java/org/apache/cassandra/config/Config.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/config/Config.java 
b/src/java/org/apache/cassandra/config/Config.java
index 35f6bf9..d875584 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -123,6 +123,8 @@ public class Config
 
     public boolean incremental_backups = false;
     public int memtable_flush_queue_size = 4;
+    public boolean trickle_fsync = false;
+    public int trickle_fsync_interval_in_kb = 10240;
 
     public int key_cache_size_in_mb = 2;
     public int key_cache_save_period = 14400;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java 
b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 0c20160..db2669c 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -981,6 +981,16 @@ public class DatabaseDescriptor
         return conf.commitlog_total_space_in_mb;
     }
 
+    public static boolean getTrickleFsync()
+    {
+        return conf.trickle_fsync;
+    }
+
+    public static int getTrickleFsyncIntervalInKb()
+    {
+        return conf.trickle_fsync_interval_in_kb;
+    }
+
     public static int getKeyCacheSizeInMB()
     {
         return conf.key_cache_size_in_mb;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/ba7b0bdd/src/java/org/apache/cassandra/io/util/SequentialWriter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java 
b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
index 098fad4..596d72e 100644
--- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
@@ -23,6 +23,7 @@ import java.nio.channels.ClosedChannelException;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.utils.CLibrary;
 
 public class SequentialWriter extends OutputStream
@@ -51,6 +52,12 @@ public class SequentialWriter extends OutputStream
     // used if skip I/O cache was enabled
     private long ioCacheStartOffset = 0, bytesSinceCacheFlush = 0;
 
+    // whether to do trickling fsync() to avoid sudden bursts of dirty buffer 
flushing by kernel causing read
+    // latency spikes
+    private boolean trickleFsync;
+    private int trickleFsyncByteInterval;
+    private int bytesSinceTrickleFsync = 0;
+
     public final DataOutputStream stream;
     private MessageDigest digest;
 
@@ -62,6 +69,8 @@ public class SequentialWriter extends OutputStream
 
         buffer = new byte[bufferSize];
         this.skipIOCache = skipIOCache;
+        this.trickleFsync = DatabaseDescriptor.getTrickleFsync();
+        this.trickleFsyncByteInterval = 
DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024;
         fd = CLibrary.getfd(out.getFD());
         directoryFD = CLibrary.tryOpenDirectory(file.getParent());
         stream = new DataOutputStream(this);
@@ -145,12 +154,17 @@ public class SequentialWriter extends OutputStream
         syncInternal();
     }
 
+    protected void syncDataOnlyInternal() throws IOException
+    {
+        out.getFD().sync();
+    }
+
     protected void syncInternal() throws IOException
     {
         if (syncNeeded)
         {
             flushInternal();
-            out.getFD().sync();
+            syncDataOnlyInternal();
 
             if (!directorySynced)
             {
@@ -181,6 +195,16 @@ public class SequentialWriter extends OutputStream
         {
             flushData();
 
+            if (trickleFsync)
+            {
+                bytesSinceTrickleFsync += validBufferBytes;
+                if (bytesSinceTrickleFsync >= trickleFsyncByteInterval)
+                {
+                    syncDataOnlyInternal();
+                    bytesSinceTrickleFsync = 0;
+                }
+            }
+
             if (skipIOCache)
             {
                 // we don't know when the data reaches disk since we aren't

Reply via email to