Repository: cassandra Updated Branches: refs/heads/trunk 3cb00db3f -> 4047dd121
Failure detector detects and ignores local pauses Patch by brandonwilliams, reviewed by Richard Low for CASSANDRA-9183 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/4047dd12 Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/4047dd12 Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/4047dd12 Branch: refs/heads/trunk Commit: 4047dd1213ed99b3d7bec253b551b4cae911990a Parents: 3cb00db Author: Brandon Williams <brandonwilli...@apache.org> Authored: Thu May 7 14:15:51 2015 -0500 Committer: Brandon Williams <brandonwilli...@apache.org> Committed: Thu May 7 14:16:47 2015 -0500 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../apache/cassandra/gms/FailureDetector.java | 29 ++++++++++++++++++++ 2 files changed, 30 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/4047dd12/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 5a8ee93..9cfc772 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 3.0 + * Failure detector detects and ignores local pauses (CASSANDRA-9183) * Remove Thrift dependencies in bundled tools (CASSANDRA-8358) * Disable memory mapping of hsperfdata file for JVM statistics (CASSANDRA-9242) * Add pre-startup checks to detect potential incompatibilities (CASSANDRA-8049) http://git-wip-us.apache.org/repos/asf/cassandra/blob/4047dd12/src/java/org/apache/cassandra/gms/FailureDetector.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/gms/FailureDetector.java b/src/java/org/apache/cassandra/gms/FailureDetector.java index fe825b6..b8c20d7 100644 --- a/src/java/org/apache/cassandra/gms/FailureDetector.java +++ b/src/java/org/apache/cassandra/gms/FailureDetector.java @@ -48,6 +48,22 @@ public class FailureDetector implements IFailureDetector, FailureDetectorMBean public static final String MBEAN_NAME = "org.apache.cassandra.net:type=FailureDetector"; private static final int SAMPLE_SIZE = 1000; protected static final long INITIAL_VALUE_NANOS = TimeUnit.NANOSECONDS.convert(getInitialValue(), TimeUnit.MILLISECONDS); + private static final long DEFAULT_MAX_PAUSE = 5000L * 1000000L; // 5 seconds + private static final long MAX_LOCAL_PAUSE_IN_NANOS = getMaxLocalPause(); + private long lastInterpret = System.nanoTime(); + private boolean wasPaused = false; + + private static long getMaxLocalPause() + { + if (System.getProperty("cassandra.max_local_pause_in_ms") != null) + { + long pause = Long.parseLong(System.getProperty("cassandra.max_local_pause_in_ms")); + logger.warn("Overriding max local pause time to {}ms", pause); + return pause * 1000000L; + } + else + return DEFAULT_MAX_PAUSE; + } public static final IFailureDetector instance = new FailureDetector(); @@ -228,6 +244,19 @@ public class FailureDetector implements IFailureDetector, FailureDetectorMBean return; } long now = System.nanoTime(); + long diff = now - lastInterpret; + lastInterpret = now; + if (diff > MAX_LOCAL_PAUSE_IN_NANOS) + { + logger.warn("Not marking nodes down due to local pause of {} > {}", diff, MAX_LOCAL_PAUSE_IN_NANOS); + wasPaused = true; + return; + } + if (wasPaused) + { + wasPaused = false; + return; + } double phi = hbWnd.phi(now); if (logger.isTraceEnabled()) logger.trace("PHI for {} : {}", ep, phi);