Author: brandonwilliams Date: Mon Oct 3 20:36:35 2011 New Revision: 1178563
URL: http://svn.apache.org/viewvc?rev=1178563&view=rev Log: Fix bug where the FailureDetector can take a very long time to mark a host down. Patch by brandonwilliams, reviewed by Paul Cannon for CASSANDRA-3273 Modified: cassandra/branches/cassandra-0.8/CHANGES.txt cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/FailureDetector.java cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/Gossiper.java cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/IFailureDetector.java cassandra/branches/cassandra-1.0.0/CHANGES.txt cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/FailureDetector.java cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/Gossiper.java cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/IFailureDetector.java Modified: cassandra/branches/cassandra-0.8/CHANGES.txt URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-0.8/CHANGES.txt?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-0.8/CHANGES.txt (original) +++ cassandra/branches/cassandra-0.8/CHANGES.txt Mon Oct 3 20:36:35 2011 @@ -18,6 +18,8 @@ * (Hadoop) allow wrapping ranges in queries (CASSANDRA-3137) * (Hadoop) check all interfaces for a match with split location before falling back to random replica (CASSANDRA-3211) + * Fix bug where the failure detector can take too long to mark a host + down (CASSANDRA-3273) 0.8.6 Modified: cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/FailureDetector.java URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/FailureDetector.java?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/FailureDetector.java (original) +++ cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/FailureDetector.java Mon Oct 3 20:36:35 2011 @@ -121,7 +121,14 @@ public class FailureDetector implements logger_.error("unknown endpoint " + ep); return epState != null && epState.isAlive(); } - + + public void clear(InetAddress ep) + { + ArrivalWindow heartbeatWindow = arrivalSamples_.get(ep); + if (heartbeatWindow != null) + heartbeatWindow.clear(); + } + public void report(InetAddress ep) { if (logger_.isTraceEnabled()) @@ -149,7 +156,9 @@ public class FailureDetector implements logger_.trace("PHI for " + ep + " : " + phi); if ( phi > phiConvictThreshold_ ) - { + { + logger_.trace("notifying listeners that {} is down", ep); + logger_.trace("intervals: {} mean: {}", hbWnd, hbWnd.mean()); for ( IFailureDetectionEventListener listener : fdEvntListeners_ ) { listener.convict(ep, phi); @@ -206,6 +215,11 @@ class ArrivalWindow // change. private final double PHI_FACTOR = 1.0 / Math.log(10.0); + // in the event of a long partition, never record an interval longer than the rpc timeout, + // since if a host is regularly experiencing connectivity problems lasting this long we'd + // rather mark it down quickly instead of adapting + private final double MAX_INTERVAL_IN_MS = DatabaseDescriptor.getRpcTimeout(); + ArrivalWindow(int size) { arrivalIntervals_ = new BoundedStatsDeque(size); @@ -216,14 +230,17 @@ class ArrivalWindow double interArrivalTime; if ( tLast_ > 0L ) { - interArrivalTime = (value - tLast_); + interArrivalTime = (value - tLast_); } else { interArrivalTime = Gossiper.intervalInMillis / 2; } - tLast_ = value; - arrivalIntervals_.add(interArrivalTime); + if (interArrivalTime <= MAX_INTERVAL_IN_MS) + arrivalIntervals_.add(interArrivalTime); + else + logger_.debug("Ignoring interval time of {}", interArrivalTime); + tLast_ = value; } synchronized double sum() Modified: cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/Gossiper.java URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/Gossiper.java?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/Gossiper.java (original) +++ cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/Gossiper.java Mon Oct 3 20:36:35 2011 @@ -665,6 +665,13 @@ public class Gossiper implements IFailur if ( remoteGeneration > localGeneration ) { localEndpointState.updateTimestamp(); + // this node was dead and the generation changed, this indicates a reboot, or possibly a takeover + // we will clean the fd intervals for it and relearn them + if (!localEndpointState.isAlive()) + { + logger.debug("Clearing interval times for {} due to generation change", endpoint); + fd.clear(endpoint); + } fd.report(endpoint); return; } @@ -676,6 +683,7 @@ public class Gossiper implements IFailur if ( remoteVersion > localVersion ) { localEndpointState.updateTimestamp(); + // just a version change, report to the fd fd.report(endpoint); } } Modified: cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/IFailureDetector.java URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/IFailureDetector.java?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/IFailureDetector.java (original) +++ cassandra/branches/cassandra-0.8/src/java/org/apache/cassandra/gms/IFailureDetector.java Mon Oct 3 20:36:35 2011 @@ -37,7 +37,13 @@ public interface IFailureDetector * @return true if UP and false if DOWN. */ public boolean isAlive(InetAddress ep); - + + /** + * Clear any existing interval timings for this endpoint + * @param ep + */ + public void clear(InetAddress ep); + /** * This method is invoked by any entity wanting to interrogate the status of an endpoint. * In our case it would be the Gossiper. The Failure Detector will then calculate Phi and Modified: cassandra/branches/cassandra-1.0.0/CHANGES.txt URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-1.0.0/CHANGES.txt?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-1.0.0/CHANGES.txt (original) +++ cassandra/branches/cassandra-1.0.0/CHANGES.txt Mon Oct 3 20:36:35 2011 @@ -5,6 +5,8 @@ (CASSANDRA-3269) * Evict gossip state immediately when a token is taken over by a new IP (CASSANDRA-3259) * tolerate whitespace in seed CDL (CASSANDRA-3263) + * Fix bug where the failure detector can take too long to mark a host + down (CASSANDRA-3273) 1.0.0-rc2 Modified: cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/FailureDetector.java URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/FailureDetector.java?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/FailureDetector.java (original) +++ cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/FailureDetector.java Mon Oct 3 20:36:35 2011 @@ -149,7 +149,14 @@ public class FailureDetector implements logger_.error("unknown endpoint " + ep); return epState != null && epState.isAlive(); } - + + public void clear(InetAddress ep) + { + ArrivalWindow heartbeatWindow = arrivalSamples_.get(ep); + if (heartbeatWindow != null) + heartbeatWindow.clear(); + } + public void report(InetAddress ep) { if (logger_.isTraceEnabled()) @@ -177,7 +184,9 @@ public class FailureDetector implements logger_.trace("PHI for " + ep + " : " + phi); if ( phi > phiConvictThreshold_ ) - { + { + logger_.trace("notifying listeners that {} is down", ep); + logger_.trace("intervals: {} mean: {}", hbWnd, hbWnd.mean()); for ( IFailureDetectionEventListener listener : fdEvntListeners_ ) { listener.convict(ep, phi); @@ -234,6 +243,11 @@ class ArrivalWindow // change. private final double PHI_FACTOR = 1.0 / Math.log(10.0); + // in the event of a long partition, never record an interval longer than the rpc timeout, + // since if a host is regularly experiencing connectivity problems lasting this long we'd + // rather mark it down quickly instead of adapting + private final double MAX_INTERVAL_IN_MS = DatabaseDescriptor.getRpcTimeout(); + ArrivalWindow(int size) { arrivalIntervals_ = new BoundedStatsDeque(size); @@ -244,14 +258,17 @@ class ArrivalWindow double interArrivalTime; if ( tLast_ > 0L ) { - interArrivalTime = (value - tLast_); + interArrivalTime = (value - tLast_); } else { interArrivalTime = Gossiper.intervalInMillis / 2; } - tLast_ = value; - arrivalIntervals_.add(interArrivalTime); + if (interArrivalTime <= MAX_INTERVAL_IN_MS) + arrivalIntervals_.add(interArrivalTime); + else + logger_.debug("Ignoring interval time of {}", interArrivalTime); + tLast_ = value; } synchronized double sum() Modified: cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/Gossiper.java URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/Gossiper.java?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/Gossiper.java (original) +++ cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/Gossiper.java Mon Oct 3 20:36:35 2011 @@ -685,6 +685,13 @@ public class Gossiper implements IFailur if ( remoteGeneration > localGeneration ) { localEndpointState.updateTimestamp(); + // this node was dead and the generation changed, this indicates a reboot, or possibly a takeover + // we will clean the fd intervals for it and relearn them + if (!localEndpointState.isAlive()) + { + logger.debug("Clearing interval times for {} due to generation change", endpoint); + fd.clear(endpoint); + } fd.report(endpoint); return; } @@ -696,6 +703,7 @@ public class Gossiper implements IFailur if ( remoteVersion > localVersion ) { localEndpointState.updateTimestamp(); + // just a version change, report to the fd fd.report(endpoint); } } Modified: cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/IFailureDetector.java URL: http://svn.apache.org/viewvc/cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/IFailureDetector.java?rev=1178563&r1=1178562&r2=1178563&view=diff ============================================================================== --- cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/IFailureDetector.java (original) +++ cassandra/branches/cassandra-1.0.0/src/java/org/apache/cassandra/gms/IFailureDetector.java Mon Oct 3 20:36:35 2011 @@ -37,7 +37,13 @@ public interface IFailureDetector * @return true if UP and false if DOWN. */ public boolean isAlive(InetAddress ep); - + + /** + * Clear any existing interval timings for this endpoint + * @param ep + */ + public void clear(InetAddress ep); + /** * This method is invoked by any entity wanting to interrogate the status of an endpoint. * In our case it would be the Gossiper. The Failure Detector will then calculate Phi and