Fail repair if insufficient responses received Patch by Simon Zhou; Reviewed by Paulo Motta for CASSANDRA-13397
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/f5b36f12 Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/f5b36f12 Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/f5b36f12 Branch: refs/heads/trunk Commit: f5b36f12df65a780a52851207c285db7a8b4122f Parents: 175e4f8 Author: Simon Zhou <sz...@uber.com> Authored: Fri Mar 31 20:53:39 2017 -0700 Committer: Paulo Motta <pa...@apache.org> Committed: Thu Apr 20 10:26:16 2017 -0300 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../cassandra/service/ActiveRepairService.java | 22 +++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/f5b36f12/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 6a1e486..7a860fe 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 3.0.14 + * Fail repair if insufficient responses received (CASSANDRA-13397) * Fix SSTableLoader fail when the loaded table contains dropped columns (CASSANDRA-13276) * Avoid name clashes in CassandraIndexTest (CASSANDRA-13427) * Handling partially written hint files (CASSANDRA-12728) http://git-wip-us.apache.org/repos/asf/cassandra/blob/f5b36f12/src/java/org/apache/cassandra/service/ActiveRepairService.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java index 11d4617..b4cea79 100644 --- a/src/java/org/apache/cassandra/service/ActiveRepairService.java +++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java @@ -321,30 +321,36 @@ public class ActiveRepairService implements IEndpointStateChangeSubscriber, IFai } else { - status.set(false); - failedNodes.add(neighbour.getHostAddress()); - prepareLatch.countDown(); + // bailout early to avoid potentially waiting for a long time. + failRepair(parentRepairSession, "Endpoint not alive: " + neighbour); } } + try { - prepareLatch.await(1, TimeUnit.HOURS); + // Failed repair is expensive so we wait for longer time. + if (!prepareLatch.await(1, TimeUnit.HOURS)) { + failRepair(parentRepairSession, "Did not get replies from all endpoints."); + } } catch (InterruptedException e) { - removeParentRepairSession(parentRepairSession); - throw new RuntimeException("Did not get replies from all endpoints. List of failed endpoint(s): " + failedNodes.toString(), e); + failRepair(parentRepairSession, "Interrupted while waiting for prepare repair response."); } if (!status.get()) { - removeParentRepairSession(parentRepairSession); - throw new RuntimeException("Did not get positive replies from all endpoints. List of failed endpoint(s): " + failedNodes.toString()); + failRepair(parentRepairSession, "Got negative replies from endpoints " + failedNodes); } return parentRepairSession; } + private void failRepair(UUID parentRepairSession, String errorMsg) { + removeParentRepairSession(parentRepairSession); + throw new RuntimeException(errorMsg); + } + public void registerParentRepairSession(UUID parentRepairSession, InetAddress coordinator, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, boolean isIncremental, long timestamp, boolean isGlobal) { if (!registeredForEndpointChanges)