This is an automated email from the ASF dual-hosted git repository. samt pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/cassandra.git
commit 61aabdfb4426296e9924e2959baa14fe744fb362 Author: Sam Tunnicliffe <s...@apache.org> AuthorDate: Wed Feb 7 11:31:02 2024 +0000 Make RemoveNodeTest::testAbort more deterministic Patch by Sam Tunnicliffe; reviewed by Ekaterina Dimitrova for CASSANDRA-19342 --- .../apache/cassandra/service/StorageService.java | 2 +- .../cassandra/distributed/test/RemoveNodeTest.java | 59 +++++++++++++++++----- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 9c1163b51b..1ea091e771 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -3642,7 +3642,7 @@ public class StorageService extends NotificationBroadcasterSupport implements IE MultiStepOperation<?> seq = metadata.inProgressSequences.get(nodeId); if (seq != null && seq.kind() == MultiStepOperation.Kind.REMOVE) { - sb.append("Removing node ").append(nodeId).append(" (").append(metadata.directory.endpoint(nodeId)).append(')').append(": ").append(seq.status()); + sb.append("Removing node ").append(nodeId.toUUID()).append(" (").append(metadata.directory.endpoint(nodeId)).append(')').append(": ").append(seq.status()); found = true; } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java b/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java index baa7ae4c6d..2710fdc832 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java @@ -18,19 +18,28 @@ package org.apache.cassandra.distributed.test; +import java.util.concurrent.Callable; + import org.junit.Test; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.shared.NetworkTopology; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.net.Verb; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeState; import org.apache.cassandra.tcm.sequences.SingleNodeSequences; +import org.apache.cassandra.tcm.transformations.PrepareLeave; +import static org.apache.cassandra.distributed.shared.ClusterUtils.pauseBeforeCommit; +import static org.apache.cassandra.distributed.shared.ClusterUtils.pauseBeforeEnacting; +import static org.apache.cassandra.distributed.shared.ClusterUtils.unpauseCommits; +import static org.apache.cassandra.distributed.shared.ClusterUtils.unpauseEnactment; +import static org.apache.cassandra.distributed.shared.ClusterUtils.waitForCMSToQuiesce; import static org.junit.Assert.assertTrue; public class RemoveNodeTest extends TestBaseImpl @@ -44,25 +53,49 @@ public class RemoveNodeTest extends TestBaseImpl .with(Feature.NETWORK, Feature.GOSSIP)) .start())) { - cluster.filters().inbound().verbs(Verb.INITIATE_DATA_MOVEMENTS_REQ.id).drop(); - String nodeId = cluster.get(3).callOnInstance(() -> ClusterMetadata.current().myNodeId().toUUID().toString()); - cluster.get(3).shutdown().get(); + IInvokableInstance cmsInstance = cluster.get(1); + IInvokableInstance leavingInstance = cluster.get(3); + IInvokableInstance otherInstance = cluster.get(2); + String nodeId = leavingInstance.callOnInstance(() -> ClusterMetadata.current().myNodeId().toUUID().toString()); + leavingInstance.shutdown().get(); + + // Have the CMS node pause before the mid-leave step is committed, then initiate the removal. Make a note + // of the _next_ epoch (i.e. when the mid-leave will be enacted), so we can pause before it is enacted + Callable<Epoch> pending = pauseBeforeCommit(cmsInstance, (e) -> e instanceof PrepareLeave.MidLeave); Thread t = new Thread(() -> cluster.get(1).nodetoolResult("removenode", nodeId, "--force")); t.start(); - cluster.get(1).logs().watchFor("Committed StartLeave"); + Epoch pauseBeforeEnacting = pending.call().nextEpoch(); + + // Check the status of the removal String stdout = cluster.get(1).nodetoolResult("removenode", "status").getStdout(); - assertTrue(String.format("\"%s\" does not contain MID_LEAVE", stdout), stdout.contains("MID_LEAVE")); + assertTrue(String.format("\"%s\" does not contain correct node id", stdout), stdout.contains("Removing node " + nodeId)); + assertTrue(String.format("\"%s\" does not contain MID_LEAVE", stdout), stdout.contains("step: MID_LEAVE")); - cluster.get(1).nodetoolResult("removenode", "abort", nodeId).asserts().success(); - cluster.get(2).logs().watchFor("Enacted CancelInProgressSequence"); + // Allow the CMS to proceed with committing the mid-leave step, but make the other node pause before + // enacting it so we can abort the removal before the final step (FINISH_LEAVE) is committed + Callable<?> beforeEnacted = pauseBeforeEnacting(otherInstance, pauseBeforeEnacting); + unpauseCommits(cmsInstance); + beforeEnacted.call(); + // Now abort the removal. This should succeed in committing a cancellation of the removal sequence before + // it can be completed, as non-CMS instance is still paused. + cmsInstance.nodetoolResult("removenode", "abort", nodeId).asserts().success(); - cluster.get(1).runOnInstance(() -> { - ClusterMetadata metadata = ClusterMetadata.current(); - assertTrue(metadata.inProgressSequences.isEmpty()); - assertTrue(metadata.directory.peerState(NodeId.fromString(nodeId)) == NodeState.JOINED); - }); + // Resume processing on the non-CMS instance. It will enact the MID_LEAVE step followed by the cancellation + // of the removal process. + unpauseEnactment(otherInstance); + otherInstance.logs().watchFor("Enacted CancelInProgressSequence"); + // Finally, validate that cluster metadata is in the expected state - i.e. the "leaving" node is still joined + waitForCMSToQuiesce(cluster, cmsInstance, leavingInstance.config().num()); + for (IInvokableInstance instance : new IInvokableInstance[] {cmsInstance, otherInstance}) + { + instance.runOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + assertTrue(metadata.inProgressSequences.isEmpty()); + assertTrue(metadata.directory.peerState(NodeId.fromString(nodeId)) == NodeState.JOINED); + }); + } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org For additional commands, e-mail: commits-h...@cassandra.apache.org