(cassandra) 01/02: Make RemoveNodeTest::testAbort more deterministic

samt Mon, 12 Feb 2024 00:53:16 -0800

This is an automated email from the ASF dual-hosted git repository.

samt pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git


commit 61aabdfb4426296e9924e2959baa14fe744fb362
Author: Sam Tunnicliffe <s...@apache.org>
AuthorDate: Wed Feb 7 11:31:02 2024 +0000

    Make RemoveNodeTest::testAbort more deterministic
    
    Patch by Sam Tunnicliffe; reviewed by Ekaterina Dimitrova for 
CASSANDRA-19342
---
 .../apache/cassandra/service/StorageService.java   |  2 +-
 .../cassandra/distributed/test/RemoveNodeTest.java | 59 +++++++++++++++++-----
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/src/java/org/apache/cassandra/service/StorageService.java 
b/src/java/org/apache/cassandra/service/StorageService.java
index 9c1163b51b..1ea091e771 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -3642,7 +3642,7 @@ public class StorageService extends 
NotificationBroadcasterSupport implements IE
                 MultiStepOperation<?> seq = 
metadata.inProgressSequences.get(nodeId);
                 if (seq != null && seq.kind() == 
MultiStepOperation.Kind.REMOVE)
                 {
-                    sb.append("Removing node ").append(nodeId).append(" 
(").append(metadata.directory.endpoint(nodeId)).append(')').append(": 
").append(seq.status());
+                    sb.append("Removing node 
").append(nodeId.toUUID()).append(" 
(").append(metadata.directory.endpoint(nodeId)).append(')').append(": 
").append(seq.status());
                     found = true;
                 }
             }
diff --git 
a/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java 
b/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java
index baa7ae4c6d..2710fdc832 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/RemoveNodeTest.java
@@ -18,19 +18,28 @@
 
 package org.apache.cassandra.distributed.test;
 
+import java.util.concurrent.Callable;
+
 import org.junit.Test;
 
 import org.apache.cassandra.distributed.Cluster;
 import org.apache.cassandra.distributed.api.Feature;
+import org.apache.cassandra.distributed.api.IInvokableInstance;
 import org.apache.cassandra.distributed.shared.NetworkTopology;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.locator.InetAddressAndPort;
-import org.apache.cassandra.net.Verb;
 import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.Epoch;
 import org.apache.cassandra.tcm.membership.NodeId;
 import org.apache.cassandra.tcm.membership.NodeState;
 import org.apache.cassandra.tcm.sequences.SingleNodeSequences;
+import org.apache.cassandra.tcm.transformations.PrepareLeave;
 
+import static 
org.apache.cassandra.distributed.shared.ClusterUtils.pauseBeforeCommit;
+import static 
org.apache.cassandra.distributed.shared.ClusterUtils.pauseBeforeEnacting;
+import static 
org.apache.cassandra.distributed.shared.ClusterUtils.unpauseCommits;
+import static 
org.apache.cassandra.distributed.shared.ClusterUtils.unpauseEnactment;
+import static 
org.apache.cassandra.distributed.shared.ClusterUtils.waitForCMSToQuiesce;
 import static org.junit.Assert.assertTrue;
 
 public class RemoveNodeTest extends TestBaseImpl
@@ -44,25 +53,49 @@ public class RemoveNodeTest extends TestBaseImpl
                                                                    
.with(Feature.NETWORK, Feature.GOSSIP))
                                            .start()))
         {
-            
cluster.filters().inbound().verbs(Verb.INITIATE_DATA_MOVEMENTS_REQ.id).drop();
-            String nodeId = cluster.get(3).callOnInstance(() -> 
ClusterMetadata.current().myNodeId().toUUID().toString());
-            cluster.get(3).shutdown().get();
+            IInvokableInstance cmsInstance = cluster.get(1);
+            IInvokableInstance leavingInstance = cluster.get(3);
+            IInvokableInstance otherInstance = cluster.get(2);
+            String nodeId = leavingInstance.callOnInstance(() -> 
ClusterMetadata.current().myNodeId().toUUID().toString());
+            leavingInstance.shutdown().get();
+
+            // Have the CMS node pause before the mid-leave step is committed, 
then initiate the removal. Make a note
+            // of the _next_ epoch (i.e. when the mid-leave will be enacted), 
so we can pause before it is enacted
+            Callable<Epoch> pending = pauseBeforeCommit(cmsInstance, (e) -> e 
instanceof PrepareLeave.MidLeave);
             Thread t = new Thread(() -> 
cluster.get(1).nodetoolResult("removenode", nodeId, "--force"));
             t.start();
-            cluster.get(1).logs().watchFor("Committed StartLeave");
+            Epoch pauseBeforeEnacting = pending.call().nextEpoch();
+
+            // Check the status of the removal
             String stdout = cluster.get(1).nodetoolResult("removenode", 
"status").getStdout();
-            assertTrue(String.format("\"%s\" does not contain MID_LEAVE", 
stdout), stdout.contains("MID_LEAVE"));
+            assertTrue(String.format("\"%s\" does not contain correct node 
id", stdout), stdout.contains("Removing node " + nodeId));
+            assertTrue(String.format("\"%s\" does not contain MID_LEAVE", 
stdout), stdout.contains("step: MID_LEAVE"));
 
-            cluster.get(1).nodetoolResult("removenode", "abort", 
nodeId).asserts().success();
-            cluster.get(2).logs().watchFor("Enacted CancelInProgressSequence");
+            // Allow the CMS to proceed with committing the mid-leave step, 
but make the other node pause before
+            // enacting it so we can abort the removal before the final step 
(FINISH_LEAVE) is committed
+            Callable<?> beforeEnacted = pauseBeforeEnacting(otherInstance, 
pauseBeforeEnacting);
+            unpauseCommits(cmsInstance);
+            beforeEnacted.call();
 
+            // Now abort the removal. This should succeed in committing a 
cancellation of the removal sequence before
+            // it can be completed, as non-CMS instance is still paused.
+            cmsInstance.nodetoolResult("removenode", "abort", 
nodeId).asserts().success();
 
-            cluster.get(1).runOnInstance(() -> {
-                ClusterMetadata metadata = ClusterMetadata.current();
-                assertTrue(metadata.inProgressSequences.isEmpty());
-                
assertTrue(metadata.directory.peerState(NodeId.fromString(nodeId)) == 
NodeState.JOINED);
-            });
+            // Resume processing on the non-CMS instance. It will enact the 
MID_LEAVE step followed by the cancellation
+            // of the removal process.
+            unpauseEnactment(otherInstance);
+            otherInstance.logs().watchFor("Enacted CancelInProgressSequence");
 
+            // Finally, validate that cluster metadata is in the expected 
state - i.e. the "leaving" node is still joined
+            waitForCMSToQuiesce(cluster, cmsInstance, 
leavingInstance.config().num());
+            for (IInvokableInstance instance : new IInvokableInstance[] 
{cmsInstance, otherInstance})
+            {
+                instance.runOnInstance(() -> {
+                    ClusterMetadata metadata = ClusterMetadata.current();
+                    assertTrue(metadata.inProgressSequences.isEmpty());
+                    
assertTrue(metadata.directory.peerState(NodeId.fromString(nodeId)) == 
NodeState.JOINED);
+                });
+            }
         }
     }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org
For additional commands, e-mail: commits-h...@cassandra.apache.org

(cassandra) 01/02: Make RemoveNodeTest::testAbort more deterministic

Reply via email to