[ 
https://issues.apache.org/jira/browse/CASSANDRA-19975?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17886261#comment-17886261
 ] 

David Capwell commented on CASSANDRA-19975:
-------------------------------------------

I commented out "ClusterUtils.stopUnchecked(node1);" and the test passes... so 
does look to be due to CMS having 1 down node

> TCM unable to allow node to join when there is 1 down voting member
> -------------------------------------------------------------------
>
>                 Key: CASSANDRA-19975
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-19975
>             Project: Cassandra
>          Issue Type: Bug
>          Components: Transactional Cluster Metadata
>            Reporter: David Capwell
>            Priority: Normal
>             Fix For: 5.x
>
>
> This issue was found by the HarryTopologyMixupTest… in the cep-15-accord 
> branch we added stopping nodes as well as restarting nodes (now that accord 
> supports it) and this looks to break TCM if the down node is a CMS voting 
> member.
> Here is the test that shows it
> {code}
> /*
>  * Licensed to the Apache Software Foundation (ASF) under one
>  * or more contributor license agreements.  See the NOTICE file
>  * distributed with this work for additional information
>  * regarding copyright ownership.  The ASF licenses this file
>  * to you under the Apache License, Version 2.0 (the
>  * "License"); you may not use this file except in compliance
>  * with the License.  You may obtain a copy of the License at
>  *
>  *     http://www.apache.org/licenses/LICENSE-2.0
>  *
>  * Unless required by applicable law or agreed to in writing, software
>  * distributed under the License is distributed on an "AS IS" BASIS,
>  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>  * See the License for the specific language governing permissions and
>  * limitations under the License.
>  */
> package org.apache.cassandra.distributed.test.tcm;
> import accord.utils.Invariants;
> import accord.utils.async.TimeoutUtils;
> import org.agrona.collections.Long2LongHashMap;
> import org.apache.cassandra.distributed.Cluster;
> import org.apache.cassandra.distributed.api.Feature;
> import org.apache.cassandra.distributed.api.IInvokableInstance;
> import org.apache.cassandra.distributed.impl.INodeProvisionStrategy;
> import org.apache.cassandra.distributed.shared.ClusterUtils;
> import org.apache.cassandra.distributed.test.TestBaseImpl;
> import org.junit.Test;
> import java.io.IOException;
> import java.time.Duration;
> import java.util.concurrent.ExecutionException;
> import java.util.concurrent.TimeoutException;
> import java.util.concurrent.atomic.AtomicInteger;
> public class RepoTest extends TestBaseImpl
> {
>     /**
>      * This is the history reported from HarryTopologyMixupTest
>      * 
>         History:
>       2: Add Node3; epoch=18, cms=[1, 2]
>       // hidden - reconfigure to rf=3
>       3: Waiting for CMS to Quiesce; epoch=18, cms=[1, 2]
>       5: Harry Validate All; epoch=31, cms=[1, 2, 3]
>       6: Harry Insert; epoch=31, cms=[1, 2, 3]
>       8: Add Node4; epoch=31, cms=[1, 2, 3]
>       9: Waiting for CMS to Quiesce; epoch=31, cms=[1, 2, 3]
>       10: Harry Validate All; epoch=38, cms=[1, 2, 3]
>       11: nodetool repair harry tbl_0 from node2; epoch=38, cms=[1, 2, 3]
>       12: Stop Node3 for nodetool removenode; epoch=38, cms=[1, 2, 3]
>       13: nodetool removenode node3 from node1; epoch=38, cms=[1, 2, 3]
>       14: nodetool repair harry tbl_0 from node1; epoch=49, cms=[1, 2, 3]
>       15: Waiting for CMS to Quiesce; epoch=49, cms=[1, 2, 3]
>       16: Stop Node1 for Normal Stop; epoch=49, cms=[1, 2, 4]
>       18: Add Node5; epoch=49, cms=[1, 2, 4]
>          */
>     @Test
>     public void test() throws IOException, ExecutionException, 
> InterruptedException, TimeoutException
>     {
>         Long2LongHashMap nodeToToken = new Long2LongHashMap(-0);
>         nodeToToken.put(1, -1799911656L);
>         nodeToToken.put(2, -1005197310L);
>         nodeToToken.put(3, -834315596L);
>         nodeToToken.put(4, 335272232L);
>         nodeToToken.put(5, -1829188286L);
>         final AtomicInteger counter = new AtomicInteger(0);
>         try (Cluster cluster = Cluster.build(2)
>                                       .withTokenSupplier(i -> 
> nodeToToken.get(i))
>                                       .withConfig(c -> 
> c.with(Feature.values()))
>                                       .withNodeProvisionStrategy((subnet, 
> portMap) -> new INodeProvisionStrategy.AbstractNodeProvisionStrategy(portMap)
>                                       {
>                                           {
>                                               Invariants.checkArgument(subnet 
> == 0, "Unexpected subnet detected: %d", subnet);
>                                           }
>                                           private final String ipPrefix = 
> "127.0." + subnet + '.';
>                                           @Override
>                                           public int seedNodeNum()
>                                           {
>                                               switch 
> (counter.getAndIncrement())
>                                               {
>                                                   case 0:
>                                                   case 1:
>                                                       return 1;
>                                                   default:
>                                                       return 2;
>                                               }
>                                           }
>                                           @Override
>                                           public String ipAddress(int nodeNum)
>                                           {
>                                               return ipPrefix + nodeNum;
>                                           }
>                                       })
>                                       .start())
>         {
>             fixDistributedSchemas(cluster);
>             IInvokableInstance node1 = cluster.get(1);
>             IInvokableInstance node2 = cluster.get(2);
>             node1.nodetoolResult("cms", "reconfigure", 
> "2").asserts().success();
>             IInvokableInstance node3 = ClusterUtils.addInstance(cluster, 
> node1.config(), c -> c.set("auto_bootstrap", true));
>             node3.startup(cluster);
>             node1.nodetoolResult("cms", "reconfigure", 
> Integer.toString(3)).asserts().success();
>             ClusterUtils.waitForCMSToQuiesce(cluster, new int[]{1, 2, 3});
>             IInvokableInstance node4 = ClusterUtils.addInstance(cluster, 
> node1.config(), c -> c.set("auto_bootstrap", true));
>             node4.startup(cluster);
>             ClusterUtils.stopUnchecked(node3);
>             node1.nodetoolResult("removenode", "3").asserts().success();
>             ClusterUtils.stopUnchecked(node1);
>             // expected CMS Voting Group: [1, 2, 4]
>             TimeoutUtils.runBlocking(Duration.ofMinutes(2), "node5 join", () 
> -> {
>                 IInvokableInstance node5 = ClusterUtils.addInstance(cluster, 
> node1.config(), c -> c.set("auto_bootstrap", true));
>                 node5.startup(cluster);
>             });
>         }
>     }
> }
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org
For additional commands, e-mail: commits-h...@cassandra.apache.org

Reply via email to