[ https://issues.apache.org/jira/browse/CASSANDRA-19975?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Marcus Eriksson updated CASSANDRA-19975: ---------------------------------------- Bug Category: Parent values: Correctness(12982)Level 1 values: Test Failure(12990) (was: Parent values: Correctness(12982)Level 1 values: Unrecoverable Corruption / Loss(13161)) > TopologyMixupTestBase does not fix replication factor for Keyspaces after > reaching rf=3 > --------------------------------------------------------------------------------------- > > Key: CASSANDRA-19975 > URL: https://issues.apache.org/jira/browse/CASSANDRA-19975 > Project: Cassandra > Issue Type: Bug > Components: Transactional Cluster Metadata > Reporter: David Capwell > Assignee: David Capwell > Priority: Normal > Fix For: 5.x > > > This issue was found by the HarryTopologyMixupTest… in the cep-15-accord > branch we added stopping nodes as well as restarting nodes (now that accord > supports it) and this looks to break TCM if the down node is a CMS voting > member. > Here is the test that shows it > {code} > /* > * Licensed to the Apache Software Foundation (ASF) under one > * or more contributor license agreements. See the NOTICE file > * distributed with this work for additional information > * regarding copyright ownership. The ASF licenses this file > * to you under the Apache License, Version 2.0 (the > * "License"); you may not use this file except in compliance > * with the License. You may obtain a copy of the License at > * > * http://www.apache.org/licenses/LICENSE-2.0 > * > * Unless required by applicable law or agreed to in writing, software > * distributed under the License is distributed on an "AS IS" BASIS, > * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > * See the License for the specific language governing permissions and > * limitations under the License. > */ > package org.apache.cassandra.distributed.test.tcm; > import accord.utils.Invariants; > import accord.utils.async.TimeoutUtils; > import org.agrona.collections.Long2LongHashMap; > import org.apache.cassandra.distributed.Cluster; > import org.apache.cassandra.distributed.api.Feature; > import org.apache.cassandra.distributed.api.IInvokableInstance; > import org.apache.cassandra.distributed.impl.INodeProvisionStrategy; > import org.apache.cassandra.distributed.shared.ClusterUtils; > import org.apache.cassandra.distributed.test.TestBaseImpl; > import org.junit.Test; > import java.io.IOException; > import java.time.Duration; > import java.util.concurrent.ExecutionException; > import java.util.concurrent.TimeoutException; > import java.util.concurrent.atomic.AtomicInteger; > public class RepoTest extends TestBaseImpl > { > /** > * This is the history reported from HarryTopologyMixupTest > * > History: > 2: Add Node3; epoch=18, cms=[1, 2] > // hidden - reconfigure to rf=3 > 3: Waiting for CMS to Quiesce; epoch=18, cms=[1, 2] > 5: Harry Validate All; epoch=31, cms=[1, 2, 3] > 6: Harry Insert; epoch=31, cms=[1, 2, 3] > 8: Add Node4; epoch=31, cms=[1, 2, 3] > 9: Waiting for CMS to Quiesce; epoch=31, cms=[1, 2, 3] > 10: Harry Validate All; epoch=38, cms=[1, 2, 3] > 11: nodetool repair harry tbl_0 from node2; epoch=38, cms=[1, 2, 3] > 12: Stop Node3 for nodetool removenode; epoch=38, cms=[1, 2, 3] > 13: nodetool removenode node3 from node1; epoch=38, cms=[1, 2, 3] > 14: nodetool repair harry tbl_0 from node1; epoch=49, cms=[1, 2, 3] > 15: Waiting for CMS to Quiesce; epoch=49, cms=[1, 2, 3] > 16: Stop Node1 for Normal Stop; epoch=49, cms=[1, 2, 4] > 18: Add Node5; epoch=49, cms=[1, 2, 4] > */ > @Test > public void test() throws IOException, ExecutionException, > InterruptedException, TimeoutException > { > Long2LongHashMap nodeToToken = new Long2LongHashMap(-0); > nodeToToken.put(1, -1799911656L); > nodeToToken.put(2, -1005197310L); > nodeToToken.put(3, -834315596L); > nodeToToken.put(4, 335272232L); > nodeToToken.put(5, -1829188286L); > final AtomicInteger counter = new AtomicInteger(0); > try (Cluster cluster = Cluster.build(2) > .withTokenSupplier(i -> > nodeToToken.get(i)) > .withConfig(c -> > c.with(Feature.values())) > .withNodeProvisionStrategy((subnet, > portMap) -> new INodeProvisionStrategy.AbstractNodeProvisionStrategy(portMap) > { > { > Invariants.checkArgument(subnet > == 0, "Unexpected subnet detected: %d", subnet); > } > private final String ipPrefix = > "127.0." + subnet + '.'; > @Override > public int seedNodeNum() > { > switch > (counter.getAndIncrement()) > { > case 0: > case 1: > return 1; > default: > return 2; > } > } > @Override > public String ipAddress(int nodeNum) > { > return ipPrefix + nodeNum; > } > }) > .start()) > { > fixDistributedSchemas(cluster); > IInvokableInstance node1 = cluster.get(1); > IInvokableInstance node2 = cluster.get(2); > node1.nodetoolResult("cms", "reconfigure", > "2").asserts().success(); > IInvokableInstance node3 = ClusterUtils.addInstance(cluster, > node1.config(), c -> c.set("auto_bootstrap", true)); > node3.startup(cluster); > node1.nodetoolResult("cms", "reconfigure", > Integer.toString(3)).asserts().success(); > ClusterUtils.waitForCMSToQuiesce(cluster, new int[]{1, 2, 3}); > IInvokableInstance node4 = ClusterUtils.addInstance(cluster, > node1.config(), c -> c.set("auto_bootstrap", true)); > node4.startup(cluster); > ClusterUtils.stopUnchecked(node3); > node1.nodetoolResult("removenode", "3").asserts().success(); > ClusterUtils.stopUnchecked(node1); > // expected CMS Voting Group: [1, 2, 4] > TimeoutUtils.runBlocking(Duration.ofMinutes(2), "node5 join", () > -> { > IInvokableInstance node5 = ClusterUtils.addInstance(cluster, > node1.config(), c -> c.set("auto_bootstrap", true)); > node5.startup(cluster); > }); > } > } > } > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org For additional commands, e-mail: commits-h...@cassandra.apache.org