chesnokoff commented on code in PR #13184:
URL: https://github.com/apache/ignite/pull/13184#discussion_r3450722241


##########
modules/compatibility/src/test/java/org/apache/ignite/compatibility/ru/IgniteRebalanceOnUpgradeTest.java:
##########
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.compatibility.ru;
+
+import java.io.File;
+//import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.ignite.IgniteCache;
+import org.apache.ignite.Ignition;
+import org.apache.ignite.cache.CacheAtomicityMode;
+import org.apache.ignite.client.ClientCache;
+import org.apache.ignite.client.ClientCacheConfiguration;
+import org.apache.ignite.client.IgniteClient;
+import 
org.apache.ignite.compatibility.testframework.testcontainers.IgniteClusterContainer;
+import 
org.apache.ignite.compatibility.testframework.testcontainers.IgniteContainer;
+import org.apache.ignite.configuration.ClientConfiguration;
+import org.apache.ignite.configuration.DataRegionConfiguration;
+import org.apache.ignite.configuration.DataStorageConfiguration;
+import org.apache.ignite.configuration.IgniteConfiguration;
+import org.apache.ignite.internal.IgniteEx;
+import org.apache.ignite.internal.util.typedef.internal.U;
+import org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi;
+import org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi;
+import org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder;
+import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static 
org.apache.ignite.compatibility.testframework.testcontainers.IgniteContainer.LOCAL_WORK_DIR_PATH;
+import static org.apache.ignite.testframework.GridTestUtils.DFLT_TEST_TIMEOUT;
+import static org.apache.ignite.testframework.GridTestUtils.waitForCondition;
+
+/** Smoke test for rolling upgrade with persistence. */
+public class IgniteRebalanceOnUpgradeTest extends GridCommonAbstractTest {
+    /** Node IDs. */
+    private static final List<String> NODE_IDS = List.of(
+        "ad26bff6-5ff5-49f1-9a61-425a827953ed",
+        "c1099d16-e7d7-49f4-925c-53329286c444",
+        "7b880b69-8a9e-4b84-b555-250d365e2e67"
+    );
+
+    /** Source commit hash. */
+    private static final String SOURCE_COMMIT_HASH = "f239499b"; //"6b172a8b";
+
+    /** Cache name. */
+    private static final String CACHE_NAME = "ru-test-cache";
+
+    /** Local work directory. */
+    private static final File LOCAL_WORK_DIR = new File(LOCAL_WORK_DIR_PATH);
+
+    /** Thin client. */
+    private IgniteClient client;
+
+    /** */
+    private final List<IgniteEx> nodes = new ArrayList<>();
+
+    /** */
+    private final Map<String, String> addrs = new HashMap<>();
+
+    /** */
+    @BeforeClass
+    public static void beforeClass() {
+        U.delete(LOCAL_WORK_DIR);
+
+        System.setProperty("java.net.preferIPv4Stack", "true");
+        System.setProperty("java.net.preferIPv6Addresses", "false");
+    }
+
+    /** */
+    @AfterClass
+    public static void afterClass() {
+        U.delete(LOCAL_WORK_DIR);
+    }
+
+    /** {@inheritDoc} */
+    @Override protected boolean isMultiJvm() {
+        return false;
+    }
+
+    /** {@inheritDoc} */
+    @Override protected long getTestTimeout() {
+        return super.getTestTimeout() * 3;
+    }
+
+    /** Basic RU test. */
+    @Test
+    public void testRollingUpgrade() throws Exception {
+        try (IgniteClusterContainer cluster = new 
IgniteClusterContainer(SOURCE_COMMIT_HASH, NODE_IDS)) {
+            cluster.start();
+
+            for (IgniteContainer container : cluster.containers())
+                addrs.put(container.nodeId(), container.discoveryAddress());
+
+            System.out.println(">>> Addresses=" + addrs);
+
+            ClientCacheConfiguration cfg = new ClientCacheConfiguration()
+                .setName(CACHE_NAME)
+                .setBackups(1)
+                .setAtomicityMode(CacheAtomicityMode.TRANSACTIONAL);
+
+            ClientCache<Integer, Integer> cache = 
client(cluster.containers().get(0).clientAddress()).createCache(cfg);
+
+            for (int i = 0; i < 1000; i++)
+                cache.put(i, i);
+
+            closeClient();
+            
+            upgradeCluster(cluster);
+
+            IgniteCache<Integer, Integer> targetCache = 
nodes.get(0).cache(CACHE_NAME);
+
+            for (int i = 0; i < 1000; i++)
+                assertEquals("Data mismatch after upgrade at key: " + i, i, 
(int)targetCache.get(i));
+
+            targetCache.put(1001, 1001);
+
+            assertEquals(1001, (int)targetCache.get(1001));
+        }
+        finally {
+            closeClient();
+        }
+    }
+
+    /** */
+    private void upgradeCluster(IgniteClusterContainer srcCluster) throws 
Exception {
+        for (IgniteContainer container : srcCluster.containers()) {
+            System.out.println(">>> Upgrade " + container.nodeId());
+
+            container.stop();
+
+            addrs.remove(container.nodeId());
+
+            System.out.println(">>> CONNECT TO=" + addrs.values());
+
+            IgniteEx ignite = null;
+
+            try {
+                Thread.sleep(20_000);
+
+                ignite = startGrid(configuration(container.nodeId(), 
container.localWorkDirectory(), addrs.values()));
+            }

Review Comment:
   Looks like there is a problem based on the logs from my local run.
   
   **Prerequisites**:
   We have three old nodes, each running in its own Docker container. During 
rolling upgrade, one old container is stopped and replaced by a new local node 
outside Docker.
   
   For simplicity:
   - node A: old container node, coordinator
   - node B: another old container node
   - node C: our new local node, started outside Docker
   
   **Problem**:
   Node C cannot join the cluster and fails with `Failed to connect to any 
address from IP finder within join timeout`
   
   **Reason**:
   Node C and the Docker container nodes have different views of node B address.
   
   Node C connects to coordinator node A. Node A accepts the join request and 
prints `Added new node to topology`
   
   After that, coordinator A sends `TcpDiscoveryNodeAddedMessage` around the 
discovery ring. In our case, node C receives this message and has to pass it to 
node B.
   
   The problem is that node B declares its Docker-internal address, for example 
`172.19.0.3:47500`
   
   This address is valid inside the Docker network, so node A can use it. But 
node C is running on macOS, outside Docker. From node C's point of view, node B 
is reachable only through Docker port mapping, for example 
`192.168.0.101:<mapped-discovery-port>`
   
   As a result, node C tries to connect to node B using `172.19.0.3:47500` and 
cannot reach it. The discovery message cannot complete its round through the 
ring, so the join process is not completed. Finally, node C hits the join 
timeout and node A removes it from the topology
   
   I see two solutions here:
   1) More clear. We need to start new node also in container to have common 
docker network and connect to cluster with thin client to manage nodes. The 
only problem is that we may want to do more sophisticated operations with 
server nodes and we won't be able to trigger java api
   2) Somehow set AddressResolver, but for now I'm not sure it will be a clear 
solution



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to