This is an automated email from the ASF dual-hosted git repository.
sk0x50 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ignite.git
The following commit(s) were added to refs/heads/master by this push:
new 8ec5d50896a IGNITE-17279 Fixed incorrect detection of lost partition
on the coordinator node. Fixes #10126
8ec5d50896a is described below
commit 8ec5d50896a2b5f2d008d0bb8f67f7f3d7cdf584
Author: Slava Koptilin <[email protected]>
AuthorDate: Wed Jul 6 23:42:54 2022 +0300
IGNITE-17279 Fixed incorrect detection of lost partition on the coordinator
node. Fixes #10126
---
.../dht/topology/GridClientPartitionTopology.java | 9 +++
.../dht/topology/GridDhtPartitionTopologyImpl.java | 9 +++
.../CachePartitionLossWithPersistenceTest.java | 71 +++++++++++++++++++++-
.../CachePartitionLossWithRestartsTest.java | 6 ++
...CachePartitionLostAfterSupplierHasLeftTest.java | 5 ++
5 files changed, 99 insertions(+), 1 deletion(-)
diff --git
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridClientPartitionTopology.java
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridClientPartitionTopology.java
index 0314c46c4eb..08b240a8fce 100644
---
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridClientPartitionTopology.java
+++
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridClientPartitionTopology.java
@@ -1316,6 +1316,15 @@ public class GridClientPartitionTopology implements
GridDhtPartitionTopology {
for (Map.Entry<Integer, Set<UUID>> entry :
ownersByUpdCounters.entrySet())
part2node.put(entry.getKey(), entry.getValue());
+ if (lostParts != null) {
+ for (Integer lostPart : lostParts) {
+ for (GridDhtPartitionMap partMap : node2part.values()) {
+ if (partMap.containsKey(lostPart))
+ partMap.put(lostPart, LOST);
+ }
+ }
+ }
+
updateSeq.incrementAndGet();
}
finally {
diff --git
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridDhtPartitionTopologyImpl.java
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridDhtPartitionTopologyImpl.java
index 728fd4f3857..e85453d76f0 100644
---
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridDhtPartitionTopologyImpl.java
+++
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/GridDhtPartitionTopologyImpl.java
@@ -2447,6 +2447,15 @@ public class GridDhtPartitionTopologyImpl implements
GridDhtPartitionTopology {
}
}
+ if (lostParts != null) {
+ for (Integer lostPart : lostParts) {
+ for (GridDhtPartitionMap partMap : node2part.values())
{
+ if (partMap.containsKey(lostPart))
+ partMap.put(lostPart, LOST);
+ }
+ }
+ }
+
node2part = new GridDhtPartitionFullMap(node2part,
updateSeq.incrementAndGet());
}
finally {
diff --git
a/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithPersistenceTest.java
b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithPersistenceTest.java
index 7df1db6b618..c154b744d6b 100644
---
a/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithPersistenceTest.java
+++
b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithPersistenceTest.java
@@ -38,7 +38,11 @@ import org.apache.ignite.internal.IgniteEx;
import org.apache.ignite.internal.TestRecordingCommunicationSpi;
import org.apache.ignite.internal.processors.cache.IgniteInternalCache;
import
org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionDemandMessage;
+import
org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition;
+import
org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState;
+import
org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionTopology;
import org.apache.ignite.internal.util.typedef.G;
+import org.apache.ignite.internal.util.typedef.internal.CU;
import org.apache.ignite.lang.IgniteBiPredicate;
import org.apache.ignite.plugin.extensions.communication.Message;
import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
@@ -46,6 +50,8 @@ import org.junit.Ignore;
import org.junit.Test;
import static org.apache.ignite.cache.PartitionLossPolicy.READ_WRITE_SAFE;
+import static
org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.EVICTED;
+import static
org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.LOST;
/**
*
@@ -221,6 +227,10 @@ public class CachePartitionLossWithPersistenceTest extends
GridCommonAbstractTes
});
}
+ // Check that all lost partitions have the same state on all cluster
nodes.
+ for (Integer lostPart : lostParts)
+ checkLostPartitionAcrossCluster(DEFAULT_CACHE_NAME, lostPart);
+
if (partResetMode == 0)
crd.resetLostPartitions(Collections.singleton(DEFAULT_CACHE_NAME));
@@ -228,8 +238,13 @@ public class CachePartitionLossWithPersistenceTest extends
GridCommonAbstractTes
final Collection<Integer> g2LostParts =
g2.cache(DEFAULT_CACHE_NAME).lostPartitions();
- if (partResetMode != 0)
+ if (partResetMode != 0) {
+ // Check that all lost partitions have the same state on all
cluster nodes.
+ for (Integer lostPart : lostParts)
+ checkLostPartitionAcrossCluster(DEFAULT_CACHE_NAME, lostPart);
+
assertEquals(lostParts, g2LostParts);
+ }
if (partResetMode == 1)
crd.resetLostPartitions(Collections.singleton(DEFAULT_CACHE_NAME));
@@ -261,4 +276,58 @@ public class CachePartitionLossWithPersistenceTest extends
GridCommonAbstractTes
assertEquals("Partition " + p, 0,
ignite.cache(DEFAULT_CACHE_NAME).get(p));
}
}
+
+ /**
+ * Checks partition states on all nodes.
+ *
+ * @param cacheName Cache name to check.
+ * @param partId Partition to check.
+ * @return {@code true} if partition state of the given partition equals
to {@code state} on all nodes.
+ */
+ public static void checkLostPartitionAcrossCluster(String cacheName, int
partId) {
+ for (Ignite grid : G.allGrids()) {
+ IgniteEx g = (IgniteEx)grid;
+
+ boolean affNode = CU.affinityNode(
+ g.localNode(),
+
g.cache(cacheName).getConfiguration(CacheConfiguration.class).getNodeFilter());
+
+ // skip non affinity nodes
+ if (!affNode)
+ continue;
+
+ GridDhtPartitionTopology top =
g.context().cache().cacheGroup(CU.cacheId(cacheName)).topology();
+
+ GridDhtLocalPartition p = top.localPartition(partId);
+
+ if (p != null) {
+ GridDhtPartitionState actualState = p.state();
+
+ // check lost partitions
+ if (!top.lostPartitions().contains(partId)) {
+ fail("Unexpected partition state [partId=" + partId + ",
nodeId=" + g.localNode().id() +
+ ", actualPartState=" + actualState + ",
expectedPartState=" + LOST + ", markedAsLost=false]");
+ }
+
+ // check actual partition state
+ if (actualState != LOST && actualState != EVICTED) {
+ fail("Unexpected partition state [partId=" + partId + ",
nodeId=" + g.localNode().id() +
+ ", actualPartState=" + actualState + ",
expectedPartState=" + LOST + ", markedAsLost=true]");
+ }
+
+ // check node2part mapping
+ for (Ignite node : G.allGrids()) {
+ IgniteEx n = (IgniteEx)node;
+
+ GridDhtPartitionState s =
top.partitionState(n.localNode().id(), partId);
+
+ if (s != LOST && s != EVICTED) {
+ fail("Unexpected partition state [partId=" + partId +
", nodeId=" + g.localNode().id() +
+ ", node2partNodeId=" + n.localNode().id() +
+ ", node2partState=" + s + ", expectedPartState=" +
LOST + ", markedAsLost=true]");
+ }
+ }
+ }
+ }
+ }
}
diff --git
a/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithRestartsTest.java
b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithRestartsTest.java
index da527e4284a..e8bfdcc0894 100644
---
a/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithRestartsTest.java
+++
b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLossWithRestartsTest.java
@@ -44,6 +44,8 @@ import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
+import static
org.apache.ignite.internal.processors.cache.distributed.CachePartitionLossWithPersistenceTest.checkLostPartitionAcrossCluster;
+
/**
*
*/
@@ -205,6 +207,10 @@ public class CachePartitionLossWithRestartsTest extends
GridCommonAbstractTest {
GridDhtPartitionTopology top =
startGrid(1).cachex(DEFAULT_CACHE_NAME).context().topology();
assertEquals(lost1, top.lostPartitions());
+ // Check that all lost partitions have the same state on all cluster
nodes.
+ for (Integer lostPart : lost1)
+ checkLostPartitionAcrossCluster(DEFAULT_CACHE_NAME, lostPart);
+
// TODO https://issues.apache.org/jira/browse/IGNITE-13053
grid(1).resetLostPartitions(Collections.singleton(DEFAULT_CACHE_NAME));
diff --git
a/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLostAfterSupplierHasLeftTest.java
b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLostAfterSupplierHasLeftTest.java
index 6cd5d2e9374..ad8005a092b 100644
---
a/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLostAfterSupplierHasLeftTest.java
+++
b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/CachePartitionLostAfterSupplierHasLeftTest.java
@@ -51,6 +51,7 @@ import org.junit.Test;
import static
org.apache.ignite.IgniteSystemProperties.IGNITE_PREFER_WAL_REBALANCE;
import static org.apache.ignite.cache.CacheAtomicityMode.TRANSACTIONAL;
import static org.apache.ignite.cache.CacheMode.PARTITIONED;
+import static
org.apache.ignite.internal.processors.cache.distributed.CachePartitionLossWithPersistenceTest.checkLostPartitionAcrossCluster;
/**
* Test scenario: last supplier has left while a partition on demander is
cleared before sending first demand request.
@@ -288,6 +289,10 @@ public class CachePartitionLostAfterSupplierHasLeftTest
extends GridCommonAbstra
assertEquals(PARTS_CNT, lostParts2.size());
+ // Check that all lost partitions have the same state on all cluster
nodes.
+ for (Integer lostPart : lostParts2)
+ checkLostPartitionAcrossCluster(DEFAULT_CACHE_NAME, lostPart);
+
spi1.stopBlock();
g0.resetLostPartitions(Collections.singletonList(DEFAULT_CACHE_NAME));