[ https://issues.apache.org/jira/browse/IGNITE-11620?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16800418#comment-16800418 ]
Roman Shtykh commented on IGNITE-11620: --------------------------------------- {noformat} [ERROR][ttl-cleanup-worker-#39][] Critical system error detected. Will be handled accordingly to configured handler [hnd=class o.a.i.failure.StopNodeOrHaltFailureHandler, failureCtx=FailureContext [type=SYSTEM_WORKER_TERMINATION, err=class o.a.i.i.processors.cache.distributed.dht.GridDhtInvalidPartitionException [part=814, msg=Adding entry to partition that is concurrently evicted [grp=OrderLog1, part=814, shouldBeMoving=, belongs=false, topVer=AffinityTopologyVersion [topVer=579, minorTopVer=0], curTopVer=AffinityTopologyVersion [topVer=579, minorTopVer=0]]]]] org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtInvalidPartitionException: Adding entry to partition that is concurrently evicted [grp=OrderLog1, part=814, shouldBeMoving=, belongs=false, topVer=AffinityTopologyVersion [topVer=579, minorTopVer=0], curTopVer=AffinityTopologyVersion [topVer=579, minorTopVer=0]] at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionTopologyImpl.localPartition0(GridDhtPartitionTopologyImpl.java:909) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionTopologyImpl.localPartition(GridDhtPartitionTopologyImpl.java:784) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.distributed.dht.GridCachePartitionedConcurrentMap.localPartition(GridCachePartitionedConcurrentMap.java:69) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.distributed.dht.GridCachePartitionedConcurrentMap.putEntryIfObsoleteOrAbsent(GridCachePartitionedConcurrentMap.java:88) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.GridCacheAdapter.entryEx(GridCacheAdapter.java:955) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtCacheAdapter.entryEx(GridDhtCacheAdapter.java:525) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.GridCacheAdapter.entryEx(GridCacheAdapter.java:946) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl.expire(IgniteCacheOffheapManagerImpl.java:1049) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.GridCacheTtlManager.expire(GridCacheTtlManager.java:197) ~[ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.processors.cache.GridCacheSharedTtlCleanupManager$CleanupWorker.body(GridCacheSharedTtlCleanupManager.java:137) [ignite-core-2.6.0.jar:2.6.0] at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:110) [ignite-core-2.6.0.jar:2.6.0] at java.lang.Thread.run(Thread.java:748) [?:1.8.0_181] [ERROR][ttl-cleanup-worker-#39][] JVM will be halted immediately due to the failure: [failureCtx=FailureContext [type=SYSTEM_WORKER_TERMINATION, err=class o.a.i.i.processors.cache.distributed.dht.GridDhtInvalidPartitionException [part=814, msg=Adding entry to partition that is concurrently evicted [grp=OrderLog1, part=814, shouldBeMoving=, belongs=false, topVer=AffinityTopologyVersion [topVer=579, minorTopVer=0], curTopVer=AffinityTopologyVersion [topVer=579, minorTopVer=0]]]]] {noformat} > GridDhtInvalidPartitionException stops the cluster > -------------------------------------------------- > > Key: IGNITE-11620 > URL: https://issues.apache.org/jira/browse/IGNITE-11620 > Project: Ignite > Issue Type: Bug > Affects Versions: 2.6, 2.7 > Reporter: Roman Shtykh > Priority: Critical > > When injecting data and having it expired at the same time rebalancing occurs, > *GridDhtInvalidPartitionException* triggers *SYSTEM_WORKER_TERMINATION*. > This can cause cascading failures in the cluster and take the whole cluster > down. > Simple test case: > {noformat} > import org.apache.ignite.IgniteCache; > import org.apache.ignite.configuration.CacheConfiguration; > import org.apache.ignite.configuration.IgniteConfiguration; > import org.apache.ignite.failure.StopNodeOrHaltFailureHandler; > import org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi; > import org.apache.ignite.spi.discovery.tcp.ipfinder.TcpDiscoveryIpFinder; > import org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder; > import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; > import javax.cache.expiry.CreatedExpiryPolicy; > import javax.cache.expiry.Duration; > import java.util.concurrent.CountDownLatch; > import java.util.concurrent.TimeUnit; > import static org.apache.ignite.cache.CacheAtomicityMode.ATOMIC; > import static org.apache.ignite.cache.CacheMode.PARTITIONED; > /** > * > */ > public class ExpireWhileRebalanceTest extends GridCommonAbstractTest { > private static final int ENTRIES = 500000; > /** > * > */ > protected static final TcpDiscoveryIpFinder IP_FINDER = new > TcpDiscoveryVmIpFinder(true); > /** > * {@inheritDoc} > */ > @Override > protected IgniteConfiguration getConfiguration(String gridName) throws > Exception { > IgniteConfiguration cfg = super.getConfiguration(gridName); > ((TcpDiscoverySpi) cfg.getDiscoverySpi()).setIpFinder(IP_FINDER); > cfg.setFailureHandler(new StopNodeOrHaltFailureHandler()); > CacheConfiguration<Object, Object> ccfg = new > CacheConfiguration<>(DEFAULT_CACHE_NAME); > ccfg.setAtomicityMode(ATOMIC); > ccfg.setCacheMode(PARTITIONED); > ccfg.setExpiryPolicyFactory(CreatedExpiryPolicy.factoryOf(new > Duration(TimeUnit.SECONDS, 1))); > cfg.setCacheConfiguration(ccfg); > return cfg; > } > /** > * @throws Exception If failed. > */ > public void testExpireWhileRebalancing() throws Exception { > startGridsMultiThreaded(4); > IgniteCache<Object, Object> cache = > ignite(0).cache(DEFAULT_CACHE_NAME); > CountDownLatch latch = new CountDownLatch(1); > new Thread(() -> { > for (int i = 1; i <= ENTRIES; i++) { > cache.put(i, i); > if (i % (ENTRIES / 10) == 0) > System.out.println(">>> Entries put: " + i); > } > latch.countDown(); > }).start(); > // stopping 0 has no effect > stopGrid(3); > awaitPartitionMapExchange(); > startGrid(3); > latch.await(10, TimeUnit.SECONDS); > } > /** > * {@inheritDoc} > */ > @Override > protected void afterTest() throws Exception { > stopAllGrids(); > } > } > {noformat} -- This message was sent by Atlassian JIRA (v7.6.3#76005)