[ https://issues.apache.org/jira/browse/IGNITE-20299?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17759846#comment-17759846 ]
Raymond Wilson commented on IGNITE-20299: ----------------------------------------- Yes, we are using persistence. This is our persistence XML file: {noformat} <?xml version="1.0" encoding="UTF-8"?> <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:util="http://www.springframework.org/schema/util" xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd http://www.springframework.org/schema/util http://www.springframework.org/schema/util/spring-util.xsd"> <bean id="grid.custom.cfg" class="org.apache.ignite.configuration.IgniteConfiguration" scope="singleton"> </bean> </beans> {noformat} Our configuration is mostly in code. Here is the primary configuration for the server nodes: {noformat} public void ConfigureTRexGrid(IgniteConfiguration cfg) { cfg.IgniteInstanceName = TRexGrids.ImmutableGridName(); cfg.JvmOptions = CommonJavaJVMOptions(); var configStore = DIContext.Obtain<IConfigurationStore>(); // Note: Set the PSN JVM heap size minimum and maximum sizes to be the maximum defined JVM heap size for the node. // This is to ensure the JVM always has access to the heap promised to it so will never act to resize the heap // This provide better performance and removes chances of surprise if the OS cannot allocate a larger heap size block // for other reason. cfg.JvmMaxMemoryMb = configStore.GetValueInt(PSNODE_IGNITE_JVM_MAX_HEAP_SIZE_MB, DEFAULT_IGNITE_JVM_MAX_HEAP_SIZE_MB); cfg.JvmInitialMemoryMb = configStore.GetValueInt(PSNODE_IGNITE_JVM_MAX_HEAP_SIZE_MB, DEFAULT_IGNITE_JVM_MAX_HEAP_SIZE_MB); cfg.UserAttributes = new Dictionary<string, object> { { "Owner", TRexGrids.ImmutableGridName() } }; // Configure the Ignite persistence layer to store our data cfg.DataStorageConfiguration = new DataStorageConfiguration { WalMode = WalMode.Fsync, PageSize = IgniteDataRegionPageSize(), StoragePath = Path.Combine(TRexServerConfig.PersistentCacheStoreLocation, "Immutable", "Persistence"), WalPath = Path.Combine(TRexServerConfig.PersistentCacheStoreLocation, "Immutable", "WalStore"), WalArchivePath = Path.Combine(TRexServerConfig.PersistentCacheStoreLocation, "Immutable", "WalArchive"), WalSegmentSize = 512 * 1024 * 1024, // Set the WalSegmentSize to 512Mb to better support high write loads (can be set to max 2Gb) MaxWalArchiveSize = (long)10 * 512 * 1024 * 1024, // Ensure there are 10 segments in the WAL archive at the defined segment size CheckpointThreads = configStore.GetValueInt(IGNITE_NUMBER_OF_CHECKPOINTING_THREADS, DEFAULT_IGNITE_NUMBER_OF_CHECKPOINTING_THREADS), CheckpointFrequency = TimeSpan.FromSeconds(configStore.GetValueInt(IGNITE_CHECKPOINTING_INTERVAL_SECONDS, DEFAULT_IGNITE_CHECKPOINTING_INTERVAL_SECONDS)), DefaultDataRegionConfiguration = new DataRegionConfiguration { Name = DataRegions.DEFAULT_IMMUTABLE_DATA_REGION_NAME, InitialSize = configStore.GetValueLong(IMMUTABLE_DATA_REGION_INITIAL_SIZE_MB, DEFAULT_IMMUTABLE_DATA_REGION_INITIAL_SIZE_MB) * 1024 * 1024, MaxSize = configStore.GetValueLong(IMMUTABLE_DATA_REGION_MAX_SIZE_MB, DEFAULT_IMMUTABLE_DATA_REGION_MAX_SIZE_MB) * 1024 * 1024, PersistenceEnabled = true } }; Log.LogInformation($"cfg.DataStorageConfiguration.StoragePath={cfg.DataStorageConfiguration.StoragePath}"); Log.LogInformation($"cfg.DataStorageConfiguration.WalArchivePath={cfg.DataStorageConfiguration.WalArchivePath}"); Log.LogInformation($"cfg.DataStorageConfiguration.WalPath={cfg.DataStorageConfiguration.WalPath}"); if (!bool.TryParse(Environment.GetEnvironmentVariable("IS_KUBERNETES"), out var isKubernetes)) { Log.LogWarning($"Failed to parse the value of the 'IS_KUBERNETES' environment variable as a bool. Value is {Environment.GetEnvironmentVariable("IS_KUBERNETES")}. Defaulting to true"); } cfg = isKubernetes ? SetKubernetesIgniteConfiguration(cfg) : SetLocalIgniteConfiguration(cfg); cfg.WorkDirectory = Path.Combine(TRexServerConfig.PersistentCacheStoreLocation, "Immutable"); cfg.Logger = new TRexIgniteLogger(configStore, Logger.CreateLogger("ImmutableCacheComputeServer")); // Set an Ignite metrics heartbeat cfg.MetricsLogFrequency = new TimeSpan(0, 0, 0, configStore.GetValueInt(IGNITE_HEARTBEAT_FREQUENCY_SECONDS, DEFAULT_IGNITE_HEARTBEAT_FREQUENCY_SECONDS)); cfg.PublicThreadPoolSize = configStore.GetValueInt(IGNITE_PUBLIC_THREAD_POOL_SIZE, DEFAULT_IGNITE_PUBLIC_THREAD_POOL_SIZE); cfg.SystemThreadPoolSize = configStore.GetValueInt(IGNITE_SYSTEM_THREAD_POOL_SIZE, DEFAULT_IGNITE_SYSTEM_THREAD_POOL_SIZE); cfg.StripedThreadPoolSize = configStore.GetValueInt(IGNITE_STRIPED_THREAD_POOL_SIZE, Math.Max(DEFAULT_IGNITE_MINIMUM_THREAD_POOL_SIZE, Environment.ProcessorCount)); cfg.PeerAssemblyLoadingMode = PeerAssemblyLoadingMode.CurrentAppDomain; cfg.BinaryConfiguration = new BinaryConfiguration { Serializer = new BinarizableSerializer() }; // Add the TRex progressive request custom thread pool cfg.ExecutorConfiguration = new List<ExecutorConfiguration> { new ExecutorConfiguration() {Name = BaseIgniteClass.TREX_PROGRESSIVE_QUERY_CUSTOM_THREAD_POOL_NAME, Size = configStore.GetValueInt(PROGRESSIVE_REQUEST_CUSTOM_POOL_SIZE, DEFAULT_PROGRESSIVE_REQUEST_CUSTOM_POOL_SIZE)}, new ExecutorConfiguration() {Name = BaseIgniteClass.TREX_CHANGE_NOTIFICATION_CUSTOM_THREAD_POOL_NAME, Size = configStore.GetValueInt(CHANGE_NOTIFICATION_REQUEST_CUSTOM_POOL_SIZE, DEFAULT_CHANGE_NOTIFICATION_REQUEST_CUSTOM_POOL_SIZE)}, new ExecutorConfiguration() {Name = BaseIgniteClass.TREX_NODE_AVAILABILITY_THREAD_POOL_NAME, Size = configStore.GetValueInt(NODE_AVAILABILITY_REQUEST_CUSTOM_POOL_SIZE, DEFAULT_NODE_AVAILABILITY_REQUEST_CUSTOM_POOL_SIZE)} }; cfg.FailureDetectionTimeout = TimeSpan.FromMilliseconds(configStore.GetValueInt(IGNITE_FAILURE_DETECTION_TIMEOUT_MS, DEFAULT_IGNITE_FAILURE_DETECTION_TIMEOUT_MS)); cfg.ClientFailureDetectionTimeout = TimeSpan.FromMilliseconds(configStore.GetValueInt(IGNITE_CLIENT_FAILURE_DETECTION_TIMEOUT_MS, DEFAULT_IGNITE_CLIENT_FAILURE_DETECTION_TIMEOUT_MS)); } {noformat} Much of this is parameterised. Let me know if you need any of the values. My repetition of the issues is within our system. I don't have a trivial repeater as such, but I can reproduce it at will. > Creating a cache with an unknown data region name causes total unrecoverable > failure of the grid > ------------------------------------------------------------------------------------------------ > > Key: IGNITE-20299 > URL: https://issues.apache.org/jira/browse/IGNITE-20299 > Project: Ignite > Issue Type: Bug > Components: cache > Affects Versions: 2.15 > Environment: Observed in: > C# client and grid running on Linux in a container > C# client and grid running on Windows > > Reporter: Raymond Wilson > Priority: Major > > Using the Ignite C# client. > > Given a running grid, having a client (and perhaps server) node in the grid > attempt to create a cache using a DataRegionName that does not exist in the > grid causes immediate failure in the client node with the following log > output. > > 2023-08-27 17:08:48,520 [44] INF [ImmutableClientServer] Completed > partition exchange [localNode=15122bd7-bf81-44e6-a548-e70dbd9334c0, > exchange=GridDhtPartitionsExchangeFuture [topVer=AffinityTopologyVersion > [topVer=15, minorTopVer=0], evt=NODE_FAILED, evtNode=TcpDiscoveryNode > [id=9d5ed68d-38bb-447d-aed5-189f52660716, > consistentId=9d5ed68d-38bb-447d-aed5-189f52660716, addrs=ArrayList > [127.0.0.1], sockAddrs=null, discPort=0, order=8, intOrder=8, > lastExchangeTime=1693112858024, loc=false, ver=2.15.0#20230425-sha1:f98f7f35, > isClient=true], rebalanced=false, done=true, newCrdFut=null], > topVer=AffinityTopologyVersion [topVer=15, minorTopVer=0]] > 2023-08-27 17:08:48,520 [44] INF [ImmutableClientServer] Exchange timings > [startVer=AffinityTopologyVersion [topVer=15, minorTopVer=0], > resVer=AffinityTopologyVersion [topVer=15, minorTopVer=0], stage="Waiting in > exchange queue" (14850 ms), stage="Exchange parameters initialization" (2 > ms), stage="Determine exchange type" (3 ms), stage="Exchange done" (4 ms), > stage="Total time" (14859 ms)] > 2023-08-27 17:08:48,522 [44] INF [ImmutableClientServer] Exchange longest > local stages [startVer=AffinityTopologyVersion [topVer=15, minorTopVer=0], > resVer=AffinityTopologyVersion [topVer=15, minorTopVer=0]] > 2023-08-27 17:08:48,524 [44] INF [ImmutableClientServer] Finished exchange > init [topVer=AffinityTopologyVersion [topVer=15, minorTopVer=0], crd=false] > 2023-08-27 17:08:48,525 [44] INF [ImmutableClientServer] > AffinityTopologyVersion [topVer=15, minorTopVer=0], evt=NODE_FAILED, > evtNode=9d5ed68d-38bb-447d-aed5-189f52660716, client=true] > Unhandled exception: Apache.Ignite.Core.Cache.CacheException: class > org.apache.ignite.IgniteCheckedException: Failed to complete exchange process. > ---> Apache.Ignite.Core.Common.IgniteException: Failed to complete exchange > process. > ---> Apache.Ignite.Core.Common.JavaException: javax.cache.CacheException: > class org.apache.ignite.IgniteCheckedException: Failed to complete exchange > process. > at > org.apache.ignite.internal.processors.cache.GridCacheUtils.convertToCacheException(GridCacheUtils.java:1272) > at > org.apache.ignite.internal.IgniteKernal.getOrCreateCache0(IgniteKernal.java:2278) > at > org.apache.ignite.internal.IgniteKernal.getOrCreateCache(IgniteKernal.java:2242) > at > org.apache.ignite.internal.processors.platform.PlatformProcessorImpl.processInStreamOutObject(PlatformProcessorImpl.java:643) > at > org.apache.ignite.internal.processors.platform.PlatformTargetProxyImpl.inStreamOutObject(PlatformTargetProxyImpl.java:79) > Caused by: class org.apache.ignite.IgniteCheckedException: Failed to complete > exchange process. > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.createExchangeException(GridDhtPartitionsExchangeFuture.java:3709) > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.sendExchangeFailureMessage(GridDhtPartitionsExchangeFuture.java:3737) > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.finishExchangeOnCoordinator(GridDhtPartitionsExchangeFuture.java:3832) > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onAllReceived(GridDhtPartitionsExchangeFuture.java:3813) > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.distributedExchange(GridDhtPartitionsExchangeFuture.java:1796) > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.init(GridDhtPartitionsExchangeFuture.java:1053) > at > org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body0(GridCachePartitionExchangeManager.java:3348) > at > org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body(GridCachePartitionExchangeManager.java:3182) > at > org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:125) > at java.base/java.lang.Thread.run(Thread.java:829) > Suppressed: class org.apache.ignite.IgniteCheckedException: Failed to > initialize exchange locally [locNodeId=e9325b04-00fa-452e-9796-989b47b860ea] > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onCacheChangeRequest(GridDhtPartitionsExchangeFuture.java:1483) > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.init(GridDhtPartitionsExchangeFuture.java:979) > ... 4 more > Caused by: class org.apache.ignite.IgniteCheckedException: Requested > DataRegion is not configured: Default-Mutable > at > org.apache.ignite.internal.processors.cache.persistence.IgniteCacheDatabaseSharedManager.dataRegion(IgniteCacheDatabaseSharedManager.java:896) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.startCacheGroup(GridCacheProcessor.java:2463) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.getOrCreateCacheGroupContext(GridCacheProcessor.java:2181) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareCacheContext(GridCacheProcessor.java:1991) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareCacheStart(GridCacheProcessor.java:1926) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.lambda$prepareStartCaches$55a0e703$1(GridCacheProcessor.java:1801) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.lambda$prepareStartCachesIfPossible$16(GridCacheProcessor.java:1771) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareStartCaches(GridCacheProcessor.java:1798) > at > org.apache.ignite.internal.processors.cache.GridCacheProcessor.prepareStartCachesIfPossible(GridCacheProcessor.java:1769) > at > org.apache.ignite.internal.processors.cache.CacheAffinitySharedManager.processCacheStartRequests(CacheAffinitySharedManager.java:1000) > at > org.apache.ignite.internal.processors.cache.CacheAffinitySharedManager.onCacheChangeRequest(CacheAffinitySharedManager.java:886) > at > org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onCacheChangeRequest(GridDhtPartitionsExchangeFuture.java:1472) > ... 5 more > at Apache.Ignite.Core.Impl.Unmanaged.Jni.Env.ExceptionCheck() > at Apache.Ignite.Core.Impl.Unmanaged.Jni.Env.CallObjectMethod(GlobalRef > obj, IntPtr methodId, Int64* argsPtr) > at > Apache.Ignite.Core.Impl.Unmanaged.UnmanagedUtils.TargetInStreamOutObject(GlobalRef > target, Int32 opType, Int64 inMemPtr) > at Apache.Ignite.Core.Impl.PlatformJniTarget.InStreamOutObject(Int32 type, > Action`1 writeAction) > --- End of inner exception stack trace — > --- End of inner exception stack trace — > at Apache.Ignite.Core.Impl.PlatformJniTarget.InStreamOutObject(Int32 type, > Action`1 writeAction) > at Apache.Ignite.Core.Impl.PlatformTargetAdapter.DoOutOpObject(Int32 type, > Action`1 action) > at > Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration > configuration, NearCacheConfiguration nearConfiguration, > PlatformCacheConfiguration platformCacheConfiguration, Op op) > at > Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration > configuration, NearCacheConfiguration nearConfiguration, > PlatformCacheConfiguration platformCacheConfiguration) > at > Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration > configuration, NearCacheConfiguration nearConfiguration) > at > Apache.Ignite.Core.Impl.Ignite.GetOrCreateCache[TK,TV](CacheConfiguration > configuration) > > > This failure causes issues in the server nodes in the grid which now fail to > restart with errors such as below (for the incorrectly create cache) but > which are repeated for every defined cache in the grid: > > 2023-08-27 17:11:36,882 [42] INF [ImmutableCacheComputeServer] Can not > finish proxy initialization because proxy does not exist, > cacheName=SiteModelMetadata, localNodeId=3d4a75e8-174d-4947-877e-e45784d8d08d > 2 > > At this point the grid is now unusable. > > In summary: Attempted creation of a cache with an unknown DataRegionName > causes immediate and unrecoverable failure in the entire grid. > > On attempted restarted Ignite notes all caches (including system caches) as > being "Started cache in recovery mode" in the log and then scans the WAL. The > incorrectly created cache is not mentioned at this point. > At the point the cluster is activated, (ie: this appears in the log "Started > state transition: activate cluster"), it states the incorrectly created cache > ("SiteModelMetadata") can not be started, log entry is: "Cache can not be > started : cache=SiteModelMetadata" > This is followed by multiple messages like this: "Finished recovery for cache > [cache=ignite-sys-cache, grp=ignite-sys-cache, > startVer=AffinityTopologyVersion [topVer=12, minorTopVer=1]] " > > This is followed by errors relating to proxy initialisation, eg: "Can not > finish proxy initialization because proxy does not exist, > cacheName=ignite-sys-cache, localNodeId=4d44108f-cd96-4953-94db-6365f998a91b" > > All caches are then stopped, eg: "Stopped cache > [cacheName=ignite-sys-cache]", and the grid enters a relatively dormant > inactivated state where it only emits Ignite heartbeat messages. > > This bug appears to be trivially easy to reproduce by creating a cache with > an unknown data region. > > Attempting to destroy the bad cache in the grid with the Control.sh tool > results in this output: > > {noformat} > # ./control.sh --cache destroy --caches SiteModelMetadata > WARNING: An illegal reflective access operation has occurred > WARNING: Illegal reflective access by > org.apache.ignite.internal.util.GridUnsafe$2 > (file:/trex/libs/ignite-core-2.15.0.jar) to field java.nio.Buffer.address > WARNING: Please consider reporting this to the maintainers of > org.apache.ignite.internal.util.GridUnsafe$2 > WARNING: Use --illegal-access=warn to enable warnings of further illegal > reflective access operations > WARNING: All illegal access operations will be denied in a future > releaseWarning! > The command will destroy 1 caches: SiteModelMetadata. > If you continue, the cache data will be impossible to recover. > Press 'y' to continue . . . y > [01:53:38,925][SEVERE][session=24175683][CommandHandlerLog] Connection to > cluster failed. Latest topology update failed.Control utility [ver. > 2.15.0#20230425-sha1:f98f7f35] > 2023 Copyright(C) Apache Software FoundationUser: rootTime: > 2023-08-29T01:53:26.048 > Command [CACHE] startedArguments: --cache destroy --caches > SiteModelMetadata-------------------------------------------------------------------------------- > Connection to cluster failed. > Latest topology update failed.Command [CACHE] finished with code: > Control utility has completed execution at: 2023-08-29T01:53:38.926 > {noformat} > > The Control.sh comment to list caches also fails. > > -- This message was sent by Atlassian Jira (v8.20.10#820010)