[ 
https://issues.apache.org/jira/browse/IGNITE-25673?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Kirill Tkalenko updated IGNITE-25673:
-------------------------------------
    Fix Version/s: 3.1

> Add debug information for SnapshotExecutorImpl#runningJobs to help detect 
> node stop hangs
> -----------------------------------------------------------------------------------------
>
>                 Key: IGNITE-25673
>                 URL: https://issues.apache.org/jira/browse/IGNITE-25673
>             Project: Ignite
>          Issue Type: Improvement
>            Reporter: Kirill Tkalenko
>            Assignee: Kirill Tkalenko
>            Priority: Major
>              Labels: ignite-3
>             Fix For: 3.1
>
>          Time Spent: 20m
>  Remaining Estimate: 0h
>
> During the use of the cluster, it was found that on the node stop we can hang 
> on the raft node stop. Due to some snapshot operation that could not 
> complete, most likely due to some exception. To help identify the cause, it 
> is proposed to add 
> *org.apache.ignite.raft.jraft.storage.snapshot.SnapshotExecutorImpl#runningJobs*
>  to the output of operations that did not have time to complete for some 
> reason on the node stop.
> The following thread demonstrates that *SnapshotExecutorImpl* waits on a 
> latch for a Raft snapshot to be finished with:
> {noformat}
> Thread [name="SIGTERM handler", id=7184, state=WAITING, blockCnt=3, 
> waitCnt=26]
>     Lock 
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@6f346369,
>  ownerName=null, ownerId=-1]
>         at [email protected]/jdk.internal.misc.Unsafe.park(Native Method)
>         at 
> [email protected]/java.util.concurrent.locks.LockSupport.park(LockSupport.java:341)
>         at 
> [email protected]/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionNode.block(AbstractQueuedSynchronizer.java:506)
>         at 
> [email protected]/java.util.concurrent.ForkJoinPool.unmanagedBlock(ForkJoinPool.java:3465)
>         at 
> [email protected]/java.util.concurrent.ForkJoinPool.managedBlock(ForkJoinPool.java:3436)
>         at 
> [email protected]/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1630)
>         at 
> app//org.apache.ignite.raft.jraft.util.CountDownEvent.await(CountDownEvent.java:59)
>         at 
> app//org.apache.ignite.raft.jraft.storage.snapshot.SnapshotExecutorImpl.join(SnapshotExecutorImpl.java:704)
>         at 
> app//org.apache.ignite.raft.jraft.core.NodeImpl.join(NodeImpl.java:3259)
>         - locked org.apache.ignite.raft.jraft.core.NodeImpl@17da08e8
>         at 
> app//org.apache.ignite.raft.jraft.RaftGroupService.shutdown(RaftGroupService.java:127)
>         - locked org.apache.ignite.raft.jraft.RaftGroupService@762ce012
>         at 
> app//org.apache.ignite.internal.raft.server.impl.JraftServerImpl.stopRaftNodes(JraftServerImpl.java:591)
>         at 
> app//org.apache.ignite.internal.raft.Loza.stopRaftNodes(Loza.java:505)
>         at 
> app//org.apache.ignite.internal.metastorage.impl.MetaStorageManagerImpl.lambda$stopAsync$31(MetaStorageManagerImpl.java:771)
>         at 
> app//org.apache.ignite.internal.metastorage.impl.MetaStorageManagerImpl$$Lambda$3787/0x00007ffac0d4eab8.close(Unknown
>  Source)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils.lambda$closeAllManually$1(IgniteUtils.java:611)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils$$Lambda$3677/0x00007ffac0d36c48.accept(Unknown
>  Source)
>         at 
> [email protected]/java.util.stream.ForEachOps$ForEachOp$OfRef.accept(ForEachOps.java:183)
>         at 
> [email protected]/java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:179)
>         at 
> [email protected]/java.util.Spliterators$ArraySpliterator.forEachRemaining(Spliterators.java:992)
>         at 
> [email protected]/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:509)
>         at 
> [email protected]/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:499)
>         at 
> [email protected]/java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150)
>         at 
> [email protected]/java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173)
>         at 
> [email protected]/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
>         at 
> [email protected]/java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:596)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils.closeAllManually(IgniteUtils.java:609)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils.closeAllManually(IgniteUtils.java:643)
>         at 
> app//org.apache.ignite.internal.metastorage.impl.MetaStorageManagerImpl.stopAsync(MetaStorageManagerImpl.java:767)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils.lambda$stopAsync$6(IgniteUtils.java:1256)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils$$Lambda$3739/0x00007ffac0d451f8.apply(Unknown
>  Source)
>         at 
> [email protected]/java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:197)
>         at 
> [email protected]/java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:179)
>         at 
> [email protected]/java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1625)
>         at 
> [email protected]/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:509)
>         at 
> [email protected]/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:499)
>         at 
> [email protected]/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:575)
>         at 
> [email protected]/java.util.stream.AbstractPipeline.evaluateToArrayNode(AbstractPipeline.java:260)
>         at 
> [email protected]/java.util.stream.ReferencePipeline.toArray(ReferencePipeline.java:616)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils.stopAsync(IgniteUtils.java:1262)
>         at 
> app//org.apache.ignite.internal.util.IgniteUtils.stopAsync(IgniteUtils.java:1304)
>         at 
> app//org.apache.ignite.internal.app.LifecycleManager.initiateAllComponentsStop(LifecycleManager.java:170)
>         - locked org.apache.ignite.internal.app.LifecycleManager@26304515
>         at 
> app//org.apache.ignite.internal.app.LifecycleManager.stopNode(LifecycleManager.java:144)
>         at 
> app//org.apache.ignite.internal.app.IgniteImpl.stopAsync(IgniteImpl.java:2066)
>         at 
> app//org.apache.ignite.internal.app.IgniteServerImpl.doShutdownAsync(IgniteServerImpl.java:342)
>         at 
> app//org.apache.ignite.internal.app.IgniteServerImpl$$Lambda$3644/0x00007ffac0d2fd68.get(Unknown
>  Source)
>         at 
> app//org.apache.ignite.internal.app.IgniteServerImpl.lambda$chainRestartOrShutdownAction$6(IgniteServerImpl.java:281)
>         at 
> app//org.apache.ignite.internal.app.IgniteServerImpl$$Lambda$3646/0x00007ffac0d30230.apply(Unknown
>  Source)
>         at 
> [email protected]/java.util.concurrent.CompletableFuture.uniComposeStage(CompletableFuture.java:1187)
>         at 
> [email protected]/java.util.concurrent.CompletableFuture.thenCompose(CompletableFuture.java:2309)
>         at 
> app//org.apache.ignite.internal.app.IgniteServerImpl.chainRestartOrShutdownAction(IgniteServerImpl.java:281)
>         at 
> app//org.apache.ignite.internal.app.IgniteServerImpl.shutdownAsync(IgniteServerImpl.java:318)
>         - locked java.lang.Object@30400aca
>         at 
> app//org.apache.ignite.internal.app.IgniteServerImpl.shutdown(IgniteServerImpl.java:358)
>         at 
> app//org.apache.ignite.internal.app.IgniteRunner.lambda$main$0(IgniteRunner.java:73)
>         at 
> app//org.apache.ignite.internal.app.IgniteRunner$$Lambda$1540/0x00007ffac08ff7e0.handle(Unknown
>  Source)
>         at [email protected]/jdk.internal.misc.Signal$1.run(Signal.java:219)
>         at [email protected]/java.lang.Thread.run(Thread.java:840)
> {noformat}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to