[ https://issues.apache.org/jira/browse/SLING-9162?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Timothee Maret updated SLING-9162: ---------------------------------- Description: Client threads polling for the queue state currently have to wait an indefinite amount of time until the [queue cache is ready|https://github.com/apache/sling-org-apache-sling-distribution-journal/blob/2de092d5437b2eeafc77d953ce5ffa73d3ee2377/src/main/java/org/apache/sling/distribution/journal/impl/queue/impl/PubQueueCache.java#L229]. In case of failure, the queue cache may never gets ready and the client threads end up being blocked at {code:java} "ForkJoinPool.commonPool-worker-9" daemon prio=5 tid=0x662 nid=0xffffffff in Object.wait() java.lang.Thread.State: WAITING (on object monitor) at java.base@11.0.3/jdk.internal.misc.Unsafe.park(Native Method) - waiting to lock <0x28684ad3> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) owned by "sling-default-5-health-org.apache.sling.distribution.monitor.DistributionQueueHealthCheck" tid=0x141 at java.base@11.0.3/java.util.concurrent.locks.LockSupport.park(LockSupport.java:194) at java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:885) at java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:917) at java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1240) at java.base@11.0.3/java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:267) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCache.fetchIfNeeded(PubQueueCache.java:225) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCache.getOffsetQueue(PubQueueCache.java:146) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCacheService.getOffsetQueue(PubQueueCacheService.java:98) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueProviderImpl.getQueue(PubQueueProviderImpl.java:110) at org.apache.sling.distribution.journal.impl.publisher.DistributionPublisher.getPubQueue(DistributionPublisher.java:242) at org.apache.sling.distribution.journal.impl.publisher.DistributionPublisher.getQueue(DistributionPublisher.java:227) at org.apache.sling.distribution.journal.impl.shared.AgentState$$Lambda$676/0x0000000801ccf840.apply(Unknown Source) at java.base@11.0.3/java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:195) at java.base@11.0.3/java.util.HashMap$KeySpliterator.tryAdvance(HashMap.java:1624) at java.base@11.0.3/java.util.stream.ReferencePipeline.forEachWithCancel(ReferencePipeline.java:127) at java.base@11.0.3/java.util.stream.AbstractPipeline.copyIntoWithCancel(AbstractPipeline.java:502) at java.base@11.0.3/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:488) at java.base@11.0.3/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) at java.base@11.0.3/java.util.stream.MatchOps$MatchTask.doLeaf(MatchOps.java:306) at java.base@11.0.3/java.util.stream.MatchOps$MatchTask.doLeaf(MatchOps.java:277) at java.base@11.0.3/java.util.stream.AbstractShortCircuitTask.compute(AbstractShortCircuitTask.java:115) at java.base@11.0.3/java.util.concurrent.CountedCompleter.exec(CountedCompleter.java:746) at java.base@11.0.3/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) at java.base@11.0.3/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) at java.base@11.0.3/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) at java.base@11.0.3/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) at java.base@11.0.3/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:177) {code} We should timeout calls to the [getOffsetQueue|#L142]] method. was: Client threads polling for the queue state currently have to wait an indefinite amount of time until the [queue cache is ready|https://github.com/apache/sling-org-apache-sling-distribution-journal/blob/2de092d5437b2eeafc77d953ce5ffa73d3ee2377/src/main/java/org/apache/sling/distribution/journal/impl/queue/impl/PubQueueCache.java#L229]. In case of failure, the queue cache may never gets ready and the client threads end up being blocked at {code} "ForkJoinPool.commonPool-worker-9" daemon prio=5 tid=0x662 nid=0xffffffff in Object.wait() java.lang.Thread.State: WAITING (on object monitor) at java.base@11.0.3/jdk.internal.misc.Unsafe.park(Native Method) - waiting to lock <0x28684ad3> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) owned by "sling-default-5-health-org.apache.sling.distribution.monitor.DistributionQueueHealthCheck" tid=0x141 at java.base@11.0.3/java.util.concurrent.locks.LockSupport.park(LockSupport.java:194) at java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:885) at java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:917) at java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1240) at java.base@11.0.3/java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:267) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCache.fetchIfNeeded(PubQueueCache.java:225) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCache.getOffsetQueue(PubQueueCache.java:146) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCacheService.getOffsetQueue(PubQueueCacheService.java:98) at org.apache.sling.distribution.journal.impl.queue.impl.PubQueueProviderImpl.getQueue(PubQueueProviderImpl.java:110) at org.apache.sling.distribution.journal.impl.publisher.DistributionPublisher.getPubQueue(DistributionPublisher.java:242) at org.apache.sling.distribution.journal.impl.publisher.DistributionPublisher.getQueue(DistributionPublisher.java:227) at org.apache.sling.distribution.journal.impl.shared.AgentState$$Lambda$676/0x0000000801ccf840.apply(Unknown Source) at java.base@11.0.3/java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:195) at java.base@11.0.3/java.util.HashMap$KeySpliterator.tryAdvance(HashMap.java:1624) at java.base@11.0.3/java.util.stream.ReferencePipeline.forEachWithCancel(ReferencePipeline.java:127) at java.base@11.0.3/java.util.stream.AbstractPipeline.copyIntoWithCancel(AbstractPipeline.java:502) at java.base@11.0.3/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:488) at java.base@11.0.3/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) at java.base@11.0.3/java.util.stream.MatchOps$MatchTask.doLeaf(MatchOps.java:306) at java.base@11.0.3/java.util.stream.MatchOps$MatchTask.doLeaf(MatchOps.java:277) at java.base@11.0.3/java.util.stream.AbstractShortCircuitTask.compute(AbstractShortCircuitTask.java:115) at java.base@11.0.3/java.util.concurrent.CountedCompleter.exec(CountedCompleter.java:746) at java.base@11.0.3/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) at java.base@11.0.3/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) at java.base@11.0.3/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) at java.base@11.0.3/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) at java.base@11.0.3/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:177) {code} We should timeout calls to the [getOffsetQueue|[https://github.com/apache/sling-org-apache-sling-distribution-journal/blob/2de092d5437b2eeafc77d953ce5ffa73d3ee2377/src/main/java/org/apache/sling/distribution/journal/impl/queue/impl/PubQueueCache.java#L142]] method. > Add timeout when waiting for ready queue cache > ----------------------------------------------- > > Key: SLING-9162 > URL: https://issues.apache.org/jira/browse/SLING-9162 > Project: Sling > Issue Type: Improvement > Components: Content Distribution > Affects Versions: Content Distribution Journal Core 0.1.4 > Reporter: Timothee Maret > Assignee: Timothee Maret > Priority: Major > Fix For: Content Distribution Journal Core 0.1.10 > > > Client threads polling for the queue state currently have to wait an > indefinite amount of time until the [queue cache is > ready|https://github.com/apache/sling-org-apache-sling-distribution-journal/blob/2de092d5437b2eeafc77d953ce5ffa73d3ee2377/src/main/java/org/apache/sling/distribution/journal/impl/queue/impl/PubQueueCache.java#L229]. > In case of failure, the queue cache may never gets ready and the client > threads end up being blocked at > {code:java} > "ForkJoinPool.commonPool-worker-9" daemon prio=5 tid=0x662 nid=0xffffffff in > Object.wait() > java.lang.Thread.State: WAITING (on object monitor) > at java.base@11.0.3/jdk.internal.misc.Unsafe.park(Native Method) > - waiting to lock <0x28684ad3> (a > java.util.concurrent.locks.ReentrantLock$NonfairSync) owned by > "sling-default-5-health-org.apache.sling.distribution.monitor.DistributionQueueHealthCheck" > tid=0x141 > at > java.base@11.0.3/java.util.concurrent.locks.LockSupport.park(LockSupport.java:194) > at > java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:885) > at > java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:917) > at > java.base@11.0.3/java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1240) > at > java.base@11.0.3/java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:267) > at > org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCache.fetchIfNeeded(PubQueueCache.java:225) > at > org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCache.getOffsetQueue(PubQueueCache.java:146) > at > org.apache.sling.distribution.journal.impl.queue.impl.PubQueueCacheService.getOffsetQueue(PubQueueCacheService.java:98) > at > org.apache.sling.distribution.journal.impl.queue.impl.PubQueueProviderImpl.getQueue(PubQueueProviderImpl.java:110) > at > org.apache.sling.distribution.journal.impl.publisher.DistributionPublisher.getPubQueue(DistributionPublisher.java:242) > at > org.apache.sling.distribution.journal.impl.publisher.DistributionPublisher.getQueue(DistributionPublisher.java:227) > at > org.apache.sling.distribution.journal.impl.shared.AgentState$$Lambda$676/0x0000000801ccf840.apply(Unknown > Source) > at > java.base@11.0.3/java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:195) > at > java.base@11.0.3/java.util.HashMap$KeySpliterator.tryAdvance(HashMap.java:1624) > at > java.base@11.0.3/java.util.stream.ReferencePipeline.forEachWithCancel(ReferencePipeline.java:127) > at > java.base@11.0.3/java.util.stream.AbstractPipeline.copyIntoWithCancel(AbstractPipeline.java:502) > at > java.base@11.0.3/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:488) > at > java.base@11.0.3/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) > at > java.base@11.0.3/java.util.stream.MatchOps$MatchTask.doLeaf(MatchOps.java:306) > at > java.base@11.0.3/java.util.stream.MatchOps$MatchTask.doLeaf(MatchOps.java:277) > at > java.base@11.0.3/java.util.stream.AbstractShortCircuitTask.compute(AbstractShortCircuitTask.java:115) > at > java.base@11.0.3/java.util.concurrent.CountedCompleter.exec(CountedCompleter.java:746) > at > java.base@11.0.3/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) > at > java.base@11.0.3/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) > at > java.base@11.0.3/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) > at > java.base@11.0.3/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) > at > java.base@11.0.3/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:177) > {code} > We should timeout calls to the [getOffsetQueue|#L142]] method. -- This message was sent by Atlassian Jira (v8.3.4#803005)