Oh, that's very helpful to know about, ty The overwhelming majority appear to be threads in TIMED_WAITING, all waiting on the same thing: java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@3b315cbb
I've attached a screenshot which includes the stack trace. Stopping all queries to the instance and waiting didn't result in any noticeable decrease in the number of threads so it looks like despite being timed, they're simply not getting terminated. Restarting the service takes me back down to just 53 threads; re-running a test results in many new threads immediately coming into being, this time with a higher proportion of threads BLOCKED on org.apache.solr.search.function.FileFloatSource$CreationPlaceholder@37b782de - See second screenshot. The stack trace for those is too big for one screen so here's the output: qtp178604517-861 (861) org.apache.solr.search.function.FileFloatSource$CreationPlaceholder@37b782de - org.apache.solr.search.function.FileFloatSource$Cache.get(FileFloatSource.java:210) - org.apache.solr.search.function.FileFloatSource.getCachedFloats(FileFloatSource.java:158) - org.apache.solr.search.function.FileFloatSource.getValues(FileFloatSource.java:97) - org.apache.lucene.queries.function.ValueSource$WrappedDoubleValuesSource.getValues(ValueSource.java:203) - org.apache.lucene.queries.function.FunctionScoreQuery$MultiplicativeBoostValuesSource.getValues(FunctionScoreQuery.java:261) - org.apache.lucene.queries.function.FunctionScoreQuery$FunctionScoreWeight.scorer(FunctionScoreQuery.java:224) - org.apache.lucene.search.Weight.scorerSupplier(Weight.java:148) - org.apache.lucene.search.BooleanWeight.scorerSupplier(BooleanWeight.java:379) - org.apache.lucene.search.BooleanWeight.scorer(BooleanWeight.java:344) - org.apache.lucene.search.Weight.bulkScorer(Weight.java:182) - org.apache.lucene.search.BooleanWeight.bulkScorer(BooleanWeight.java:338) - org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:656) - org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:443) - org.apache.solr.search.SolrIndexSearcher.buildAndRunCollectorChain(SolrIndexSearcher.java:211) - org.apache.solr.search.SolrIndexSearcher.getDocListAndSetNC(SolrIndexSearcher.java:1705) - org.apache.solr.search.SolrIndexSearcher.getDocListC(SolrIndexSearcher.java:1408) - org.apache.solr.search.SolrIndexSearcher.search(SolrIndexSearcher.java:596) - org.apache.solr.handler.component.QueryComponent.doProcessUngroupedSearch(QueryComponent.java:1500) - org.apache.solr.handler.component.QueryComponent.process(QueryComponent.java:390) - org.apache.solr.handler.component.SearchHandler.handleRequestBody(SearchHandler.java:369) - org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:216) - org.apache.solr.core.SolrCore.execute(SolrCore.java:2637) - org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:794) - org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:567) - org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:427) - org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:357) - org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:201) - org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601) - org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548) - org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) - org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602) - org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) - org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235) - org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624) - org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) - org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1435) - org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) - org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501) - org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594) - org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) - org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1350) - org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) - org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:191) - org.eclipse.jetty.server.handler.InetAccessHandler.handle(InetAccessHandler.java:177) - org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146) - org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) - org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:322) - org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) - org.eclipse.jetty.server.Server.handle(Server.java:516) - org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388) - org.eclipse.jetty.server.HttpChannel$$Lambda$556/0x000000080067a440.dispatch(Unknown Source) - org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:633) - org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:380) - org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277) - org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) - org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105) - org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) - org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336) - org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313) - org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171) - org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129) - org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:383) - org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:882) - org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1036) - java.base@11.0.5/java.lang.Thread.run(Thread.java:834) [image: image.png] [image: image.png] On Wed, 13 Oct 2021 at 00:03, Joel Bernstein <joels...@gmail.com> wrote: > There is a thread dump on the Solr admin. You can use that to determine > what all those threads are doing and where they are getting stuck. You can > post parts of the thread dump back to this email thread as well. > > > > Joel Bernstein > http://joelsolr.blogspot.com/ > > > On Tue, Oct 12, 2021 at 11:15 AM Dominic Humphries > <domi...@adzuna.com.invalid> wrote: > > > We run 8.3.1 in prod without any problems, but we're having issues with > > trying to upgrade. > > > > I've created an 8.9.0 leader & follower, imported our live data into it, > > and am testing it via replaying requests made to prod. We're seeing a big > > problem where fairly moderate request rates are causing the instance to > > become so slow it fails healthcheck. The logs showed a lot of errors > around > > creating threads: > > > > solr[4507]: [124136.511s][warning][os,thread] Failed to start thread - > > pthread_create failed (EAGAIN) for attributes: stacksize: 256k, > guardsize: > > 0k, detached. > > > > WARN (qtp178604517-3891) [ ] o.e.j.i.ManagedSelector => > > java.lang.OutOfMemoryError: unable to create native thread: possibly out > of > > memory or process/resource limits reached > > > > So I monitored thread count for the process whilst running the test suite > > and saw a persistent pattern: Threads increased until maxed out, the logs > > flooded with errors as it tried to create still more threads, and the > > instance slowed down until terminated as unhealthy. > > > > The DefaultTasksMax is set to 4915, I've tried raising and lowering it > but > > regardless of value the result is the same: it gets maxed and everything > > slows down. > > > > Is there anything I can do to stop solr spinning up so many threads it > > ceases to function? There have been a few test passes where it > > spontaneously dropped threadcount from thousands to hundreds and stayed > up > > longer, but there seems no pattern to when this happens. Running the same > > tests on 8.3.1 results in a much slower increase in threads and it never > > quite maxes them so things continue to function. > > > > See below for the thread count and healthcheck times seen on a (fairly > > harsh) test run of 100 requests/sec > > > > Thanks > > > > Dominic > > > > > > Threadcount: > > > > ubuntu@ip-10-40-22-166:~$ while [ 1 ]; do date; ps -eLF | grep > 'start.jar' > > | wc -l; sleep 10s; done > > Tue Oct 12 14:27:33 UTC 2021 > > 52 > > Tue Oct 12 14:27:43 UTC 2021 > > 52 > > Tue Oct 12 14:27:54 UTC 2021 > > 52 > > Tue Oct 12 14:28:04 UTC 2021 > > 52 > > Tue Oct 12 14:28:14 UTC 2021 > > 569 > > Tue Oct 12 14:28:24 UTC 2021 > > 899 > > Tue Oct 12 14:28:34 UTC 2021 > > 1198 > > Tue Oct 12 14:28:44 UTC 2021 > > 1589 > > Tue Oct 12 14:28:54 UTC 2021 > > 2016 > > Tue Oct 12 14:29:05 UTC 2021 > > 2451 > > Tue Oct 12 14:29:15 UTC 2021 > > 2851 > > Tue Oct 12 14:29:26 UTC 2021 > > 2934 > > Tue Oct 12 14:29:36 UTC 2021 > > 3249 > > Tue Oct 12 14:29:46 UTC 2021 > > 3501 > > Tue Oct 12 14:29:57 UTC 2021 > > 3734 > > Tue Oct 12 14:30:07 UTC 2021 > > 4128 > > Tue Oct 12 14:30:18 UTC 2021 > > 4374 > > Tue Oct 12 14:30:29 UTC 2021 > > 4637 > > Tue Oct 12 14:30:39 UTC 2021 > > 4693 > > Tue Oct 12 14:30:50 UTC 2021 > > 4807 > > Tue Oct 12 14:31:01 UTC 2021 > > 4916 > > Tue Oct 12 14:31:11 UTC 2021 > > 4916 > > Tue Oct 12 14:31:22 UTC 2021 > > Connection to 10.40.22.166 closed by remote host. > > > > > > Healthcheck: > > > > ubuntu@ip-10-40-22-166:~$ while [ 1 ]; do date; curl -v > > localhost:8983/solr/ 2>&1 | grep HTTP; date; echo '----'; sleep > > 10s; done > > Tue Oct 12 14:27:34 UTC 2021 > > > GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:27:34 UTC 2021 > > ---- > > Tue Oct 12 14:27:44 UTC 2021 > > > GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:27:44 UTC 2021 > > ---- > > Tue Oct 12 14:27:54 UTC 2021 > > > GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:27:54 UTC 2021 > > ---- > > Tue Oct 12 14:28:04 UTC 2021 > > > GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:28:04 UTC 2021 > > ---- > > Tue Oct 12 14:28:14 UTC 2021 > > > GET /solr/ HTTP/1.1 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:02 --:--:-- > > 0< HTTP/1.1 200 OK > > Tue Oct 12 14:28:16 UTC 2021 > > ---- > > Tue Oct 12 14:28:26 UTC 2021 > > > GET /solr/ HTTP/1.1 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:12 --:--:-- > > 0< HTTP/1.1 200 OK > > Tue Oct 12 14:28:39 UTC 2021 > > ---- > > Tue Oct 12 14:28:49 UTC 2021 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- > > 0> GET /solr/ HTTP/1.1 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:23 --:--:-- > > 0< HTTP/1.1 200 OK > > Tue Oct 12 14:29:13 UTC 2021 > > ---- > > Tue Oct 12 14:29:23 UTC 2021 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- > > 0> GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:29:25 UTC 2021 > > ---- > > Tue Oct 12 14:29:35 UTC 2021 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:03 --:--:-- > > 0> GET /solr/ HTTP/1.1 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:09 --:--:-- > > 0< HTTP/1.1 200 OK > > Tue Oct 12 14:29:44 UTC 2021 > > ---- > > Tue Oct 12 14:29:54 UTC 2021 > > > GET /solr/ HTTP/1.1 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:11 --:--:-- > > 0< HTTP/1.1 200 OK > > Tue Oct 12 14:30:06 UTC 2021 > > ---- > > Tue Oct 12 14:30:16 UTC 2021 > > > GET /solr/ HTTP/1.1 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:03 --:--:-- > > 0< HTTP/1.1 200 OK > > Tue Oct 12 14:30:20 UTC 2021 > > ---- > > Tue Oct 12 14:30:30 UTC 2021 > > > GET /solr/ HTTP/1.1 > > 0 0 0 0 0 0 0 0 --:--:-- 0:00:02 --:--:-- > > 0< HTTP/1.1 200 OK > > Tue Oct 12 14:30:33 UTC 2021 > > ---- > > Tue Oct 12 14:30:43 UTC 2021 > > > GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:30:43 UTC 2021 > > ---- > > Tue Oct 12 14:30:53 UTC 2021 > > > GET /solr/ HTTP/1.1 > > Tue Oct 12 14:30:55 UTC 2021 > > ---- > > Tue Oct 12 14:31:05 UTC 2021 > > > GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:31:05 UTC 2021 > > ---- > > Tue Oct 12 14:31:15 UTC 2021 > > > GET /solr/ HTTP/1.1 > > < HTTP/1.1 200 OK > > Tue Oct 12 14:31:15 UTC 2021 > > ---- > > Connection to 10.40.22.166 closed by remote host. > > >