[ 
https://issues.apache.org/jira/browse/SOLR-13953?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Alex Jablonski updated SOLR-13953:
----------------------------------
    Description: 
When using the Prometheus Exporter in SolrCloud mode against a cluster with 
more than 100 nodes, only 100 nodes' metrics are collected. For the other 
nodes, we see "Connection pool shut down" errors show up in logs, and the 
metrics from those nodes aren't reported.

This seems to be tied to the cache implementation in hostClientCache in 
SolrCloudScraper. That cache currently has a fixed maximum size of 100. When it 
approaches that limit begins to evict HttpSolrClients, it closes those clients.

We use the cache to build up a map of base URL to HttpSolrClient. For a >100 
node cluster, the cache will successfully return clients for all nodes, 
sequentially. But once we add the 101st node, the first HttpSolrClient, which 
the cache still holds a reference to, gets closed. When we then try to get the 
metrics using all of the HttpSolrClients returned from the cache, the ones that 
have been closed throw IllegalStateExceptions with message "Connection pool 
shut down".

 

Original email thread here: 
[http://mail-archives.apache.org/mod_mbox/lucene-dev/201911.mbox/%3CCAOz296DSV-tt7rWBirBZ%2BP4%3DvT5g29FZrR_2zHrHF084Xq%2Bgyw%40mail.gmail.com%3E]

Github PR here: [https://github.com/apache/lucene-solr/pull/1022]

 

Example stacktrace:

 
{code:java}
WARN  - 2019-11-15 21:21:19.584; org.apache.solr.prometheus.scraper.Async; 
Error occurred during metrics collection => 
java.util.concurrent.ExecutionException: java.lang.IllegalStateException: 
Connection pool shut down
        at 
java.base/java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:395)
        at 
java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:177) [?:?]
        at 
java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1654) 
[?:?]
        at 
java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:484) [?:?]
        at 
java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) 
[?:?]
        at 
java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150) 
[?:?]
        at 
java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173)
 [?:?]
        at 
java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) [?:?]
        at 
java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:497) [?:?]
        at 
org.apache.solr.prometheus.scraper.Async.lambda$waitForAllSuccessfulResponses$3(Async.java:43)
 [solr-prometheus-exporter-7.7.2.jar:7.7.2 
d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
        at 
java.util.concurrent.CompletableFuture.uniExceptionally(CompletableFuture.java:986)
 [?:?]
        at 
java.util.concurrent.CompletableFuture$UniExceptionally.tryFire(CompletableFuture.java:970)
 [?:?]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) 
[?:?]
        at 
java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1705)
 [?:?]
        at 
org.apache.solr.common.util.ExecutorUtil$MDCAwareThreadPoolExecutor.lambda$execute$0(ExecutorUtil.java:209)
 [solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy 
- 2019-05-28 23:37:52]
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) 
[?:?]
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) 
[?:?]
        at java.lang.Thread.run(Thread.java:834) [?:?]
Caused by: java.lang.IllegalStateException: Connection pool shut down
        at org.apache.http.util.Asserts.check(Asserts.java:34) 
~[httpcore-4.4.10.jar:4.4.10]
        at 
org.apache.http.pool.AbstractConnPool.lease(AbstractConnPool.java:191) 
~[httpcore-4.4.10.jar:4.4.10]
        at 
org.apache.http.impl.conn.PoolingHttpClientConnectionManager.requestConnection(PoolingHttpClientConnectionManager.java:267)
 ~[httpclient-4.5.6.jar:4.5.6]
        at 
org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:176) 
~[httpclient-4.5.6.jar:4.5.6]
        at 
org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185) 
~[httpclient-4.5.6.jar:4.5.6]
        at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89) 
~[httpclient-4.5.6.jar:4.5.6]
        at 
org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110) 
~[httpclient-4.5.6.jar:4.5.6]
        at 
org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
 ~[httpclient-4.5.6.jar:4.5.6]
        at 
org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
 ~[httpclient-4.5.6.jar:4.5.6]
        at 
org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
 ~[httpclient-4.5.6.jar:4.5.6]
        at 
org.apache.solr.client.solrj.impl.HttpSolrClient.executeMethod(HttpSolrClient.java:542)
 ~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy 
- 2019-05-28 23:37:52]
        at 
org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:255)
 ~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy 
- 2019-05-28 23:37:52]
        at 
org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:244)
 ~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy 
- 2019-05-28 23:37:52]
        at 
org.apache.solr.client.solrj.SolrClient.request(SolrClient.java:1260) 
~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy 
- 2019-05-28 23:37:52]
        at 
org.apache.solr.prometheus.scraper.SolrScraper.request(SolrScraper.java:102) 
~[solr-prometheus-exporter-7.7.2.jar:7.7.2 
d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
        at 
org.apache.solr.prometheus.scraper.SolrCloudScraper.lambda$metricsForAllHosts$6(SolrCloudScraper.java:119)
 ~[solr-prometheus-exporter-7.7.2.jar:7.7.2 
d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
        at 
org.apache.solr.prometheus.scraper.SolrScraper.lambda$null$0(SolrScraper.java:81)
 ~[solr-prometheus-exporter-7.7.2.jar:7.7.2 
d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
        at 
java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1700)
 ~[?:?]
        ... 4 more
{code}
 

  was:
When using the Prometheus Exporter in SolrCloud mode against a cluster with 
more than 100 nodes, only 100 nodes' metrics are collected. For the other 
nodes, we see "Connection pool shut down" errors show up in logs, and the 
metrics from those nodes aren't reported.

This seems to be tied to the cache implementation in hostClientCache in 
SolrCloudScraper. That cache currently has a fixed maximum size of 100. When it 
approaches that limit begins to evict HttpSolrClients, it closes those clients.

We use the cache to build up a map of base URL to HttpSolrClient. For a >100 
node cluster, the cache will successfully return clients for all nodes, 
sequentially. But once we add the 101st node, the first HttpSolrClient, which 
the cache still holds a reference to, gets closed. When we then try to get the 
metrics using all of the HttpSolrClients returned from the cache, the ones that 
have been closed throw IllegalStateExceptions with message "Connection pool 
shut down".

 

Original email thread here: 
[http://mail-archives.apache.org/mod_mbox/lucene-dev/201911.mbox/%3CCAOz296DSV-tt7rWBirBZ%2BP4%3DvT5g29FZrR_2zHrHF084Xq%2Bgyw%40mail.gmail.com%3E]

Github PR here: [https://github.com/apache/lucene-solr/pull/1022]

 


> Prometheus exporter in SolrCloud mode limited to 100 nodes
> ----------------------------------------------------------
>
>                 Key: SOLR-13953
>                 URL: https://issues.apache.org/jira/browse/SOLR-13953
>             Project: Solr
>          Issue Type: Bug
>      Security Level: Public(Default Security Level. Issues are Public) 
>            Reporter: Alex Jablonski
>            Priority: Major
>
> When using the Prometheus Exporter in SolrCloud mode against a cluster with 
> more than 100 nodes, only 100 nodes' metrics are collected. For the other 
> nodes, we see "Connection pool shut down" errors show up in logs, and the 
> metrics from those nodes aren't reported.
> This seems to be tied to the cache implementation in hostClientCache in 
> SolrCloudScraper. That cache currently has a fixed maximum size of 100. When 
> it approaches that limit begins to evict HttpSolrClients, it closes those 
> clients.
> We use the cache to build up a map of base URL to HttpSolrClient. For a >100 
> node cluster, the cache will successfully return clients for all nodes, 
> sequentially. But once we add the 101st node, the first HttpSolrClient, which 
> the cache still holds a reference to, gets closed. When we then try to get 
> the metrics using all of the HttpSolrClients returned from the cache, the 
> ones that have been closed throw IllegalStateExceptions with message 
> "Connection pool shut down".
>  
> Original email thread here: 
> [http://mail-archives.apache.org/mod_mbox/lucene-dev/201911.mbox/%3CCAOz296DSV-tt7rWBirBZ%2BP4%3DvT5g29FZrR_2zHrHF084Xq%2Bgyw%40mail.gmail.com%3E]
> Github PR here: [https://github.com/apache/lucene-solr/pull/1022]
>  
> Example stacktrace:
>  
> {code:java}
> WARN  - 2019-11-15 21:21:19.584; org.apache.solr.prometheus.scraper.Async; 
> Error occurred during metrics collection => 
> java.util.concurrent.ExecutionException: java.lang.IllegalStateException: 
> Connection pool shut down
>         at 
> java.base/java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:395)
>         at 
> java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:177) 
> [?:?]
>         at 
> java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1654)
>  [?:?]
>         at 
> java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:484) [?:?]
>         at 
> java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) 
> [?:?]
>         at 
> java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150) 
> [?:?]
>         at 
> java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173)
>  [?:?]
>         at 
> java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) [?:?]
>         at 
> java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:497) [?:?]
>         at 
> org.apache.solr.prometheus.scraper.Async.lambda$waitForAllSuccessfulResponses$3(Async.java:43)
>  [solr-prometheus-exporter-7.7.2.jar:7.7.2 
> d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
>         at 
> java.util.concurrent.CompletableFuture.uniExceptionally(CompletableFuture.java:986)
>  [?:?]
>         at 
> java.util.concurrent.CompletableFuture$UniExceptionally.tryFire(CompletableFuture.java:970)
>  [?:?]
>         at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
>  [?:?]
>         at 
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1705)
>  [?:?]
>         at 
> org.apache.solr.common.util.ExecutorUtil$MDCAwareThreadPoolExecutor.lambda$execute$0(ExecutorUtil.java:209)
>  [solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - 
> janhoy - 2019-05-28 23:37:52]
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
>  [?:?]
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
>  [?:?]
>         at java.lang.Thread.run(Thread.java:834) [?:?]
> Caused by: java.lang.IllegalStateException: Connection pool shut down
>         at org.apache.http.util.Asserts.check(Asserts.java:34) 
> ~[httpcore-4.4.10.jar:4.4.10]
>         at 
> org.apache.http.pool.AbstractConnPool.lease(AbstractConnPool.java:191) 
> ~[httpcore-4.4.10.jar:4.4.10]
>         at 
> org.apache.http.impl.conn.PoolingHttpClientConnectionManager.requestConnection(PoolingHttpClientConnectionManager.java:267)
>  ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:176)
>  ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185) 
> ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89) 
> ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110) 
> ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
>  ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
>  ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
>  ~[httpclient-4.5.6.jar:4.5.6]
>         at 
> org.apache.solr.client.solrj.impl.HttpSolrClient.executeMethod(HttpSolrClient.java:542)
>  ~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - 
> janhoy - 2019-05-28 23:37:52]
>         at 
> org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:255)
>  ~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - 
> janhoy - 2019-05-28 23:37:52]
>         at 
> org.apache.solr.client.solrj.impl.HttpSolrClient.request(HttpSolrClient.java:244)
>  ~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - 
> janhoy - 2019-05-28 23:37:52]
>         at 
> org.apache.solr.client.solrj.SolrClient.request(SolrClient.java:1260) 
> ~[solr-solrj-7.7.2.jar:7.7.2 d4c30fc2856154f2c1fefc589eb7cd070a415b94 - 
> janhoy - 2019-05-28 23:37:52]
>         at 
> org.apache.solr.prometheus.scraper.SolrScraper.request(SolrScraper.java:102) 
> ~[solr-prometheus-exporter-7.7.2.jar:7.7.2 
> d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
>         at 
> org.apache.solr.prometheus.scraper.SolrCloudScraper.lambda$metricsForAllHosts$6(SolrCloudScraper.java:119)
>  ~[solr-prometheus-exporter-7.7.2.jar:7.7.2 
> d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
>         at 
> org.apache.solr.prometheus.scraper.SolrScraper.lambda$null$0(SolrScraper.java:81)
>  ~[solr-prometheus-exporter-7.7.2.jar:7.7.2 
> d4c30fc2856154f2c1fefc589eb7cd070a415b94 - janhoy - 2019-05-28 23:37:41]
>         at 
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1700)
>  ~[?:?]
>         ... 4 more
> {code}
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to