I have been testing 8.5.2 and it looks like the load has moved but is still on 
one machine.

Setup:
3 physical machines.
Each machine hosts 8 instances of Solr.
Each instance of Solr hosts one replica.

Another way to say it:
Number of shards = 8. Replication factor = 3.

Here is the cluster state. You can see that the leaders are well distributed. 

{"TEST_COLLECTION":{
    "pullReplicas":"0",
    "replicationFactor":"3",
    "shards":{
      "shard1":{
        "range":"80000000-9fffffff",
        "state":"active",
        "replicas":{
          "core_node3":{
            "core":"TEST_COLLECTION_shard1_replica_n1",
            "base_url":"http://10.156.122.13:10007/solr";,
            "node_name":"10.156.122.13:10007_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node5":{
            "core":"TEST_COLLECTION_shard1_replica_n2",
            "base_url":"http://10.156.112.50:10002/solr";,
            "node_name":"10.156.112.50:10002_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"},
          "core_node7":{
            "core":"TEST_COLLECTION_shard1_replica_n4",
            "base_url":"http://10.156.112.50:10006/solr";,
            "node_name":"10.156.112.50:10006_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"}}},
      "shard2":{
        "range":"a0000000-bfffffff",
        "state":"active",
        "replicas":{
          "core_node9":{
            "core":"TEST_COLLECTION_shard2_replica_n6",
            "base_url":"http://10.156.112.50:10003/solr";,
            "node_name":"10.156.112.50:10003_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node11":{
            "core":"TEST_COLLECTION_shard2_replica_n8",
            "base_url":"http://10.156.122.13:10004/solr";,
            "node_name":"10.156.122.13:10004_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"},
          "core_node12":{
            "core":"TEST_COLLECTION_shard2_replica_n10",
            "base_url":"http://10.156.116.34:10008/solr";,
            "node_name":"10.156.116.34:10008_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"}}},
      "shard3":{
        "range":"c0000000-dfffffff",
        "state":"active",
        "replicas":{
          "core_node15":{
            "core":"TEST_COLLECTION_shard3_replica_n13",
            "base_url":"http://10.156.122.13:10008/solr";,
            "node_name":"10.156.122.13:10008_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node17":{
            "core":"TEST_COLLECTION_shard3_replica_n14",
            "base_url":"http://10.156.116.34:10005/solr";,
            "node_name":"10.156.116.34:10005_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node19":{
            "core":"TEST_COLLECTION_shard3_replica_n16",
            "base_url":"http://10.156.116.34:10002/solr";,
            "node_name":"10.156.116.34:10002_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"}}},
      "shard4":{
        "range":"e0000000-ffffffff",
        "state":"active",
        "replicas":{
          "core_node20":{
            "core":"TEST_COLLECTION_shard4_replica_n18",
            "base_url":"http://10.156.122.13:10001/solr";,
            "node_name":"10.156.122.13:10001_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node23":{
            "core":"TEST_COLLECTION_shard4_replica_n21",
            "base_url":"http://10.156.116.34:10004/solr";,
            "node_name":"10.156.116.34:10004_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node25":{
            "core":"TEST_COLLECTION_shard4_replica_n22",
            "base_url":"http://10.156.112.50:10001/solr";,
            "node_name":"10.156.112.50:10001_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"}}},
      "shard5":{
        "range":"0-1fffffff",
        "state":"active",
        "replicas":{
          "core_node27":{
            "core":"TEST_COLLECTION_shard5_replica_n24",
            "base_url":"http://10.156.116.34:10007/solr";,
            "node_name":"10.156.116.34:10007_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node29":{
            "core":"TEST_COLLECTION_shard5_replica_n26",
            "base_url":"http://10.156.122.13:10006/solr";,
            "node_name":"10.156.122.13:10006_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node31":{
            "core":"TEST_COLLECTION_shard5_replica_n28",
            "base_url":"http://10.156.116.34:10006/solr";,
            "node_name":"10.156.116.34:10006_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"}}},
      "shard6":{
        "range":"20000000-3fffffff",
        "state":"active",
        "replicas":{
          "core_node33":{
            "core":"TEST_COLLECTION_shard6_replica_n30",
            "base_url":"http://10.156.122.13:10002/solr";,
            "node_name":"10.156.122.13:10002_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"},
          "core_node35":{
            "core":"TEST_COLLECTION_shard6_replica_n32",
            "base_url":"http://10.156.112.50:10008/solr";,
            "node_name":"10.156.112.50:10008_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node37":{
            "core":"TEST_COLLECTION_shard6_replica_n34",
            "base_url":"http://10.156.116.34:10003/solr";,
            "node_name":"10.156.116.34:10003_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"}}},
      "shard7":{
        "range":"40000000-5fffffff",
        "state":"active",
        "replicas":{
          "core_node39":{
            "core":"TEST_COLLECTION_shard7_replica_n36",
            "base_url":"http://10.156.122.13:10003/solr";,
            "node_name":"10.156.122.13:10003_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"},
          "core_node41":{
            "core":"TEST_COLLECTION_shard7_replica_n38",
            "base_url":"http://10.156.122.13:10005/solr";,
            "node_name":"10.156.122.13:10005_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node43":{
            "core":"TEST_COLLECTION_shard7_replica_n40",
            "base_url":"http://10.156.112.50:10004/solr";,
            "node_name":"10.156.112.50:10004_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"}}},
      "shard8":{
        "range":"60000000-7fffffff",
        "state":"active",
        "replicas":{
          "core_node45":{
            "core":"TEST_COLLECTION_shard8_replica_n42",
            "base_url":"http://10.156.112.50:10007/solr";,
            "node_name":"10.156.112.50:10007_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"},
          "core_node47":{
            "core":"TEST_COLLECTION_shard8_replica_n44",
            "base_url":"http://10.156.112.50:10005/solr";,
            "node_name":"10.156.112.50:10005_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false",
            "leader":"true"},
          "core_node48":{
            "core":"TEST_COLLECTION_shard8_replica_n46",
            "base_url":"http://10.156.116.34:10001/solr";,
            "node_name":"10.156.116.34:10001_solr",
            "state":"active",
            "type":"NRT",
            "force_set_state":"false"}}}},
    "router":{"name":"compositeId"},
    "maxShardsPerNode":"1",
    "autoAddReplicas":"false",
    "nrtReplicas":"3",
    "tlogReplicas":"0”}}


Running TOP on each machine while load tests have been running for 60 minutes.

10.156.112.50   load average: 0.08, 0.35, 1.65
10.156.116.34   load average: 24.71, 24.20, 20.65
10.156.122.13   load average: 5.37, 3.21, 4.04



Here are the stats from each shard leader.

http://10.156.112.50:10002/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.112.50:10002/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":2},
  "metrics":{
    "solr.core.BTS.shard1.replica_n2":{
      "QUERY./select.requestTimes":{
        "count":805,
        "meanRate":0.4385455794526838,
        "1minRate":0.5110237122383522,
        "5minRate":0.4671091682458005,
        "15minRate":0.4057871940723353,
        "min_ms":0.14047,
        "max_ms":12424.589645,
        "mean_ms":796.2194458711818,
        "median_ms":10.534906,
        "stddev_ms":2567.655224710497,
        "p75_ms":22.893306,
        "p95_ms":8316.33323,
        "p99_ms":12424.589645,
        "p999_ms":12424.589645}}}}

http://10.156.122.13:10004/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.122.13:10004/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":2},
  "metrics":{
    "solr.core.BTS.shard2.replica_n8":{
      "QUERY./select.requestTimes":{
        "count":791,
        "meanRate":0.4244162938316224,
        "1minRate":0.4869749626003825,
        "5minRate":0.45856412657687656,
        "15minRate":0.3948063845907493,
        "min_ms":0.168369,
        "max_ms":11022.763933,
        "mean_ms":2572.0670957974603,
        "median_ms":1490.222885,
        "stddev_ms":2718.1710938804276,
        "p75_ms":4292.490478,
        "p95_ms":8487.18506,
        "p99_ms":8855.936617,
        "p999_ms":9589.218502}}}}

http://10.156.116.34:10002/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.116.34:10002/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":83},
  "metrics":{
    "solr.core.BTS.shard3.replica_n16":{
      "QUERY./select.requestTimes":{
        "count":840,
        "meanRate":0.4335334453288775,
        "1minRate":0.5733683837779382,
        "5minRate":0.4931753679028527,
        "15minRate":0.42241330274699623,
        "min_ms":0.155939,
        "max_ms":18125.516406,
        "mean_ms":7097.942850416767,
        "median_ms":8136.862825,
        "stddev_ms":2382.041897221542,
        "p75_ms":8497.844088,
        "p95_ms":9642.430475,
        "p99_ms":9993.694346,
        "p999_ms":12207.982291}}}}

http://10.156.112.50:10001/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.112.50:10001/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":3},
  "metrics":{
    "solr.core.BTS.shard4.replica_n22":{
      "QUERY./select.requestTimes":{
        "count":873,
        "meanRate":0.43420303985137254,
        "1minRate":0.4284437786865815,
        "5minRate":0.44020640429418745,
        "15minRate":0.40860871277629196,
        "min_ms":0.136658,
        "max_ms":11345.407699,
        "mean_ms":511.28573906464504,
        "median_ms":9.063677,
        "stddev_ms":2038.8104673512248,
        "p75_ms":20.270605,
        "p95_ms":8418.131442,
        "p99_ms":8904.78616,
        "p999_ms":10447.78365}}}}

http://10.156.116.34:10006/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.116.34:10006/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":4},
  "metrics":{
    "solr.core.BTS.shard5.replica_n28":{
      "QUERY./select.requestTimes":{
        "count":863,
        "meanRate":0.4419375762840668,
        "1minRate":0.44487242228317025,
        "5minRate":0.45927613542085916,
        "15minRate":0.41056066296443494,
        "min_ms":0.158855,
        "max_ms":16669.411989,
        "mean_ms":6513.057114006753,
        "median_ms":8033.386692,
        "stddev_ms":3002.7487311308896,
        "p75_ms":8446.147616,
        "p95_ms":9888.641316,
        "p99_ms":13624.11926,
        "p999_ms":13624.11926}}}}

http://10.156.122.13:10002/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.122.13:10002/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":2},
  "metrics":{
    "solr.core.BTS.shard6.replica_n30":{
      "QUERY./select.requestTimes":{
        "count":893,
        "meanRate":0.43301141185981046,
        "1minRate":0.4011485529441132,
        "5minRate":0.447654905093643,
        "15minRate":0.41489193746842407,
        "min_ms":0.161571,
        "max_ms":14716.828978,
        "mean_ms":2932.212133523417,
        "median_ms":1289.686481,
        "stddev_ms":3426.22045100954,
        "p75_ms":6230.031884,
        "p95_ms":8109.408506,
        "p99_ms":12904.515311,
        "p999_ms":12904.515311}}}}



http://10.156.122.13:10003/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.122.13:10003/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":16},
  "metrics":{
    "solr.core.BTS.shard7.replica_n36":{
      "QUERY./select.requestTimes":{
        "count":962,
        "meanRate":0.46572438680661055,
        "1minRate":0.4974893681625287,
        "5minRate":0.49072296556429784,
        "15minRate":0.44138205926188756,
        "min_ms":0.164803,
        "max_ms":12481.82656,
        "mean_ms":2606.899631183513,
        "median_ms":1457.505387,
        "stddev_ms":3083.297183477969,
        "p75_ms":4072.543679,
        "p95_ms":8562.456178,
        "p99_ms":9351.230895,
        "p999_ms":10430.483813}}}}

http://10.156.112.50:10005/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes
 
<http://10.156.112.50:10005/solr/admin/metrics?group=core&prefix=QUERY./select.requestTimes>
{
  "responseHeader":{
    "status":0,
    "QTime":3},
  "metrics":{
    "solr.core.BTS.shard8.replica_n44":{
      "QUERY./select.requestTimes":{
        "count":904,
        "meanRate":0.4356001115451976,
        "1minRate":0.42906831311171356,
        "5minRate":0.4651312663377039,
        "15minRate":0.41812847342709225,
        "min_ms":0.089738,
        "max_ms":10857.092832,
        "mean_ms":304.52127270799156,
        "median_ms":7.098736,
        "stddev_ms":1544.5378594679773,
        "p75_ms":15.599817,
        "p95_ms":93.818662,
        "p99_ms":8510.757117,
        "p999_ms":9353.844994}}}}

I restart all of the instances on “34” so that there are no leaders on it. The 
load somewhat goes to the other box.

10.156.112.50   load average: 0.00, 0.16, 0.47
10.156.116.34   load average: 17.00, 16.16, 17.07
10.156.122.13   load average: 17.86, 17.49, 14.74

Box “50” is still doing nothing AND it is the leader of 4 of the 8 shards.
Box “13” is the leader of the remaining 4 shards.
Box “34” is not the leader of any shard.

I will continue to test, who knows, it may be something I am doing. Maybe not 
enough RAM, etc…, so I am definitely leaving this open to the possibility that 
I am not well configured for 8.5.

Regards




> On May 16, 2020, at 5:08 PM, Tomás Fernández Löbbe <tomasflo...@gmail.com> 
> wrote:
> 
> I just backported Michael’s fix to be released in 8.5.2
> 
> On Fri, May 15, 2020 at 6:38 AM Michael Gibney <mich...@michaelgibney.net>
> wrote:
> 
>> Hi Wei,
>> SOLR-14471 has been merged, so this issue should be fixed in 8.6.
>> Thanks for reporting the problem!
>> Michael
>> 
>> On Mon, May 11, 2020 at 7:51 PM Wei <weiwan...@gmail.com> wrote:
>>> 
>>> Thanks Michael!  Yes in each shard I have 10 Tlog replicas,  no other
>> type
>>> of replicas, and each Tlog replica is an individual solr instance on its
>>> own physical machine.  In the jira you mentioned 'when "last place
>> matches"
>>> == "first place matches" – e.g. when shards.preference specified matches
>>> *all* available replicas'.   My setting is
>>> shards.preference=replica.location:local,replica.type:TLOG,
>>> I also tried just shards.preference=replica.location:local and it still
>> has
>>> the issue. Can you explain a bit more?
>>> 
>>> On Mon, May 11, 2020 at 12:26 PM Michael Gibney <
>> mich...@michaelgibney.net>
>>> wrote:
>>> 
>>>> FYI: https://issues.apache.org/jira/browse/SOLR-14471
>>>> Wei, assuming you have only TLOG replicas, your "last place" matches
>>>> (to which the random fallback ordering would not be applied -- see
>>>> above issue) would be the same as the "first place" matches selected
>>>> for executing distributed requests.
>>>> 
>>>> 
>>>> On Mon, May 11, 2020 at 1:49 PM Michael Gibney
>>>> <mich...@michaelgibney.net> wrote:
>>>>> 
>>>>> Wei, probably no need to answer my earlier questions; I think I see
>>>>> the problem here, and believe it is indeed a bug, introduced in 8.3.
>>>>> Will file an issue and submit a patch shortly.
>>>>> Michael
>>>>> 
>>>>> On Mon, May 11, 2020 at 12:49 PM Michael Gibney
>>>>> <mich...@michaelgibney.net> wrote:
>>>>>> 
>>>>>> Hi Wei,
>>>>>> 
>>>>>> In considering this problem, I'm stumbling a bit on terminology
>>>>>> (particularly, where you mention "nodes", I think you're referring
>> to
>>>>>> "replicas"?). Could you confirm that you have 10 TLOG replicas per
>>>>>> shard, for each of 6 shards? How many *nodes* (i.e., running solr
>>>>>> server instances) do you have, and what is the replica placement
>> like
>>>>>> across those nodes? What, if any, non-TLOG replicas do you have per
>>>>>> shard (not that it's necessarily relevant, but just to get a
>> complete
>>>>>> picture of the situation)?
>>>>>> 
>>>>>> If you're able without too much trouble, can you determine what the
>>>>>> behavior is like on Solr 8.3? (there were different changes
>> introduced
>>>>>> to potentially relevant code in 8.3 and 8.4, and knowing whether
>> the
>>>>>> behavior you're observing manifests on 8.3 would help narrow down
>>>>>> where to look for an explanation).
>>>>>> 
>>>>>> Michael
>>>>>> 
>>>>>> On Fri, May 8, 2020 at 7:34 PM Wei <weiwan...@gmail.com> wrote:
>>>>>>> 
>>>>>>> Update:  after I remove the shards.preference parameter from
>>>>>>> solrconfig.xml,  issue is gone and internal shard requests are
>> now
>>>>>>> balanced. The same parameter works fine with solr 7.6.  Still not
>>>> sure of
>>>>>>> the root cause, but I observed a strange coincidence: the nodes
>> that
>>>> are
>>>>>>> most frequently picked for shard requests are the first node in
>> each
>>>> shard
>>>>>>> returned from the CLUSTERSTATUS api.  Seems something wrong with
>>>> shuffling
>>>>>>> equally compared nodes when shards.preference is set.  Will
>> report
>>>> back if
>>>>>>> I find more.
>>>>>>> 
>>>>>>> On Mon, Apr 27, 2020 at 5:59 PM Wei <weiwan...@gmail.com> wrote:
>>>>>>> 
>>>>>>>> Hi Eric,
>>>>>>>> 
>>>>>>>> I am measuring the number of shard requests, and it's for query
>>>> only, no
>>>>>>>> indexing requests.  I have an external load balancer and see
>> each
>>>> node
>>>>>>>> received about the equal number of external queries. However
>> for
>>>> the
>>>>>>>> internal shard queries,  the distribution is uneven:    6 nodes
>>>> (one in
>>>>>>>> each shard,  some of them are leaders and some are non-leaders
>> )
>>>> gets about
>>>>>>>> 80% of the shard requests, the other 54 nodes gets about 20% of
>>>> the shard
>>>>>>>> requests.   I checked a few other parameters set:
>>>>>>>> 
>>>>>>>> -Dsolr.disable.shardsWhitelist=true
>>>>>>>> shards.preference=replica.location:local,replica.type:TLOG
>>>>>>>> 
>>>>>>>> Nothing seems to cause the strange behavior.  Any suggestions
>> how
>>>> to
>>>>>>>> debug this?
>>>>>>>> 
>>>>>>>> -Wei
>>>>>>>> 
>>>>>>>> 
>>>>>>>> On Mon, Apr 27, 2020 at 5:42 PM Erick Erickson <
>>>> erickerick...@gmail.com>
>>>>>>>> wrote:
>>>>>>>> 
>>>>>>>>> Wei:
>>>>>>>>> 
>>>>>>>>> How are you measuring utilization here? The number of incoming
>>>> requests
>>>>>>>>> or CPU?
>>>>>>>>> 
>>>>>>>>> The leader for each shard are certainly handling all of the
>>>> indexing
>>>>>>>>> requests since they’re TLOG replicas, so that’s one thing that
>>>> might
>>>>>>>>> skewing your measurements.
>>>>>>>>> 
>>>>>>>>> Best,
>>>>>>>>> Erick
>>>>>>>>> 
>>>>>>>>>> On Apr 27, 2020, at 7:13 PM, Wei <weiwan...@gmail.com>
>> wrote:
>>>>>>>>>> 
>>>>>>>>>> Hi everyone,
>>>>>>>>>> 
>>>>>>>>>> I have a strange issue after upgrade from 7.6.0 to 8.4.1. My
>>>> cloud has 6
>>>>>>>>>> shards with 10 TLOG replicas each shard.  After upgrade I
>>>> noticed that
>>>>>>>>> one
>>>>>>>>>> of the replicas in each shard is handling most of the
>>>> distributed shard
>>>>>>>>>> requests, so 6 nodes are heavily loaded while other nodes
>> are
>>>> idle.
>>>>>>>>> There
>>>>>>>>>> is no change in shard handler configuration:
>>>>>>>>>> 
>>>>>>>>>> <shardHandlerFactory name="shardHandlerFactory" class=
>>>>>>>>>> "HttpShardHandlerFactory">
>>>>>>>>>> 
>>>>>>>>>>   <int name="socketTimeout">30000</int>
>>>>>>>>>> 
>>>>>>>>>>   <int name="connTimeout">30000</int>
>>>>>>>>>> 
>>>>>>>>>>   <int name="maxConnectionsPerHost">500</int>
>>>>>>>>>> 
>>>>>>>>>> </shardHandlerFactory>
>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> What could cause the unbalanced internal distributed
>> request?
>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> Thanks in advance.
>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> Wei
>>>>>>>>> 
>>>>>>>>> 
>>>> 
>> 

Reply via email to