[GitHub] [lucene-solr] madrob commented on a change in pull request #2187: SOLR-15052 Reducing overseer bottlenecks using per-replica states (8x)
madrob commented on a change in pull request #2187: URL: https://github.com/apache/lucene-solr/pull/2187#discussion_r554164613 ## File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java ## @@ -64,31 +64,8 @@ import org.apache.solr.common.AlreadyClosedException; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.solr.common.cloud.BeforeReconnect; -import org.apache.solr.common.cloud.ClusterState; -import org.apache.solr.common.cloud.ConnectionManager; -import org.apache.solr.common.cloud.DefaultConnectionStrategy; -import org.apache.solr.common.cloud.DefaultZkACLProvider; -import org.apache.solr.common.cloud.DefaultZkCredentialsProvider; -import org.apache.solr.common.cloud.DocCollection; -import org.apache.solr.common.cloud.DocCollectionWatcher; -import org.apache.solr.common.cloud.LiveNodesListener; -import org.apache.solr.common.cloud.NodesSysPropsCacher; -import org.apache.solr.common.cloud.OnReconnect; -import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.*; Review comment: please don't do wildcard imports. ## File path: solr/core/src/java/org/apache/solr/cloud/overseer/NodeMutator.java ## @@ -76,7 +80,12 @@ } if (needToUpdateCollection) { Review comment: we no longer need this boolean, can check whether downedReplicas is empty or not to decide ## File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java ## @@ -1609,12 +1586,40 @@ public void publish(final CoreDescriptor cd, final Replica.State state, boolean if (updateLastState) { cd.getCloudDescriptor().setLastPublished(state); } - overseerJobQueue.offer(Utils.toJSON(m)); + DocCollection coll = zkStateReader.getCollection(collection); + if (forcePublish || sendToOverseer(coll, coreNodeName)) { Review comment: other places we do additional checks, like https://github.com/apache/lucene-solr/pull/2187/files#diff-5307cc9f51d88f5a171591d2f429779e22b0168cd2114275d745bccea9d1a6b3R186 please be consistent about whether we care to optimize for early termination (I suspect we don't) ## File path: solr/core/src/test/org/apache/solr/handler/TestStressThreadBackup.java ## @@ -61,7 +60,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -@Nightly +//@Nightly Review comment: ? ## File path: solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java ## @@ -129,11 +143,23 @@ public Replica(String name, Map propMap, String collection, Strin Objects.requireNonNull(this.nodeName, "'node_name' must not be null"); Objects.requireNonNull(this.core, "'core' must not be null"); Objects.requireNonNull(this.type, "'type' must not be null"); -if (propMap.get(ZkStateReader.STATE_PROP) != null) { - this.state = State.getState((String) propMap.get(ZkStateReader.STATE_PROP)); +ClusterState.getReplicaStatesProvider().get().ifPresent(it -> { + log.debug("A replica {} state fetched from per-replica state", name); Review comment: nit: I think this would be confusing, people might think it means that the state fetched was "A" for Active. Would log "Replica..." ## File path: solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java ## @@ -1576,14 +1614,24 @@ public static DocCollection getCollectionLive(ZkStateReader zkStateReader, Strin } } - private DocCollection fetchCollectionState(String coll, Watcher watcher) throws KeeperException, InterruptedException { -String collectionPath = getCollectionPath(coll); + public DocCollection fetchCollectionState(String coll, Watcher watcher, String path) throws KeeperException, InterruptedException { +String collectionPath = path == null ? getCollectionPath(coll) : path; while (true) { + ClusterState.initReplicaStateProvider(() -> { +try { + PerReplicaStates replicaStates = PerReplicaStates.fetch(collectionPath, zkClient, null); + log.info("per-replica-state ver: {} fetched for initializing {} ", replicaStates.cversion, collectionPath); + return replicaStates; +} catch (Exception e) { + //TODO Review comment: ? ## File path: solr/solrj/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java ## @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agree
[GitHub] [lucene-solr] madrob commented on a change in pull request #2187: SOLR-15052 Reducing overseer bottlenecks using per-replica states (8x)
madrob commented on a change in pull request #2187: URL: https://github.com/apache/lucene-solr/pull/2187#discussion_r554164613 ## File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java ## @@ -64,31 +64,8 @@ import org.apache.solr.common.AlreadyClosedException; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.solr.common.cloud.BeforeReconnect; -import org.apache.solr.common.cloud.ClusterState; -import org.apache.solr.common.cloud.ConnectionManager; -import org.apache.solr.common.cloud.DefaultConnectionStrategy; -import org.apache.solr.common.cloud.DefaultZkACLProvider; -import org.apache.solr.common.cloud.DefaultZkCredentialsProvider; -import org.apache.solr.common.cloud.DocCollection; -import org.apache.solr.common.cloud.DocCollectionWatcher; -import org.apache.solr.common.cloud.LiveNodesListener; -import org.apache.solr.common.cloud.NodesSysPropsCacher; -import org.apache.solr.common.cloud.OnReconnect; -import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.*; Review comment: please don't do wildcard imports. ## File path: solr/core/src/java/org/apache/solr/cloud/overseer/NodeMutator.java ## @@ -76,7 +80,12 @@ } if (needToUpdateCollection) { Review comment: we no longer need this boolean, can check whether downedReplicas is empty or not to decide ## File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java ## @@ -1609,12 +1586,40 @@ public void publish(final CoreDescriptor cd, final Replica.State state, boolean if (updateLastState) { cd.getCloudDescriptor().setLastPublished(state); } - overseerJobQueue.offer(Utils.toJSON(m)); + DocCollection coll = zkStateReader.getCollection(collection); + if (forcePublish || sendToOverseer(coll, coreNodeName)) { Review comment: other places we do additional checks, like https://github.com/apache/lucene-solr/pull/2187/files#diff-5307cc9f51d88f5a171591d2f429779e22b0168cd2114275d745bccea9d1a6b3R186 please be consistent about whether we care to optimize for early termination (I suspect we don't) ## File path: solr/core/src/test/org/apache/solr/handler/TestStressThreadBackup.java ## @@ -61,7 +60,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -@Nightly +//@Nightly Review comment: ? ## File path: solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java ## @@ -129,11 +143,23 @@ public Replica(String name, Map propMap, String collection, Strin Objects.requireNonNull(this.nodeName, "'node_name' must not be null"); Objects.requireNonNull(this.core, "'core' must not be null"); Objects.requireNonNull(this.type, "'type' must not be null"); -if (propMap.get(ZkStateReader.STATE_PROP) != null) { - this.state = State.getState((String) propMap.get(ZkStateReader.STATE_PROP)); +ClusterState.getReplicaStatesProvider().get().ifPresent(it -> { + log.debug("A replica {} state fetched from per-replica state", name); Review comment: nit: I think this would be confusing, people might think it means that the state fetched was "A" for Active. Would log "Replica..." ## File path: solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java ## @@ -1576,14 +1614,24 @@ public static DocCollection getCollectionLive(ZkStateReader zkStateReader, Strin } } - private DocCollection fetchCollectionState(String coll, Watcher watcher) throws KeeperException, InterruptedException { -String collectionPath = getCollectionPath(coll); + public DocCollection fetchCollectionState(String coll, Watcher watcher, String path) throws KeeperException, InterruptedException { +String collectionPath = path == null ? getCollectionPath(coll) : path; while (true) { + ClusterState.initReplicaStateProvider(() -> { +try { + PerReplicaStates replicaStates = PerReplicaStates.fetch(collectionPath, zkClient, null); + log.info("per-replica-state ver: {} fetched for initializing {} ", replicaStates.cversion, collectionPath); + return replicaStates; +} catch (Exception e) { + //TODO Review comment: ? ## File path: solr/solrj/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java ## @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agree