[GitHub] [lucene-solr] madrob commented on a change in pull request #2187: SOLR-15052 Reducing overseer bottlenecks using per-replica states (8x)

2021-01-09 Thread GitBox


madrob commented on a change in pull request #2187:
URL: https://github.com/apache/lucene-solr/pull/2187#discussion_r554164613



##
File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java
##
@@ -64,31 +64,8 @@
 import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.cloud.BeforeReconnect;
-import org.apache.solr.common.cloud.ClusterState;
-import org.apache.solr.common.cloud.ConnectionManager;
-import org.apache.solr.common.cloud.DefaultConnectionStrategy;
-import org.apache.solr.common.cloud.DefaultZkACLProvider;
-import org.apache.solr.common.cloud.DefaultZkCredentialsProvider;
-import org.apache.solr.common.cloud.DocCollection;
-import org.apache.solr.common.cloud.DocCollectionWatcher;
-import org.apache.solr.common.cloud.LiveNodesListener;
-import org.apache.solr.common.cloud.NodesSysPropsCacher;
-import org.apache.solr.common.cloud.OnReconnect;
-import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.*;

Review comment:
   please don't do wildcard imports.

##
File path: solr/core/src/java/org/apache/solr/cloud/overseer/NodeMutator.java
##
@@ -76,7 +80,12 @@
   }
 
   if (needToUpdateCollection) {

Review comment:
   we no longer need this boolean, can check whether downedReplicas is 
empty or not to decide

##
File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java
##
@@ -1609,12 +1586,40 @@ public void publish(final CoreDescriptor cd, final 
Replica.State state, boolean
   if (updateLastState) {
 cd.getCloudDescriptor().setLastPublished(state);
   }
-  overseerJobQueue.offer(Utils.toJSON(m));
+  DocCollection coll = zkStateReader.getCollection(collection);
+  if (forcePublish || sendToOverseer(coll, coreNodeName)) {

Review comment:
   other places we do additional checks, like 
https://github.com/apache/lucene-solr/pull/2187/files#diff-5307cc9f51d88f5a171591d2f429779e22b0168cd2114275d745bccea9d1a6b3R186
   
   please be consistent about whether we care to optimize for early termination 
(I suspect we don't)

##
File path: 
solr/core/src/test/org/apache/solr/handler/TestStressThreadBackup.java
##
@@ -61,7 +60,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-@Nightly
+//@Nightly

Review comment:
   ?

##
File path: solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java
##
@@ -129,11 +143,23 @@ public Replica(String name, Map propMap, 
String collection, Strin
 Objects.requireNonNull(this.nodeName, "'node_name' must not be null");
 Objects.requireNonNull(this.core, "'core' must not be null");
 Objects.requireNonNull(this.type, "'type' must not be null");
-if (propMap.get(ZkStateReader.STATE_PROP) != null) {
-  this.state = State.getState((String) 
propMap.get(ZkStateReader.STATE_PROP));
+ClusterState.getReplicaStatesProvider().get().ifPresent(it -> {
+  log.debug("A replica  {} state fetched from per-replica state", name);

Review comment:
   nit: I think this would be confusing, people might think it means that 
the state fetched was "A" for Active. Would log "Replica..."

##
File path: solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
##
@@ -1576,14 +1614,24 @@ public static DocCollection 
getCollectionLive(ZkStateReader zkStateReader, Strin
 }
   }
 
-  private DocCollection fetchCollectionState(String coll, Watcher watcher) 
throws KeeperException, InterruptedException {
-String collectionPath = getCollectionPath(coll);
+  public DocCollection fetchCollectionState(String coll, Watcher watcher, 
String path) throws KeeperException, InterruptedException {
+String collectionPath = path == null ? getCollectionPath(coll) : path;
 while (true) {
+  ClusterState.initReplicaStateProvider(() -> {
+try {
+  PerReplicaStates replicaStates = 
PerReplicaStates.fetch(collectionPath, zkClient, null);
+  log.info("per-replica-state ver: {} fetched for initializing {} ", 
replicaStates.cversion, collectionPath);
+  return replicaStates;
+} catch (Exception e) {
+  //TODO

Review comment:
   ?

##
File path: 
solr/solrj/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java
##
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agree

[GitHub] [lucene-solr] madrob commented on a change in pull request #2187: SOLR-15052 Reducing overseer bottlenecks using per-replica states (8x)

2021-01-08 Thread GitBox


madrob commented on a change in pull request #2187:
URL: https://github.com/apache/lucene-solr/pull/2187#discussion_r554164613



##
File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java
##
@@ -64,31 +64,8 @@
 import org.apache.solr.common.AlreadyClosedException;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.cloud.BeforeReconnect;
-import org.apache.solr.common.cloud.ClusterState;
-import org.apache.solr.common.cloud.ConnectionManager;
-import org.apache.solr.common.cloud.DefaultConnectionStrategy;
-import org.apache.solr.common.cloud.DefaultZkACLProvider;
-import org.apache.solr.common.cloud.DefaultZkCredentialsProvider;
-import org.apache.solr.common.cloud.DocCollection;
-import org.apache.solr.common.cloud.DocCollectionWatcher;
-import org.apache.solr.common.cloud.LiveNodesListener;
-import org.apache.solr.common.cloud.NodesSysPropsCacher;
-import org.apache.solr.common.cloud.OnReconnect;
-import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.*;

Review comment:
   please don't do wildcard imports.

##
File path: solr/core/src/java/org/apache/solr/cloud/overseer/NodeMutator.java
##
@@ -76,7 +80,12 @@
   }
 
   if (needToUpdateCollection) {

Review comment:
   we no longer need this boolean, can check whether downedReplicas is 
empty or not to decide

##
File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java
##
@@ -1609,12 +1586,40 @@ public void publish(final CoreDescriptor cd, final 
Replica.State state, boolean
   if (updateLastState) {
 cd.getCloudDescriptor().setLastPublished(state);
   }
-  overseerJobQueue.offer(Utils.toJSON(m));
+  DocCollection coll = zkStateReader.getCollection(collection);
+  if (forcePublish || sendToOverseer(coll, coreNodeName)) {

Review comment:
   other places we do additional checks, like 
https://github.com/apache/lucene-solr/pull/2187/files#diff-5307cc9f51d88f5a171591d2f429779e22b0168cd2114275d745bccea9d1a6b3R186
   
   please be consistent about whether we care to optimize for early termination 
(I suspect we don't)

##
File path: 
solr/core/src/test/org/apache/solr/handler/TestStressThreadBackup.java
##
@@ -61,7 +60,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-@Nightly
+//@Nightly

Review comment:
   ?

##
File path: solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java
##
@@ -129,11 +143,23 @@ public Replica(String name, Map propMap, 
String collection, Strin
 Objects.requireNonNull(this.nodeName, "'node_name' must not be null");
 Objects.requireNonNull(this.core, "'core' must not be null");
 Objects.requireNonNull(this.type, "'type' must not be null");
-if (propMap.get(ZkStateReader.STATE_PROP) != null) {
-  this.state = State.getState((String) 
propMap.get(ZkStateReader.STATE_PROP));
+ClusterState.getReplicaStatesProvider().get().ifPresent(it -> {
+  log.debug("A replica  {} state fetched from per-replica state", name);

Review comment:
   nit: I think this would be confusing, people might think it means that 
the state fetched was "A" for Active. Would log "Replica..."

##
File path: solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
##
@@ -1576,14 +1614,24 @@ public static DocCollection 
getCollectionLive(ZkStateReader zkStateReader, Strin
 }
   }
 
-  private DocCollection fetchCollectionState(String coll, Watcher watcher) 
throws KeeperException, InterruptedException {
-String collectionPath = getCollectionPath(coll);
+  public DocCollection fetchCollectionState(String coll, Watcher watcher, 
String path) throws KeeperException, InterruptedException {
+String collectionPath = path == null ? getCollectionPath(coll) : path;
 while (true) {
+  ClusterState.initReplicaStateProvider(() -> {
+try {
+  PerReplicaStates replicaStates = 
PerReplicaStates.fetch(collectionPath, zkClient, null);
+  log.info("per-replica-state ver: {} fetched for initializing {} ", 
replicaStates.cversion, collectionPath);
+  return replicaStates;
+} catch (Exception e) {
+  //TODO

Review comment:
   ?

##
File path: 
solr/solrj/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java
##
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agree