This is an automated email from the ASF dual-hosted git repository.
psalagnac pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new d1498e8d5a7 SOLR-12831: Clean up ZK nodes after shard deletion is
invoked (#3314)
d1498e8d5a7 is described below
commit d1498e8d5a7f4382a83a2800a0d1568bcd0f2a55
Author: Pierre Salagnac <[email protected]>
AuthorDate: Thu Apr 17 10:45:08 2025 +0200
SOLR-12831: Clean up ZK nodes after shard deletion is invoked (#3314)
Delete left over znodes on zookeeper that aren't removed after a delete
shard command is issued. In DeleteShardCmd, use the zk client to issue a
recursive delete from the specified root and its children (including that root).
---
solr/CHANGES.txt | 3 ++
.../solr/cloud/api/collections/DeleteShardCmd.java | 32 +++++++++++++++++++++-
.../org/apache/solr/cloud/DeleteShardTest.java | 29 ++++++++++++++++++--
3 files changed, 60 insertions(+), 4 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 2e93c65d47a..4686b33af11 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -254,6 +254,9 @@ Bug Fixes
* SOLR-5386: Fix various methods of invoking "local" solr requests to re-use
the same searcher as the original request.
This notably fixes the use of spellcheck.maxCollationTries in firstSearcher
warming queries to prevent deadlock (hossman)
+* SOLR-12831: Clean up shard metadata in ZooKeeper nodes after shard deletion
is invoked. This makes sure Zookeeper
+ nodes for leader election and terms are not left behind (Andy Vuong, Pierre
Salagnac).
+
Dependency Upgrades
---------------------
* SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke)
diff --git
a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
index 72c93c02c62..d32b80d4b39 100644
---
a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
+++
b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
@@ -38,6 +38,7 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CoreAdminParams;
@@ -201,7 +202,7 @@ public class DeleteShardCmd implements
CollApiCmds.CollectionApiCommand {
} else {
ccc.offerStateUpdate(m);
}
-
+ cleanupZooKeeperShardMetadata(collectionName, sliceId);
zkStateReader.waitForState(
collectionName, 45, TimeUnit.SECONDS, (c) -> c.getSlice(sliceId) ==
null);
@@ -238,4 +239,33 @@ public class DeleteShardCmd implements
CollApiCmds.CollectionApiCommand {
}
return sourceReplicas;
}
+
+ /**
+ * Best effort to delete Zookeeper nodes that stored other details than the
shard itself in
+ * cluster state. If we fail for any reason, we just log and the shard is
still deleted.
+ */
+ private void cleanupZooKeeperShardMetadata(String collection, String sliceId)
+ throws InterruptedException {
+
+ String[] cleanupPaths =
+ new String[] {
+ ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection +
"/leader_elect/" + sliceId,
+ ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" +
sliceId,
+ ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" +
sliceId
+ };
+
+ SolrZkClient client = ccc.getZkStateReader().getZkClient();
+ for (String path : cleanupPaths) {
+ try {
+ client.clean(path);
+ } catch (KeeperException ex) {
+ log.warn(
+ "Non-fatal error occurred when deleting shard metadata {}/{} at
path {}",
+ collection,
+ sliceId,
+ path,
+ ex);
+ }
+ }
+ }
}
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
index dec630995cb..de433340cc7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
@@ -16,13 +16,12 @@
*/
package org.apache.solr.cloud;
-import java.io.IOException;
-import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.Slice.State;
+import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.util.FileUtils;
import org.junit.After;
import org.junit.Before;
@@ -54,6 +53,8 @@ public class DeleteShardTest extends SolrCloudTestCase {
DocCollection state = getCollectionState(collection);
assertEquals(State.ACTIVE, state.getSlice("shard1").getState());
assertEquals(State.ACTIVE, state.getSlice("shard2").getState());
+ assertShardMetadata(collection, "shard1", true);
+ assertShardMetadata(collection, "shard2", true);
// Can't delete an ACTIVE shard
expectThrows(
@@ -67,15 +68,17 @@ public class DeleteShardTest extends SolrCloudTestCase {
// Can delete an INACTIVE shard
CollectionAdminRequest.deleteShard(collection,
"shard1").process(cluster.getSolrClient());
waitForState("Expected 'shard1' to be removed", collection, c ->
c.getSlice("shard1") == null);
+ assertShardMetadata(collection, "shard1", false);
// Can delete a shard under construction
setSliceState(collection, "shard2", Slice.State.CONSTRUCTION);
CollectionAdminRequest.deleteShard(collection,
"shard2").process(cluster.getSolrClient());
waitForState("Expected 'shard2' to be removed", collection, c ->
c.getSlice("shard2") == null);
+ assertShardMetadata(collection, "shard2", false);
}
@Test
- public void testDirectoryCleanupAfterDeleteShard() throws IOException,
SolrServerException {
+ public void testDirectoryCleanupAfterDeleteShard() throws Exception {
final String collection = "deleteshard_test";
CollectionAdminRequest.createCollectionWithImplicitRouter(collection,
"conf", "a,b,c", 1)
@@ -125,4 +128,24 @@ public class DeleteShardTest extends SolrCloudTestCase {
collectionName,
c -> c.getSlice(shardId).getState() == state);
}
+
+ /** Check whether shard metadata exist in Zookeeper. */
+ private void assertShardMetadata(String collection, String sliceId, boolean
shouldExist)
+ throws Exception {
+ String collectionRoot = ZkStateReader.COLLECTIONS_ZKNODE + "/" +
collection;
+
+ String leaderElectPath = collectionRoot + "/leader_elect/" + sliceId;
+ assertEquals(shouldExist, cluster.getZkClient().exists(leaderElectPath,
true));
+
+ String leaderPath = collectionRoot + "/leaders/" + sliceId;
+ assertEquals(shouldExist, cluster.getZkClient().exists(leaderPath, true));
+
+ String termPath = collectionRoot + "/terms/" + sliceId;
+ assertEquals(shouldExist, cluster.getZkClient().exists(termPath, true));
+
+ // Check if the shard name is present in any node under the collection
root.
+ // This way, new/unexpected stuff could be detected.
+ String layout = cluster.getZkClient().listZnode(collectionRoot, true);
+ assertEquals(shouldExist, layout.contains(sliceId));
+ }
}