This is an automated email from the ASF dual-hosted git repository.

psalagnac pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new d1498e8d5a7 SOLR-12831: Clean up ZK nodes after shard deletion is 
invoked (#3314)
d1498e8d5a7 is described below

commit d1498e8d5a7f4382a83a2800a0d1568bcd0f2a55
Author: Pierre Salagnac <[email protected]>
AuthorDate: Thu Apr 17 10:45:08 2025 +0200

    SOLR-12831: Clean up ZK nodes after shard deletion is invoked (#3314)
    
    Delete left over znodes on zookeeper that aren't removed after a delete 
shard command is issued. In DeleteShardCmd, use the zk client to issue a 
recursive delete from the specified root and its children (including that root).
---
 solr/CHANGES.txt                                   |  3 ++
 .../solr/cloud/api/collections/DeleteShardCmd.java | 32 +++++++++++++++++++++-
 .../org/apache/solr/cloud/DeleteShardTest.java     | 29 ++++++++++++++++++--
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 2e93c65d47a..4686b33af11 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -254,6 +254,9 @@ Bug Fixes
 * SOLR-5386: Fix various methods of invoking "local" solr requests to re-use 
the same searcher as the original request.
   This notably fixes the use of spellcheck.maxCollationTries in firstSearcher 
warming queries to prevent deadlock (hossman)
 
+* SOLR-12831: Clean up shard metadata in ZooKeeper nodes after shard deletion 
is invoked. This makes sure Zookeeper
+ nodes for leader election and terms are not left behind (Andy Vuong, Pierre 
Salagnac).
+
 Dependency Upgrades
 ---------------------
 * SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke)
diff --git 
a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java 
b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
index 72c93c02c62..d32b80d4b39 100644
--- 
a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
+++ 
b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
@@ -38,6 +38,7 @@ import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CoreAdminParams;
@@ -201,7 +202,7 @@ public class DeleteShardCmd implements 
CollApiCmds.CollectionApiCommand {
       } else {
         ccc.offerStateUpdate(m);
       }
-
+      cleanupZooKeeperShardMetadata(collectionName, sliceId);
       zkStateReader.waitForState(
           collectionName, 45, TimeUnit.SECONDS, (c) -> c.getSlice(sliceId) == 
null);
 
@@ -238,4 +239,33 @@ public class DeleteShardCmd implements 
CollApiCmds.CollectionApiCommand {
     }
     return sourceReplicas;
   }
+
+  /**
+   * Best effort to delete Zookeeper nodes that stored other details than the 
shard itself in
+   * cluster state. If we fail for any reason, we just log and the shard is 
still deleted.
+   */
+  private void cleanupZooKeeperShardMetadata(String collection, String sliceId)
+      throws InterruptedException {
+
+    String[] cleanupPaths =
+        new String[] {
+          ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + 
"/leader_elect/" + sliceId,
+          ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + 
sliceId,
+          ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" + 
sliceId
+        };
+
+    SolrZkClient client = ccc.getZkStateReader().getZkClient();
+    for (String path : cleanupPaths) {
+      try {
+        client.clean(path);
+      } catch (KeeperException ex) {
+        log.warn(
+            "Non-fatal error occurred when deleting shard metadata {}/{} at 
path {}",
+            collection,
+            sliceId,
+            path,
+            ex);
+      }
+    }
+  }
 }
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java 
b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
index dec630995cb..de433340cc7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
@@ -16,13 +16,12 @@
  */
 package org.apache.solr.cloud;
 
-import java.io.IOException;
-import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.Slice.State;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.util.FileUtils;
 import org.junit.After;
 import org.junit.Before;
@@ -54,6 +53,8 @@ public class DeleteShardTest extends SolrCloudTestCase {
     DocCollection state = getCollectionState(collection);
     assertEquals(State.ACTIVE, state.getSlice("shard1").getState());
     assertEquals(State.ACTIVE, state.getSlice("shard2").getState());
+    assertShardMetadata(collection, "shard1", true);
+    assertShardMetadata(collection, "shard2", true);
 
     // Can't delete an ACTIVE shard
     expectThrows(
@@ -67,15 +68,17 @@ public class DeleteShardTest extends SolrCloudTestCase {
     // Can delete an INACTIVE shard
     CollectionAdminRequest.deleteShard(collection, 
"shard1").process(cluster.getSolrClient());
     waitForState("Expected 'shard1' to be removed", collection, c -> 
c.getSlice("shard1") == null);
+    assertShardMetadata(collection, "shard1", false);
 
     // Can delete a shard under construction
     setSliceState(collection, "shard2", Slice.State.CONSTRUCTION);
     CollectionAdminRequest.deleteShard(collection, 
"shard2").process(cluster.getSolrClient());
     waitForState("Expected 'shard2' to be removed", collection, c -> 
c.getSlice("shard2") == null);
+    assertShardMetadata(collection, "shard2", false);
   }
 
   @Test
-  public void testDirectoryCleanupAfterDeleteShard() throws IOException, 
SolrServerException {
+  public void testDirectoryCleanupAfterDeleteShard() throws Exception {
 
     final String collection = "deleteshard_test";
     CollectionAdminRequest.createCollectionWithImplicitRouter(collection, 
"conf", "a,b,c", 1)
@@ -125,4 +128,24 @@ public class DeleteShardTest extends SolrCloudTestCase {
         collectionName,
         c -> c.getSlice(shardId).getState() == state);
   }
+
+  /** Check whether shard metadata exist in Zookeeper. */
+  private void assertShardMetadata(String collection, String sliceId, boolean 
shouldExist)
+      throws Exception {
+    String collectionRoot = ZkStateReader.COLLECTIONS_ZKNODE + "/" + 
collection;
+
+    String leaderElectPath = collectionRoot + "/leader_elect/" + sliceId;
+    assertEquals(shouldExist, cluster.getZkClient().exists(leaderElectPath, 
true));
+
+    String leaderPath = collectionRoot + "/leaders/" + sliceId;
+    assertEquals(shouldExist, cluster.getZkClient().exists(leaderPath, true));
+
+    String termPath = collectionRoot + "/terms/" + sliceId;
+    assertEquals(shouldExist, cluster.getZkClient().exists(termPath, true));
+
+    // Check if the shard name is present in any node under the collection 
root.
+    // This way, new/unexpected stuff could be detected.
+    String layout = cluster.getZkClient().listZnode(collectionRoot, true);
+    assertEquals(shouldExist, layout.contains(sliceId));
+  }
 }

Reply via email to