This is an automated email from the ASF dual-hosted git repository. psalagnac pushed a commit to branch delete-zk-nodes in repository https://gitbox.apache.org/repos/asf/solr.git
commit 8d1601156745853c5f89f056f10d71d7bad29722 Author: Pierre Salagnac <[email protected]> AuthorDate: Tue Apr 8 11:34:24 2025 +0200 SOLR-14240: Clean up ZK nodes after shard deletion is invoked --- solr/CHANGES.txt | 3 ++ .../solr/cloud/api/collections/DeleteShardCmd.java | 32 +++++++++++++++++++++- .../org/apache/solr/cloud/DeleteShardTest.java | 21 ++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 61a1d98e47a..1a4c70dbc16 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -243,6 +243,9 @@ Bug Fixes * SOLR-17692: Core unload/deletion now preempts all forms of ongoing "recovery", rather than inadvertently waiting for completion in some cases. (Jason Gerlowski) +* SOLR-12831: Clean up shard metadata in ZooKeeper nodes after shard deletion is invoked. This makes sure Zookeeper + nodes for leader election and terms are not left behind (Andy Vuong, Pierre Salagnac). + Dependency Upgrades --------------------- * SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke) diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java index 72c93c02c62..d32b80d4b39 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java @@ -38,6 +38,7 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CoreAdminParams; @@ -201,7 +202,7 @@ public class DeleteShardCmd implements CollApiCmds.CollectionApiCommand { } else { ccc.offerStateUpdate(m); } - + cleanupZooKeeperShardMetadata(collectionName, sliceId); zkStateReader.waitForState( collectionName, 45, TimeUnit.SECONDS, (c) -> c.getSlice(sliceId) == null); @@ -238,4 +239,33 @@ public class DeleteShardCmd implements CollApiCmds.CollectionApiCommand { } return sourceReplicas; } + + /** + * Best effort to delete Zookeeper nodes that stored other details than the shard itself in + * cluster state. If we fail for any reason, we just log and the shard is still deleted. + */ + private void cleanupZooKeeperShardMetadata(String collection, String sliceId) + throws InterruptedException { + + String[] cleanupPaths = + new String[] { + ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leader_elect/" + sliceId, + ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + sliceId, + ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" + sliceId + }; + + SolrZkClient client = ccc.getZkStateReader().getZkClient(); + for (String path : cleanupPaths) { + try { + client.clean(path); + } catch (KeeperException ex) { + log.warn( + "Non-fatal error occurred when deleting shard metadata {}/{} at path {}", + collection, + sliceId, + path, + ex); + } + } + } } diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java index dec630995cb..08b0f96320f 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java @@ -23,7 +23,9 @@ import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.Slice.State; +import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.util.FileUtils; +import org.apache.zookeeper.KeeperException; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -54,6 +56,8 @@ public class DeleteShardTest extends SolrCloudTestCase { DocCollection state = getCollectionState(collection); assertEquals(State.ACTIVE, state.getSlice("shard1").getState()); assertEquals(State.ACTIVE, state.getSlice("shard2").getState()); + assertShardMetadata(collection, "shard1", true); + assertShardMetadata(collection, "shard2", true); // Can't delete an ACTIVE shard expectThrows( @@ -67,11 +71,13 @@ public class DeleteShardTest extends SolrCloudTestCase { // Can delete an INACTIVE shard CollectionAdminRequest.deleteShard(collection, "shard1").process(cluster.getSolrClient()); waitForState("Expected 'shard1' to be removed", collection, c -> c.getSlice("shard1") == null); + assertShardMetadata(collection, "shard1", false); // Can delete a shard under construction setSliceState(collection, "shard2", Slice.State.CONSTRUCTION); CollectionAdminRequest.deleteShard(collection, "shard2").process(cluster.getSolrClient()); waitForState("Expected 'shard2' to be removed", collection, c -> c.getSlice("shard2") == null); + assertShardMetadata(collection, "shard2", false); } @Test @@ -125,4 +131,19 @@ public class DeleteShardTest extends SolrCloudTestCase { collectionName, c -> c.getSlice(shardId).getState() == state); } + + /** Check whether shard metadata exist in Zookeeper. */ + private void assertShardMetadata(String collection, String sliceId, boolean shouldExist) + throws KeeperException, InterruptedException { + + String leaderElectPath = + ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leader_elect/" + sliceId; + assertEquals(shouldExist, cluster.getZkClient().exists(leaderElectPath, true)); + + String leaderPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leaders/" + sliceId; + assertEquals(shouldExist, cluster.getZkClient().exists(leaderPath, true)); + + String termPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" + sliceId; + assertEquals(shouldExist, cluster.getZkClient().exists(termPath, true)); + } }
