This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new d54e2cdf1c4 Backport SOLR-18008 to 9x (#4086)
d54e2cdf1c4 is described below
commit d54e2cdf1c4beeba016657790ecd2b8343a4aabe
Author: Eric Pugh <[email protected]>
AuthorDate: Wed Jan 28 08:31:27 2026 -0500
Backport SOLR-18008 to 9x (#4086)
---
.../SOLR-18008-simulate_solr_core_remnants.yml | 9 +
.../java/org/apache/solr/core/CoreContainer.java | 4 +-
.../apache/solr/core/CorePropertiesLocator.java | 21 ++
.../solr/cloud/DeleteCoreRemnantsOnCreateTest.java | 320 +++++++++++++++++++++
.../configuration-guide/pages/core-discovery.adoc | 2 +
.../pages/collection-management.adoc | 9 +
6 files changed, 363 insertions(+), 2 deletions(-)
diff --git a/changelog/unreleased/SOLR-18008-simulate_solr_core_remnants.yml
b/changelog/unreleased/SOLR-18008-simulate_solr_core_remnants.yml
new file mode 100644
index 00000000000..45d804cccd6
--- /dev/null
+++ b/changelog/unreleased/SOLR-18008-simulate_solr_core_remnants.yml
@@ -0,0 +1,9 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: Add solr.cloud.delete.unknown.cores.enabled setting for removing
unknown but existing core data when a core is created in SolrCloud mode.
+type: changed # added, changed, fixed, deprecated, removed, dependency_update,
security, other
+authors:
+ - name: Eric Pugh
+ - name: David Smiley
+links:
+- name: SOLR-18008
+ url: https://issues.apache.org/jira/browse/SOLR-18008
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index 149e1c4ffc3..776219293f2 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -1782,7 +1782,7 @@ public class CoreContainer {
} catch (Exception e) {
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
if (e instanceof ZkController.NotInClusterStateException &&
!newCollection) {
- // this mostly happens when the core is deleted when this node is down
+ // this mostly happens when the core is deleted when this node is down,
// but it can also happen if connecting to the wrong zookeeper
final boolean deleteUnknownCores =
Boolean.parseBoolean(System.getProperty("solr.deleteUnknownCores",
"false"));
@@ -1793,7 +1793,7 @@ public class CoreContainer {
(deleteUnknownCores
? " It will be deleted. See SOLR-13396 for more information."
: ""));
- // We alreday have an ongoing CoreOp, so do not wait to start another
one
+ // We already have an ongoing CoreOp, so do not wait to start another
one
unloadWithoutCoreOp(
dcore.getName(), deleteUnknownCores, deleteUnknownCores,
deleteUnknownCores);
throw e;
diff --git a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
index c060fc5d3cd..2fe356dfca8 100644
--- a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
+++ b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java
@@ -39,6 +39,7 @@ import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.EnvUtils;
import org.apache.solr.util.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -65,6 +66,7 @@ public class CorePropertiesLocator implements CoresLocator {
@Override
public void create(CoreContainer cc, CoreDescriptor... coreDescriptors) {
for (CoreDescriptor cd : coreDescriptors) {
+ checkForExistingCore(cd);
Path propertiesFile = cd.getInstanceDir().resolve(PROPERTIES_FILENAME);
if (Files.exists(propertiesFile))
throw new SolrException(
@@ -240,4 +242,23 @@ public class CorePropertiesLocator implements CoresLocator
{
p.putAll(cd.getPersistableUserProperties());
return p;
}
+
+ protected void checkForExistingCore(CoreDescriptor cd) {
+ if (cd.getCloudDescriptor() != null && Files.exists(cd.getInstanceDir())) {
+ final boolean deleteUnknownCores =
+
EnvUtils.getPropertyAsBool("solr.cloud.delete.unknown.cores.enabled", false);
+ if (deleteUnknownCores) {
+ log.warn(
+ "Automatically deleting existing directory at [{}] for core [{}]
because solr.cloud.delete.unknown.cores.enabled is true",
+ cd.getInstanceDir().toAbsolutePath(),
+ cd.getName());
+ SolrCore.deleteUnloadedCore(cd, true, true);
+ } else {
+ log.warn(
+ "Directory at [{}] for core[{}] already exists may prevent create
operation. Set solr.cloud.delete.unknown.cores.enabled=true to delete
directory. (SOLR-18008)",
+ cd.getInstanceDir().toAbsolutePath(),
+ cd.getName());
+ }
+ }
+ }
}
diff --git
a/solr/core/src/test/org/apache/solr/cloud/DeleteCoreRemnantsOnCreateTest.java
b/solr/core/src/test/org/apache/solr/cloud/DeleteCoreRemnantsOnCreateTest.java
new file mode 100644
index 00000000000..3f07879a148
--- /dev/null
+++
b/solr/core/src/test/org/apache/solr/cloud/DeleteCoreRemnantsOnCreateTest.java
@@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.cloud;
+
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.Optional;
+import org.apache.solr.client.solrj.impl.JsonMapResponseParser;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.CoreAdminRequest;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.embedded.JettySolrRunner;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Test "solr.cloud.delete.unknown.cores.enabled" property that can be used if
Solr has an
+ * inconsistent state with its cores lifecycle where remnant files are left on
disk after various
+ * operations that delete a core.
+ */
+public class DeleteCoreRemnantsOnCreateTest extends SolrCloudTestCase {
+ private static final String DELETE_UNKNOWN_CORES_PROP =
"solr.cloud.delete.unknown.cores.enabled";
+
+ @BeforeClass
+ public static void setupCluster() throws Exception {
+ configureCluster(1).addConfig("conf",
configset("cloud-minimal")).configure();
+ }
+
+ /**
+ * Shared setup for testing collection creation with remnants. Creates a
collection, deletes it,
+ * and then leaves behind a remnant directory.
+ */
+ private String setupCollectionRemnant(String collectionName) throws
Exception {
+ List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
+ String primaryNode = jettys.get(0).getNodeName();
+
+ CollectionAdminRequest.Create createRequest =
+ CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1);
+ createRequest.process(cluster.getSolrClient());
+
+ waitForState(
+ "Expected collection to be fully active",
+ collectionName,
+ (n, c) -> DocCollection.isFullyActive(n, c, 1, 1));
+
+ Replica primaryReplica = getReplicaOnNode(collectionName, "shard1",
primaryNode);
+ JettySolrRunner primaryJetty = cluster.getReplicaJetty(primaryReplica);
+ String originalCoreName = primaryReplica.getCoreName();
+ Path remnantInstanceDir;
+ try (SolrCore core =
primaryJetty.getCoreContainer().getCore(originalCoreName)) {
+ CoreDescriptor cd = core.getCoreDescriptor();
+ remnantInstanceDir = cd.getInstanceDir();
+ }
+
+
CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient());
+ waitForState("Expected collection deletion", collectionName, (n, c) -> c
== null);
+
+ // Simulate a core remnant still exists by creating the directory and
core.properties
+ Files.createDirectories(remnantInstanceDir);
+ String propertiesContent = "";
+ Files.writeString(
+ remnantInstanceDir.resolve("core.properties"), propertiesContent,
StandardCharsets.UTF_8);
+
+ return originalCoreName;
+ }
+
+ /**
+ * Shared setup for testing replica addition with remnants. Creates a
collection, then simulates a
+ * remnant directory on the single node that will impact the next addReplica
command.
+ */
+ private void setupReplicaRemnant(String collectionName) throws Exception {
+ List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
+ String primaryNode = jettys.get(0).getNodeName();
+
+ CollectionAdminRequest.Create createRequest =
+ CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1);
+ createRequest.process(cluster.getSolrClient());
+
+ waitForState(
+ "Expected collection to be fully active",
+ collectionName,
+ (n, c) -> DocCollection.isFullyActive(n, c, 1, 1));
+
+ int nextReplicaIndex = 3; // Yep, from 1 to 3 due to how we count in ZK
and setup.
+ String expectedNewReplicaName = collectionName + "_shard1_replica_n" +
nextReplicaIndex;
+
+ // Simulate a core remnant on the single node adjacent to the existing
replica instance path
+ Replica existing = getReplicaOnNode(collectionName, "shard1", primaryNode);
+ try (SolrCore core =
+
cluster.getReplicaJetty(existing).getCoreContainer().getCore(existing.getCoreName()))
{
+ Path siblingDir =
core.getInstancePath().getParent().resolve(expectedNewReplicaName);
+ Files.createDirectories(siblingDir);
+ Files.writeString(
+ siblingDir.resolve("core.properties"),
+ "name="
+ + expectedNewReplicaName
+ + "_remnant\n"
+ + "collection="
+ + collectionName
+ + "_remnant\n"
+ + "shard=shard1\n"
+ + "coreNodeName=core_node_remnant\n",
+ StandardCharsets.UTF_8);
+ }
+ }
+
+ /**
+ * Shared setup for testing DeleteCore admin API with remnants. Creates a
collection, deletes it,
+ * and then leaves behind a remnant core directory.
+ */
+ private String setupCoreRemnantForUnloadCoreOperation(String collectionName)
throws Exception {
+ List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
+ String primaryNode = jettys.get(0).getNodeName();
+
+ CollectionAdminRequest.Create createRequest =
+ CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1);
+ createRequest.process(cluster.getSolrClient());
+
+ waitForState(
+ "Expected collection to be fully active",
+ collectionName,
+ (n, c) -> DocCollection.isFullyActive(n, c, 1, 1));
+
+ Replica primaryReplica = getReplicaOnNode(collectionName, "shard1",
primaryNode);
+ JettySolrRunner primaryJetty = cluster.getReplicaJetty(primaryReplica);
+ String originalCoreName = primaryReplica.getCoreName();
+ Path remnantInstanceDir;
+ try (SolrCore core =
primaryJetty.getCoreContainer().getCore(originalCoreName)) {
+ CoreDescriptor cd = core.getCoreDescriptor();
+ remnantInstanceDir = cd.getInstanceDir();
+ }
+
+
CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient());
+ waitForState("Expected collection deletion", collectionName, (n, c) -> c
== null);
+
+ // Simulate a core remnant still exists by creating the directory and
core.properties
+ Files.createDirectories(remnantInstanceDir);
+ Files.writeString(
+ remnantInstanceDir.resolve("core.properties"),
+ "name=" + originalCoreName + "\n",
+ StandardCharsets.UTF_8);
+
+ return originalCoreName;
+ }
+
+ @Test
+ public void testCreateCollectionWithRemnantsFailsWithoutSetting() throws
Exception {
+ assertNull(
+ "Property should not be set by default",
System.getProperty(DELETE_UNKNOWN_CORES_PROP));
+
+ String collectionName = "coreRemnantCreateNoSetting";
+ setupCollectionRemnant(collectionName);
+
+ // Try to create the collection again - this demonstrates the behavior
without the setting
+ // In typical environments, this might fail, but behavior depends on
configuration
+ CollectionAdminRequest.Create recreateRequest =
+ CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1);
+
+ // The request to create a collection SHOULD fail based on the remnant
file, if it does not it
+ // means we've changed Solr's behavior when creating a core and
+ // remnants exist, and therefore we should rethink the utility of this
setting.
+ Exception e =
+ assertThrows(
+ "This request to recreate the collection should have failed due to
remnant files.",
+ Exception.class,
+ () -> recreateRequest.process(cluster.getSolrClient()));
+
+ assertTrue(
+ "Verify the exception was due to core creation failed.",
+ e.getMessage().contains("Underlying core creation failed"));
+ }
+
+ @Test
+ public void testCreateCollectionWithRemnantsWithSetting() throws Exception {
+ System.setProperty(DELETE_UNKNOWN_CORES_PROP, "true");
+
+ String collectionName = "coreRemnantCreateWithSetting";
+ setupCollectionRemnant(collectionName);
+
+ // With the setting enabled, collection creation should succeed despite
remnants
+ CollectionAdminRequest.Create recreateRequest =
+ CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1);
+ List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
+ recreateRequest.process(cluster.getSolrClient());
+
+ waitForState(
+ "Expected recreated collection to be fully active",
+ collectionName,
+ (n, c) -> DocCollection.isFullyActive(n, c, 1, 1));
+
+ // Verify collection was created successfully
+ DocCollection collection = getCollectionState(collectionName);
+ assertNotNull("Collection should exist", collection);
+ assertEquals("Should have 1 replica", 1, collection.getReplicas().size());
+
+ // Verify replica on the node where we had the remnant is active
+ Replica recreatedReplica =
+ getReplicaOnNode(collectionName, "shard1",
jettys.get(0).getNodeName());
+ assertNotNull("Should have a replica on the primary node",
recreatedReplica);
+ assertEquals("Replica should be active", Replica.State.ACTIVE,
recreatedReplica.getState());
+ }
+
+ @Test
+ public void testAddReplicaWithRemnantFailsWithoutSetting() throws Exception {
+ assertNull(
+ "Property should not be set by default",
System.getProperty(DELETE_UNKNOWN_CORES_PROP));
+
+ String collectionName = "coreRemnantAddReplicaNoSetting";
+ setupReplicaRemnant(collectionName);
+
+ List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
+ String primaryNode = jettys.get(0).getNodeName();
+
+ // Try to add a new replica - this demonstrates the behavior without the
setting
+ CollectionAdminRequest.AddReplica addReplicaRequest =
+ CollectionAdminRequest.addReplicaToShard(collectionName, "shard1");
+ addReplicaRequest.setNode(primaryNode);
+
+ Exception e =
+ assertThrows(
+ "This request to add a replica to the collection should have
failed due to remnant files.",
+ Exception.class,
+ () -> addReplicaRequest.process(cluster.getSolrClient()));
+
+ assertTrue(
+ "Verify the exception was due to core creation failed.",
+ e.getMessage().contains("ADDREPLICA failed to create replica"));
+ }
+
+ @Test
+ public void testAddReplicaWithRemnantWithSetting() throws Exception {
+ System.setProperty(DELETE_UNKNOWN_CORES_PROP, "true");
+
+ String collectionName = "coreRemnantAddReplicaWithSetting";
+ setupReplicaRemnant(collectionName);
+
+ List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
+ String primaryNode = jettys.get(0).getNodeName();
+
+ // With the setting enabled, replica addition should succeed despite
remnants
+ CollectionAdminRequest.AddReplica addReplicaRequest =
+ CollectionAdminRequest.addReplicaToShard(collectionName, "shard1");
+ addReplicaRequest.setNode(primaryNode);
+ addReplicaRequest.process(cluster.getSolrClient());
+
+ waitForState(
+ "Expected replica addition to finish",
+ collectionName,
+ (n, c) -> DocCollection.isFullyActive(n, c, 1, 2));
+
+ // Verify collection now has 2 replicas
+ DocCollection collection = getCollectionState(collectionName);
+ assertNotNull("Collection should exist", collection);
+ assertEquals("Should have 2 replicas after adding", 2,
collection.getReplicas().size());
+
+ // Verify the replica was added on the single node and is active
+ Replica addedReplica = getReplicaOnNode(collectionName, "shard1",
primaryNode);
+ assertNotNull("Should have added a replica on the primary node",
addedReplica);
+ assertEquals("Added replica should be active", Replica.State.ACTIVE,
addedReplica.getState());
+ }
+
+ /**
+ * This test demonstrates that you can't call the direct core unload admin
operation to get rid of
+ * a remnant core, not because of the existence of the remnant, but because
the core no longer has
+ * a CoreDescriptor record in ZooKeeper.
+ */
+ @Test
+ public void testDeleteCoreFailsWhenUnknown() throws Exception {
+
+ String collectionName = "coreRemnantDelete";
+ String coreName = setupCollectionRemnant(collectionName);
+
+ // Try to delete a core that only exists as a remnant - and has no record
in ZooKeeper
+ CoreAdminRequest.Unload unloadRequest = new CoreAdminRequest.Unload(true);
+ unloadRequest.setDeleteIndex(true);
+ unloadRequest.setDeleteDataDir(true);
+ unloadRequest.setDeleteInstanceDir(true);
+ unloadRequest.setCoreName(coreName);
+ unloadRequest.setResponseParser(new JsonMapResponseParser());
+
+ Exception e =
+ assertThrows(
+ "Expected request to fail.",
+ Exception.class,
+ () -> cluster.getSolrClient().request(unloadRequest));
+
+ assertTrue(
+ "Verify the exception was due to ZK not knowing about the core
existence.",
+ e.getMessage().contains("Cannot unload non-existent core [" + coreName
+ "]"));
+ }
+
+ private Replica getReplicaOnNode(String collectionName, String shard, String
nodeName) {
+ DocCollection collectionState = getCollectionState(collectionName);
+ Slice slice = collectionState.getSlice(shard);
+ Optional<Replica> replica =
+ slice.getReplicas().stream().filter(r ->
nodeName.equals(r.getNodeName())).findFirst();
+ return replica.orElseThrow(
+ () -> new AssertionError("No replica found on node " + nodeName + "
for " + shard));
+ }
+}
diff --git
a/solr/solr-ref-guide/modules/configuration-guide/pages/core-discovery.adoc
b/solr/solr-ref-guide/modules/configuration-guide/pages/core-discovery.adoc
index 779c0812c3b..944ede33e3a 100644
--- a/solr/solr-ref-guide/modules/configuration-guide/pages/core-discovery.adoc
+++ b/solr/solr-ref-guide/modules/configuration-guide/pages/core-discovery.adoc
@@ -18,6 +18,8 @@
Core discovery means that creating a core is as simple as a `core.properties`
file located on disk.
+TIP: If you are running in SolrCloud mode, you are shielded from this
complexity.
+
== The core.properties File
In Solr, the term _core_ is used to refer to a single index and associated
transaction log and configuration files (including the `solrconfig.xml` and
schema files, among others).
diff --git
a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
index f743fb4ea8c..1d70836584a 100644
---
a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
+++
b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
@@ -292,6 +292,15 @@ Request ID to track this action which will be
xref:configuration-guide:collectio
Collections are first created in read-write mode but can be put in `readOnly`
mode using the
xref:collection-management.adoc#modifycollection[MODIFYCOLLECTION] action.
+Solr can occasionally enter an inconsistent state where remnant core files
remain on disk after previous collection deletion operations fail.
+
+The system property `solr.cloud.delete.unknown.cores.enabled` is an
expert-level setting designed to handle this situation.
+When enabled, Solr automatically deletes any remnant core data on startup that
lacks a record in ZooKeeper.
+During core creation (whether for new collections or replicas), Solr also
deletes preexisting remnant files, allowing operations to complete.
+Without this setting, these files would cause new core creation to fail.
+
+Enable this feature with caution— it indicates an underlying problem in your
Solr setup that should be investigated.
+
=== CREATE Response
The response will include the status of the request and the new core names.