This is an automated email from the ASF dual-hosted git repository.
jxue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push:
new 2b33ffebe skip min active check if instance replica in unhealthy state
(#3019)
2b33ffebe is described below
commit 2b33ffebea4cd485c4c1e23cdefc886d95686292
Author: Grant Paláu Spencer <[email protected]>
AuthorDate: Wed Apr 30 13:49:58 2025 -0700
skip min active check if instance replica in unhealthy state (#3019)
Stoppable API will skip the min active replica check for a replica if that
replica is already in an unhealthy state. This is because stopping the instance
will not affect that partition's availability as the replica is already in an
unhealthy state (offline/error). Previously, this would prevent a node with an
error state replica from being stopped if min_active for that partition was not
satisfied. With this change, we will not consider the sister replicas and drop
anyways.
---
.../stages/ParticipantDeregistrationStage.java | 4 +-
.../apache/helix/util/InstanceValidationUtil.java | 5 +++
.../org/apache/helix/integration/TestWagedNPE.java | 3 --
.../helix/util/TestInstanceValidationUtil.java | 45 ++++++++++++++++++++++
.../helix/rest/server/TestInstancesAccessor.java | 2 +-
5 files changed, 53 insertions(+), 6 deletions(-)
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
b/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
index ef950a185..1918dca76 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
@@ -26,14 +26,14 @@ public class ParticipantDeregistrationStage extends
AbstractAsyncBaseStage {
@Override
public void execute(ClusterEvent event) throws Exception {
HelixManager manager =
event.getAttribute(AttributeName.helixmanager.name());
- ClusterConfig clusterConfig =
manager.getConfigAccessor().getClusterConfig(manager.getClusterName());
+ ResourceControllerDataProvider cache =
event.getAttribute(AttributeName.ControllerDataProvider.name());
+ ClusterConfig clusterConfig = cache.getClusterConfig();
if (clusterConfig == null ||
!clusterConfig.isParticipantDeregistrationEnabled()) {
LOG.debug("Cluster config is null or participant deregistration is not
enabled. "
+ "Skipping participant deregistration.");
return;
}
- ResourceControllerDataProvider cache =
event.getAttribute(AttributeName.ControllerDataProvider.name());
Map<String, Long> offlineTimeMap = cache.getInstanceOfflineTimeMap();
long deregisterDelay = clusterConfig.getParticipantDeregistrationTimeout();
long stageStartTime = System.currentTimeMillis();
diff --git
a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
index 5dea68334..981d43092 100644
--- a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
+++ b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
@@ -449,6 +449,11 @@ public class InstanceValidationUtil {
Map<String, String> stateByInstanceMap =
externalView.getStateMap(partition);
// found the resource hosted on the instance
if (stateByInstanceMap.containsKey(instanceName)) {
+ // If this node's replica is in unhealthy state, skip the sibling
check as removing this replica will not
+ // negatively affect availability.
+ if (unhealthyStates.contains(stateByInstanceMap.get(instanceName))) {
+ continue;
+ }
int numHealthySiblings = 0;
for (Map.Entry<String, String> entry :
stateByInstanceMap.entrySet()) {
String siblingInstanceName = entry.getKey();
diff --git
a/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java
b/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java
index 57930fa71..111ea5fa6 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java
@@ -2,13 +2,10 @@ package org.apache.helix.integration;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.HashSet;
import java.util.List;
import org.apache.helix.ConfigAccessor;
import org.apache.helix.TestHelper;
import org.apache.helix.common.ZkTestBase;
-import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer;
-import
org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy;
import org.apache.helix.controller.rebalancer.waged.WagedRebalancer;
import org.apache.helix.integration.manager.ClusterControllerManager;
import org.apache.helix.integration.manager.MockParticipantManager;
diff --git
a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
index 88dd05351..c95dbb707 100644
---
a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
+++
b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
@@ -544,6 +544,51 @@ public class TestInstanceValidationUtil {
InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor,
TEST_INSTANCE);
}
+ // Test that min active replica check only run on partitions where removing
the instance's replica would reduce
+ // the partition's availability. This means min active check will not be
performed on partitions where the instance's
+ // replica is in an unhealthy state
+ @Test
+ public void
TestSiblingNodesActiveReplicaCheckSuccessWithReplicaInErrorState() {
+ String resource = "resource";
+ Mock mock = new Mock();
+ doReturn(ImmutableList.of(resource)).when(mock.dataAccessor)
+ .getChildNames(argThat(new
PropertyKeyArgument(PropertyType.IDEALSTATES)));
+ // set ideal state
+ IdealState idealState = mock(IdealState.class);
+ when(idealState.isEnabled()).thenReturn(true);
+ when(idealState.isValid()).thenReturn(true);
+ when(idealState.getStateModelDefRef()).thenReturn("MasterSlave");
+ doReturn(idealState).when(mock.dataAccessor).getProperty(argThat(new
PropertyKeyArgument(PropertyType.IDEALSTATES)));
+
+ ExternalView externalView = mock(ExternalView.class);
+ when(externalView.getMinActiveReplicas()).thenReturn(3);
+ when(externalView.getStateModelDefRef()).thenReturn("MasterSlave");
+ when(externalView.getPartitionSet()).thenReturn(ImmutableSet.of("db0"));
+ when(externalView.getStateMap("db0")).thenReturn(
+ ImmutableMap.of(TEST_INSTANCE, "ERROR", "instance1", "ERROR",
"instance2", "ERROR"));
+ doReturn(externalView).when(mock.dataAccessor)
+ .getProperty(argThat(new
PropertyKeyArgument(PropertyType.EXTERNALVIEW)));
+ StateModelDefinition stateModelDefinition =
mock(StateModelDefinition.class);
+ when(stateModelDefinition.getInitialState()).thenReturn("OFFLINE");
+ doReturn(stateModelDefinition).when(mock.dataAccessor)
+ .getProperty(argThat(new
PropertyKeyArgument(PropertyType.STATEMODELDEFS)));
+
+ // This min active replica check should pass as the instance's replica is
in ERROR state, so it will not affect
+ // availability of the partition even though partition does not meet
healthy min active replica count
+ boolean result =
+
InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor,
TEST_INSTANCE);
+ Assert.assertTrue(result);
+
+ // This min active replica check should fail as the instance is in a
healthy state and removing it would reduce
+ // availability of the partition and partition would not meet the healthy
min active replica count
+ when(externalView.getStateMap("db0")).thenReturn(
+ ImmutableMap.of(TEST_INSTANCE, "Master", "instance1", "ERROR",
"instance2", "ERROR"));
+
+ result =
+
InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor,
TEST_INSTANCE);
+ Assert.assertFalse(result);
+ }
+
private class Mock {
HelixDataAccessor dataAccessor;
ConfigAccessor configAccessor;
diff --git
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
index 7fe4bae28..fc20d1d92 100644
---
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
+++
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
@@ -426,7 +426,7 @@ public class TestInstancesAccessor extends
AbstractTestClass {
JsonNode jsonResult =
OBJECT_MAPPER.readTree(response.readEntity(String.class));
Assert.assertFalse(jsonResult.get("stoppable").asBoolean());
Assert.assertEquals(getStringSet(jsonResult, "failedChecks"),
-
ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE","HELIX:MIN_ACTIVE_REPLICA_CHECK_FAILED"));
+
ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE"));
// Reenable instance0, it should passed the check
instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE);