This is an automated email from the ASF dual-hosted git repository.
xyuanlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push:
new e6adf5bc9 Exclude Custom Checks When Instance Is Not Alive (#2993)
e6adf5bc9 is described below
commit e6adf5bc934a01ab87bc44c03015208a45ca6b4f
Author: Xiaxuan Gao <[email protected]>
AuthorDate: Tue Jan 21 11:20:45 2025 -0800
Exclude Custom Checks When Instance Is Not Alive (#2993)
Exclude Custom Checks When Instance Is Not Alive
---
.../MaintenanceManagementService.java | 62 ++++++++++++++++++++--
.../server/resources/helix/InstancesAccessor.java | 9 ++++
.../helix/rest/server/TestInstancesAccessor.java | 47 +++++++++++++++-
3 files changed, 112 insertions(+), 6 deletions(-)
diff --git
a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
index 925b2d216..8d5d025b3 100644
---
a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
+++
b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
@@ -45,6 +45,7 @@ import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import org.apache.helix.ConfigAccessor;
import org.apache.helix.HelixException;
+import org.apache.helix.PropertyKey;
import org.apache.helix.manager.zk.ZKHelixDataAccessor;
import org.apache.helix.model.CurrentState;
import org.apache.helix.model.ExternalView;
@@ -98,6 +99,8 @@ public class MaintenanceManagementService {
// maintain the backward compatibility with users who don't use
MaintenanceManagementServiceBuilder
// to create the MaintenanceManagementService object.
private List<HealthCheck> _skipStoppableHealthCheckList =
Collections.emptyList();
+ // default value false to maintain backward compatibility
+ private boolean _skipCustomChecksIfNoLiveness = false;
public MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
ConfigAccessor configAccessor, boolean skipZKRead, String namespace) {
@@ -152,7 +155,7 @@ public class MaintenanceManagementService {
private MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
ConfigAccessor configAccessor, CustomRestClient customRestClient,
boolean skipZKRead,
Set<String> nonBlockingHealthChecks, Set<StoppableCheck.Category>
skipHealthCheckCategories,
- List<HealthCheck> skipStoppableHealthCheckList, String namespace) {
+ List<HealthCheck> skipStoppableHealthCheckList, String namespace,
boolean skipCustomChecksIfNoLiveness) {
_dataAccessor =
new HelixDataAccessorWrapper(dataAccessor, customRestClient,
namespace);
@@ -166,6 +169,7 @@ public class MaintenanceManagementService {
_skipStoppableHealthCheckList = skipStoppableHealthCheckList == null ?
Collections.emptyList()
: skipStoppableHealthCheckList;
_namespace = namespace;
+ _skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness;
}
/**
@@ -502,15 +506,20 @@ public class MaintenanceManagementService {
return instances;
}
+ // Skip performing a custom check on any dead instance if the user set
_skipCustomCheckIfInstanceNotAlive
+ // to true.
+ List<String> instanceIdsForCustomCheck =
filterOutDeadInstancesIfNeeded(instances);
+
// If the config has exactUrl and the CLUSTER level customer check is not
skipped, we will
// perform the custom check at cluster level.
if (restConfig.getCompleteConfiguredHealthUrl().isPresent()) {
- if
(_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_AGGREGATED_CHECK))
{
+ if
(_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_AGGREGATED_CHECK)
+ || instanceIdsForCustomCheck.isEmpty()) {
return instances;
}
Map<String, StoppableCheck> clusterLevelCustomCheckResult =
- performAggregatedCustomCheck(clusterId, instances,
+ performAggregatedCustomCheck(clusterId, instanceIdsForCustomCheck,
restConfig.getCompleteConfiguredHealthUrl().get(),
customPayLoads,
toBeStoppedInstances);
List<String> instancesForNextCheck = new ArrayList<>();
@@ -526,7 +535,7 @@ public class MaintenanceManagementService {
// Reaching here means the rest config requires instances/partition level
checks. We will
// perform the custom check at instance/partition level if they are not
skipped.
- List<String> instancesForCustomPartitionLevelChecks = instances;
+ List<String> instancesForCustomPartitionLevelChecks =
instanceIdsForCustomCheck;
if
(!_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_INSTANCE_CHECK))
{
Map<String, Future<StoppableCheck>> customInstanceLevelChecks =
instances.stream().collect(
Collectors.toMap(Function.identity(), instance -> POOL.submit(
@@ -560,6 +569,42 @@ public class MaintenanceManagementService {
return instancesForCustomPartitionLevelChecks;
}
+ /**
+ * Helper Methods
+ * <p>
+ * If users set skipCustomCheckIfInstanceNotAlive to true, filter out dead
instances
+ * to avoid running custom checks on them.
+ *
+ * @param instanceIds the list of instances
+ * @return either the original list or a filtered list of only live instances
+ */
+ private List<String> filterOutDeadInstancesIfNeeded(List<String>
instanceIds) {
+ if (!_skipCustomChecksIfNoLiveness) {
+ // We are not skipping the not-alive check, so just return all instances.
+ return instanceIds;
+ }
+
+ // Retrieve the set of currently live instances
+ PropertyKey.Builder keyBuilder = _dataAccessor.keyBuilder();
+ List<String> liveNodes =
_dataAccessor.getChildNames(keyBuilder.liveInstances());
+
+ // Filter out instances that are not in the live list
+ List<String> filtered = new ArrayList<>();
+ List<String> skipped = new ArrayList<>();
+ for (String instanceId : instanceIds) {
+ if (liveNodes.contains(instanceId)) {
+ filtered.add(instanceId);
+ } else {
+ skipped.add(instanceId);
+ }
+ }
+
+ if (!skipped.isEmpty()) {
+ LOG.info("Skipping any custom checks for instances due to liveness: {}",
skipped);
+ }
+ return filtered;
+ }
+
private Map<String, MaintenanceManagementInstanceInfo>
batchInstanceHealthCheck(String clusterId,
List<String> instances, List<String> healthChecks, Map<String, String>
healthCheckConfig) {
List<String> instancesForNext = new ArrayList<>(instances);
@@ -890,6 +935,7 @@ public class MaintenanceManagementService {
public static class MaintenanceManagementServiceBuilder {
private ConfigAccessor _configAccessor;
private boolean _skipZKRead;
+ private boolean _skipCustomChecksIfNoLiveness = false;
private String _namespace;
private ZKHelixDataAccessor _dataAccessor;
private CustomRestClient _customRestClient;
@@ -942,11 +988,17 @@ public class MaintenanceManagementService {
return this;
}
+ public MaintenanceManagementServiceBuilder setSkipCustomChecksIfNoLiveness(
+ boolean skipCustomChecksIfNoLiveness) {
+ _skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness;
+ return this;
+ }
+
public MaintenanceManagementService build() {
validate();
return new MaintenanceManagementService(_dataAccessor, _configAccessor,
_customRestClient,
_skipZKRead, _nonBlockingHealthChecks, _skipHealthCheckCategories,
- _skipStoppableHealthCheckList, _namespace);
+ _skipStoppableHealthCheckList, _namespace,
_skipCustomChecksIfNoLiveness);
}
private void validate() throws IllegalArgumentException {
diff --git
a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
index 52313a66b..1d6e249db 100644
---
a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
+++
b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
@@ -73,6 +73,7 @@ public class InstancesAccessor extends AbstractHelixResource {
online,
disabled,
selection_base,
+ skip_custom_check_if_instance_not_alive,
zone_order,
to_be_stopped_instances,
skip_stoppable_check_list,
@@ -296,6 +297,13 @@ public class InstancesAccessor extends
AbstractHelixResource {
}
}
+ boolean skipCustomChecksIfNoLiveness = false;
+ if
(node.get(InstancesProperties.skip_custom_check_if_instance_not_alive.name())
!= null) {
+ skipCustomChecksIfNoLiveness = node.get(
+
InstancesAccessor.InstancesProperties.skip_custom_check_if_instance_not_alive.name())
+ .asBoolean();
+ }
+
ClusterTopology clusterTopology =
clusterService.getClusterTopology(clusterId);
if (selectionBase != InstanceHealthSelectionBase.non_zone_based) {
if (!clusterService.isClusterTopologyAware(clusterId)) {
@@ -335,6 +343,7 @@ public class InstancesAccessor extends
AbstractHelixResource {
.setSkipHealthCheckCategories(skipHealthCheckCategories)
.setNamespace(namespace)
.setSkipStoppableHealthCheckList(skipStoppableCheckList)
+ .setSkipCustomChecksIfNoLiveness(skipCustomChecksIfNoLiveness)
.build();
StoppableInstancesSelector stoppableInstancesSelector =
diff --git
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
index 1c9cbcb51..7fe4bae28 100644
---
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
+++
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
@@ -34,6 +34,7 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
+import org.apache.helix.ConfigAccessor;
import org.apache.helix.TestHelper;
import org.apache.helix.constants.InstanceConstants;
import org.apache.helix.model.ClusterConfig;
@@ -248,7 +249,51 @@ public class TestInstancesAccessor extends
AbstractTestClass {
System.out.println("End test :" + TestHelper.getTestMethodName());
}
- @Test(dependsOnMethods =
"testInstanceStoppableCrossZoneBasedWithSelectedCheckList")
+ @Test(dependsOnMethods = "testCrossZoneStoppableWithoutZoneOrder")
+ public void testSkipCustomChecksIfInstanceNotAlive() throws
JsonProcessingException {
+ System.out.println("Start test :" + TestHelper.getTestMethodName());
+
+ // Instance 4 and 5 in stoppable cluster 1 are not alive
+ String content = String.format(
+ "{\"%s\":\"%s\",\"%s\":[\"%s\",\"%s\", \"%s\"], \"%s\":[\"%s\",
\"%s\", \"%s\"], \"%s"
+ + "\": \"%b\"}",
+ InstancesAccessor.InstancesProperties.selection_base.name(),
+ InstancesAccessor.InstanceHealthSelectionBase.cross_zone_based.name(),
+ InstancesAccessor.InstancesProperties.instances.name(), "instance4",
"instance5", "invalidInstance",
+
InstancesAccessor.InstancesProperties.skip_stoppable_check_list.name(),
"INSTANCE_NOT_ALIVE", "EMPTY_RESOURCE_ASSIGNMENT", "INSTANCE_NOT_STABLE",
+
InstancesAccessor.InstancesProperties.skip_custom_check_if_instance_not_alive.name(),
true);
+
+ // Set the dummy custom checks for the cluster. The custom checks should
be skipped.
+ ConfigAccessor configAccessor = new ConfigAccessor(ZK_ADDR);
+ Assert.assertNull(configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
+ RESTConfig restConfig = new RESTConfig(STOPPABLE_CLUSTER);
+ restConfig.set(RESTConfig.SimpleFields.CUSTOMIZED_HEALTH_URL, "TEST_URL");
+ configAccessor.setRESTConfig(STOPPABLE_CLUSTER, restConfig);
+ Assert.assertEquals(restConfig,
configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
+
+ // Even if we don't skip custom stoppable checks, the instance is not
alive so it should be stoppable
+ Response response = new JerseyUriRequestBuilder(
+ "clusters/{}/instances?command=stoppable").format(
+ STOPPABLE_CLUSTER).post(this, Entity.entity(content,
MediaType.APPLICATION_JSON_TYPE));
+ JsonNode jsonNode =
OBJECT_MAPPER.readTree(response.readEntity(String.class));
+ Set<String> stoppableSet = getStringSet(jsonNode,
+
InstancesAccessor.InstancesProperties.instance_stoppable_parallel.name());
+ Assert.assertTrue(stoppableSet.contains("instance4"));
+ Assert.assertTrue(stoppableSet.contains("instance5"));
+ JsonNode nonStoppableInstances = jsonNode.get(
+
InstancesAccessor.InstancesProperties.instance_not_stoppable_with_reasons.name());
+
+ Assert.assertEquals(getStringSet(nonStoppableInstances, "invalidInstance"),
+ ImmutableSet.of("HELIX:INSTANCE_NOT_EXIST"));
+
+ // After the test finishes, remove the dummy custom checks REST config
+ configAccessor.deleteRESTConfig(STOPPABLE_CLUSTER);
+ Assert.assertNull(configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
+
+ System.out.println("End test :" + TestHelper.getTestMethodName());
+ }
+
+ @Test(dependsOnMethods = "testSkipCustomChecksIfInstanceNotAlive")
public void testInstanceStoppableCrossZoneBasedWithEvacuatingInstances()
throws IOException {
System.out.println("Start test :" + TestHelper.getTestMethodName());
String content = String.format(