This is an automated email from the ASF dual-hosted git repository.

xyuanlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/master by this push:
     new e6adf5bc9 Exclude Custom Checks When Instance Is Not Alive (#2993)
e6adf5bc9 is described below

commit e6adf5bc934a01ab87bc44c03015208a45ca6b4f
Author: Xiaxuan Gao <[email protected]>
AuthorDate: Tue Jan 21 11:20:45 2025 -0800

    Exclude Custom Checks When Instance Is Not Alive (#2993)
    
    Exclude Custom Checks When Instance Is Not Alive
---
 .../MaintenanceManagementService.java              | 62 ++++++++++++++++++++--
 .../server/resources/helix/InstancesAccessor.java  |  9 ++++
 .../helix/rest/server/TestInstancesAccessor.java   | 47 +++++++++++++++-
 3 files changed, 112 insertions(+), 6 deletions(-)

diff --git 
a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
 
b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
index 925b2d216..8d5d025b3 100644
--- 
a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
+++ 
b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java
@@ -45,6 +45,7 @@ import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import org.apache.helix.ConfigAccessor;
 import org.apache.helix.HelixException;
+import org.apache.helix.PropertyKey;
 import org.apache.helix.manager.zk.ZKHelixDataAccessor;
 import org.apache.helix.model.CurrentState;
 import org.apache.helix.model.ExternalView;
@@ -98,6 +99,8 @@ public class MaintenanceManagementService {
   // maintain the backward compatibility with users who don't use 
MaintenanceManagementServiceBuilder
   // to create the MaintenanceManagementService object.
   private List<HealthCheck> _skipStoppableHealthCheckList = 
Collections.emptyList();
+  // default value false to maintain backward compatibility
+  private boolean _skipCustomChecksIfNoLiveness = false;
 
   public MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
       ConfigAccessor configAccessor, boolean skipZKRead, String namespace) {
@@ -152,7 +155,7 @@ public class MaintenanceManagementService {
   private MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
       ConfigAccessor configAccessor, CustomRestClient customRestClient, 
boolean skipZKRead,
       Set<String> nonBlockingHealthChecks, Set<StoppableCheck.Category> 
skipHealthCheckCategories,
-      List<HealthCheck> skipStoppableHealthCheckList, String namespace) {
+      List<HealthCheck> skipStoppableHealthCheckList, String namespace, 
boolean skipCustomChecksIfNoLiveness) {
     _dataAccessor =
         new HelixDataAccessorWrapper(dataAccessor, customRestClient,
             namespace);
@@ -166,6 +169,7 @@ public class MaintenanceManagementService {
     _skipStoppableHealthCheckList = skipStoppableHealthCheckList == null ? 
Collections.emptyList()
             : skipStoppableHealthCheckList;
     _namespace = namespace;
+    _skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness;
   }
 
   /**
@@ -502,15 +506,20 @@ public class MaintenanceManagementService {
       return instances;
     }
 
+    // Skip performing a custom check on any dead instance if the user set 
_skipCustomCheckIfInstanceNotAlive
+    // to true.
+    List<String> instanceIdsForCustomCheck = 
filterOutDeadInstancesIfNeeded(instances);
+
     // If the config has exactUrl and the CLUSTER level customer check is not 
skipped, we will
     // perform the custom check at cluster level.
     if (restConfig.getCompleteConfiguredHealthUrl().isPresent()) {
-      if 
(_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_AGGREGATED_CHECK))
 {
+      if 
(_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_AGGREGATED_CHECK)
+          || instanceIdsForCustomCheck.isEmpty()) {
         return instances;
       }
 
       Map<String, StoppableCheck> clusterLevelCustomCheckResult =
-          performAggregatedCustomCheck(clusterId, instances,
+          performAggregatedCustomCheck(clusterId, instanceIdsForCustomCheck,
               restConfig.getCompleteConfiguredHealthUrl().get(), 
customPayLoads,
               toBeStoppedInstances);
       List<String> instancesForNextCheck = new ArrayList<>();
@@ -526,7 +535,7 @@ public class MaintenanceManagementService {
 
     // Reaching here means the rest config requires instances/partition level 
checks. We will
     // perform the custom check at instance/partition level if they are not 
skipped.
-    List<String> instancesForCustomPartitionLevelChecks = instances;
+    List<String> instancesForCustomPartitionLevelChecks = 
instanceIdsForCustomCheck;
     if 
(!_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_INSTANCE_CHECK))
 {
       Map<String, Future<StoppableCheck>> customInstanceLevelChecks = 
instances.stream().collect(
           Collectors.toMap(Function.identity(), instance -> POOL.submit(
@@ -560,6 +569,42 @@ public class MaintenanceManagementService {
     return instancesForCustomPartitionLevelChecks;
   }
 
+  /**
+   * Helper Methods
+   * <p>
+   * If users set skipCustomCheckIfInstanceNotAlive to true, filter out dead 
instances
+   * to avoid running custom checks on them.
+   *
+   * @param instanceIds  the list of instances
+   * @return either the original list or a filtered list of only live instances
+   */
+  private List<String> filterOutDeadInstancesIfNeeded(List<String> 
instanceIds) {
+    if (!_skipCustomChecksIfNoLiveness) {
+      // We are not skipping the not-alive check, so just return all instances.
+      return instanceIds;
+    }
+
+    // Retrieve the set of currently live instances
+    PropertyKey.Builder keyBuilder = _dataAccessor.keyBuilder();
+    List<String> liveNodes = 
_dataAccessor.getChildNames(keyBuilder.liveInstances());
+
+    // Filter out instances that are not in the live list
+    List<String> filtered = new ArrayList<>();
+    List<String> skipped = new ArrayList<>();
+    for (String instanceId : instanceIds) {
+      if (liveNodes.contains(instanceId)) {
+        filtered.add(instanceId);
+      } else {
+        skipped.add(instanceId);
+      }
+    }
+
+    if (!skipped.isEmpty()) {
+      LOG.info("Skipping any custom checks for instances due to liveness: {}", 
skipped);
+    }
+    return filtered;
+  }
+
   private Map<String, MaintenanceManagementInstanceInfo> 
batchInstanceHealthCheck(String clusterId,
       List<String> instances, List<String> healthChecks, Map<String, String> 
healthCheckConfig) {
     List<String> instancesForNext = new ArrayList<>(instances);
@@ -890,6 +935,7 @@ public class MaintenanceManagementService {
   public static class MaintenanceManagementServiceBuilder {
     private ConfigAccessor _configAccessor;
     private boolean _skipZKRead;
+    private boolean _skipCustomChecksIfNoLiveness = false;
     private String _namespace;
     private ZKHelixDataAccessor _dataAccessor;
     private CustomRestClient _customRestClient;
@@ -942,11 +988,17 @@ public class MaintenanceManagementService {
       return this;
     }
 
+    public MaintenanceManagementServiceBuilder setSkipCustomChecksIfNoLiveness(
+        boolean skipCustomChecksIfNoLiveness) {
+      _skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness;
+      return this;
+    }
+
     public MaintenanceManagementService build() {
       validate();
       return new MaintenanceManagementService(_dataAccessor, _configAccessor, 
_customRestClient,
           _skipZKRead, _nonBlockingHealthChecks, _skipHealthCheckCategories,
-          _skipStoppableHealthCheckList, _namespace);
+          _skipStoppableHealthCheckList, _namespace, 
_skipCustomChecksIfNoLiveness);
     }
 
     private void validate() throws IllegalArgumentException {
diff --git 
a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
 
b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
index 52313a66b..1d6e249db 100644
--- 
a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
+++ 
b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
@@ -73,6 +73,7 @@ public class InstancesAccessor extends AbstractHelixResource {
     online,
     disabled,
     selection_base,
+    skip_custom_check_if_instance_not_alive,
     zone_order,
     to_be_stopped_instances,
     skip_stoppable_check_list,
@@ -296,6 +297,13 @@ public class InstancesAccessor extends 
AbstractHelixResource {
         }
       }
 
+      boolean skipCustomChecksIfNoLiveness = false;
+      if 
(node.get(InstancesProperties.skip_custom_check_if_instance_not_alive.name()) 
!= null) {
+        skipCustomChecksIfNoLiveness = node.get(
+                
InstancesAccessor.InstancesProperties.skip_custom_check_if_instance_not_alive.name())
+            .asBoolean();
+      }
+
       ClusterTopology clusterTopology = 
clusterService.getClusterTopology(clusterId);
       if (selectionBase != InstanceHealthSelectionBase.non_zone_based) {
         if (!clusterService.isClusterTopologyAware(clusterId)) {
@@ -335,6 +343,7 @@ public class InstancesAccessor extends 
AbstractHelixResource {
               .setSkipHealthCheckCategories(skipHealthCheckCategories)
               .setNamespace(namespace)
               .setSkipStoppableHealthCheckList(skipStoppableCheckList)
+              .setSkipCustomChecksIfNoLiveness(skipCustomChecksIfNoLiveness)
               .build();
 
       StoppableInstancesSelector stoppableInstancesSelector =
diff --git 
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
 
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
index 1c9cbcb51..7fe4bae28 100644
--- 
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
+++ 
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
@@ -34,6 +34,7 @@ import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
+import org.apache.helix.ConfigAccessor;
 import org.apache.helix.TestHelper;
 import org.apache.helix.constants.InstanceConstants;
 import org.apache.helix.model.ClusterConfig;
@@ -248,7 +249,51 @@ public class TestInstancesAccessor extends 
AbstractTestClass {
     System.out.println("End test :" + TestHelper.getTestMethodName());
   }
 
-  @Test(dependsOnMethods = 
"testInstanceStoppableCrossZoneBasedWithSelectedCheckList")
+  @Test(dependsOnMethods = "testCrossZoneStoppableWithoutZoneOrder")
+  public void testSkipCustomChecksIfInstanceNotAlive() throws 
JsonProcessingException {
+    System.out.println("Start test :" + TestHelper.getTestMethodName());
+
+    // Instance 4 and 5 in stoppable cluster 1 are not alive
+    String content = String.format(
+        "{\"%s\":\"%s\",\"%s\":[\"%s\",\"%s\", \"%s\"], \"%s\":[\"%s\", 
\"%s\", \"%s\"], \"%s"
+            + "\": \"%b\"}",
+        InstancesAccessor.InstancesProperties.selection_base.name(),
+        InstancesAccessor.InstanceHealthSelectionBase.cross_zone_based.name(),
+        InstancesAccessor.InstancesProperties.instances.name(), "instance4", 
"instance5", "invalidInstance",
+        
InstancesAccessor.InstancesProperties.skip_stoppable_check_list.name(), 
"INSTANCE_NOT_ALIVE", "EMPTY_RESOURCE_ASSIGNMENT", "INSTANCE_NOT_STABLE",
+        
InstancesAccessor.InstancesProperties.skip_custom_check_if_instance_not_alive.name(),
 true);
+
+    // Set the dummy custom checks for the cluster. The custom checks should 
be skipped.
+    ConfigAccessor configAccessor = new ConfigAccessor(ZK_ADDR);
+    Assert.assertNull(configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
+    RESTConfig restConfig = new RESTConfig(STOPPABLE_CLUSTER);
+    restConfig.set(RESTConfig.SimpleFields.CUSTOMIZED_HEALTH_URL, "TEST_URL");
+    configAccessor.setRESTConfig(STOPPABLE_CLUSTER, restConfig);
+    Assert.assertEquals(restConfig, 
configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
+
+    // Even if we don't skip custom stoppable checks, the instance is not 
alive so it should be stoppable
+    Response response = new JerseyUriRequestBuilder(
+        "clusters/{}/instances?command=stoppable").format(
+        STOPPABLE_CLUSTER).post(this, Entity.entity(content, 
MediaType.APPLICATION_JSON_TYPE));
+    JsonNode jsonNode = 
OBJECT_MAPPER.readTree(response.readEntity(String.class));
+    Set<String> stoppableSet = getStringSet(jsonNode,
+        
InstancesAccessor.InstancesProperties.instance_stoppable_parallel.name());
+    Assert.assertTrue(stoppableSet.contains("instance4"));
+    Assert.assertTrue(stoppableSet.contains("instance5"));
+    JsonNode nonStoppableInstances = jsonNode.get(
+        
InstancesAccessor.InstancesProperties.instance_not_stoppable_with_reasons.name());
+
+    Assert.assertEquals(getStringSet(nonStoppableInstances, "invalidInstance"),
+        ImmutableSet.of("HELIX:INSTANCE_NOT_EXIST"));
+
+    // After the test finishes, remove the dummy custom checks REST config
+    configAccessor.deleteRESTConfig(STOPPABLE_CLUSTER);
+    Assert.assertNull(configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
+
+    System.out.println("End test :" + TestHelper.getTestMethodName());
+  }
+
+  @Test(dependsOnMethods = "testSkipCustomChecksIfInstanceNotAlive")
   public void testInstanceStoppableCrossZoneBasedWithEvacuatingInstances() 
throws IOException {
     System.out.println("Start test :" + TestHelper.getTestMethodName());
     String content = String.format(

Reply via email to