adoroszlai commented on a change in pull request #874:
URL: https://github.com/apache/hadoop-ozone/pull/874#discussion_r426571715



##########
File path: 
hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
##########
@@ -358,6 +233,11 @@ protected void initializeConfiguration() throws 
IOException {
       conf.setInt("hdds.scm.replication.event.timeout", 20 * 1000);
       conf.setInt(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY, 100);
       conf.setInt(OzoneConfigKeys.DFS_CONTAINER_RATIS_LOG_PURGE_GAP, 100);
+
+      conf.setInt(OMConfigKeys.
+              OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY, 100);
+      conf.setInt(OMConfigKeys.
+              OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY, 100);

Review comment:
       Duplicate config setting.  Did you intend to set some other property?

##########
File path: 
hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
##########
@@ -406,19 +280,71 @@ public MiniOzoneChaosCluster build() throws IOException {
       final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
           scm, null);
 
-      MiniOzoneChaosCluster cluster;
-      if (failureService == FailureService.DATANODE) {
-        cluster = new MiniOzoneDatanodeChaosCluster(conf, omList, scm,
-            hddsDatanodes, omServiceId);
-      } else {
-        cluster = new MiniOzoneOMChaosCluster(conf, omList, scm,
-            hddsDatanodes, omServiceId);
-      }
+      MiniOzoneChaosCluster cluster =
+          new MiniOzoneChaosCluster(conf, omList, scm, hddsDatanodes,
+              omServiceId, clazzes);
 
       if (startDataNodes) {
         cluster.startHddsDatanodes();
       }
       return cluster;
     }
   }
+
+  // OzoneManager specifc
+  public List<OzoneManager> omToFail() {
+    int numNodesToFail = FailureManager.getNumberOfOmToFail();
+    if (failedOmSet.size() >= numOzoneManagers/2) {
+      return Collections.emptyList();
+    }
+
+    int numOms = getOzoneManagersList().size();
+    List<OzoneManager> oms = new ArrayList<>(numNodesToFail);
+    for (int i = 0; i < numNodesToFail; i++) {
+      int failedNodeIndex = FailureManager.getBoundedRandomIndex(numOms);
+      oms.add(getOzoneManager(failedNodeIndex));
+    }
+    return oms;
+  }
+
+  public void shutdownOzoneManager(OzoneManager om) {
+    super.shutdownOzoneManager(om);
+    failedOmSet.add(om);
+  }
+
+  public void restartOzoneManager(OzoneManager om, boolean waitForOM)
+      throws IOException, TimeoutException, InterruptedException {
+    super.restartOzoneManager(om, waitForOM);
+    failedOmSet.remove(om);
+  }
+
+  // Should the selected node be stopped or started.
+  public boolean shouldStop() {
+    if (failedOmSet.size() >= numOzoneManagers/2) {
+      return false;
+    }
+    return RandomUtils.nextBoolean();
+  }
+
+  public List<DatanodeDetails> dnToFail() {
+    int numNodesToFail = FailureManager.getNumberOfDnToFail();
+    int numDns = getHddsDatanodes().size();
+    List<DatanodeDetails> dns = new ArrayList<>(numNodesToFail);
+    for (int i = 0; i < numNodesToFail; i++) {
+      int failedNodeIndex = FailureManager.getBoundedRandomIndex(numDns);
+      dns.add(getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails());
+    }
+    return dns;
+  }
+  
+  @Override
+  public void restartHddsDatanode(DatanodeDetails dn, boolean waitForDatanode)
+      throws InterruptedException, TimeoutException, IOException {
+    super.restartHddsDatanode(dn, waitForDatanode);
+  }
+
+  @Override
+  public void shutdownHddsDatanode(int i) {
+    super.shutdownHddsDatanode(i);
+  }

Review comment:
       Do we need these overrides?

##########
File path: 
hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
##########
@@ -264,12 +121,30 @@ public void shutdown() {
     }
   }
 
+  /**
+   * Check if cluster is ready for a restart or shutdown of an OM node. If
+   * yes, then set isClusterReady to false so that another thread cannot
+   * restart/ shutdown OM till all OMs are up again.
+   */
+  public void waitForClusterToBeReady()

Review comment:
       ```suggestion
     @Override
     public void waitForClusterToBeReady()
   ```

##########
File path: 
hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
##########
@@ -406,19 +280,71 @@ public MiniOzoneChaosCluster build() throws IOException {
       final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
           scm, null);
 
-      MiniOzoneChaosCluster cluster;
-      if (failureService == FailureService.DATANODE) {
-        cluster = new MiniOzoneDatanodeChaosCluster(conf, omList, scm,
-            hddsDatanodes, omServiceId);
-      } else {
-        cluster = new MiniOzoneOMChaosCluster(conf, omList, scm,
-            hddsDatanodes, omServiceId);
-      }
+      MiniOzoneChaosCluster cluster =
+          new MiniOzoneChaosCluster(conf, omList, scm, hddsDatanodes,
+              omServiceId, clazzes);
 
       if (startDataNodes) {
         cluster.startHddsDatanodes();
       }
       return cluster;
     }
   }
+
+  // OzoneManager specifc
+  public List<OzoneManager> omToFail() {
+    int numNodesToFail = FailureManager.getNumberOfOmToFail();
+    if (failedOmSet.size() >= numOzoneManagers/2) {
+      return Collections.emptyList();
+    }
+
+    int numOms = getOzoneManagersList().size();
+    List<OzoneManager> oms = new ArrayList<>(numNodesToFail);
+    for (int i = 0; i < numNodesToFail; i++) {
+      int failedNodeIndex = FailureManager.getBoundedRandomIndex(numOms);
+      oms.add(getOzoneManager(failedNodeIndex));

Review comment:
       This can add the same OM multiple times to the list.  Is that OK?

##########
File path: 
hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/FailureManager.java
##########
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.failure;
+
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ozone.MiniOzoneChaosCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Manages all the failures in the MiniOzoneChaosCluster.
+ */
+public class FailureManager {
+
+  static final Logger LOG =
+      LoggerFactory.getLogger(Failures.class);
+
+  private final MiniOzoneChaosCluster cluster;
+  private final List<Failures> failures;
+  private ScheduledFuture scheduledFuture;
+  private final ScheduledExecutorService executorService;
+  public FailureManager(MiniOzoneChaosCluster cluster,
+                        Configuration conf,
+                        List<Class<? extends Failures>> clazzes) {
+    this.cluster = cluster;
+    this.executorService = Executors.newSingleThreadScheduledExecutor();
+
+    failures = new ArrayList<>();
+    for (Class<? extends Failures> clazz : clazzes) {
+      Failures f = ReflectionUtils.newInstance(clazz, conf);
+      f.validateFailure(cluster.getOzoneManagersList(),
+          cluster.getStorageContainerManager(),
+          cluster.getHddsDatanodes());
+      failures.add(f);
+    }
+
+  }
+
+
+  // Fail nodes randomly at configured timeout period.
+  private void fail() {
+    try {
+      Failures f = failures.get(getBoundedRandomIndex(failures.size()));
+      LOG.info("time failure with {}", f.getName());
+      f.fail(cluster);
+    } catch (Throwable t) {
+      LOG.info("failing with ", t);

Review comment:
       Nit: `fail` seems to be an overloaded term here (refers to both managed 
failure and unexpected failure while trying to induce failure).




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: ozone-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: ozone-issues-h...@hadoop.apache.org

Reply via email to