Repository: hadoop Updated Branches: refs/heads/HDFS-7240 841742cdd -> 1fc744c6c
HDFS-10912. Ozone:SCM: Add chill mode support to NodeManager. Contributed by Anu Engineer. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/1fc744c6 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/1fc744c6 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/1fc744c6 Branch: refs/heads/HDFS-7240 Commit: 1fc744c6c5bfffa9aedde5aa66c38ac47847e9a0 Parents: 841742c Author: Anu Engineer <aengin...@apache.org> Authored: Thu Oct 13 16:00:29 2016 -0700 Committer: Anu Engineer <aengin...@apache.org> Committed: Thu Oct 13 16:00:29 2016 -0700 ---------------------------------------------------------------------- .../hadoop/ozone/scm/node/NodeManager.java | 43 +++++++- .../hadoop/ozone/scm/node/SCMNodeManager.java | 107 ++++++++++++++++--- .../hadoop/ozone/scm/node/TestNodeManager.java | 92 ++++++++++++---- 3 files changed, 204 insertions(+), 38 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/1fc744c6/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/NodeManager.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/NodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/NodeManager.java index 699c789..9de6c81 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/NodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/NodeManager.java @@ -94,18 +94,51 @@ public interface NodeManager extends Closeable, Runnable { List<DatanodeID> getAllNodes(); /** - * Get the minimum number of nodes to get out of safe mode. + * Get the minimum number of nodes to get out of chill mode. * * @return int */ - int getMinimumSafeModeNodes(); + int getMinimumChillModeNodes(); /** - * Reports if we have exited out of safe mode by discovering enough nodes. + * Reports if we have exited out of chill mode by discovering enough nodes. * - * @return True if we are out of Node layer safe mode, false otherwise. + * @return True if we are out of Node layer chill mode, false otherwise. */ - boolean isOutOfNodeSafeMode(); + boolean isOutOfNodeChillMode(); + + /** + * Chill mode is the period when node manager waits for a minimum + * configured number of datanodes to report in. This is called chill mode + * to indicate the period before node manager gets into action. + * + * Forcefully exits the chill mode, even if we have not met the minimum + * criteria of the nodes reporting in. + */ + void forceExitChillMode(); + + /** + * Forcefully enters chill mode, even if all minimum node conditions are met. + */ + void forceEnterChillMode(); + + /** + * Clears the manual chill mode flag. + */ + void clearChillModeFlag(); + + /** + * Returns a chill mode status string. + * @return String + */ + String getChillModeStatus(); + + /** + * Returns the status of manual chill mode flag. + * @return true if forceEnterChillMode has been called, + * false if forceExitChillMode or status is not set. eg. clearChillModeFlag. + */ + boolean isInManualChillMode(); /** * Enum that represents the Node State. This is used in calls to getNodeList http://git-wip-us.apache.org/repos/asf/hadoop/blob/1fc744c6/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/SCMNodeManager.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/SCMNodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/SCMNodeManager.java index e866dbc..da3710f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/SCMNodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/ozone/scm/node/SCMNodeManager.java @@ -17,6 +17,7 @@ package org.apache.hadoop.ozone.scm.node; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.hadoop.conf.Configuration; @@ -35,7 +36,6 @@ import java.util.Map; import java.util.Queue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -101,8 +101,9 @@ public class SCMNodeManager implements NodeManager { private long lastHBcheckStart; private long lastHBcheckFinished = 0; private long lastHBProcessedCount; - private int safeModeNodeCount; + private int chillModeNodeCount; private final int maxHBToProcessPerLoop; + private Optional<Boolean> inManualChillMode; /** * Constructs SCM machine Manager. @@ -120,7 +121,7 @@ public class SCMNodeManager implements NodeManager { totalNodes = new AtomicInteger(0); // TODO: Support this value as a Percentage of known machines. - safeModeNodeCount = 1; + chillModeNodeCount = 1; staleNodeIntervalMs = OzoneClientUtils.getStaleNodeInterval(conf); deadNodeIntervalMs = OzoneClientUtils.getDeadNodeInterval(conf); @@ -132,6 +133,7 @@ public class SCMNodeManager implements NodeManager { executorService = HadoopExecutors.newScheduledThreadPool(1, new ThreadFactoryBuilder().setDaemon(true) .setNameFormat("SCM Heartbeat Processing Thread - %d").build()); + this.inManualChillMode = Optional.absent(); Preconditions.checkState(heartbeatCheckerIntervalMs > 0); executorService.schedule(this, heartbeatCheckerIntervalMs, @@ -243,36 +245,111 @@ public class SCMNodeManager implements NodeManager { } /** - * Get the minimum number of nodes to get out of safe mode. + * Get the minimum number of nodes to get out of Chill mode. * * @return int */ @Override - public int getMinimumSafeModeNodes() { - return safeModeNodeCount; + public int getMinimumChillModeNodes() { + return chillModeNodeCount; } /** - * Sets the Minimum SafeModeNode count, used only in testing. + * Sets the Minimum chill mode nodes count, used only in testing. * * @param count - Number of nodes. */ @VisibleForTesting - public void setMinimumSafeModeNodes(int count) { - safeModeNodeCount = count; + public void setMinimumChillModeNodes(int count) { + chillModeNodeCount = count; } /** - * Reports if we have exited out of safe mode. + * Reports if we have exited out of chill mode. * - * @return true if we are out of safe mode. + * @return true if we are out of chill mode. */ @Override - public boolean isOutOfNodeSafeMode() { - LOG.trace("Node count : {}", totalNodes.get()); + public boolean isOutOfNodeChillMode() { + if (inManualChillMode.isPresent()) { + return !inManualChillMode.get(); + } + + return (totalNodes.get() >= getMinimumChillModeNodes()); + } + + /** + * Clears the manual chill mode. + */ + @Override + public void clearChillModeFlag() { + this.inManualChillMode = Optional.absent(); + } + + /** + * Returns chill mode Status string. + * @return String + */ + @Override + public String getChillModeStatus() { + if (inManualChillMode.isPresent() && inManualChillMode.get()) { + return "Manual chill mode is set to true." + + getNodeStatus(); + } + + if (inManualChillMode.isPresent() && !inManualChillMode.get()) { + return "Manual chill mode is set to false." + + getNodeStatus(); + } + + if (isOutOfNodeChillMode()) { + return "Out of chill mode." + getNodeStatus(); + } else { + return "Still in chill mode. Waiting on nodes to report in." + + getNodeStatus(); + } + } - //TODO : Support a boolean to force getting out of Safe mode. - return (totalNodes.get() >= getMinimumSafeModeNodes()); + /** + * Returns a node status string. + * @return - String + */ + private String getNodeStatus() { + final String chillModeStatus = " %d of out of total " + + "%d nodes have reported in."; + return String.format(chillModeStatus, totalNodes.get(), + getMinimumChillModeNodes()); + } + + /** + * Returns the status of Manual chill Mode flag. + * + * @return true if forceEnterChillMode has been called, false if + * forceExitChillMode or status is not set. eg. clearChillModeFlag. + */ + @Override + public boolean isInManualChillMode() { + if(this.inManualChillMode.isPresent()) { + return this.inManualChillMode.get(); + } + return false; + } + + /** + * Forcefully exits the chill mode even if we have not met the minimum + * criteria of exiting the chill mode. + */ + @Override + public void forceExitChillMode() { + this.inManualChillMode = Optional.of(false); + } + + /** + * Forcefully enters chill mode, even if all chill mode conditions are met. + */ + @Override + public void forceEnterChillMode() { + this.inManualChillMode = Optional.of(true); } /** http://git-wip-us.apache.org/repos/asf/hadoop/blob/1fc744c6/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/scm/node/TestNodeManager.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/scm/node/TestNodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/scm/node/TestNodeManager.java index 81fea89..3d990cd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/scm/node/TestNodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/scm/node/TestNodeManager.java @@ -23,6 +23,7 @@ import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.OzoneConfiguration; import org.apache.hadoop.test.GenericTestUtils; import org.hamcrest.CoreMatchers; +import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; @@ -107,14 +108,14 @@ public class TestNodeManager { SCMNodeManager createNodeManager(Configuration config) throws IOException { SCMNodeManager nodeManager = new SCMNodeManager(config); - assertFalse("Node manager should be in safe mode", - nodeManager.isOutOfNodeSafeMode()); + assertFalse("Node manager should be in chill mode", + nodeManager.isOutOfNodeChillMode()); return nodeManager; } /** - * Tests that Node manager handles heartbeats correctly, and comes out of Safe - * Mode. + * Tests that Node manager handles heartbeats correctly, and comes out of + * chill Mode. * * @throws IOException * @throws InterruptedException @@ -127,7 +128,7 @@ public class TestNodeManager { try (SCMNodeManager nodeManager = createNodeManager(getConf())) { // Send some heartbeats from different nodes. - for (int x = 0; x < nodeManager.getMinimumSafeModeNodes(); x++) { + for (int x = 0; x < nodeManager.getMinimumChillModeNodes(); x++) { nodeManager.updateHeartbeat(getDatanodeID()); } @@ -136,13 +137,13 @@ public class TestNodeManager { 4 * 1000); assertTrue("Heartbeat thread should have picked up the scheduled " + - "heartbeats and transitioned out of safe mode.", - nodeManager.isOutOfNodeSafeMode()); + "heartbeats and transitioned out of chill mode.", + nodeManager.isOutOfNodeChillMode()); } } /** - * asserts that if we send no heartbeats node manager stays in safemode. + * asserts that if we send no heartbeats node manager stays in chillmode. * * @throws IOException * @throws InterruptedException @@ -155,13 +156,13 @@ public class TestNodeManager { try (SCMNodeManager nodeManager = createNodeManager(getConf())) { GenericTestUtils.waitFor(() -> nodeManager.waitForHeartbeatThead(), 100, 4 * 1000); - assertFalse("No heartbeats, Node manager should have been in safe mode.", - nodeManager.isOutOfNodeSafeMode()); + assertFalse("No heartbeats, Node manager should have been in chill mode.", + nodeManager.isOutOfNodeChillMode()); } } /** - * Asserts that if we don't get enough unique nodes we stay in safemode. + * Asserts that if we don't get enough unique nodes we stay in chillmode. * * @throws IOException * @throws InterruptedException @@ -172,13 +173,13 @@ public class TestNodeManager { InterruptedException, TimeoutException { try (SCMNodeManager nodeManager = createNodeManager(getConf())) { - // Need 100 nodes to come out of safe mode, only one node is sending HB. - nodeManager.setMinimumSafeModeNodes(100); + // Need 100 nodes to come out of chill mode, only one node is sending HB. + nodeManager.setMinimumChillModeNodes(100); nodeManager.updateHeartbeat(getDatanodeID()); GenericTestUtils.waitFor(() -> nodeManager.waitForHeartbeatThead(), 100, 4 * 1000); assertFalse("Not enough heartbeat, Node manager should have been in " + - "safemode.", nodeManager.isOutOfNodeSafeMode()); + "chillmode.", nodeManager.isOutOfNodeChillMode()); } } @@ -195,10 +196,10 @@ public class TestNodeManager { InterruptedException, TimeoutException { try (SCMNodeManager nodeManager = createNodeManager(getConf())) { - nodeManager.setMinimumSafeModeNodes(3); + nodeManager.setMinimumChillModeNodes(3); DatanodeID datanodeID = getDatanodeID(); - // Send 10 heartbeat from same node, and assert we never leave safe mode. + // Send 10 heartbeat from same node, and assert we never leave chill mode. for (int x = 0; x < 10; x++) { nodeManager.updateHeartbeat(datanodeID); } @@ -206,7 +207,7 @@ public class TestNodeManager { GenericTestUtils.waitFor(() -> nodeManager.waitForHeartbeatThead(), 100, 4 * 1000); assertFalse("Not enough nodes have send heartbeat to node manager.", - nodeManager.isOutOfNodeSafeMode()); + nodeManager.isOutOfNodeChillMode()); } } @@ -234,7 +235,7 @@ public class TestNodeManager { Thread.sleep(2 * 1000); assertFalse("Node manager executor service is shutdown, should never exit" + - " safe mode", nodeManager.isOutOfNodeSafeMode()); + " chill mode", nodeManager.isOutOfNodeChillMode()); assertEquals("Assert new HBs were never processed", 0, nodeManager.getLastHBProcessedCount()); @@ -861,4 +862,59 @@ public class TestNodeManager { "counts.")); } } + + @Test + public void testScmEnterAndExistChillMode() throws IOException, + InterruptedException { + Configuration conf = getConf(); + conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100); + + try (SCMNodeManager nodeManager = createNodeManager(conf)) { + nodeManager.setMinimumChillModeNodes(10); + nodeManager.updateHeartbeat(getDatanodeID()); + String status = nodeManager.getChillModeStatus(); + Assert.assertThat(status, CoreMatchers.containsString("Still in chill " + + "mode. Waiting on nodes to report in.")); + + // Should not exist chill mode since 10 nodes have not heartbeat yet. + assertFalse(nodeManager.isOutOfNodeChillMode()); + assertFalse((nodeManager.isInManualChillMode())); + + // Force exit chill mode. + nodeManager.forceExitChillMode(); + assertTrue(nodeManager.isOutOfNodeChillMode()); + status = nodeManager.getChillModeStatus(); + Assert.assertThat(status, + CoreMatchers.containsString("Manual chill mode is set to false.")); + assertFalse((nodeManager.isInManualChillMode())); + + + // Enter back to into chill mode. + nodeManager.forceEnterChillMode(); + assertFalse(nodeManager.isOutOfNodeChillMode()); + status = nodeManager.getChillModeStatus(); + Assert.assertThat(status, + CoreMatchers.containsString("Manual chill mode is set to true.")); + assertTrue((nodeManager.isInManualChillMode())); + + + // Assert that node manager force enter cannot be overridden by nodes HBs. + for(int x= 0; x < 20; x++) { + nodeManager.updateHeartbeat(getDatanodeID()); + } + + Thread.sleep(500); + assertFalse(nodeManager.isOutOfNodeChillMode()); + + // Make sure that once we clear the manual chill mode flag, we fall back + // to the number of nodes to get out chill mode. + nodeManager.clearChillModeFlag(); + assertTrue(nodeManager.isOutOfNodeChillMode()); + status = nodeManager.getChillModeStatus(); + Assert.assertThat(status, + CoreMatchers.containsString("Out of chill mode.")); + assertFalse(nodeManager.isInManualChillMode()); + } + + } } --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org