Repository: hadoop
Updated Branches:
  refs/heads/branch-2 1e7eb2d2a -> 7e36b9159


MAPREDUCE-6657. Job history server can fail on startup when NameNode is in 
start phase. Contributed by Haibo Chen.
(cherry picked from commit f6ef876fe158a5334cad7075f1966573a1c4dec9)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/7e36b915
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/7e36b915
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/7e36b915

Branch: refs/heads/branch-2
Commit: 7e36b9159e55ced14620c97bf6a98ac30f9d5fae
Parents: 1e7eb2d
Author: Junping Du <junping...@apache.org>
Authored: Tue May 17 14:41:47 2016 -0700
Committer: Junping Du <junping...@apache.org>
Committed: Tue May 17 14:42:11 2016 -0700

----------------------------------------------------------------------
 .../hadoop/hdfs/server/namenode/NameNode.java   |  4 ++
 .../hdfs/server/namenode/NameNodeRpcServer.java |  3 +-
 .../hadoop-mapreduce-client-hs/pom.xml          |  4 ++
 .../mapreduce/v2/hs/HistoryFileManager.java     | 20 ++++--
 ...HistoryFileManagerInitWithNonRunningDFS.java | 75 ++++++++++++++++++++
 5 files changed, 100 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/7e36b915/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
index 3d2fbda..c9006a1 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
@@ -517,6 +517,10 @@ public class NameNode extends ReconfigurableBase implements
     return role.equals(that);
   }
 
+  public static String composeNotStartedMessage(NamenodeRole role) {
+    return role + " still not started";
+  }
+
   /**
    * Given a configuration get the address of the lifeline RPC server.
    * If the lifeline RPC is not configured returns null.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/7e36b915/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
index 4f62884..098b5d6 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
@@ -2050,7 +2050,8 @@ class NameNodeRpcServer implements NamenodeProtocols {
 
   private void checkNNStartup() throws IOException {
     if (!this.nn.isStarted()) {
-      throw new RetriableException(this.nn.getRole() + " still not started");
+      String message = NameNode.composeNotStartedMessage(this.nn.getRole());
+      throw new RetriableException(message);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/7e36b915/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
index 5b2591e..3aa2501 100644
--- 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
+++ 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
@@ -39,6 +39,10 @@
     </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-mapreduce-client-common</artifactId>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/7e36b915/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
index 02da8e5..7bfb11c 100644
--- 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
+++ 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
@@ -55,6 +55,9 @@ import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.UnsupportedFileSystemException;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.ipc.RetriableException;
 import org.apache.hadoop.mapred.JobACLsManager;
 import org.apache.hadoop.mapreduce.jobhistory.JobSummary;
 import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@@ -599,12 +602,19 @@ public class HistoryFileManager extends AbstractService {
   }
 
   /**
+   * Check if the NameNode is still not started yet as indicated by the
+   * exception type and message.
    * DistributedFileSystem returns a RemoteException with a message stating
    * SafeModeException in it. So this is only way to check it is because of
-   * being in safe mode.
+   * being in safe mode. In addition, Name Node may have not started yet, in
+   * which case, the message contains "NameNode still not started".
    */
-  private boolean isBecauseSafeMode(Throwable ex) {
-    return ex.toString().contains("SafeModeException");
+  private boolean isNameNodeStillNotStarted(Exception ex) {
+    String nameNodeNotStartedMsg = NameNode.composeNotStartedMessage(
+        HdfsServerConstants.NamenodeRole.NAMENODE);
+    return ex.toString().contains("SafeModeException") ||
+        (ex instanceof RetriableException && ex.getMessage().contains(
+            nameNodeNotStartedMsg));
   }
 
   /**
@@ -631,7 +641,7 @@ public class HistoryFileManager extends AbstractService {
       }
       succeeded = false;
     } catch (IOException e) {
-      if (isBecauseSafeMode(e)) {
+      if (isNameNodeStillNotStarted(e)) {
         succeeded = false;
         if (logWait) {
           LOG.info("Waiting for FileSystem at " +
@@ -661,7 +671,7 @@ public class HistoryFileManager extends AbstractService {
               "to be available");
         }
       } catch (IOException e) {
-        if (isBecauseSafeMode(e)) {
+        if (isNameNodeStillNotStarted(e)) {
           succeeded = false;
           if (logWait) {
             LOG.info("Waiting for FileSystem at " +

http://git-wip-us.apache.org/repos/asf/hadoop/blob/7e36b915/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
new file mode 100644
index 0000000..d0fefef
--- /dev/null
+++ 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.v2.hs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
+import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+/**
+ * Test service initialization of HistoryFileManager when
+ * HDFS is not running normally (either in start phase or
+ * in safe mode).
+ */
+public class TestHistoryFileManagerInitWithNonRunningDFS {
+  private static final String CLUSTER_BASE_DIR =
+      MiniDFSCluster.getBaseDirectory();
+
+  /**
+   * Verify if JHS keeps retrying to connect to HDFS, if the name node is
+   * in safe mode, when it creates history directories during service
+   * initialization. The expected behavior of JHS is to keep retrying for
+   * a time limit as specified by
+   * JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, and give up by throwing
+   * a YarnRuntimeException with a time out message.
+   */
+  @Test
+  public void testKeepRetryingWhileNameNodeInSafeMode() throws Exception {
+    Configuration conf = new Configuration();
+    // set maximum wait time for JHS to wait for HDFS NameNode to start running
+    final long maxJhsWaitTime = 500;
+    conf.setLong(JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, maxJhsWaitTime);
+    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, CLUSTER_BASE_DIR);
+
+    MiniDFSCluster dfsCluster = new MiniDFSCluster.Builder(conf).build();
+    try {
+      // set up a cluster with its name node in safe mode
+      dfsCluster.getFileSystem().setSafeMode(
+          HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
+      Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode());
+
+      HistoryFileManager hfm = new HistoryFileManager();
+      hfm.serviceInit(conf);
+      Assert.fail("History File Manager did not retry to connect to name 
node");
+    } catch (YarnRuntimeException yex) {
+      String expectedExceptionMsg = "Timed out '" + maxJhsWaitTime +
+          "ms' waiting for FileSystem to become available";
+      Assert.assertEquals("Unexpected reconnect timeout exception message",
+          expectedExceptionMsg, yex.getMessage());
+    } finally {
+      dfsCluster.shutdown(true);
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to