diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
index 3fc4e37d7e..ae7a937124 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
@@ -516,6 +516,10 @@ boolean isRole(NamenodeRole that) {
return role.equals(that);
}
+ public static String composeNotStartedMessage(NamenodeRole role) {
+ return role + " still not started";
+ }
+
/**
* Given a configuration get the address of the lifeline RPC server.
* If the lifeline RPC is not configured returns null.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
index 2501be948b..a63edeaaf9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
@@ -2073,7 +2073,8 @@ public void removeXAttr(String src, XAttr xAttr) throws IOException {
private void checkNNStartup() throws IOException {
if (!this.nn.isStarted()) {
- throw new RetriableException(this.nn.getRole() + " still not started");
+ String message = NameNode.composeNotStartedMessage(this.nn.getRole());
+ throw new RetriableException(message);
}
}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
index ce76165d09..6166e8097a 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
@@ -37,6 +37,10 @@
org.apache.hadoop
hadoop-hdfs-client
+
+ org.apache.hadoop
+ hadoop-hdfs
+
org.apache.hadoop
hadoop-mapreduce-client-common
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
index 27e743c53b..a1f4014323 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
@@ -55,6 +55,9 @@
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.ipc.RetriableException;
import org.apache.hadoop.mapred.JobACLsManager;
import org.apache.hadoop.mapreduce.jobhistory.JobSummary;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@@ -599,12 +602,19 @@ void createHistoryDirs(Clock clock, long intervalCheckMillis,
}
/**
+ * Check if the NameNode is still not started yet as indicated by the
+ * exception type and message.
* DistributedFileSystem returns a RemoteException with a message stating
* SafeModeException in it. So this is only way to check it is because of
- * being in safe mode.
+ * being in safe mode. In addition, Name Node may have not started yet, in
+ * which case, the message contains "NameNode still not started".
*/
- private boolean isBecauseSafeMode(Throwable ex) {
- return ex.toString().contains("SafeModeException");
+ private boolean isNameNodeStillNotStarted(Exception ex) {
+ String nameNodeNotStartedMsg = NameNode.composeNotStartedMessage(
+ HdfsServerConstants.NamenodeRole.NAMENODE);
+ return ex.toString().contains("SafeModeException") ||
+ (ex instanceof RetriableException && ex.getMessage().contains(
+ nameNodeNotStartedMsg));
}
/**
@@ -631,7 +641,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
}
succeeded = false;
} catch (IOException e) {
- if (isBecauseSafeMode(e)) {
+ if (isNameNodeStillNotStarted(e)) {
succeeded = false;
if (logWait) {
LOG.info("Waiting for FileSystem at " +
@@ -661,7 +671,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
"to be available");
}
} catch (IOException e) {
- if (isBecauseSafeMode(e)) {
+ if (isNameNodeStillNotStarted(e)) {
succeeded = false;
if (logWait) {
LOG.info("Waiting for FileSystem at " +
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
new file mode 100644
index 0000000000..d0fefef90a
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.v2.hs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
+import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+/**
+ * Test service initialization of HistoryFileManager when
+ * HDFS is not running normally (either in start phase or
+ * in safe mode).
+ */
+public class TestHistoryFileManagerInitWithNonRunningDFS {
+ private static final String CLUSTER_BASE_DIR =
+ MiniDFSCluster.getBaseDirectory();
+
+ /**
+ * Verify if JHS keeps retrying to connect to HDFS, if the name node is
+ * in safe mode, when it creates history directories during service
+ * initialization. The expected behavior of JHS is to keep retrying for
+ * a time limit as specified by
+ * JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, and give up by throwing
+ * a YarnRuntimeException with a time out message.
+ */
+ @Test
+ public void testKeepRetryingWhileNameNodeInSafeMode() throws Exception {
+ Configuration conf = new Configuration();
+ // set maximum wait time for JHS to wait for HDFS NameNode to start running
+ final long maxJhsWaitTime = 500;
+ conf.setLong(JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, maxJhsWaitTime);
+ conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, CLUSTER_BASE_DIR);
+
+ MiniDFSCluster dfsCluster = new MiniDFSCluster.Builder(conf).build();
+ try {
+ // set up a cluster with its name node in safe mode
+ dfsCluster.getFileSystem().setSafeMode(
+ HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
+ Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode());
+
+ HistoryFileManager hfm = new HistoryFileManager();
+ hfm.serviceInit(conf);
+ Assert.fail("History File Manager did not retry to connect to name node");
+ } catch (YarnRuntimeException yex) {
+ String expectedExceptionMsg = "Timed out '" + maxJhsWaitTime +
+ "ms' waiting for FileSystem to become available";
+ Assert.assertEquals("Unexpected reconnect timeout exception message",
+ expectedExceptionMsg, yex.getMessage());
+ } finally {
+ dfsCluster.shutdown(true);
+ }
+ }
+}