diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index 3fc4e37d7e..ae7a937124 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -516,6 +516,10 @@ boolean isRole(NamenodeRole that) { return role.equals(that); } + public static String composeNotStartedMessage(NamenodeRole role) { + return role + " still not started"; + } + /** * Given a configuration get the address of the lifeline RPC server. * If the lifeline RPC is not configured returns null. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java index 2501be948b..a63edeaaf9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java @@ -2073,7 +2073,8 @@ public void removeXAttr(String src, XAttr xAttr) throws IOException { private void checkNNStartup() throws IOException { if (!this.nn.isStarted()) { - throw new RetriableException(this.nn.getRole() + " still not started"); + String message = NameNode.composeNotStartedMessage(this.nn.getRole()); + throw new RetriableException(message); } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml index ce76165d09..6166e8097a 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml @@ -37,6 +37,10 @@ org.apache.hadoop hadoop-hdfs-client + + org.apache.hadoop + hadoop-hdfs + org.apache.hadoop hadoop-mapreduce-client-common diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java index 27e743c53b..a1f4014323 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java @@ -55,6 +55,9 @@ import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.ipc.RetriableException; import org.apache.hadoop.mapred.JobACLsManager; import org.apache.hadoop.mapreduce.jobhistory.JobSummary; import org.apache.hadoop.mapreduce.v2.api.records.JobId; @@ -599,12 +602,19 @@ void createHistoryDirs(Clock clock, long intervalCheckMillis, } /** + * Check if the NameNode is still not started yet as indicated by the + * exception type and message. * DistributedFileSystem returns a RemoteException with a message stating * SafeModeException in it. So this is only way to check it is because of - * being in safe mode. + * being in safe mode. In addition, Name Node may have not started yet, in + * which case, the message contains "NameNode still not started". */ - private boolean isBecauseSafeMode(Throwable ex) { - return ex.toString().contains("SafeModeException"); + private boolean isNameNodeStillNotStarted(Exception ex) { + String nameNodeNotStartedMsg = NameNode.composeNotStartedMessage( + HdfsServerConstants.NamenodeRole.NAMENODE); + return ex.toString().contains("SafeModeException") || + (ex instanceof RetriableException && ex.getMessage().contains( + nameNodeNotStartedMsg)); } /** @@ -631,7 +641,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException { } succeeded = false; } catch (IOException e) { - if (isBecauseSafeMode(e)) { + if (isNameNodeStillNotStarted(e)) { succeeded = false; if (logWait) { LOG.info("Waiting for FileSystem at " + @@ -661,7 +671,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException { "to be available"); } } catch (IOException e) { - if (isBecauseSafeMode(e)) { + if (isNameNodeStillNotStarted(e)) { succeeded = false; if (logWait) { LOG.info("Waiting for FileSystem at " + diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java new file mode 100644 index 0000000000..d0fefef90a --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapreduce.v2.hs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig; +import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; + +import org.junit.Assert; +import org.junit.Test; + + +/** + * Test service initialization of HistoryFileManager when + * HDFS is not running normally (either in start phase or + * in safe mode). + */ +public class TestHistoryFileManagerInitWithNonRunningDFS { + private static final String CLUSTER_BASE_DIR = + MiniDFSCluster.getBaseDirectory(); + + /** + * Verify if JHS keeps retrying to connect to HDFS, if the name node is + * in safe mode, when it creates history directories during service + * initialization. The expected behavior of JHS is to keep retrying for + * a time limit as specified by + * JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, and give up by throwing + * a YarnRuntimeException with a time out message. + */ + @Test + public void testKeepRetryingWhileNameNodeInSafeMode() throws Exception { + Configuration conf = new Configuration(); + // set maximum wait time for JHS to wait for HDFS NameNode to start running + final long maxJhsWaitTime = 500; + conf.setLong(JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, maxJhsWaitTime); + conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, CLUSTER_BASE_DIR); + + MiniDFSCluster dfsCluster = new MiniDFSCluster.Builder(conf).build(); + try { + // set up a cluster with its name node in safe mode + dfsCluster.getFileSystem().setSafeMode( + HdfsConstants.SafeModeAction.SAFEMODE_ENTER); + Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode()); + + HistoryFileManager hfm = new HistoryFileManager(); + hfm.serviceInit(conf); + Assert.fail("History File Manager did not retry to connect to name node"); + } catch (YarnRuntimeException yex) { + String expectedExceptionMsg = "Timed out '" + maxJhsWaitTime + + "ms' waiting for FileSystem to become available"; + Assert.assertEquals("Unexpected reconnect timeout exception message", + expectedExceptionMsg, yex.getMessage()); + } finally { + dfsCluster.shutdown(true); + } + } +}