MAPREDUCE-6657. Job history server can fail on startup when NameNode is in start phase. Contributed by Haibo Chen.

This commit is contained in:
Junping Du 2016-05-17 14:41:47 -07:00
parent e24fe2641b
commit f6ef876fe1
5 changed files with 100 additions and 6 deletions

View File

@ -516,6 +516,10 @@ boolean isRole(NamenodeRole that) {
return role.equals(that); return role.equals(that);
} }
public static String composeNotStartedMessage(NamenodeRole role) {
return role + " still not started";
}
/** /**
* Given a configuration get the address of the lifeline RPC server. * Given a configuration get the address of the lifeline RPC server.
* If the lifeline RPC is not configured returns null. * If the lifeline RPC is not configured returns null.

View File

@ -2073,7 +2073,8 @@ public void removeXAttr(String src, XAttr xAttr) throws IOException {
private void checkNNStartup() throws IOException { private void checkNNStartup() throws IOException {
if (!this.nn.isStarted()) { if (!this.nn.isStarted()) {
throw new RetriableException(this.nn.getRole() + " still not started"); String message = NameNode.composeNotStartedMessage(this.nn.getRole());
throw new RetriableException(message);
} }
} }

View File

@ -37,6 +37,10 @@
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId> <artifactId>hadoop-hdfs-client</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId> <artifactId>hadoop-mapreduce-client-common</artifactId>

View File

@ -55,6 +55,9 @@
import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.ipc.RetriableException;
import org.apache.hadoop.mapred.JobACLsManager; import org.apache.hadoop.mapred.JobACLsManager;
import org.apache.hadoop.mapreduce.jobhistory.JobSummary; import org.apache.hadoop.mapreduce.jobhistory.JobSummary;
import org.apache.hadoop.mapreduce.v2.api.records.JobId; import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@ -599,12 +602,19 @@ void createHistoryDirs(Clock clock, long intervalCheckMillis,
} }
/** /**
* Check if the NameNode is still not started yet as indicated by the
* exception type and message.
* DistributedFileSystem returns a RemoteException with a message stating * DistributedFileSystem returns a RemoteException with a message stating
* SafeModeException in it. So this is only way to check it is because of * SafeModeException in it. So this is only way to check it is because of
* being in safe mode. * being in safe mode. In addition, Name Node may have not started yet, in
* which case, the message contains "NameNode still not started".
*/ */
private boolean isBecauseSafeMode(Throwable ex) { private boolean isNameNodeStillNotStarted(Exception ex) {
return ex.toString().contains("SafeModeException"); String nameNodeNotStartedMsg = NameNode.composeNotStartedMessage(
HdfsServerConstants.NamenodeRole.NAMENODE);
return ex.toString().contains("SafeModeException") ||
(ex instanceof RetriableException && ex.getMessage().contains(
nameNodeNotStartedMsg));
} }
/** /**
@ -631,7 +641,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
} }
succeeded = false; succeeded = false;
} catch (IOException e) { } catch (IOException e) {
if (isBecauseSafeMode(e)) { if (isNameNodeStillNotStarted(e)) {
succeeded = false; succeeded = false;
if (logWait) { if (logWait) {
LOG.info("Waiting for FileSystem at " + LOG.info("Waiting for FileSystem at " +
@ -661,7 +671,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
"to be available"); "to be available");
} }
} catch (IOException e) { } catch (IOException e) {
if (isBecauseSafeMode(e)) { if (isNameNodeStillNotStarted(e)) {
succeeded = false; succeeded = false;
if (logWait) { if (logWait) {
LOG.info("Waiting for FileSystem at " + LOG.info("Waiting for FileSystem at " +

View File

@ -0,0 +1,75 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.v2.hs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.junit.Assert;
import org.junit.Test;
/**
* Test service initialization of HistoryFileManager when
* HDFS is not running normally (either in start phase or
* in safe mode).
*/
public class TestHistoryFileManagerInitWithNonRunningDFS {
private static final String CLUSTER_BASE_DIR =
MiniDFSCluster.getBaseDirectory();
/**
* Verify if JHS keeps retrying to connect to HDFS, if the name node is
* in safe mode, when it creates history directories during service
* initialization. The expected behavior of JHS is to keep retrying for
* a time limit as specified by
* JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, and give up by throwing
* a YarnRuntimeException with a time out message.
*/
@Test
public void testKeepRetryingWhileNameNodeInSafeMode() throws Exception {
Configuration conf = new Configuration();
// set maximum wait time for JHS to wait for HDFS NameNode to start running
final long maxJhsWaitTime = 500;
conf.setLong(JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, maxJhsWaitTime);
conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, CLUSTER_BASE_DIR);
MiniDFSCluster dfsCluster = new MiniDFSCluster.Builder(conf).build();
try {
// set up a cluster with its name node in safe mode
dfsCluster.getFileSystem().setSafeMode(
HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode());
HistoryFileManager hfm = new HistoryFileManager();
hfm.serviceInit(conf);
Assert.fail("History File Manager did not retry to connect to name node");
} catch (YarnRuntimeException yex) {
String expectedExceptionMsg = "Timed out '" + maxJhsWaitTime +
"ms' waiting for FileSystem to become available";
Assert.assertEquals("Unexpected reconnect timeout exception message",
expectedExceptionMsg, yex.getMessage());
} finally {
dfsCluster.shutdown(true);
}
}
}