MAPREDUCE-6657. Job history server can fail on startup when NameNode is in start phase. Contributed by Haibo Chen.
This commit is contained in:
parent
e24fe2641b
commit
f6ef876fe1
@ -516,6 +516,10 @@ boolean isRole(NamenodeRole that) {
|
||||
return role.equals(that);
|
||||
}
|
||||
|
||||
public static String composeNotStartedMessage(NamenodeRole role) {
|
||||
return role + " still not started";
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a configuration get the address of the lifeline RPC server.
|
||||
* If the lifeline RPC is not configured returns null.
|
||||
|
@ -2073,7 +2073,8 @@ public void removeXAttr(String src, XAttr xAttr) throws IOException {
|
||||
|
||||
private void checkNNStartup() throws IOException {
|
||||
if (!this.nn.isStarted()) {
|
||||
throw new RetriableException(this.nn.getRole() + " still not started");
|
||||
String message = NameNode.composeNotStartedMessage(this.nn.getRole());
|
||||
throw new RetriableException(message);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,6 +37,10 @@
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-common</artifactId>
|
||||
|
@ -55,6 +55,9 @@
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
import org.apache.hadoop.ipc.RetriableException;
|
||||
import org.apache.hadoop.mapred.JobACLsManager;
|
||||
import org.apache.hadoop.mapreduce.jobhistory.JobSummary;
|
||||
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
||||
@ -599,12 +602,19 @@ void createHistoryDirs(Clock clock, long intervalCheckMillis,
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the NameNode is still not started yet as indicated by the
|
||||
* exception type and message.
|
||||
* DistributedFileSystem returns a RemoteException with a message stating
|
||||
* SafeModeException in it. So this is only way to check it is because of
|
||||
* being in safe mode.
|
||||
* being in safe mode. In addition, Name Node may have not started yet, in
|
||||
* which case, the message contains "NameNode still not started".
|
||||
*/
|
||||
private boolean isBecauseSafeMode(Throwable ex) {
|
||||
return ex.toString().contains("SafeModeException");
|
||||
private boolean isNameNodeStillNotStarted(Exception ex) {
|
||||
String nameNodeNotStartedMsg = NameNode.composeNotStartedMessage(
|
||||
HdfsServerConstants.NamenodeRole.NAMENODE);
|
||||
return ex.toString().contains("SafeModeException") ||
|
||||
(ex instanceof RetriableException && ex.getMessage().contains(
|
||||
nameNodeNotStartedMsg));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -631,7 +641,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
|
||||
}
|
||||
succeeded = false;
|
||||
} catch (IOException e) {
|
||||
if (isBecauseSafeMode(e)) {
|
||||
if (isNameNodeStillNotStarted(e)) {
|
||||
succeeded = false;
|
||||
if (logWait) {
|
||||
LOG.info("Waiting for FileSystem at " +
|
||||
@ -661,7 +671,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
|
||||
"to be available");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
if (isBecauseSafeMode(e)) {
|
||||
if (isNameNodeStillNotStarted(e)) {
|
||||
succeeded = false;
|
||||
if (logWait) {
|
||||
LOG.info("Waiting for FileSystem at " +
|
||||
|
@ -0,0 +1,75 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.mapreduce.v2.hs;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
|
||||
/**
|
||||
* Test service initialization of HistoryFileManager when
|
||||
* HDFS is not running normally (either in start phase or
|
||||
* in safe mode).
|
||||
*/
|
||||
public class TestHistoryFileManagerInitWithNonRunningDFS {
|
||||
private static final String CLUSTER_BASE_DIR =
|
||||
MiniDFSCluster.getBaseDirectory();
|
||||
|
||||
/**
|
||||
* Verify if JHS keeps retrying to connect to HDFS, if the name node is
|
||||
* in safe mode, when it creates history directories during service
|
||||
* initialization. The expected behavior of JHS is to keep retrying for
|
||||
* a time limit as specified by
|
||||
* JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, and give up by throwing
|
||||
* a YarnRuntimeException with a time out message.
|
||||
*/
|
||||
@Test
|
||||
public void testKeepRetryingWhileNameNodeInSafeMode() throws Exception {
|
||||
Configuration conf = new Configuration();
|
||||
// set maximum wait time for JHS to wait for HDFS NameNode to start running
|
||||
final long maxJhsWaitTime = 500;
|
||||
conf.setLong(JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, maxJhsWaitTime);
|
||||
conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, CLUSTER_BASE_DIR);
|
||||
|
||||
MiniDFSCluster dfsCluster = new MiniDFSCluster.Builder(conf).build();
|
||||
try {
|
||||
// set up a cluster with its name node in safe mode
|
||||
dfsCluster.getFileSystem().setSafeMode(
|
||||
HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
|
||||
Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode());
|
||||
|
||||
HistoryFileManager hfm = new HistoryFileManager();
|
||||
hfm.serviceInit(conf);
|
||||
Assert.fail("History File Manager did not retry to connect to name node");
|
||||
} catch (YarnRuntimeException yex) {
|
||||
String expectedExceptionMsg = "Timed out '" + maxJhsWaitTime +
|
||||
"ms' waiting for FileSystem to become available";
|
||||
Assert.assertEquals("Unexpected reconnect timeout exception message",
|
||||
expectedExceptionMsg, yex.getMessage());
|
||||
} finally {
|
||||
dfsCluster.shutdown(true);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user