MAPREDUCE-6657. Job history server can fail on startup when NameNode is in start phase. Contributed by Haibo Chen.
This commit is contained in:
parent
e24fe2641b
commit
f6ef876fe1
@ -516,6 +516,10 @@ boolean isRole(NamenodeRole that) {
|
|||||||
return role.equals(that);
|
return role.equals(that);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String composeNotStartedMessage(NamenodeRole role) {
|
||||||
|
return role + " still not started";
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a configuration get the address of the lifeline RPC server.
|
* Given a configuration get the address of the lifeline RPC server.
|
||||||
* If the lifeline RPC is not configured returns null.
|
* If the lifeline RPC is not configured returns null.
|
||||||
|
@ -2073,7 +2073,8 @@ public void removeXAttr(String src, XAttr xAttr) throws IOException {
|
|||||||
|
|
||||||
private void checkNNStartup() throws IOException {
|
private void checkNNStartup() throws IOException {
|
||||||
if (!this.nn.isStarted()) {
|
if (!this.nn.isStarted()) {
|
||||||
throw new RetriableException(this.nn.getRole() + " still not started");
|
String message = NameNode.composeNotStartedMessage(this.nn.getRole());
|
||||||
|
throw new RetriableException(message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,6 +37,10 @@
|
|||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-hdfs-client</artifactId>
|
<artifactId>hadoop-hdfs-client</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-mapreduce-client-common</artifactId>
|
<artifactId>hadoop-mapreduce-client-common</artifactId>
|
||||||
|
@ -55,6 +55,9 @@
|
|||||||
import org.apache.hadoop.fs.RemoteIterator;
|
import org.apache.hadoop.fs.RemoteIterator;
|
||||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
import org.apache.hadoop.fs.permission.FsPermission;
|
import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
|
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
|
import org.apache.hadoop.ipc.RetriableException;
|
||||||
import org.apache.hadoop.mapred.JobACLsManager;
|
import org.apache.hadoop.mapred.JobACLsManager;
|
||||||
import org.apache.hadoop.mapreduce.jobhistory.JobSummary;
|
import org.apache.hadoop.mapreduce.jobhistory.JobSummary;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
||||||
@ -599,12 +602,19 @@ void createHistoryDirs(Clock clock, long intervalCheckMillis,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Check if the NameNode is still not started yet as indicated by the
|
||||||
|
* exception type and message.
|
||||||
* DistributedFileSystem returns a RemoteException with a message stating
|
* DistributedFileSystem returns a RemoteException with a message stating
|
||||||
* SafeModeException in it. So this is only way to check it is because of
|
* SafeModeException in it. So this is only way to check it is because of
|
||||||
* being in safe mode.
|
* being in safe mode. In addition, Name Node may have not started yet, in
|
||||||
|
* which case, the message contains "NameNode still not started".
|
||||||
*/
|
*/
|
||||||
private boolean isBecauseSafeMode(Throwable ex) {
|
private boolean isNameNodeStillNotStarted(Exception ex) {
|
||||||
return ex.toString().contains("SafeModeException");
|
String nameNodeNotStartedMsg = NameNode.composeNotStartedMessage(
|
||||||
|
HdfsServerConstants.NamenodeRole.NAMENODE);
|
||||||
|
return ex.toString().contains("SafeModeException") ||
|
||||||
|
(ex instanceof RetriableException && ex.getMessage().contains(
|
||||||
|
nameNodeNotStartedMsg));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -631,7 +641,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
|
|||||||
}
|
}
|
||||||
succeeded = false;
|
succeeded = false;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
if (isBecauseSafeMode(e)) {
|
if (isNameNodeStillNotStarted(e)) {
|
||||||
succeeded = false;
|
succeeded = false;
|
||||||
if (logWait) {
|
if (logWait) {
|
||||||
LOG.info("Waiting for FileSystem at " +
|
LOG.info("Waiting for FileSystem at " +
|
||||||
@ -661,7 +671,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
|
|||||||
"to be available");
|
"to be available");
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
if (isBecauseSafeMode(e)) {
|
if (isNameNodeStillNotStarted(e)) {
|
||||||
succeeded = false;
|
succeeded = false;
|
||||||
if (logWait) {
|
if (logWait) {
|
||||||
LOG.info("Waiting for FileSystem at " +
|
LOG.info("Waiting for FileSystem at " +
|
||||||
|
@ -0,0 +1,75 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.mapreduce.v2.hs;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||||
|
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test service initialization of HistoryFileManager when
|
||||||
|
* HDFS is not running normally (either in start phase or
|
||||||
|
* in safe mode).
|
||||||
|
*/
|
||||||
|
public class TestHistoryFileManagerInitWithNonRunningDFS {
|
||||||
|
private static final String CLUSTER_BASE_DIR =
|
||||||
|
MiniDFSCluster.getBaseDirectory();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify if JHS keeps retrying to connect to HDFS, if the name node is
|
||||||
|
* in safe mode, when it creates history directories during service
|
||||||
|
* initialization. The expected behavior of JHS is to keep retrying for
|
||||||
|
* a time limit as specified by
|
||||||
|
* JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, and give up by throwing
|
||||||
|
* a YarnRuntimeException with a time out message.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testKeepRetryingWhileNameNodeInSafeMode() throws Exception {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
// set maximum wait time for JHS to wait for HDFS NameNode to start running
|
||||||
|
final long maxJhsWaitTime = 500;
|
||||||
|
conf.setLong(JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, maxJhsWaitTime);
|
||||||
|
conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, CLUSTER_BASE_DIR);
|
||||||
|
|
||||||
|
MiniDFSCluster dfsCluster = new MiniDFSCluster.Builder(conf).build();
|
||||||
|
try {
|
||||||
|
// set up a cluster with its name node in safe mode
|
||||||
|
dfsCluster.getFileSystem().setSafeMode(
|
||||||
|
HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
|
||||||
|
Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode());
|
||||||
|
|
||||||
|
HistoryFileManager hfm = new HistoryFileManager();
|
||||||
|
hfm.serviceInit(conf);
|
||||||
|
Assert.fail("History File Manager did not retry to connect to name node");
|
||||||
|
} catch (YarnRuntimeException yex) {
|
||||||
|
String expectedExceptionMsg = "Timed out '" + maxJhsWaitTime +
|
||||||
|
"ms' waiting for FileSystem to become available";
|
||||||
|
Assert.assertEquals("Unexpected reconnect timeout exception message",
|
||||||
|
expectedExceptionMsg, yex.getMessage());
|
||||||
|
} finally {
|
||||||
|
dfsCluster.shutdown(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user