MAPREDUCE-6657. Job history server can fail on startup when NameNode is in start phase. Contributed by Haibo Chen.

2016-05-17 14:41:47 -07:00 · 2016-05-17 14:41:47 -07:00 · f6ef876fe1
commit f6ef876fe1
parent e24fe2641b
5 changed files with 100 additions and 6 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
@ -516,6 +516,10 @@ boolean isRole(NamenodeRole that) {
    return role.equals(that);
  }
  public static String composeNotStartedMessage(NamenodeRole role) {
    return role + " still not started";
  }
  /**
   * Given a configuration get the address of the lifeline RPC server.
   * If the lifeline RPC is not configured returns null.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
@ -2073,7 +2073,8 @@ public void removeXAttr(String src, XAttr xAttr) throws IOException {
  private void checkNNStartup() throws IOException {
    if (!this.nn.isStarted()) {
-      throw new RetriableException(this.nn.getRole() + " still not started");
+      String message = NameNode.composeNotStartedMessage(this.nn.getRole());
      throw new RetriableException(message);
    }
  }
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
@ -37,6 +37,10 @@
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs-client</artifactId>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-common</artifactId>
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
@ -55,6 +55,9 @@
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.UnsupportedFileSystemException;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.ipc.RetriableException;
 import org.apache.hadoop.mapred.JobACLsManager;
 import org.apache.hadoop.mapreduce.jobhistory.JobSummary;
 import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@ -599,12 +602,19 @@ void createHistoryDirs(Clock clock, long intervalCheckMillis,
  }
  /**
   * Check if the NameNode is still not started yet as indicated by the
   * exception type and message.
   * DistributedFileSystem returns a RemoteException with a message stating
   * SafeModeException in it. So this is only way to check it is because of
-   * being in safe mode.
+   * being in safe mode. In addition, Name Node may have not started yet, in
   * which case, the message contains "NameNode still not started".
   */
-  private boolean isBecauseSafeMode(Throwable ex) {
+  private boolean isNameNodeStillNotStarted(Exception ex) {
-    return ex.toString().contains("SafeModeException");
+    String nameNodeNotStartedMsg = NameNode.composeNotStartedMessage(
        HdfsServerConstants.NamenodeRole.NAMENODE);
    return ex.toString().contains("SafeModeException") ||
        (ex instanceof RetriableException && ex.getMessage().contains(
            nameNodeNotStartedMsg));
  }
  /**
@ -631,7 +641,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
      }
      succeeded = false;
    } catch (IOException e) {
-      if (isBecauseSafeMode(e)) {
+      if (isNameNodeStillNotStarted(e)) {
        succeeded = false;
        if (logWait) {
          LOG.info("Waiting for FileSystem at " +
@ -661,7 +671,7 @@ boolean tryCreatingHistoryDirs(boolean logWait) throws IOException {
              "to be available");
        }
      } catch (IOException e) {
-        if (isBecauseSafeMode(e)) {
+        if (isNameNodeStillNotStarted(e)) {
          succeeded = false;
          if (logWait) {
            LOG.info("Waiting for FileSystem at " +
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestHistoryFileManagerInitWithNonRunningDFS.java
@ -0,0 +1,75 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.hadoop.mapreduce.v2.hs;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
 import org.junit.Assert;
 import org.junit.Test;
 /**
 * Test service initialization of HistoryFileManager when
 * HDFS is not running normally (either in start phase or
 * in safe mode).
 */
 public class TestHistoryFileManagerInitWithNonRunningDFS {
  private static final String CLUSTER_BASE_DIR =
      MiniDFSCluster.getBaseDirectory();
  /**
   * Verify if JHS keeps retrying to connect to HDFS, if the name node is
   * in safe mode, when it creates history directories during service
   * initialization. The expected behavior of JHS is to keep retrying for
   * a time limit as specified by
   * JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, and give up by throwing
   * a YarnRuntimeException with a time out message.
   */
  @Test
  public void testKeepRetryingWhileNameNodeInSafeMode() throws Exception {
    Configuration conf = new Configuration();
    // set maximum wait time for JHS to wait for HDFS NameNode to start running
    final long maxJhsWaitTime = 500;
    conf.setLong(JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, maxJhsWaitTime);
    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, CLUSTER_BASE_DIR);
    MiniDFSCluster dfsCluster = new MiniDFSCluster.Builder(conf).build();
    try {
      // set up a cluster with its name node in safe mode
      dfsCluster.getFileSystem().setSafeMode(
          HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
      Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode());
      HistoryFileManager hfm = new HistoryFileManager();
      hfm.serviceInit(conf);
      Assert.fail("History File Manager did not retry to connect to name node");
    } catch (YarnRuntimeException yex) {
      String expectedExceptionMsg = "Timed out '" + maxJhsWaitTime +
          "ms' waiting for FileSystem to become available";
      Assert.assertEquals("Unexpected reconnect timeout exception message",
          expectedExceptionMsg, yex.getMessage());
    } finally {
      dfsCluster.shutdown(true);
    }
  }
 }