HDFS-4128. 2NN gets stuck in inconsistent state if edit log replay fails in the middle (kihwal via daryn)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1452384 13f79535-47bb-0310-9956-ffa450edef68
2013-03-04 16:44:23 +00:00 · 2013-03-04 16:44:23 +00:00 · cfa86e6110
commit cfa86e6110
parent ec13f1eb3a
7 changed files with 180 additions and 4 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@ -2317,6 +2317,9 @@ Release 0.23.7 - UNRELEASED
    HDFS-4495. Allow client-side lease renewal to be retried beyond soft-limit
    (kihwal)

+    HDFS-4128. 2NN gets stuck in inconsistent state if edit log replay fails
+    in the middle (kihwal via daryn)
+
 Release 0.23.6 - UNRELEASED

  INCOMPATIBLE CHANGES
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
@ -121,6 +121,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
  public static final long    DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT = 3600;
  public static final String  DFS_NAMENODE_CHECKPOINT_TXNS_KEY = "dfs.namenode.checkpoint.txns";
  public static final long    DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT = 40000;
+  public static final String  DFS_NAMENODE_CHECKPOINT_MAX_RETRIES_KEY = "dfs.namenode.checkpoint.max-retries";
+  public static final int     DFS_NAMENODE_CHECKPOINT_MAX_RETRIES_DEFAULT = 3;
  public static final String  DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY = "dfs.namenode.heartbeat.recheck-interval";
  public static final int     DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT = 5*60*1000;
  public static final String  DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_KEY = "dfs.namenode.tolerate.heartbeat.multiplier";
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java
@ -39,6 +39,8 @@ public class CheckpointConf {
  /** checkpoint once every this many transactions, regardless of time */
  private final long checkpointTxnCount;

+  /** maxium number of retries when merge errors occur */
+  private final int maxRetriesOnMergeError;
  
  public CheckpointConf(Configuration conf) {
    checkpointCheckPeriod = conf.getLong(
@ -49,6 +51,8 @@ public CheckpointConf(Configuration conf) {
                                    DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
    checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 
                                  DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
+    maxRetriesOnMergeError = conf.getInt(DFS_NAMENODE_CHECKPOINT_MAX_RETRIES_KEY,
+                                  DFS_NAMENODE_CHECKPOINT_MAX_RETRIES_DEFAULT);
    warnForDeprecatedConfigs(conf);
  }
  
@ -75,4 +79,8 @@ public long getCheckPeriod() {
  public long getTxnCount() {
    return checkpointTxnCount;
  }
+
+  public int getMaxRetriesOnMergeError() {
+    return maxRetriesOnMergeError;
+  }
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointFaultInjector.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointFaultInjector.java
@ -33,6 +33,7 @@ static CheckpointFaultInjector getInstance() {
  
  public void beforeGetImageSetsHeaders() throws IOException {}
  public void afterSecondaryCallsRollEditLog() throws IOException {}
+  public void duringMerge() throws IOException {}
  public void afterSecondaryUploadsNewImage() throws IOException {}
  public void aboutToSendFile(File localfile) throws IOException {}

--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
@ -144,6 +144,11 @@ FSImage getFSImage() {
    return checkpointImage;
  }

+  @VisibleForTesting
+  int getMergeErrorCount() {
+    return checkpointImage.getMergeErrorCount();
+  }
+
  @VisibleForTesting
  FSNamesystem getFSNamesystem() {
    return namesystem;
@ -339,6 +344,7 @@ public void doWork() {
    // number of transactions in the edit log that haven't yet been checkpointed.
    //
    long period = checkpointConf.getCheckPeriod();
+    int maxRetries = checkpointConf.getMaxRetriesOnMergeError();

    while (shouldRun) {
      try {
@ -364,6 +370,13 @@ public void doWork() {
      } catch (IOException e) {
        LOG.error("Exception in doCheckpoint", e);
        e.printStackTrace();
+        // Prevent a huge number of edits from being created due to
+        // unrecoverable conditions and endless retries.
+        if (checkpointImage.getMergeErrorCount() > maxRetries) {
+          LOG.fatal("Merging failed " + 
+              checkpointImage.getMergeErrorCount() + " times.");
+          terminate(1);
+        }
      } catch (Throwable e) {
        LOG.fatal("Throwable Exception in doCheckpoint", e);
        e.printStackTrace();
@ -498,9 +511,21 @@ boolean doCheckpoint() throws IOException {
    RemoteEditLogManifest manifest =
      namenode.getEditLogManifest(sig.mostRecentCheckpointTxId + 1);

+    // Fetch fsimage and edits. Reload the image if previous merge failed.
    loadImage |= downloadCheckpointFiles(
-        fsName, checkpointImage, sig, manifest);   // Fetch fsimage and edits
+        fsName, checkpointImage, sig, manifest) |
+        checkpointImage.hasMergeError();
+    try {
      doMerge(sig, manifest, loadImage, checkpointImage, namesystem);
+    } catch (IOException ioe) {
+      // A merge error occurred. The in-memory file system state may be
+      // inconsistent, so the image and edits need to be reloaded.
+      checkpointImage.setMergeError();
+      throw ioe;
+    }
+    // Clear any error since merge was successful.
+    checkpointImage.clearMergeError();
+
    
    //
    // Upload the new image into the NameNode. Then tell the Namenode
@ -754,6 +779,7 @@ private static CommandLineOpts parseArgs(String[] argv) {
  
  static class CheckpointStorage extends FSImage {
    
+    private int mergeErrorCount;
    private static class CheckpointLogPurger implements LogsPurgeable {
      
      private NNStorage storage;
@ -815,6 +841,7 @@ public void selectInputStreams(Collection<EditLogInputStream> streams,
      // we shouldn't have any editLog instance. Setting to null
      // makes sure we don't accidentally depend on it.
      editLog = null;
+      mergeErrorCount = 0;
      
      // Replace the archival manager with one that can actually work on the
      // 2NN's edits storage.
@ -882,6 +909,23 @@ void recoverCreate(boolean format) throws IOException {
      }
    }

+
+    boolean hasMergeError() {
+      return (mergeErrorCount > 0);
+    }
+
+    int getMergeErrorCount() {
+      return mergeErrorCount;
+    }
+
+    void setMergeError() {
+      mergeErrorCount++;
+    }
+
+    void clearMergeError() {
+      mergeErrorCount = 0;
+    }
+ 
    /**
     * Ensure that the current/ directory exists in all storage
     * directories
@ -915,6 +959,8 @@ static void doMerge(
      dstImage.reloadFromImageFile(file, dstNamesystem);
      dstNamesystem.dir.imageLoadComplete();
    }
+    // error simulation code for junit test
+    CheckpointFaultInjector.getInstance().duringMerge();   

    Checkpointer.rollForwardByApplyingLogs(manifest, dstImage, dstNamesystem);
    // The following has the side effect of purging old fsimages/edit logs.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
@ -640,6 +640,15 @@
  </description>
 </property>

+<property>
+  <name>dfs.namenode.checkpoint.max-retries</name>
+  <value>3</value>
+  <description>The SecondaryNameNode retries failed checkpointing. If the 
+  failure occurs while loading fsimage or replaying edits, the number of
+  retries is limited by this variable. 
+  </description>
+</property>
+
 <property>
  <name>dfs.namenode.num.checkpoints.retained</name>
  <value>2</value>
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
@ -74,6 +74,8 @@
 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
 import org.apache.hadoop.test.GenericTestUtils.LogCapturer;
+import org.apache.hadoop.util.ExitUtil;
+import org.apache.hadoop.util.ExitUtil.ExitException;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.log4j.Level;
 import org.junit.After;
@ -226,6 +228,111 @@ public void testWriteTransactionIdHandlesIOE() throws Exception {
               toString().indexOf("storageDirToCheck") != -1);
  }

+  /*
+   * Simulate exception during edit replay.
+   */
+  @Test(timeout=5000)
+  public void testReloadOnEditReplayFailure () throws IOException {
+    Configuration conf = new HdfsConfiguration();
+    FSDataOutputStream fos = null;
+    SecondaryNameNode secondary = null;
+    MiniDFSCluster cluster = null;
+    FileSystem fs = null;
+
+    try {
+      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
+          .build();
+      cluster.waitActive();
+      fs = cluster.getFileSystem();
+      secondary = startSecondaryNameNode(conf);
+      fos = fs.create(new Path("tmpfile0"));
+      fos.write(new byte[] { 0, 1, 2, 3 });
+      secondary.doCheckpoint();
+      fos.write(new byte[] { 0, 1, 2, 3 });
+      fos.hsync();
+
+      // Cause merge to fail in next checkpoint.
+      Mockito.doThrow(new IOException(
+          "Injecting failure during merge"))
+          .when(faultInjector).duringMerge();
+
+      try {
+        secondary.doCheckpoint();
+        fail("Fault injection failed.");
+      } catch (IOException ioe) {
+        // This is expected.
+      } 
+      Mockito.reset(faultInjector);
+ 
+      // The error must be recorded, so next checkpoint will reload image.
+      fos.write(new byte[] { 0, 1, 2, 3 });
+      fos.hsync();
+      
+      assertTrue("Another checkpoint should have reloaded image",
+          secondary.doCheckpoint());
+    } finally {
+      if (secondary != null) {
+        secondary.shutdown();
+      }
+      if (fs != null) {
+        fs.close();
+      }
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+      Mockito.reset(faultInjector);
+    }
+  }
+
+  /*
+   * Simulate 2NN exit due to too many merge failures.
+   */
+  @Test(timeout=10000)
+  public void testTooManyEditReplayFailures() throws IOException {
+    Configuration conf = new HdfsConfiguration();
+    conf.set(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_MAX_RETRIES_KEY, "1");
+    conf.set(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, "1");
+
+    FSDataOutputStream fos = null;
+    SecondaryNameNode secondary = null;
+    MiniDFSCluster cluster = null;
+    FileSystem fs = null;
+
+    try {
+      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
+          .checkExitOnShutdown(false).build();
+      cluster.waitActive();
+      fs = cluster.getFileSystem();
+      fos = fs.create(new Path("tmpfile0"));
+      fos.write(new byte[] { 0, 1, 2, 3 });
+
+      // Cause merge to fail in next checkpoint.
+      Mockito.doThrow(new IOException(
+          "Injecting failure during merge"))
+          .when(faultInjector).duringMerge();
+
+      secondary = startSecondaryNameNode(conf);
+      secondary.doWork();
+      // Fail if we get here.
+      fail("2NN did not exit.");
+    } catch (ExitException ee) {
+      // ignore
+      ExitUtil.resetFirstExitException();
+      assertEquals("Max retries", 1, secondary.getMergeErrorCount() - 1);
+    } finally {
+      if (secondary != null) {
+        secondary.shutdown();
+      }
+      if (fs != null) {
+        fs.close();
+      }
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+      Mockito.reset(faultInjector);
+    }
+  }
+
  /*
   * Simulate namenode crashing after rolling edit log.
   */