From f3296501e09fa7f1e81548dfcefa56f20fe337ca Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Mon, 5 Nov 2018 16:48:37 -0800 Subject: [PATCH] HDFS-14043. Tolerate corrupted seen_txid file. Contributed by Lukas Majercak. --- .../hadoop/hdfs/util/PersistentLongFile.java | 2 + .../server/namenode/TestSaveNamespace.java | 56 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/PersistentLongFile.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/PersistentLongFile.java index 777dd87cfe..a94d7edff0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/PersistentLongFile.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/PersistentLongFile.java @@ -98,6 +98,8 @@ public static long readFile(File file, long defaultVal) throws IOException { val = Long.parseLong(br.readLine()); br.close(); br = null; + } catch (NumberFormatException e) { + throw new IOException(e); } finally { IOUtils.cleanupWithLogger(LOG, br); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java index 8fa870186c..6688ef2831 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java @@ -28,13 +28,20 @@ import static org.mockito.Mockito.spy; import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.io.OutputStream; +import java.io.PrintWriter; +import java.net.URI; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -737,6 +744,55 @@ public void testSaveNamespaceBeforeShutdown() throws Exception { } } + @Test(timeout=30000) + public void testTxFaultTolerance() throws Exception { + String baseDir = MiniDFSCluster.getBaseDirectory(); + List nameDirs = new ArrayList<>(); + nameDirs.add(fileAsURI(new File(baseDir, "name1")).toString()); + nameDirs.add(fileAsURI(new File(baseDir, "name2")).toString()); + + Configuration conf = new HdfsConfiguration(); + String nameDirsStr = StringUtils.join(",", nameDirs); + conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY, nameDirsStr); + conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY, nameDirsStr); + + NameNode.initMetrics(conf, NamenodeRole.NAMENODE); + DFSTestUtil.formatNameNode(conf); + FSNamesystem fsn = FSNamesystem.loadFromDisk(conf); + try { + // We have a BEGIN_LOG_SEGMENT txn to start + assertEquals(1, fsn.getEditLog().getLastWrittenTxId()); + + doAnEdit(fsn, 1); + + assertEquals(2, fsn.getEditLog().getLastWrittenTxId()); + + // Shut down + fsn.close(); + + // Corrupt one of the seen_txid files + File txidFile0 = new File(new URI(nameDirs.get(0) + + "/current/seen_txid")); + FileWriter fw = new FileWriter(txidFile0, false); + try (PrintWriter pw = new PrintWriter(fw)) { + pw.print("corrupt____!"); + } + + // Restart + fsn = FSNamesystem.loadFromDisk(conf); + assertEquals(4, fsn.getEditLog().getLastWrittenTxId()); + + // Check seen_txid is same in both dirs + File txidFile1 = new File(new URI(nameDirs.get(1) + + "/current/seen_txid")); + assertTrue(FileUtils.contentEquals(txidFile0, txidFile1)); + } finally { + if (fsn != null) { + fsn.close(); + } + } + } + private void doAnEdit(FSNamesystem fsn, int id) throws IOException { // Make an edit fsn.mkdirs("/test" + id, new PermissionStatus("test", "Test",