From 6be13332db5342465c2f279a5984b4b8a33420fc Mon Sep 17 00:00:00 2001 From: Eli Collins Date: Tue, 31 Jan 2012 02:17:53 +0000 Subject: [PATCH] HDFS-2853. HA: NN fails to start if the shared edits dir is marked required. Contributed by Aaron T. Myers. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1238134 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-hdfs/CHANGES.HDFS-1623.txt | 2 + .../hdfs/server/namenode/FSEditLog.java | 2 +- .../namenode/NameNodeResourcePolicy.java | 9 -- .../apache/hadoop/hdfs/MiniDFSCluster.java | 7 +- .../hdfs/server/namenode/TestEditLog.java | 2 +- .../namenode/TestNameNodeResourcePolicy.java | 8 +- .../namenode/ha/TestFailureOfSharedDir.java | 93 +++++++++++++++++++ 7 files changed, 104 insertions(+), 19 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt index a426f0926e..fe88dbbe97 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt @@ -141,3 +141,5 @@ HDFS-2841. HAAdmin does not work if security is enabled. (atm) HDFS-2691. Fixes for pipeline recovery in an HA cluster: report RBW replicas immediately upon pipeline creation. (todd) HDFS-2824. Fix failover when prior NN died just after creating an edit log segment. (atm via todd) + +HDFS-2853. HA: NN fails to start if the shared edits dir is marked required (atm via eli) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java index cd7ff5b0c8..a78039f2c2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java @@ -865,7 +865,7 @@ synchronized void startLogSegment(final long segmentTxId, editLogStream = journalSet.startLogSegment(segmentTxId); } catch (IOException ex) { throw new IOException("Unable to start log segment " + - segmentTxId + ": no journals successfully started."); + segmentTxId + ": too few journals successfully started.", ex); } curSegmentTxId = segmentTxId; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java index 53cd867fbc..3896165ff3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java @@ -37,9 +37,6 @@ final class NameNodeResourcePolicy { * required to continue operation. * @return true if and only if there are sufficient NN resources to * continue logging edits. - * @throws RuntimeException if the number of configured - * redundant resources is fewer than the minimum number of available - * redundant resources. */ static boolean areResourcesAvailable( Collection resources, @@ -63,12 +60,6 @@ static boolean areResourcesAvailable( } } - if (redundantResourceCount < minimumRedundantResources) { - throw new RuntimeException("Need a minimum of " + minimumRedundantResources - + " for NN to operate but only " + redundantResourceCount - + " are configured."); - } - if (redundantResourceCount == 0) { // If there are no redundant resources, return true if there are any // required resources available. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java index bf3af609d2..dc3074aeb5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java @@ -664,7 +664,12 @@ private void createNameNodesAndSetConf(MiniDFSNNTopology nnTopology, } public URI getSharedEditsDir(int minNN, int maxNN) throws IOException { - return fileAsURI(new File(base_dir, "shared-edits-" + + return formatSharedEditsDir(base_dir, minNN, maxNN); + } + + public static URI formatSharedEditsDir(File baseDir, int minNN, int maxNN) + throws IOException { + return fileAsURI(new File(baseDir, "shared-edits-" + minNN + "-through-" + maxNN)); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java index f2f4d930db..9281eb2f04 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java @@ -807,7 +807,7 @@ public void testFailedOpen() throws Exception { fail("Did no throw exception on only having a bad dir"); } catch (IOException ioe) { GenericTestUtils.assertExceptionContains( - "no journals successfully started", ioe); + "too few journals successfully started", ioe); } finally { logDir.setWritable(true); log.close(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java index 559d165726..49a96e9b66 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java @@ -50,13 +50,7 @@ public void testMultipleRedundantResources() { assertFalse(testResourceScenario(4, 0, 3, 0, 2)); assertTrue(testResourceScenario(4, 0, 3, 0, 1)); assertFalse(testResourceScenario(4, 0, 4, 0, 1)); - try { - testResourceScenario(1, 0, 0, 0, 2); - fail("Should fail if there are more minimum redundant resources than " + - "total redundant resources"); - } catch (RuntimeException rte) { - assertTrue(rte.getMessage().startsWith("Need a minimum")); - } + assertFalse(testResourceScenario(1, 0, 0, 0, 2)); } @Test diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java new file mode 100644 index 0000000000..20c93b7e73 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Test; + +public class TestFailureOfSharedDir { + + private static final Log LOG = LogFactory.getLog(TestFailureOfSharedDir.class); + + /** + * Test that marking the shared edits dir as being "required" causes the NN to + * fail if that dir can't be accessed. + */ + @Test + public void testFailureOfSharedDir() throws Exception { + Configuration conf = new Configuration(); + URI sharedEditsUri = MiniDFSCluster.formatSharedEditsDir( + new File(MiniDFSCluster.getBaseDirectory()), 0, 1); + // Mark the shared edits dir required. + conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY, + sharedEditsUri.toString()); + + MiniDFSCluster cluster = null; + try { + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + + assertEquals(sharedEditsUri, cluster.getSharedEditsDir(0, 1)); + + cluster.waitActive(); + cluster.transitionToActive(0); + + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + + assertTrue(fs.mkdirs(new Path("/test1"))); + + // Blow away the shared edits dir. + FileUtil.fullyDelete(new File(sharedEditsUri)); + + NameNode nn0 = cluster.getNameNode(0); + try { + // Make sure that subsequent operations on the NN fail. + nn0.getRpcServer().rollEditLog(); + fail("Succeeded in rolling edit log despite shared dir being deleted"); + } catch (IOException ioe) { + GenericTestUtils.assertExceptionContains( + "Unable to start log segment 4: too few journals successfully started", + ioe); + LOG.info("Got expected exception", ioe); + } + } finally { + if (cluster != null) { + cluster.shutdown(); + } + } + } +}