diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index ed1bdd39c6..beb6389ac1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -300,6 +300,9 @@ Branch-2 ( Unreleased changes ) HDFS-3485. DataTransferThrottler will over-throttle when currentTimeMillis jumps (Andy Isaacson via todd) + HDFS-2914. HA: Standby should not enter safemode when resources are low. + (Vinay via atm) + BREAKDOWN OF HDFS-3042 SUBTASKS HDFS-2185. HDFS portion of ZK-based FailoverController (todd) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index fbca355b25..386b5eb467 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -572,8 +572,6 @@ void startCommonServices(Configuration conf, HAContext haContext) throws IOExcep !safeMode.isPopulatingReplQueues(); setBlockTotal(); blockManager.activate(conf); - this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); - nnrmthread.start(); } finally { writeUnlock(); } @@ -590,7 +588,6 @@ void stopCommonServices() { writeLock(); try { if (blockManager != null) blockManager.close(); - if (nnrmthread != null) nnrmthread.interrupt(); } finally { writeUnlock(); } @@ -644,6 +641,10 @@ void startActiveServices() throws IOException { } leaseManager.startMonitor(); startSecretManagerIfNecessary(); + + //ResourceMonitor required only at ActiveNN. See HDFS-2914 + this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); + nnrmthread.start(); } finally { writeUnlock(); } @@ -666,6 +667,10 @@ void stopActiveServices() { if (leaseManager != null) { leaseManager.stopMonitor(); } + if (nnrmthread != null) { + ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor(); + nnrmthread.interrupt(); + } if (dir != null && dir.fsImage != null) { if (dir.fsImage.editLog != null) { dir.fsImage.editLog.close(); @@ -3193,10 +3198,11 @@ void checkAvailableResources() { * acceptable levels, this daemon will cause the NN to exit safe mode. */ class NameNodeResourceMonitor implements Runnable { + boolean shouldNNRmRun = true; @Override public void run () { try { - while (fsRunning) { + while (fsRunning && shouldNNRmRun) { checkAvailableResources(); if(!nameNodeHasResourcesAvailable()) { String lowResourcesMsg = "NameNode low on available disk space. "; @@ -3217,7 +3223,11 @@ public void run () { FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e); } } - } + + public void stopMonitor() { + shouldNNRmRun = false; + } + } public FSImage getFSImage() { return dir.fsImage; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java index cc9552aec2..a158a5ed6b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hdfs.server.namenode.ha; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY; import static org.junit.Assert.*; import java.io.File; @@ -127,6 +129,7 @@ public void testSharedDirsComeFirstInEditsList() throws Exception { @Test public void testFailureOfSharedDir() throws Exception { Configuration conf = new Configuration(); + conf.setLong(DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY, 2000); // The shared edits dir will automatically be marked required. MiniDFSCluster cluster = null; @@ -151,6 +154,15 @@ public void testFailureOfSharedDir() throws Exception { assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w", true)); + Thread.sleep(conf.getLong(DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY, + DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT) * 2); + + NameNode nn1 = cluster.getNameNode(1); + assertTrue(nn1.isStandbyState()); + assertFalse( + "StandBy NameNode should not go to SafeMode on resource unavailability", + nn1.isInSafeMode()); + NameNode nn0 = cluster.getNameNode(0); nn0.getNamesystem().getFSImage().getEditLog().getJournalSet() .setRuntimeForTesting(mockRuntime);