From 190dc1c91b0ae0f3f128cc6603e354a3ec83288a Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Wed, 11 Jan 2012 05:55:32 +0000 Subject: [PATCH] HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. Contributed by Hari Mankude and Todd Lipcon. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1229898 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-hdfs/CHANGES.HDFS-1623.txt | 2 ++ .../server/blockmanagement/BlockManager.java | 2 +- .../blockmanagement/DatanodeDescriptor.java | 9 +++++ .../server/namenode/ha/TestHASafeMode.java | 35 ++++++++++++++++++- 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt index 018dfeaa55..fae2f31304 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt @@ -93,3 +93,5 @@ HDFS-2730. Refactor shared HA-related test code into HATestUtil class (todd) HDFS-2762. Fix TestCheckpoint timing out on HA branch. (Uma Maheswara Rao G via todd) HDFS-2724. NN web UI can throw NPE after startup, before standby state is entered. (todd) + +HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. (Hari Mankude and todd via todd) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index ec978f6ea1..ce01502972 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -1361,7 +1361,7 @@ public void processReport(final DatanodeID nodeID, final String poolId, // To minimize startup time, we discard any second (or later) block reports // that we receive while still in startup phase. - if (namesystem.isInStartupSafeMode() && node.numBlocks() > 0) { + if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) { NameNode.stateChangeLog.info("BLOCK* processReport: " + "discarded non-initial block report from " + nodeID.getName() + " because namenode still in startup phase"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java index 807213ed17..984456f142 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java @@ -151,6 +151,10 @@ synchronized void clear() { private long lastBlocksScheduledRollTime = 0; private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min private int volumeFailures = 0; + + /** Set to false after processing first block report */ + private boolean firstBlockReport = true; + /** * When set to true, the node is not in include list and is not allowed * to communicate with the namenode @@ -608,6 +612,11 @@ public void receivedBlockReport() { if (heartbeatedSinceFailover) { blockContentsStale = false; } + firstBlockReport = false; + } + + boolean isFirstBlockReport() { + return firstBlockReport; } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java index a23f38e97f..a76470f1c4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java @@ -192,7 +192,7 @@ public void testBlocksAddedWhileInSafeMode() throws Exception { * knows there should only be 90 blocks, but it's still in safemode. * 8. NN2 doesn't ever recheck whether it should leave safemode. * - * This is essentially the inverse of {@link #testBlocksAddedWhileStandbyShutdown()} + * This is essentially the inverse of {@link #testBlocksAddedBeforeStandbyRestart()} */ @Test public void testBlocksRemovedBeforeStandbyRestart() throws Exception { @@ -328,6 +328,39 @@ public void testComplexFailoverIntoSafemode() throws Exception { "total blocks 5. Safe mode will be turned off automatically")); } + /** + * Regression test for HDFS-2753. In this bug, the following sequence was + * observed: + * - Some blocks are written to DNs while the SBN was down. This causes + * the blockReceived messages to get queued in the BPServiceActor on the + * DN. + * - When the SBN returns, the DN re-registers with the SBN, and then + * flushes its blockReceived queue to the SBN before it sends its + * first block report. This caused the first block report to be + * incorrect ignored. + * - The SBN would become stuck in safemode. + */ + @Test + public void testBlocksAddedWhileStandbyIsDown() throws Exception { + DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); + + banner("Stopping standby"); + cluster.shutdownNameNode(1); + + DFSTestUtil.createFile(fs, new Path("/test2"), 3*BLOCK_SIZE, (short) 3, 1L); + + banner("Rolling edit log so standby gets all edits on restart"); + nn0.getRpcServer().rollEditLog(); + + restartStandby(); + String status = nn1.getNamesystem().getSafemode(); + assertTrue("Bad safemode status: '" + status + "'", + status.startsWith( + "Safe mode is ON." + + "The reported blocks 6 has reached the threshold 0.9990 of " + + "total blocks 6. Safe mode will be turned off automatically")); + } + /** * Print a big banner in the test log to make debug easier. */