diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 3e54fe828f..b76918d951 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -322,6 +322,8 @@ Release 2.5.0 - UNRELEASED HDFS-6169. Move the address in WebImageViewer. (Akira Ajisaka via wheat9) + HDFS-6160. TestSafeMode occasionally fails. (Arpit Agarwal) + Release 2.4.1 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java index 5b9114d656..eca2f00ba5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java @@ -1022,6 +1022,7 @@ public DatanodeCommand blockReport(DatanodeRegistration nodeReg, for(StorageBlockReport r : reports) { final BlockListAsLongs blocks = new BlockListAsLongs(r.getBlocks()); hasStaleStorages = bm.processReport(nodeReg, r.getStorage(), poolId, blocks); + metrics.incrStorageBlockReportOps(); } if (nn.getFSImage().isUpgradeFinalized() && diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java index 1397131f4a..52ad185cdf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java @@ -73,6 +73,8 @@ public class NameNodeMetrics { MutableCounterLong snapshotDiffReportOps; @Metric("Number of blockReceivedAndDeleted calls") MutableCounterLong blockReceivedAndDeletedOps; + @Metric("Number of blockReports from individual storages") + MutableCounterLong storageBlockReportOps; @Metric("Journal transactions") MutableRate transactions; @Metric("Journal syncs") MutableRate syncs; @@ -221,6 +223,10 @@ public void incrSnapshotDiffReportOps() { public void incrBlockReceivedAndDeletedOps() { blockReceivedAndDeletedOps.incr(); } + + public void incrStorageBlockReportOps() { + storageBlockReportOps.incr(); + } public void addTransaction(long latency) { transactions.add(latency); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java index c98cb8992d..8e1b92ccde 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hdfs; +import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; +import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -26,6 +28,8 @@ import java.io.IOException; import java.util.List; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; @@ -55,12 +59,14 @@ * Tests to verify safe mode correctness. */ public class TestSafeMode { + public static final Log LOG = LogFactory.getLog(TestSafeMode.class); private static final Path TEST_PATH = new Path("/test"); private static final int BLOCK_SIZE = 1024; Configuration conf; MiniDFSCluster cluster; FileSystem fs; DistributedFileSystem dfs; + private static final String NN_METRICS = "NameNodeActivity"; @Before public void startUp() throws IOException { @@ -158,6 +164,7 @@ public void testNoExtensionIfNoBlocks() throws IOException { */ @Test(timeout=45000) public void testInitializeReplQueuesEarly() throws Exception { + LOG.info("Starting testInitializeReplQueuesEarly"); // Spray the blocks around the cluster when we add DNs instead of // concentrating all blocks on the first node. BlockManagerTestUtil.setWritingPrefersLocalNode( @@ -165,9 +172,11 @@ public void testInitializeReplQueuesEarly() throws Exception { cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null); cluster.waitActive(); + + LOG.info("Creating files"); DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L); - + LOG.info("Stopping all DataNodes"); List dnprops = Lists.newLinkedList(); dnprops.add(cluster.stopDataNode(0)); dnprops.add(cluster.stopDataNode(0)); @@ -176,6 +185,7 @@ public void testInitializeReplQueuesEarly() throws Exception { cluster.getConfiguration(0).setFloat( DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f); + LOG.info("Restarting NameNode"); cluster.restartNameNode(); final NameNode nn = cluster.getNameNode(); @@ -189,27 +199,37 @@ public void testInitializeReplQueuesEarly() throws Exception { "until threshold is crossed", NameNodeAdapter.safeModeInitializedReplQueues(nn)); + LOG.info("Restarting one DataNode"); cluster.restartDataNode(dnprops.remove(0)); - // Wait for the block report from the restarted DN to come in. + // Wait for block reports from all attached storages of + // the restarted DN to come in. GenericTestUtils.waitFor(new Supplier() { @Override public Boolean get() { - return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0; + return getLongCounter("StorageBlockReportOps", getMetrics(NN_METRICS)) == + MiniDFSCluster.DIRS_PER_DATANODE; } }, 10, 10000); - // SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks - // call is still going on at this point - wait until it's done by grabbing - // the lock. - nn.getNamesystem().writeLock(); - nn.getNamesystem().writeUnlock(); - int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn); - assertTrue("Expected first block report to make some but not all blocks " + - "safe. Got: " + safe, safe >= 1 && safe < 15); - BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); - + + final int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn); + assertTrue("Expected first block report to make some blocks safe.", safe > 0); + assertTrue("Did not expect first block report to make all blocks safe.", safe < 15); + assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn)); - assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks()); + + // Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated + // blocks are processed asynchronously so this may take a few seconds. + // Failure here will manifest as a test timeout. + BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); + long underReplicatedBlocks = nn.getNamesystem().getUnderReplicatedBlocks(); + while (underReplicatedBlocks != (15 - safe)) { + LOG.info("UnderReplicatedBlocks expected=" + (15 - safe) + + ", actual=" + underReplicatedBlocks); + Thread.sleep(100); + BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); + underReplicatedBlocks = nn.getNamesystem().getUnderReplicatedBlocks(); + } cluster.restartDataNodes(); }