From 0ef9c6f71a4397d60a024a9de6695d6e5f764c80 Mon Sep 17 00:00:00 2001 From: Suresh Srinivas Date: Tue, 16 Oct 2012 19:39:50 +0000 Subject: [PATCH] HDFS-4059. Add number of stale DataNodes to metrics. Contributed by Jing Zhao. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1398949 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 + .../blockmanagement/DatanodeManager.java | 2 +- .../hdfs/server/namenode/FSNamesystem.java | 7 ++++ .../namenode/metrics/FSNamesystemMBean.java | 6 +++ .../blockmanagement/BlockManagerTestUtil.java | 8 ++++ .../namenode/metrics/TestNameNodeMetrics.java | 41 ++++++++++++++++++- 6 files changed, 64 insertions(+), 2 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 83a1386ec7..46dd83047b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -344,6 +344,8 @@ Release 2.0.3-alpha - Unreleased HDFS-3912. Detect and avoid stale datanodes for writes. (Jing Zhao via suresh) + HDFS-4059. Add number of stale DataNodes to metrics. (Jing Zhao via suresh) + IMPROVEMENTS HDFS-3925. Prettify PipelineAck#toString() for printing to a log diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index cad3540d7e..8b2613acc0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -885,7 +885,7 @@ void setNumStaleNodes(int numStaleNodes) { * @return Return the current number of stale DataNodes (detected by * HeartbeatManager). */ - int getNumStaleNodes() { + public int getNumStaleNodes() { return this.numStaleNodes; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 32d367879a..0cbbd1a8c1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -4677,6 +4677,13 @@ public int getNumLiveDataNodes() { public int getNumDeadDataNodes() { return getBlockManager().getDatanodeManager().getNumDeadDataNodes(); } + + @Override // FSNamesystemMBean + @Metric({"StaleDataNodes", + "Number of datanodes marked stale due to delayed heartbeat"}) + public int getNumStaleDataNodes() { + return getBlockManager().getDatanodeManager().getNumStaleNodes(); + } /** * Sets the generation stamp for this filesystem diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java index d4033965df..b618cc64e1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java @@ -112,4 +112,10 @@ public interface FSNamesystemMBean { * @return number of dead data nodes */ public int getNumDeadDataNodes(); + + /** + * Number of stale data nodes + * @return number of stale data nodes + */ + public int getNumStaleDataNodes(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java index db84943791..6a0b687f75 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java @@ -191,4 +191,12 @@ public static void setWritingPrefersLocalNode( "Must use default policy, got %s", bpp.getClass()); ((BlockPlacementPolicyDefault)bpp).setPreferLocalNode(prefer); } + + /** + * Call heartbeat check function of HeartbeatManager + * @param bm the BlockManager to manipulate + */ + public static void checkHeartbeat(BlockManager bm) { + bm.getDatanodeManager().getHeartbeatManager().heartbeatCheck(); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index c4d659904e..e213379151 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -41,10 +41,14 @@ import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; +import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.test.MetricsAsserts; +import org.apache.hadoop.util.Time; import org.apache.log4j.Level; import org.junit.After; import org.junit.Before; @@ -77,7 +81,8 @@ public class TestNameNodeMetrics { DFS_REPLICATION_INTERVAL); CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, "" + PERCENTILES_INTERVAL); - + // Enable stale DataNodes checking + CONF.setBoolean(DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY, true); ((Log4JLogger)LogFactory.getLog(MetricsAsserts.class)) .getLogger().setLevel(Level.DEBUG); } @@ -119,6 +124,40 @@ private void readFile(FileSystem fileSys,Path name) throws IOException { stm.close(); } + /** Test metrics indicating the number of stale DataNodes */ + @Test + public void testStaleNodes() throws Exception { + // Set two datanodes as stale + for (int i = 0; i < 2; i++) { + DataNode dn = cluster.getDataNodes().get(i); + DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, true); + long staleInterval = CONF.getLong( + DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY, + DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT); + cluster.getNameNode().getNamesystem().getBlockManager() + .getDatanodeManager().getDatanode(dn.getDatanodeId()) + .setLastUpdate(Time.now() - staleInterval - 1); + } + // Let HeartbeatManager to check heartbeat + BlockManagerTestUtil.checkHeartbeat(cluster.getNameNode().getNamesystem() + .getBlockManager()); + assertGauge("StaleDataNodes", 2, getMetrics(NS_METRICS)); + + // Reset stale datanodes + for (int i = 0; i < 2; i++) { + DataNode dn = cluster.getDataNodes().get(i); + DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, false); + cluster.getNameNode().getNamesystem().getBlockManager() + .getDatanodeManager().getDatanode(dn.getDatanodeId()) + .setLastUpdate(Time.now()); + } + + // Let HeartbeatManager to refresh + BlockManagerTestUtil.checkHeartbeat(cluster.getNameNode().getNamesystem() + .getBlockManager()); + assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS)); + } + /** Test metrics associated with addition of a file */ @Test public void testFileAdd() throws Exception {