From 6af0d74a75f0f58d5e92e2e91e87735b9a62bb12 Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Wed, 8 Apr 2015 15:52:06 -0700 Subject: [PATCH] HDFS-7725. Incorrect 'nodes in service' metrics caused all writes to fail. Contributed by Ming Ma. --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../blockmanagement/DecommissionManager.java | 28 +++++++++--------- .../blockmanagement/HeartbeatManager.java | 29 +++++++++++++------ .../namenode/TestNamenodeCapacityReport.java | 5 ++++ 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 852006d87d..95c6912ea0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -441,6 +441,9 @@ Release 2.8.0 - UNRELEASED HDFS-5215. dfs.datanode.du.reserved is not considered while computing available space ( Brahma Reddy Battula via Yongjun Zhang) + HDFS-7725. Incorrect "nodes in service" metrics caused all writes to fail. + (Ming Ma via wang) + Release 2.7.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java index 9355329637..7f3d77802e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java @@ -197,23 +197,21 @@ void close() { */ @VisibleForTesting public void startDecommission(DatanodeDescriptor node) { - if (!node.isDecommissionInProgress()) { - if (!node.isAlive) { - LOG.info("Dead node {} is decommissioned immediately.", node); - node.setDecommissioned(); - } else if (!node.isDecommissioned()) { + if (!node.isDecommissionInProgress() && !node.isDecommissioned()) { + // Update DN stats maintained by HeartbeatManager + hbManager.startDecommission(node); + // hbManager.startDecommission will set dead node to decommissioned. + if (node.isDecommissionInProgress()) { for (DatanodeStorageInfo storage : node.getStorageInfos()) { - LOG.info("Starting decommission of {} {} with {} blocks", + LOG.info("Starting decommission of {} {} with {} blocks", node, storage, storage.numBlocks()); } - // Update DN stats maintained by HeartbeatManager - hbManager.startDecommission(node); node.decommissioningStatus.setStartTime(monotonicNow()); pendingNodes.add(node); } } else { - LOG.trace("startDecommission: Node {} is already decommission in " - + "progress, nothing to do.", node); + LOG.trace("startDecommission: Node {} in {}, nothing to do." + + node, node.getAdminState()); } } @@ -221,12 +219,12 @@ public void startDecommission(DatanodeDescriptor node) { * Stop decommissioning the specified datanode. * @param node */ - void stopDecommission(DatanodeDescriptor node) { + @VisibleForTesting + public void stopDecommission(DatanodeDescriptor node) { if (node.isDecommissionInProgress() || node.isDecommissioned()) { - LOG.info("Stopping decommissioning of node {}", node); // Update DN stats maintained by HeartbeatManager hbManager.stopDecommission(node); - // Over-replicated blocks will be detected and processed when + // Over-replicated blocks will be detected and processed when // the dead node comes back and send in its full block report. if (node.isAlive) { blockManager.processOverReplicatedBlocksOnReCommission(node); @@ -235,8 +233,8 @@ void stopDecommission(DatanodeDescriptor node) { pendingNodes.remove(node); decomNodeBlocks.remove(node); } else { - LOG.trace("stopDecommission: Node {} is not decommission in progress " + - "or decommissioned, nothing to do.", node); + LOG.trace("stopDecommission: Node {} in {}, nothing to do." + + node, node.getAdminState()); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java index d2905a29b7..b0ab31560f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java @@ -20,8 +20,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; @@ -31,6 +29,8 @@ import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Time; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Manage the heartbeats received from datanodes. @@ -38,7 +38,7 @@ * by the heartbeat manager lock. */ class HeartbeatManager implements DatanodeStatistics { - static final Log LOG = LogFactory.getLog(HeartbeatManager.class); + static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class); /** * Stores a subset of the datanodeMap in DatanodeManager, @@ -227,15 +227,26 @@ synchronized void updateHeartbeat(final DatanodeDescriptor node, } synchronized void startDecommission(final DatanodeDescriptor node) { - stats.subtract(node); - node.startDecommission(); - stats.add(node); + if (!node.isAlive) { + LOG.info("Dead node {} is decommissioned immediately.", node); + node.setDecommissioned(); + } else { + stats.subtract(node); + node.startDecommission(); + stats.add(node); + } } synchronized void stopDecommission(final DatanodeDescriptor node) { - stats.subtract(node); - node.stopDecommission(); - stats.add(node); + LOG.info("Stopping decommissioning of {} node {}", + node.isAlive ? "live" : "dead", node); + if (!node.isAlive) { + node.stopDecommission(); + } else { + stats.subtract(node); + node.stopDecommission(); + stats.add(node); + } } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java index fd611ce1c1..6f547223f9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java @@ -202,9 +202,14 @@ public void testXceiverCount() throws Exception { dn.shutdown(); DFSTestUtil.setDatanodeDead(dnd); BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager()); + //Verify decommission of dead node won't impact nodesInService metrics. + dnm.getDecomManager().startDecommission(dnd); expectedInServiceNodes--; assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes()); assertEquals(expectedInServiceNodes, getNumDNInService(namesystem)); + //Verify recommission of dead node won't impact nodesInService metrics. + dnm.getDecomManager().stopDecommission(dnd); + assertEquals(expectedInServiceNodes, getNumDNInService(namesystem)); } // restart the nodes to verify that counts are correct after