HDFS-7725. Incorrect 'nodes in service' metrics caused all writes to fail. Contributed by Ming Ma.

This commit is contained in:
Andrew Wang 2015-04-08 15:52:06 -07:00
parent a42bb1cd91
commit 6af0d74a75
4 changed files with 41 additions and 24 deletions

View File

@ -441,6 +441,9 @@ Release 2.8.0 - UNRELEASED
HDFS-5215. dfs.datanode.du.reserved is not considered while computing
available space ( Brahma Reddy Battula via Yongjun Zhang)
HDFS-7725. Incorrect "nodes in service" metrics caused all writes to fail.
(Ming Ma via wang)
Release 2.7.0 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -197,23 +197,21 @@ void close() {
*/
@VisibleForTesting
public void startDecommission(DatanodeDescriptor node) {
if (!node.isDecommissionInProgress()) {
if (!node.isAlive) {
LOG.info("Dead node {} is decommissioned immediately.", node);
node.setDecommissioned();
} else if (!node.isDecommissioned()) {
if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
// Update DN stats maintained by HeartbeatManager
hbManager.startDecommission(node);
// hbManager.startDecommission will set dead node to decommissioned.
if (node.isDecommissionInProgress()) {
for (DatanodeStorageInfo storage : node.getStorageInfos()) {
LOG.info("Starting decommission of {} {} with {} blocks",
node, storage, storage.numBlocks());
}
// Update DN stats maintained by HeartbeatManager
hbManager.startDecommission(node);
node.decommissioningStatus.setStartTime(monotonicNow());
pendingNodes.add(node);
}
} else {
LOG.trace("startDecommission: Node {} is already decommission in "
+ "progress, nothing to do.", node);
LOG.trace("startDecommission: Node {} in {}, nothing to do." +
node, node.getAdminState());
}
}
@ -221,9 +219,9 @@ public void startDecommission(DatanodeDescriptor node) {
* Stop decommissioning the specified datanode.
* @param node
*/
void stopDecommission(DatanodeDescriptor node) {
@VisibleForTesting
public void stopDecommission(DatanodeDescriptor node) {
if (node.isDecommissionInProgress() || node.isDecommissioned()) {
LOG.info("Stopping decommissioning of node {}", node);
// Update DN stats maintained by HeartbeatManager
hbManager.stopDecommission(node);
// Over-replicated blocks will be detected and processed when
@ -235,8 +233,8 @@ void stopDecommission(DatanodeDescriptor node) {
pendingNodes.remove(node);
decomNodeBlocks.remove(node);
} else {
LOG.trace("stopDecommission: Node {} is not decommission in progress " +
"or decommissioned, nothing to do.", node);
LOG.trace("stopDecommission: Node {} in {}, nothing to do." +
node, node.getAdminState());
}
}

View File

@ -20,8 +20,6 @@
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
@ -31,6 +29,8 @@
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Manage the heartbeats received from datanodes.
@ -38,7 +38,7 @@
* by the heartbeat manager lock.
*/
class HeartbeatManager implements DatanodeStatistics {
static final Log LOG = LogFactory.getLog(HeartbeatManager.class);
static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
/**
* Stores a subset of the datanodeMap in DatanodeManager,
@ -227,16 +227,27 @@ synchronized void updateHeartbeat(final DatanodeDescriptor node,
}
synchronized void startDecommission(final DatanodeDescriptor node) {
if (!node.isAlive) {
LOG.info("Dead node {} is decommissioned immediately.", node);
node.setDecommissioned();
} else {
stats.subtract(node);
node.startDecommission();
stats.add(node);
}
}
synchronized void stopDecommission(final DatanodeDescriptor node) {
LOG.info("Stopping decommissioning of {} node {}",
node.isAlive ? "live" : "dead", node);
if (!node.isAlive) {
node.stopDecommission();
} else {
stats.subtract(node);
node.stopDecommission();
stats.add(node);
}
}
/**
* Check if there are any expired heartbeats, and if so,

View File

@ -202,9 +202,14 @@ public void testXceiverCount() throws Exception {
dn.shutdown();
DFSTestUtil.setDatanodeDead(dnd);
BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
//Verify decommission of dead node won't impact nodesInService metrics.
dnm.getDecomManager().startDecommission(dnd);
expectedInServiceNodes--;
assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
//Verify recommission of dead node won't impact nodesInService metrics.
dnm.getDecomManager().stopDecommission(dnd);
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
}
// restart the nodes to verify that counts are correct after