HDFS-9391. Update webUI/JMX to display maintenance state info. (Manoj Govindassamy via mingma)
This commit is contained in:
parent
4db119b7b5
commit
467f5f1735
@ -730,7 +730,7 @@ public int hashCode() {
|
||||
// Super implementation is sufficient
|
||||
return super.hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
// Sufficient to use super equality as datanodes are uniquely identified
|
||||
@ -745,14 +745,14 @@ public class LeavingServiceStatus {
|
||||
private int underReplicatedInOpenFiles;
|
||||
private long startTime;
|
||||
|
||||
synchronized void set(int underRep,
|
||||
int onlyRep, int underConstruction) {
|
||||
synchronized void set(int underRepInOpenFiles, int underRepBlocks,
|
||||
int outOfServiceOnlyRep) {
|
||||
if (!isDecommissionInProgress() && !isEnteringMaintenance()) {
|
||||
return;
|
||||
}
|
||||
underReplicatedBlocks = underRep;
|
||||
outOfServiceOnlyReplicas = onlyRep;
|
||||
underReplicatedInOpenFiles = underConstruction;
|
||||
underReplicatedInOpenFiles = underRepInOpenFiles;
|
||||
underReplicatedBlocks = underRepBlocks;
|
||||
outOfServiceOnlyReplicas = outOfServiceOnlyRep;
|
||||
}
|
||||
|
||||
/** @return the number of under-replicated blocks */
|
||||
|
@ -634,9 +634,12 @@ private void processBlocksInternal(
|
||||
final List<BlockInfo> insufficientList,
|
||||
boolean pruneReliableBlocks) {
|
||||
boolean firstReplicationLog = true;
|
||||
int lowRedundancyBlocks = 0;
|
||||
int outOfServiceOnlyReplicas = 0;
|
||||
// Low redundancy in UC Blocks only
|
||||
int lowRedundancyInOpenFiles = 0;
|
||||
// All low redundancy blocks. Includes lowRedundancyInOpenFiles.
|
||||
int lowRedundancyBlocks = 0;
|
||||
// All maintenance and decommission replicas.
|
||||
int outOfServiceOnlyReplicas = 0;
|
||||
while (it.hasNext()) {
|
||||
if (insufficientList == null
|
||||
&& numBlocksCheckedPerLock >= numBlocksPerCheck) {
|
||||
@ -726,8 +729,8 @@ private void processBlocksInternal(
|
||||
}
|
||||
}
|
||||
|
||||
datanode.getLeavingServiceStatus().set(lowRedundancyBlocks,
|
||||
outOfServiceOnlyReplicas, lowRedundancyInOpenFiles);
|
||||
datanode.getLeavingServiceStatus().set(lowRedundancyInOpenFiles,
|
||||
lowRedundancyBlocks, outOfServiceOnlyReplicas);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -77,7 +77,7 @@ public int readOnlyReplicas() {
|
||||
* @return decommissioned and decommissioning replicas
|
||||
*/
|
||||
public int decommissionedAndDecommissioning() {
|
||||
return (int) (get(DECOMMISSIONED) + get(DECOMMISSIONING));
|
||||
return decommissioned() + decommissioning();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -5500,6 +5500,7 @@ public String getDeadNodes() {
|
||||
Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
|
||||
.put("lastContact", getLastContact(node))
|
||||
.put("decommissioned", node.isDecommissioned())
|
||||
.put("adminState", node.getAdminState().toString())
|
||||
.put("xferaddr", node.getXferAddr())
|
||||
.build();
|
||||
info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
|
||||
@ -5524,7 +5525,6 @@ public String getDecomNodes() {
|
||||
.put("xferaddr", node.getXferAddr())
|
||||
.put("underReplicatedBlocks",
|
||||
node.getLeavingServiceStatus().getUnderReplicatedBlocks())
|
||||
// TODO use another property name for outOfServiceOnlyReplicas.
|
||||
.put("decommissionOnlyReplicas",
|
||||
node.getLeavingServiceStatus().getOutOfServiceOnlyReplicas())
|
||||
.put("underReplicateInOpenFiles",
|
||||
@ -5535,6 +5535,33 @@ public String getDecomNodes() {
|
||||
return JSON.toString(info);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returned information is a JSON representation of map with host name of
|
||||
* nodes entering maintenance as the key and value as a map of various node
|
||||
* attributes to its values.
|
||||
*/
|
||||
@Override // NameNodeMXBean
|
||||
public String getEnteringMaintenanceNodes() {
|
||||
final Map<String, Map<String, Object>> nodesMap =
|
||||
new HashMap<String, Map<String, Object>>();
|
||||
final List<DatanodeDescriptor> enteringMaintenanceNodeList =
|
||||
blockManager.getDatanodeManager().getEnteringMaintenanceNodes();
|
||||
for (DatanodeDescriptor node : enteringMaintenanceNodeList) {
|
||||
Map<String, Object> attrMap = ImmutableMap
|
||||
.<String, Object> builder()
|
||||
.put("xferaddr", node.getXferAddr())
|
||||
.put("underReplicatedBlocks",
|
||||
node.getLeavingServiceStatus().getUnderReplicatedBlocks())
|
||||
.put("maintenanceOnlyReplicas",
|
||||
node.getLeavingServiceStatus().getOutOfServiceOnlyReplicas())
|
||||
.put("underReplicateInOpenFiles",
|
||||
node.getLeavingServiceStatus().getUnderReplicatedInOpenFiles())
|
||||
.build();
|
||||
nodesMap.put(node.getHostName() + ":" + node.getXferPort(), attrMap);
|
||||
}
|
||||
return JSON.toString(nodesMap);
|
||||
}
|
||||
|
||||
private long getLastContact(DatanodeDescriptor alivenode) {
|
||||
return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
|
||||
}
|
||||
|
@ -190,7 +190,14 @@ public interface NameNodeMXBean {
|
||||
* @return the decommissioning node information
|
||||
*/
|
||||
public String getDecomNodes();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the information on nodes entering maintenance.
|
||||
*
|
||||
* @return the information on nodes entering maintenance
|
||||
*/
|
||||
String getEnteringMaintenanceNodes();
|
||||
|
||||
/**
|
||||
* Gets the cluster id.
|
||||
*
|
||||
|
@ -171,9 +171,10 @@
|
||||
{/nn}
|
||||
|
||||
{#fs}
|
||||
<tr><th><a href="#tab-datanode">Live Nodes</a></th><td>{NumLiveDataNodes} (Decommissioned: {NumDecomLiveDataNodes})</td></tr>
|
||||
<tr><th><a href="#tab-datanode">Dead Nodes</a></th><td>{NumDeadDataNodes} (Decommissioned: {NumDecomDeadDataNodes})</td></tr>
|
||||
<tr><th><a href="#tab-datanode">Live Nodes</a></th><td>{NumLiveDataNodes} (Decommissioned: {NumDecomLiveDataNodes}, In Maintenance: {NumInMaintenanceLiveDataNodes})</td></tr>
|
||||
<tr><th><a href="#tab-datanode">Dead Nodes</a></th><td>{NumDeadDataNodes} (Decommissioned: {NumDecomDeadDataNodes}, In Maintenance: {NumInMaintenanceDeadDataNodes})</td></tr>
|
||||
<tr><th><a href="#tab-datanode">Decommissioning Nodes</a></th><td>{NumDecommissioningDataNodes}</td></tr>
|
||||
<tr><th><a href="#tab-datanode">Entering Maintenance Nodes</a></th><td> {NumEnteringMaintenanceDataNodes}</td></tr>
|
||||
<tr><th><a href="#tab-datanode-volume-failures">Total Datanode Volume Failures</a></th><td>{VolumeFailuresTotal} ({EstimatedCapacityLostTotal|fmt_bytes})</td></tr>
|
||||
{@eq key=nnstat.State value="active"}
|
||||
<tr><th title="Excludes missing blocks.">Number of Under-Replicated Blocks</th><td>{UnderReplicatedBlocks}</td></tr>
|
||||
@ -295,6 +296,7 @@
|
||||
<li class="dfshealth-node-icon dfshealth-node-down">Down</li>
|
||||
<li class="dfshealth-node-icon dfshealth-node-decommissioned">Decommissioned</li>
|
||||
<li class="dfshealth-node-icon dfshealth-node-down-decommissioned">Decommissioned & dead</li>
|
||||
<li class="dfshealth-node-icon dfshealth-node-down-maintenance">In Maintenance & dead</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="page-header"><h1><small>In operation</small></h1></div>
|
||||
@ -344,6 +346,32 @@
|
||||
</table>
|
||||
</small>
|
||||
|
||||
<div class="page-header"><h1><small>Entering Maintenance</small></h1></div>
|
||||
<small>
|
||||
{?EnteringMaintenanceNodes}
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Node</th>
|
||||
<th>Under replicated blocks</th>
|
||||
<th>Blocks with no live replicas</th>
|
||||
<th>Under Replicated Blocks <br/>In files under construction</th>
|
||||
</tr>
|
||||
</thead>
|
||||
{#EnteringMaintenanceNodes}
|
||||
<tr>
|
||||
<td>{name} ({xferaddr})</td>
|
||||
<td>{underReplicatedBlocks}</td>
|
||||
<td>{maintenanceOnlyReplicas}</td>
|
||||
<td>{underReplicateInOpenFiles}</td>
|
||||
</tr>
|
||||
{/EnteringMaintenanceNodes}
|
||||
</table>
|
||||
{:else}
|
||||
No nodes are entering maintenance.
|
||||
{/EnteringMaintenanceNodes}
|
||||
</small>
|
||||
|
||||
<div class="page-header"><h1><small>Decommissioning</small></h1></div>
|
||||
<small>
|
||||
{?DecomNodes}
|
||||
|
@ -223,17 +223,23 @@
|
||||
if (n.adminState === "In Service") {
|
||||
n.state = "alive";
|
||||
} else if (nodes[i].adminState === "Decommission In Progress") {
|
||||
n.state = "decommisioning";
|
||||
n.state = "decommissioning";
|
||||
} else if (nodes[i].adminState === "Decommissioned") {
|
||||
n.state = "decommissioned";
|
||||
} else if (nodes[i].adminState === "Entering Maintenance") {
|
||||
n.state = "entering-maintenance";
|
||||
} else if (nodes[i].adminState === "In Maintenance") {
|
||||
n.state = "in-maintenance";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function augment_dead_nodes(nodes) {
|
||||
for (var i = 0, e = nodes.length; i < e; ++i) {
|
||||
if (nodes[i].decommissioned) {
|
||||
if (nodes[i].adminState === "Decommissioned") {
|
||||
nodes[i].state = "down-decommissioned";
|
||||
} else if (nodes[i].adminState === "In Maintenance") {
|
||||
nodes[i].state = "down-maintenance";
|
||||
} else {
|
||||
nodes[i].state = "down";
|
||||
}
|
||||
@ -245,6 +251,7 @@
|
||||
r.DeadNodes = node_map_to_array(JSON.parse(r.DeadNodes));
|
||||
augment_dead_nodes(r.DeadNodes);
|
||||
r.DecomNodes = node_map_to_array(JSON.parse(r.DecomNodes));
|
||||
r.EnteringMaintenanceNodes = node_map_to_array(JSON.parse(r.EnteringMaintenanceNodes));
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -37,8 +37,10 @@
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
|
||||
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
||||
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.CombinedHostFileManager;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.HostConfigManager;
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
||||
@ -50,10 +52,13 @@
|
||||
import org.apache.hadoop.io.nativeio.NativeIO.POSIX.NoMlockCacheManipulator;
|
||||
import org.apache.hadoop.net.ServerSocketUtil;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.util.Time;
|
||||
import org.apache.hadoop.util.VersionInfo;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.eclipse.jetty.util.ajax.JSON;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.management.MBeanServer;
|
||||
import javax.management.ObjectName;
|
||||
@ -64,6 +69,7 @@
|
||||
import java.net.URI;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@ -80,6 +86,9 @@
|
||||
*/
|
||||
public class TestNameNodeMXBean {
|
||||
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TestNameNodeMXBean.class);
|
||||
|
||||
/**
|
||||
* Used to assert equality between doubles
|
||||
*/
|
||||
@ -180,10 +189,10 @@ public void testNameNodeMXBeanInfo() throws Exception {
|
||||
assertFalse(xferAddr.equals(dnXferAddrInMaintenance) ^ inMaintenance);
|
||||
}
|
||||
assertEquals(fsn.getLiveNodes(), alivenodeinfo);
|
||||
// get attribute deadnodeinfo
|
||||
String deadnodeinfo = (String) (mbs.getAttribute(mxbeanName,
|
||||
// get attributes DeadNodes
|
||||
String deadNodeInfo = (String) (mbs.getAttribute(mxbeanName,
|
||||
"DeadNodes"));
|
||||
assertEquals(fsn.getDeadNodes(), deadnodeinfo);
|
||||
assertEquals(fsn.getDeadNodes(), deadNodeInfo);
|
||||
// get attribute NodeUsage
|
||||
String nodeUsage = (String) (mbs.getAttribute(mxbeanName,
|
||||
"NodeUsage"));
|
||||
@ -295,16 +304,16 @@ public void testLastContactTime() throws Exception {
|
||||
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
|
||||
}
|
||||
|
||||
// get attribute deadnodeinfo
|
||||
String deadnodeinfo = (String) (mbs.getAttribute(mxbeanName,
|
||||
// get attribute DeadNodes
|
||||
String deadNodeInfo = (String) (mbs.getAttribute(mxbeanName,
|
||||
"DeadNodes"));
|
||||
assertEquals(fsn.getDeadNodes(), deadnodeinfo);
|
||||
assertEquals(fsn.getDeadNodes(), deadNodeInfo);
|
||||
Map<String, Map<String, Object>> deadNodes =
|
||||
(Map<String, Map<String, Object>>) JSON.parse(deadnodeinfo);
|
||||
(Map<String, Map<String, Object>>) JSON.parse(deadNodeInfo);
|
||||
assertTrue(deadNodes.size() > 0);
|
||||
for (Map<String, Object> deadNode : deadNodes.values()) {
|
||||
assertTrue(deadNode.containsKey("lastContact"));
|
||||
assertTrue(deadNode.containsKey("decommissioned"));
|
||||
assertTrue(deadNode.containsKey("adminState"));
|
||||
assertTrue(deadNode.containsKey("xferaddr"));
|
||||
}
|
||||
} finally {
|
||||
@ -415,6 +424,106 @@ public Boolean get() {
|
||||
}
|
||||
}
|
||||
|
||||
@Test (timeout = 120000)
|
||||
public void testMaintenanceNodes() throws Exception {
|
||||
LOG.info("Starting testMaintenanceNodes");
|
||||
int expirationInMs = 30 * 1000;
|
||||
Configuration conf = new Configuration();
|
||||
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
|
||||
conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
|
||||
expirationInMs);
|
||||
conf.setClass(DFSConfigKeys.DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY,
|
||||
CombinedHostFileManager.class, HostConfigManager.class);
|
||||
MiniDFSCluster cluster = null;
|
||||
HostsFileWriter hostsFileWriter = new HostsFileWriter();
|
||||
hostsFileWriter.initialize(conf, "temp/TestNameNodeMXBean");
|
||||
|
||||
try {
|
||||
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
|
||||
cluster.waitActive();
|
||||
|
||||
FSNamesystem fsn = cluster.getNameNode().namesystem;
|
||||
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
|
||||
ObjectName mxbeanName = new ObjectName(
|
||||
"Hadoop:service=NameNode,name=NameNodeInfo");
|
||||
|
||||
List<String> hosts = new ArrayList<>();
|
||||
for(DataNode dn : cluster.getDataNodes()) {
|
||||
hosts.add(dn.getDisplayName());
|
||||
}
|
||||
hostsFileWriter.initIncludeHosts(hosts.toArray(
|
||||
new String[hosts.size()]));
|
||||
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
|
||||
|
||||
// 1. Verify nodes for DatanodeReportType.LIVE state
|
||||
String liveNodesInfo = (String) (mbs.getAttribute(mxbeanName,
|
||||
"LiveNodes"));
|
||||
LOG.info("Live Nodes: " + liveNodesInfo);
|
||||
Map<String, Map<String, Object>> liveNodes =
|
||||
(Map<String, Map<String, Object>>) JSON.parse(liveNodesInfo);
|
||||
assertEquals(fsn.getLiveNodes(), liveNodesInfo);
|
||||
assertEquals(fsn.getNumLiveDataNodes(), liveNodes.size());
|
||||
|
||||
for (Map<String, Object> liveNode : liveNodes.values()) {
|
||||
assertTrue(liveNode.containsKey("lastContact"));
|
||||
assertTrue(liveNode.containsKey("xferaddr"));
|
||||
}
|
||||
|
||||
// Add the 1st DataNode to Maintenance list
|
||||
Map<String, Long> maintenanceNodes = new HashMap<>();
|
||||
maintenanceNodes.put(cluster.getDataNodes().get(0).getDisplayName(),
|
||||
Time.monotonicNow() + expirationInMs);
|
||||
hostsFileWriter.initOutOfServiceHosts(null, maintenanceNodes);
|
||||
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
|
||||
|
||||
boolean recheck = true;
|
||||
while (recheck) {
|
||||
// 2. Verify nodes for DatanodeReportType.ENTERING_MAINTENANCE state
|
||||
String enteringMaintenanceNodesInfo =
|
||||
(String) (mbs.getAttribute(mxbeanName, "EnteringMaintenanceNodes"));
|
||||
Map<String, Map<String, Object>> enteringMaintenanceNodes =
|
||||
(Map<String, Map<String, Object>>) JSON.parse(
|
||||
enteringMaintenanceNodesInfo);
|
||||
if (enteringMaintenanceNodes.size() <= 0) {
|
||||
LOG.info("Waiting for a node to Enter Maintenance state!");
|
||||
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
|
||||
continue;
|
||||
}
|
||||
LOG.info("Nodes entering Maintenance: " + enteringMaintenanceNodesInfo);
|
||||
recheck = false;
|
||||
assertEquals(fsn.getEnteringMaintenanceNodes(),
|
||||
enteringMaintenanceNodesInfo);
|
||||
assertEquals(fsn.getNumEnteringMaintenanceDataNodes(),
|
||||
enteringMaintenanceNodes.size());
|
||||
assertEquals(0, fsn.getNumInMaintenanceLiveDataNodes());
|
||||
assertEquals(0, fsn.getNumInMaintenanceDeadDataNodes());
|
||||
}
|
||||
|
||||
// Wait for the DecommissionManager to complete check
|
||||
// and perform state transition
|
||||
while (fsn.getNumInMaintenanceLiveDataNodes() != 1) {
|
||||
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
|
||||
}
|
||||
|
||||
// 3. Verify nodes for AdminStates.IN_MAINTENANCE state
|
||||
String enteringMaintenanceNodesInfo =
|
||||
(String) (mbs.getAttribute(mxbeanName, "EnteringMaintenanceNodes"));
|
||||
Map<String, Map<String, Object>> enteringMaintenanceNodes =
|
||||
(Map<String, Map<String, Object>>) JSON.parse(
|
||||
enteringMaintenanceNodesInfo);
|
||||
assertEquals(0, enteringMaintenanceNodes.size());
|
||||
assertEquals(fsn.getEnteringMaintenanceNodes(),
|
||||
enteringMaintenanceNodesInfo);
|
||||
assertEquals(1, fsn.getNumInMaintenanceLiveDataNodes());
|
||||
assertEquals(0, fsn.getNumInMaintenanceDeadDataNodes());
|
||||
} finally {
|
||||
if (cluster != null) {
|
||||
cluster.shutdown();
|
||||
}
|
||||
hostsFileWriter.cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
@Test(timeout=120000)
|
||||
@SuppressWarnings("unchecked")
|
||||
public void testTopUsers() throws Exception {
|
||||
|
Loading…
Reference in New Issue
Block a user