HDFS-15821. Add metrics for in-service datanodes (#2690). Contributed by Zehao Chen.

(cherry picked from commit 07a4220cd27c69b86b837e8da320bad0031f7895)
This commit is contained in:
Zehao Chen 2021-02-15 11:14:32 -06:00 committed by Jim Brennan
parent 4468378e4b
commit 1768c0096f
4 changed files with 121 additions and 0 deletions

View File

@ -709,6 +709,11 @@ public class NamenodeBeanMetrics
return 0;
}
@Override
public int getNumInServiceLiveDataNodes() {
return 0;
}
@Override
public int getVolumeFailuresTotal() {
return 0;

View File

@ -5504,6 +5504,19 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
return deadDecommissioned;
}
@Override // FSNamesystemMBean
@Metric({"NumInServiceLiveDataNodes",
"Number of live datanodes which are currently in service"})
public int getNumInServiceLiveDataNodes() {
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
int liveInService = live.size();
for (DatanodeDescriptor node : live) {
liveInService -= node.isInMaintenance() ? 1 : 0;
}
return liveInService;
}
@Override // FSNamesystemMBean
@Metric({"VolumeFailuresTotal",
"Total number of volume failures across all Datanodes"})

View File

@ -150,6 +150,12 @@ public interface FSNamesystemMBean {
*/
public int getNumDecomDeadDataNodes();
/**
* @return Number of in-service data nodes, where NumInServiceDataNodes =
* NumLiveDataNodes - NumDecomLiveDataNodes - NumInMaintenanceLiveDataNodes
*/
int getNumInServiceLiveDataNodes();
/**
* Number of failed data volumes across all live data nodes.
* @return number of failed data volumes across all live data nodes

View File

@ -435,6 +435,103 @@ public class TestNameNodeMXBean {
}
}
@Test(timeout = 120000)
public void testInServiceNodes() throws Exception {
Configuration conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
30);
conf.setClass(DFSConfigKeys.DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY,
CombinedHostFileManager.class, HostConfigManager.class);
MiniDFSCluster cluster = null;
HostsFileWriter hostsFileWriter = new HostsFileWriter();
hostsFileWriter.initialize(conf, "temp/TestInServiceNodes");
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
cluster.waitActive();
final FSNamesystem fsn = cluster.getNameNode().namesystem;
final MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
final ObjectName mxbeanName = new ObjectName(
"Hadoop:service=NameNode,name=FSNamesystem");
List<String> hosts = new ArrayList<>();
for (DataNode dn : cluster.getDataNodes()) {
hosts.add(dn.getDisplayName());
}
hostsFileWriter.initIncludeHosts(hosts.toArray(
new String[hosts.size()]));
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
try {
int numLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
"NumLiveDataNodes");
return numLiveDataNodes == 3;
} catch (Exception e) {
return false;
}
}
}, 1000, 60000);
// Verify nodes
int numDecomLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
"NumDecomLiveDataNodes");
int numInMaintenanceLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
"NumInMaintenanceLiveDataNodes");
int numInServiceLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
"NumInServiceLiveDataNodes");
assertEquals(0, numDecomLiveDataNodes);
assertEquals(0, numInMaintenanceLiveDataNodes);
assertEquals(3, numInServiceLiveDataNodes);
// Add 2 nodes to out-of-service list
ArrayList<String> decomNodes = new ArrayList<>();
decomNodes.add(cluster.getDataNodes().get(0).getDisplayName());
Map<String, Long> maintenanceNodes = new HashMap<>();
final int expirationInMs = 30 * 1000;
maintenanceNodes.put(cluster.getDataNodes().get(1).getDisplayName(),
Time.now() + expirationInMs);
hostsFileWriter.initOutOfServiceHosts(decomNodes, maintenanceNodes);
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
// Wait for the DatanodeAdminManager to complete check
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
try {
int numLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
"NumLiveDataNodes");
int numDecomLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
"NumDecomLiveDataNodes");
int numInMaintenanceLiveDataNodes = (int) mbs.getAttribute(
mxbeanName, "NumInMaintenanceLiveDataNodes");
return numLiveDataNodes == 3 &&
numDecomLiveDataNodes == 1 &&
numInMaintenanceLiveDataNodes == 1;
} catch (Exception e) {
return false;
}
}
}, 1000, 60000);
// Verify nodes
numInServiceLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
"NumInServiceLiveDataNodes");
assertEquals(1, numInServiceLiveDataNodes);
} finally {
if (cluster != null) {
cluster.shutdown();
}
hostsFileWriter.cleanup();
}
}
@Test (timeout = 120000)
public void testMaintenanceNodes() throws Exception {
LOG.info("Starting testMaintenanceNodes");