HDFS-12131. Add some of the FSNamesystem JMX values as metrics. Contributed by Erik Krogen.
This commit is contained in:
parent
0542e6f86e
commit
f4c6b00a9f
@ -213,7 +213,15 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
|
||||
| `PendingDataNodeMessageCount` | (HA-only) Current number of pending block-related messages for later processing in the standby NameNode |
|
||||
| `MillisSinceLastLoadedEdits` | (HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0 |
|
||||
| `BlockCapacity` | Current number of block capacity |
|
||||
| `NumLiveDataNodes` | Number of datanodes which are currently live |
|
||||
| `NumDeadDataNodes` | Number of datanodes which are currently dead |
|
||||
| `NumDecomLiveDataNodes` | Number of datanodes which have been decommissioned and are now live |
|
||||
| `NumDecomDeadDataNodes` | Number of datanodes which have been decommissioned and are now dead |
|
||||
| `NumDecommissioningDataNodes` | Number of datanodes in decommissioning state |
|
||||
| `VolumeFailuresTotal` | Total number of volume failures across all Datanodes |
|
||||
| `EstimatedCapacityLostTotal` | An estimate of the total capacity lost due to volume failures |
|
||||
| `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat |
|
||||
| `NumStaleStorages` | Number of storages marked as content stale (after NameNode restart/failover before first block report is received) |
|
||||
| `MissingReplOneBlocks` | Current number of missing blocks with replication factor 1 |
|
||||
| `NumFilesUnderConstruction` | Current number of files under construction |
|
||||
| `NumActiveClients` | Current number of active clients holding lease |
|
||||
@ -224,6 +232,9 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
|
||||
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
|
||||
| `NameDirSize` | NameNode name directories size in bytes |
|
||||
| `NumTimedOutPendingReconstructions` | The number of timed out reconstructions. Not the number of unique blocks that timed out. |
|
||||
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
|
||||
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
|
||||
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
|
||||
| `FSN(Read|Write)Lock`*OperationName*`NumOps` | Total number of acquiring lock by operations |
|
||||
| `FSN(Read|Write)Lock`*OperationName*`AvgTime` | Average time of holding the lock by operations in milliseconds |
|
||||
|
||||
|
@ -4843,16 +4843,20 @@ void shutdown() {
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"})
|
||||
public int getNumLiveDataNodes() {
|
||||
return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"})
|
||||
public int getNumDeadDataNodes() {
|
||||
return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumDecomLiveDataNodes",
|
||||
"Number of datanodes which have been decommissioned and are now live"})
|
||||
public int getNumDecomLiveDataNodes() {
|
||||
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
||||
@ -4864,6 +4868,8 @@ public int getNumDecomLiveDataNodes() {
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumDecomDeadDataNodes",
|
||||
"Number of datanodes which have been decommissioned and are now dead"})
|
||||
public int getNumDecomDeadDataNodes() {
|
||||
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
|
||||
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
|
||||
@ -4875,6 +4881,8 @@ public int getNumDecomDeadDataNodes() {
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"VolumeFailuresTotal",
|
||||
"Total number of volume failures across all Datanodes"})
|
||||
public int getVolumeFailuresTotal() {
|
||||
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
||||
@ -4886,6 +4894,8 @@ public int getVolumeFailuresTotal() {
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"EstimatedCapacityLostTotal",
|
||||
"An estimate of the total capacity lost due to volume failures"})
|
||||
public long getEstimatedCapacityLostTotal() {
|
||||
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
||||
@ -4901,6 +4911,8 @@ public long getEstimatedCapacityLostTotal() {
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumDecommissioningDataNodes",
|
||||
"Number of datanodes in decommissioning state"})
|
||||
public int getNumDecommissioningDataNodes() {
|
||||
return getBlockManager().getDatanodeManager().getDecommissioningNodes()
|
||||
.size();
|
||||
@ -4918,6 +4930,8 @@ public int getNumStaleDataNodes() {
|
||||
* before NN receives the first Heartbeat followed by the first Blockreport.
|
||||
*/
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumStaleStorages",
|
||||
"Number of storages marked as content stale"})
|
||||
public int getNumStaleStorages() {
|
||||
return getBlockManager().getDatanodeManager().getNumStaleStorages();
|
||||
}
|
||||
@ -7542,6 +7556,8 @@ public long getBytesInFuture() {
|
||||
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumInMaintenanceLiveDataNodes",
|
||||
"Number of live Datanodes which are in maintenance state"})
|
||||
public int getNumInMaintenanceLiveDataNodes() {
|
||||
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
|
||||
@ -7553,6 +7569,8 @@ public int getNumInMaintenanceLiveDataNodes() {
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumInMaintenanceDeadDataNodes",
|
||||
"Number of dead Datanodes which are in maintenance state"})
|
||||
public int getNumInMaintenanceDeadDataNodes() {
|
||||
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
|
||||
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
|
||||
@ -7564,6 +7582,8 @@ public int getNumInMaintenanceDeadDataNodes() {
|
||||
}
|
||||
|
||||
@Override // FSNamesystemMBean
|
||||
@Metric({"NumEnteringMaintenanceDataNodes",
|
||||
"Number of Datanodes that are entering the maintenance state"})
|
||||
public int getNumEnteringMaintenanceDataNodes() {
|
||||
return getBlockManager().getDatanodeManager().getEnteringMaintenanceNodes()
|
||||
.size();
|
||||
|
@ -61,6 +61,7 @@
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
@ -148,6 +149,8 @@ public class MiniDFSCluster implements AutoCloseable {
|
||||
public static final String HDFS_MINIDFS_BASEDIR = "hdfs.minidfs.basedir";
|
||||
public static final String DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY
|
||||
= DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + ".testing";
|
||||
public static final String DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY
|
||||
= DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY + ".testing";
|
||||
|
||||
// Changing this default may break some tests that assume it is 2.
|
||||
private static final int DEFAULT_STORAGES_PER_DATANODE = 2;
|
||||
@ -818,7 +821,10 @@ private void initMiniDFSCluster(
|
||||
int safemodeExtension = conf.getInt(
|
||||
DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY, 0);
|
||||
conf.setInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, safemodeExtension);
|
||||
conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, 3); // 3 second
|
||||
long decommissionInterval = conf.getTimeDuration(
|
||||
DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 3, TimeUnit.SECONDS);
|
||||
conf.setTimeDuration(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY,
|
||||
decommissionInterval, TimeUnit.SECONDS);
|
||||
if (!useConfiguredTopologyMappingClass) {
|
||||
conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
|
||||
StaticMapping.class, DNSToSwitchMapping.class);
|
||||
|
@ -17,6 +17,7 @@
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.server.namenode.metrics;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.hadoop.crypto.key.JavaKeyStoreProvider;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||
import org.apache.hadoop.fs.FileSystemTestHelper;
|
||||
@ -41,7 +42,9 @@
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
@ -69,12 +72,15 @@
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
||||
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
||||
import org.apache.hadoop.hdfs.util.HostsFileWriter;
|
||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||
import org.apache.hadoop.metrics2.MetricsSource;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
@ -115,6 +121,15 @@ public class TestNameNodeMetrics {
|
||||
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
|
||||
CONF.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
||||
DFS_REDUNDANCY_INTERVAL);
|
||||
// Set it long enough to essentially disable unless we manually call it
|
||||
// Used for decommissioning DataNode metrics
|
||||
CONF.setTimeDuration(
|
||||
MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 999,
|
||||
TimeUnit.DAYS);
|
||||
// Next two configs used for checking failed volume metrics
|
||||
CONF.setTimeDuration(DFSConfigKeys.DFS_DATANODE_DISK_CHECK_MIN_GAP_KEY,
|
||||
10, TimeUnit.MILLISECONDS);
|
||||
CONF.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
|
||||
CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_KEY,
|
||||
DFS_REDUNDANCY_INTERVAL);
|
||||
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
|
||||
@ -133,6 +148,7 @@ public class TestNameNodeMetrics {
|
||||
private DistributedFileSystem fs;
|
||||
private final Random rand = new Random();
|
||||
private FSNamesystem namesystem;
|
||||
private HostsFileWriter hostsFileWriter;
|
||||
private BlockManager bm;
|
||||
private Path ecDir;
|
||||
|
||||
@ -142,6 +158,8 @@ private static Path getTestPath(String fileName) {
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
hostsFileWriter = new HostsFileWriter();
|
||||
hostsFileWriter.initialize(CONF, "temp/decommission");
|
||||
cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT)
|
||||
.build();
|
||||
cluster.waitActive();
|
||||
@ -161,6 +179,10 @@ public void tearDown() throws Exception {
|
||||
MetricsRecordBuilder rb = getMetrics(source);
|
||||
assertQuantileGauges("GetGroups1s", rb);
|
||||
}
|
||||
if (hostsFileWriter != null) {
|
||||
hostsFileWriter.cleanup();
|
||||
hostsFileWriter = null;
|
||||
}
|
||||
if (cluster != null) {
|
||||
cluster.shutdown();
|
||||
cluster = null;
|
||||
@ -235,6 +257,96 @@ public void testStaleNodes() throws Exception {
|
||||
.getBlockManager());
|
||||
assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test metrics associated with volume failures.
|
||||
*/
|
||||
@Test
|
||||
public void testVolumeFailures() throws Exception {
|
||||
assertGauge("VolumeFailuresTotal", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("EstimatedCapacityLostTotal", 0L, getMetrics(NS_METRICS));
|
||||
DataNode dn = cluster.getDataNodes().get(0);
|
||||
FsDatasetSpi.FsVolumeReferences volumeReferences =
|
||||
DataNodeTestUtils.getFSDataset(dn).getFsVolumeReferences();
|
||||
FsVolumeImpl fsVolume = (FsVolumeImpl) volumeReferences.get(0);
|
||||
File dataDir = new File(fsVolume.getBaseURI());
|
||||
long capacity = fsVolume.getCapacity();
|
||||
volumeReferences.close();
|
||||
DataNodeTestUtils.injectDataDirFailure(dataDir);
|
||||
DataNodeTestUtils.waitForDiskError(dn, fsVolume);
|
||||
DataNodeTestUtils.triggerHeartbeat(dn);
|
||||
BlockManagerTestUtil.checkHeartbeat(bm);
|
||||
assertGauge("VolumeFailuresTotal", 1, getMetrics(NS_METRICS));
|
||||
assertGauge("EstimatedCapacityLostTotal", capacity, getMetrics(NS_METRICS));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test metrics associated with liveness and decommission status of DataNodes.
|
||||
*/
|
||||
@Test
|
||||
public void testDataNodeLivenessAndDecom() throws Exception {
|
||||
List<DataNode> dataNodes = cluster.getDataNodes();
|
||||
DatanodeDescriptor[] dnDescriptors = new DatanodeDescriptor[DATANODE_COUNT];
|
||||
String[] dnAddresses = new String[DATANODE_COUNT];
|
||||
for (int i = 0; i < DATANODE_COUNT; i++) {
|
||||
dnDescriptors[i] = bm.getDatanodeManager()
|
||||
.getDatanode(dataNodes.get(i).getDatanodeId());
|
||||
dnAddresses[i] = dnDescriptors[i].getXferAddr();
|
||||
}
|
||||
// First put all DNs into include
|
||||
hostsFileWriter.initIncludeHosts(dnAddresses);
|
||||
bm.getDatanodeManager().refreshNodes(CONF);
|
||||
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
||||
|
||||
// Now decommission one DN
|
||||
hostsFileWriter.initExcludeHost(dnAddresses[0]);
|
||||
bm.getDatanodeManager().refreshNodes(CONF);
|
||||
assertGauge("NumDecommissioningDataNodes", 1, getMetrics(NS_METRICS));
|
||||
BlockManagerTestUtil.recheckDecommissionState(bm.getDatanodeManager());
|
||||
assertGauge("NumDecommissioningDataNodes", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("NumDecomLiveDataNodes", 1, getMetrics(NS_METRICS));
|
||||
assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
||||
|
||||
// Now kill all DNs by expiring their heartbeats
|
||||
for (int i = 0; i < DATANODE_COUNT; i++) {
|
||||
DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), true);
|
||||
long expireInterval = CONF.getLong(
|
||||
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
|
||||
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT) * 2L
|
||||
+ CONF.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
||||
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 10 * 1000L;
|
||||
DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i],
|
||||
-(expireInterval + 1));
|
||||
}
|
||||
BlockManagerTestUtil.checkHeartbeat(bm);
|
||||
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("NumDecomDeadDataNodes", 1, getMetrics(NS_METRICS));
|
||||
assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("NumDeadDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
||||
|
||||
// Now remove the decommissioned DN altogether
|
||||
String[] includeHosts = new String[dnAddresses.length - 1];
|
||||
for (int i = 0; i < includeHosts.length; i++) {
|
||||
includeHosts[i] = dnAddresses[i + 1];
|
||||
}
|
||||
hostsFileWriter.initIncludeHosts(includeHosts);
|
||||
hostsFileWriter.initExcludeHosts(new ArrayList<>());
|
||||
bm.getDatanodeManager().refreshNodes(CONF);
|
||||
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("NumDecomDeadDataNodes", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||
assertGauge("NumDeadDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
|
||||
|
||||
// Finally mark the remaining DNs as live again
|
||||
for (int i = 1; i < dataNodes.size(); i++) {
|
||||
DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), false);
|
||||
DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], 0);
|
||||
}
|
||||
BlockManagerTestUtil.checkHeartbeat(bm);
|
||||
assertGauge("NumLiveDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
|
||||
assertGauge("NumDeadDataNodes", 0, getMetrics(NS_METRICS));
|
||||
}
|
||||
|
||||
/** Test metrics associated with addition of a file */
|
||||
@Test
|
||||
|
Loading…
Reference in New Issue
Block a user