HDFS-12131. Add some of the FSNamesystem JMX values as metrics. Contributed by Erik Krogen.
This commit is contained in:
parent
0542e6f86e
commit
f4c6b00a9f
@ -213,7 +213,15 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
|
|||||||
| `PendingDataNodeMessageCount` | (HA-only) Current number of pending block-related messages for later processing in the standby NameNode |
|
| `PendingDataNodeMessageCount` | (HA-only) Current number of pending block-related messages for later processing in the standby NameNode |
|
||||||
| `MillisSinceLastLoadedEdits` | (HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0 |
|
| `MillisSinceLastLoadedEdits` | (HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0 |
|
||||||
| `BlockCapacity` | Current number of block capacity |
|
| `BlockCapacity` | Current number of block capacity |
|
||||||
|
| `NumLiveDataNodes` | Number of datanodes which are currently live |
|
||||||
|
| `NumDeadDataNodes` | Number of datanodes which are currently dead |
|
||||||
|
| `NumDecomLiveDataNodes` | Number of datanodes which have been decommissioned and are now live |
|
||||||
|
| `NumDecomDeadDataNodes` | Number of datanodes which have been decommissioned and are now dead |
|
||||||
|
| `NumDecommissioningDataNodes` | Number of datanodes in decommissioning state |
|
||||||
|
| `VolumeFailuresTotal` | Total number of volume failures across all Datanodes |
|
||||||
|
| `EstimatedCapacityLostTotal` | An estimate of the total capacity lost due to volume failures |
|
||||||
| `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat |
|
| `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat |
|
||||||
|
| `NumStaleStorages` | Number of storages marked as content stale (after NameNode restart/failover before first block report is received) |
|
||||||
| `MissingReplOneBlocks` | Current number of missing blocks with replication factor 1 |
|
| `MissingReplOneBlocks` | Current number of missing blocks with replication factor 1 |
|
||||||
| `NumFilesUnderConstruction` | Current number of files under construction |
|
| `NumFilesUnderConstruction` | Current number of files under construction |
|
||||||
| `NumActiveClients` | Current number of active clients holding lease |
|
| `NumActiveClients` | Current number of active clients holding lease |
|
||||||
@ -224,6 +232,9 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
|
|||||||
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
|
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
|
||||||
| `NameDirSize` | NameNode name directories size in bytes |
|
| `NameDirSize` | NameNode name directories size in bytes |
|
||||||
| `NumTimedOutPendingReconstructions` | The number of timed out reconstructions. Not the number of unique blocks that timed out. |
|
| `NumTimedOutPendingReconstructions` | The number of timed out reconstructions. Not the number of unique blocks that timed out. |
|
||||||
|
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
|
||||||
|
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
|
||||||
|
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
|
||||||
| `FSN(Read|Write)Lock`*OperationName*`NumOps` | Total number of acquiring lock by operations |
|
| `FSN(Read|Write)Lock`*OperationName*`NumOps` | Total number of acquiring lock by operations |
|
||||||
| `FSN(Read|Write)Lock`*OperationName*`AvgTime` | Average time of holding the lock by operations in milliseconds |
|
| `FSN(Read|Write)Lock`*OperationName*`AvgTime` | Average time of holding the lock by operations in milliseconds |
|
||||||
|
|
||||||
|
@ -4843,16 +4843,20 @@ void shutdown() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"})
|
||||||
public int getNumLiveDataNodes() {
|
public int getNumLiveDataNodes() {
|
||||||
return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
|
return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"})
|
||||||
public int getNumDeadDataNodes() {
|
public int getNumDeadDataNodes() {
|
||||||
return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
|
return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumDecomLiveDataNodes",
|
||||||
|
"Number of datanodes which have been decommissioned and are now live"})
|
||||||
public int getNumDecomLiveDataNodes() {
|
public int getNumDecomLiveDataNodes() {
|
||||||
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
||||||
@ -4864,6 +4868,8 @@ public int getNumDecomLiveDataNodes() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumDecomDeadDataNodes",
|
||||||
|
"Number of datanodes which have been decommissioned and are now dead"})
|
||||||
public int getNumDecomDeadDataNodes() {
|
public int getNumDecomDeadDataNodes() {
|
||||||
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
|
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
|
||||||
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
|
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
|
||||||
@ -4875,6 +4881,8 @@ public int getNumDecomDeadDataNodes() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"VolumeFailuresTotal",
|
||||||
|
"Total number of volume failures across all Datanodes"})
|
||||||
public int getVolumeFailuresTotal() {
|
public int getVolumeFailuresTotal() {
|
||||||
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
||||||
@ -4886,6 +4894,8 @@ public int getVolumeFailuresTotal() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"EstimatedCapacityLostTotal",
|
||||||
|
"An estimate of the total capacity lost due to volume failures"})
|
||||||
public long getEstimatedCapacityLostTotal() {
|
public long getEstimatedCapacityLostTotal() {
|
||||||
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
|
||||||
@ -4901,6 +4911,8 @@ public long getEstimatedCapacityLostTotal() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumDecommissioningDataNodes",
|
||||||
|
"Number of datanodes in decommissioning state"})
|
||||||
public int getNumDecommissioningDataNodes() {
|
public int getNumDecommissioningDataNodes() {
|
||||||
return getBlockManager().getDatanodeManager().getDecommissioningNodes()
|
return getBlockManager().getDatanodeManager().getDecommissioningNodes()
|
||||||
.size();
|
.size();
|
||||||
@ -4918,6 +4930,8 @@ public int getNumStaleDataNodes() {
|
|||||||
* before NN receives the first Heartbeat followed by the first Blockreport.
|
* before NN receives the first Heartbeat followed by the first Blockreport.
|
||||||
*/
|
*/
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumStaleStorages",
|
||||||
|
"Number of storages marked as content stale"})
|
||||||
public int getNumStaleStorages() {
|
public int getNumStaleStorages() {
|
||||||
return getBlockManager().getDatanodeManager().getNumStaleStorages();
|
return getBlockManager().getDatanodeManager().getNumStaleStorages();
|
||||||
}
|
}
|
||||||
@ -7542,6 +7556,8 @@ public long getBytesInFuture() {
|
|||||||
|
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumInMaintenanceLiveDataNodes",
|
||||||
|
"Number of live Datanodes which are in maintenance state"})
|
||||||
public int getNumInMaintenanceLiveDataNodes() {
|
public int getNumInMaintenanceLiveDataNodes() {
|
||||||
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
|
||||||
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
|
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
|
||||||
@ -7553,6 +7569,8 @@ public int getNumInMaintenanceLiveDataNodes() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumInMaintenanceDeadDataNodes",
|
||||||
|
"Number of dead Datanodes which are in maintenance state"})
|
||||||
public int getNumInMaintenanceDeadDataNodes() {
|
public int getNumInMaintenanceDeadDataNodes() {
|
||||||
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
|
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
|
||||||
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
|
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
|
||||||
@ -7564,6 +7582,8 @@ public int getNumInMaintenanceDeadDataNodes() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override // FSNamesystemMBean
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"NumEnteringMaintenanceDataNodes",
|
||||||
|
"Number of Datanodes that are entering the maintenance state"})
|
||||||
public int getNumEnteringMaintenanceDataNodes() {
|
public int getNumEnteringMaintenanceDataNodes() {
|
||||||
return getBlockManager().getDatanodeManager().getEnteringMaintenanceNodes()
|
return getBlockManager().getDatanodeManager().getEnteringMaintenanceNodes()
|
||||||
.size();
|
.size();
|
||||||
|
@ -61,6 +61,7 @@
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
import com.google.common.base.Supplier;
|
import com.google.common.base.Supplier;
|
||||||
@ -148,6 +149,8 @@ public class MiniDFSCluster implements AutoCloseable {
|
|||||||
public static final String HDFS_MINIDFS_BASEDIR = "hdfs.minidfs.basedir";
|
public static final String HDFS_MINIDFS_BASEDIR = "hdfs.minidfs.basedir";
|
||||||
public static final String DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY
|
public static final String DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY
|
||||||
= DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + ".testing";
|
= DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + ".testing";
|
||||||
|
public static final String DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY
|
||||||
|
= DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY + ".testing";
|
||||||
|
|
||||||
// Changing this default may break some tests that assume it is 2.
|
// Changing this default may break some tests that assume it is 2.
|
||||||
private static final int DEFAULT_STORAGES_PER_DATANODE = 2;
|
private static final int DEFAULT_STORAGES_PER_DATANODE = 2;
|
||||||
@ -818,7 +821,10 @@ private void initMiniDFSCluster(
|
|||||||
int safemodeExtension = conf.getInt(
|
int safemodeExtension = conf.getInt(
|
||||||
DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY, 0);
|
DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY, 0);
|
||||||
conf.setInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, safemodeExtension);
|
conf.setInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, safemodeExtension);
|
||||||
conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, 3); // 3 second
|
long decommissionInterval = conf.getTimeDuration(
|
||||||
|
DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 3, TimeUnit.SECONDS);
|
||||||
|
conf.setTimeDuration(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY,
|
||||||
|
decommissionInterval, TimeUnit.SECONDS);
|
||||||
if (!useConfiguredTopologyMappingClass) {
|
if (!useConfiguredTopologyMappingClass) {
|
||||||
conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
|
conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
|
||||||
StaticMapping.class, DNSToSwitchMapping.class);
|
StaticMapping.class, DNSToSwitchMapping.class);
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hdfs.server.namenode.metrics;
|
package org.apache.hadoop.hdfs.server.namenode.metrics;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
import org.apache.hadoop.crypto.key.JavaKeyStoreProvider;
|
import org.apache.hadoop.crypto.key.JavaKeyStoreProvider;
|
||||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||||
import org.apache.hadoop.fs.FileSystemTestHelper;
|
import org.apache.hadoop.fs.FileSystemTestHelper;
|
||||||
@ -41,7 +42,9 @@
|
|||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import com.google.common.collect.ImmutableList;
|
import com.google.common.collect.ImmutableList;
|
||||||
|
|
||||||
@ -69,12 +72,15 @@
|
|||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
|
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
||||||
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
||||||
|
import org.apache.hadoop.hdfs.util.HostsFileWriter;
|
||||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||||
import org.apache.hadoop.metrics2.MetricsSource;
|
import org.apache.hadoop.metrics2.MetricsSource;
|
||||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
@ -115,6 +121,15 @@ public class TestNameNodeMetrics {
|
|||||||
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
|
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
|
||||||
CONF.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
CONF.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
||||||
DFS_REDUNDANCY_INTERVAL);
|
DFS_REDUNDANCY_INTERVAL);
|
||||||
|
// Set it long enough to essentially disable unless we manually call it
|
||||||
|
// Used for decommissioning DataNode metrics
|
||||||
|
CONF.setTimeDuration(
|
||||||
|
MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 999,
|
||||||
|
TimeUnit.DAYS);
|
||||||
|
// Next two configs used for checking failed volume metrics
|
||||||
|
CONF.setTimeDuration(DFSConfigKeys.DFS_DATANODE_DISK_CHECK_MIN_GAP_KEY,
|
||||||
|
10, TimeUnit.MILLISECONDS);
|
||||||
|
CONF.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
|
||||||
CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_KEY,
|
CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_KEY,
|
||||||
DFS_REDUNDANCY_INTERVAL);
|
DFS_REDUNDANCY_INTERVAL);
|
||||||
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
|
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
|
||||||
@ -133,6 +148,7 @@ public class TestNameNodeMetrics {
|
|||||||
private DistributedFileSystem fs;
|
private DistributedFileSystem fs;
|
||||||
private final Random rand = new Random();
|
private final Random rand = new Random();
|
||||||
private FSNamesystem namesystem;
|
private FSNamesystem namesystem;
|
||||||
|
private HostsFileWriter hostsFileWriter;
|
||||||
private BlockManager bm;
|
private BlockManager bm;
|
||||||
private Path ecDir;
|
private Path ecDir;
|
||||||
|
|
||||||
@ -142,6 +158,8 @@ private static Path getTestPath(String fileName) {
|
|||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
|
hostsFileWriter = new HostsFileWriter();
|
||||||
|
hostsFileWriter.initialize(CONF, "temp/decommission");
|
||||||
cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT)
|
cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT)
|
||||||
.build();
|
.build();
|
||||||
cluster.waitActive();
|
cluster.waitActive();
|
||||||
@ -161,6 +179,10 @@ public void tearDown() throws Exception {
|
|||||||
MetricsRecordBuilder rb = getMetrics(source);
|
MetricsRecordBuilder rb = getMetrics(source);
|
||||||
assertQuantileGauges("GetGroups1s", rb);
|
assertQuantileGauges("GetGroups1s", rb);
|
||||||
}
|
}
|
||||||
|
if (hostsFileWriter != null) {
|
||||||
|
hostsFileWriter.cleanup();
|
||||||
|
hostsFileWriter = null;
|
||||||
|
}
|
||||||
if (cluster != null) {
|
if (cluster != null) {
|
||||||
cluster.shutdown();
|
cluster.shutdown();
|
||||||
cluster = null;
|
cluster = null;
|
||||||
@ -236,6 +258,96 @@ public void testStaleNodes() throws Exception {
|
|||||||
assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS));
|
assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test metrics associated with volume failures.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testVolumeFailures() throws Exception {
|
||||||
|
assertGauge("VolumeFailuresTotal", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("EstimatedCapacityLostTotal", 0L, getMetrics(NS_METRICS));
|
||||||
|
DataNode dn = cluster.getDataNodes().get(0);
|
||||||
|
FsDatasetSpi.FsVolumeReferences volumeReferences =
|
||||||
|
DataNodeTestUtils.getFSDataset(dn).getFsVolumeReferences();
|
||||||
|
FsVolumeImpl fsVolume = (FsVolumeImpl) volumeReferences.get(0);
|
||||||
|
File dataDir = new File(fsVolume.getBaseURI());
|
||||||
|
long capacity = fsVolume.getCapacity();
|
||||||
|
volumeReferences.close();
|
||||||
|
DataNodeTestUtils.injectDataDirFailure(dataDir);
|
||||||
|
DataNodeTestUtils.waitForDiskError(dn, fsVolume);
|
||||||
|
DataNodeTestUtils.triggerHeartbeat(dn);
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(bm);
|
||||||
|
assertGauge("VolumeFailuresTotal", 1, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("EstimatedCapacityLostTotal", capacity, getMetrics(NS_METRICS));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test metrics associated with liveness and decommission status of DataNodes.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testDataNodeLivenessAndDecom() throws Exception {
|
||||||
|
List<DataNode> dataNodes = cluster.getDataNodes();
|
||||||
|
DatanodeDescriptor[] dnDescriptors = new DatanodeDescriptor[DATANODE_COUNT];
|
||||||
|
String[] dnAddresses = new String[DATANODE_COUNT];
|
||||||
|
for (int i = 0; i < DATANODE_COUNT; i++) {
|
||||||
|
dnDescriptors[i] = bm.getDatanodeManager()
|
||||||
|
.getDatanode(dataNodes.get(i).getDatanodeId());
|
||||||
|
dnAddresses[i] = dnDescriptors[i].getXferAddr();
|
||||||
|
}
|
||||||
|
// First put all DNs into include
|
||||||
|
hostsFileWriter.initIncludeHosts(dnAddresses);
|
||||||
|
bm.getDatanodeManager().refreshNodes(CONF);
|
||||||
|
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
||||||
|
|
||||||
|
// Now decommission one DN
|
||||||
|
hostsFileWriter.initExcludeHost(dnAddresses[0]);
|
||||||
|
bm.getDatanodeManager().refreshNodes(CONF);
|
||||||
|
assertGauge("NumDecommissioningDataNodes", 1, getMetrics(NS_METRICS));
|
||||||
|
BlockManagerTestUtil.recheckDecommissionState(bm.getDatanodeManager());
|
||||||
|
assertGauge("NumDecommissioningDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumDecomLiveDataNodes", 1, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
||||||
|
|
||||||
|
// Now kill all DNs by expiring their heartbeats
|
||||||
|
for (int i = 0; i < DATANODE_COUNT; i++) {
|
||||||
|
DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), true);
|
||||||
|
long expireInterval = CONF.getLong(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT) * 2L
|
||||||
|
+ CONF.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 10 * 1000L;
|
||||||
|
DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i],
|
||||||
|
-(expireInterval + 1));
|
||||||
|
}
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(bm);
|
||||||
|
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumDecomDeadDataNodes", 1, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumDeadDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
||||||
|
|
||||||
|
// Now remove the decommissioned DN altogether
|
||||||
|
String[] includeHosts = new String[dnAddresses.length - 1];
|
||||||
|
for (int i = 0; i < includeHosts.length; i++) {
|
||||||
|
includeHosts[i] = dnAddresses[i + 1];
|
||||||
|
}
|
||||||
|
hostsFileWriter.initIncludeHosts(includeHosts);
|
||||||
|
hostsFileWriter.initExcludeHosts(new ArrayList<>());
|
||||||
|
bm.getDatanodeManager().refreshNodes(CONF);
|
||||||
|
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumDecomDeadDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumDeadDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
|
||||||
|
|
||||||
|
// Finally mark the remaining DNs as live again
|
||||||
|
for (int i = 1; i < dataNodes.size(); i++) {
|
||||||
|
DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), false);
|
||||||
|
DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], 0);
|
||||||
|
}
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(bm);
|
||||||
|
assertGauge("NumLiveDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
|
||||||
|
assertGauge("NumDeadDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
}
|
||||||
|
|
||||||
/** Test metrics associated with addition of a file */
|
/** Test metrics associated with addition of a file */
|
||||||
@Test
|
@Test
|
||||||
public void testFileAdd() throws Exception {
|
public void testFileAdd() throws Exception {
|
||||||
|
Loading…
Reference in New Issue
Block a user