HDFS-11907. Add metric for time taken by NameNode resource check. Contributed by Chen Liang.
This commit is contained in:
parent
a81916ea89
commit
3f0a727f75
@ -3743,9 +3743,12 @@ boolean nameNodeHasResourcesAvailable() {
|
|||||||
* Perform resource checks and cache the results.
|
* Perform resource checks and cache the results.
|
||||||
*/
|
*/
|
||||||
void checkAvailableResources() {
|
void checkAvailableResources() {
|
||||||
|
long resourceCheckTime = monotonicNow();
|
||||||
Preconditions.checkState(nnResourceChecker != null,
|
Preconditions.checkState(nnResourceChecker != null,
|
||||||
"nnResourceChecker not initialized");
|
"nnResourceChecker not initialized");
|
||||||
hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
|
hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
|
||||||
|
resourceCheckTime = monotonicNow() - resourceCheckTime;
|
||||||
|
NameNode.getNameNodeMetrics().addResourceCheckTime(resourceCheckTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -86,6 +86,7 @@
|
|||||||
import org.apache.hadoop.util.JvmPauseMonitor;
|
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||||
import org.apache.hadoop.util.ServicePlugin;
|
import org.apache.hadoop.util.ServicePlugin;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
|
import org.apache.hadoop.util.Time;
|
||||||
import org.apache.htrace.core.Tracer;
|
import org.apache.htrace.core.Tracer;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -365,8 +366,9 @@ public long getProtocolVersion(String protocol,
|
|||||||
private final boolean haEnabled;
|
private final boolean haEnabled;
|
||||||
private final HAContext haContext;
|
private final HAContext haContext;
|
||||||
protected final boolean allowStaleStandbyReads;
|
protected final boolean allowStaleStandbyReads;
|
||||||
private AtomicBoolean started = new AtomicBoolean(false);
|
private AtomicBoolean started = new AtomicBoolean(false);
|
||||||
|
|
||||||
|
private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000;
|
||||||
|
|
||||||
/** httpServer */
|
/** httpServer */
|
||||||
protected NameNodeHttpServer httpServer;
|
protected NameNodeHttpServer httpServer;
|
||||||
@ -1715,7 +1717,14 @@ synchronized void monitorHealth()
|
|||||||
if (!haEnabled) {
|
if (!haEnabled) {
|
||||||
return; // no-op, if HA is not enabled
|
return; // no-op, if HA is not enabled
|
||||||
}
|
}
|
||||||
|
long start = Time.monotonicNow();
|
||||||
getNamesystem().checkAvailableResources();
|
getNamesystem().checkAvailableResources();
|
||||||
|
long end = Time.monotonicNow();
|
||||||
|
if (end - start >= HEALTH_MONITOR_WARN_THRESHOLD_MS) {
|
||||||
|
// log a warning if it take >= 5 seconds.
|
||||||
|
LOG.warn("Remote IP {} checking available resources took {}ms",
|
||||||
|
Server.getRemoteIp(), end - start);
|
||||||
|
}
|
||||||
if (!getNamesystem().nameNodeHasResourcesAvailable()) {
|
if (!getNamesystem().nameNodeHasResourcesAvailable()) {
|
||||||
throw new HealthCheckFailedException(
|
throw new HealthCheckFailedException(
|
||||||
"The NameNode has no resources available");
|
"The NameNode has no resources available");
|
||||||
|
@ -119,6 +119,8 @@ public long totalFileOps(){
|
|||||||
private final MutableQuantiles[] generateEDEKTimeQuantiles;
|
private final MutableQuantiles[] generateEDEKTimeQuantiles;
|
||||||
@Metric("Warm-up EDEK time") private MutableRate warmUpEDEKTime;
|
@Metric("Warm-up EDEK time") private MutableRate warmUpEDEKTime;
|
||||||
private final MutableQuantiles[] warmUpEDEKTimeQuantiles;
|
private final MutableQuantiles[] warmUpEDEKTimeQuantiles;
|
||||||
|
@Metric("Resource check time") private MutableRate resourceCheckTime;
|
||||||
|
private final MutableQuantiles[] resourceCheckTimeQuantiles;
|
||||||
|
|
||||||
@Metric("Duration in SafeMode at startup in msec")
|
@Metric("Duration in SafeMode at startup in msec")
|
||||||
MutableGaugeInt safeModeTime;
|
MutableGaugeInt safeModeTime;
|
||||||
@ -145,6 +147,7 @@ public long totalFileOps(){
|
|||||||
cacheReportQuantiles = new MutableQuantiles[len];
|
cacheReportQuantiles = new MutableQuantiles[len];
|
||||||
generateEDEKTimeQuantiles = new MutableQuantiles[len];
|
generateEDEKTimeQuantiles = new MutableQuantiles[len];
|
||||||
warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
|
warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
|
||||||
|
resourceCheckTimeQuantiles = new MutableQuantiles[len];
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
int interval = intervals[i];
|
int interval = intervals[i];
|
||||||
@ -163,6 +166,9 @@ public long totalFileOps(){
|
|||||||
warmUpEDEKTimeQuantiles[i] = registry.newQuantiles(
|
warmUpEDEKTimeQuantiles[i] = registry.newQuantiles(
|
||||||
"warmupEDEKTime" + interval + "s",
|
"warmupEDEKTime" + interval + "s",
|
||||||
"Warm up EDEK time", "ops", "latency", interval);
|
"Warm up EDEK time", "ops", "latency", interval);
|
||||||
|
resourceCheckTimeQuantiles[i] = registry.newQuantiles(
|
||||||
|
"resourceCheckTime" + interval + "s",
|
||||||
|
"resource check time", "ops", "latency", interval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -353,4 +359,11 @@ public void addWarmUpEDEKTime(long latency) {
|
|||||||
q.add(latency);
|
q.add(latency);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addResourceCheckTime(long latency) {
|
||||||
|
resourceCheckTime.add(latency);
|
||||||
|
for (MutableQuantiles q : resourceCheckTimeQuantiles) {
|
||||||
|
q.add(latency);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -22,8 +22,13 @@
|
|||||||
import org.apache.hadoop.fs.FileSystemTestHelper;
|
import org.apache.hadoop.fs.FileSystemTestHelper;
|
||||||
import org.apache.hadoop.fs.FileSystemTestWrapper;
|
import org.apache.hadoop.fs.FileSystemTestWrapper;
|
||||||
import org.apache.hadoop.fs.permission.FsPermission;
|
import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
|
import org.apache.hadoop.ha.HAServiceProtocol;
|
||||||
|
import org.apache.hadoop.hdfs.DFSUtil;
|
||||||
import org.apache.hadoop.hdfs.client.CreateEncryptionZoneFlag;
|
import org.apache.hadoop.hdfs.client.CreateEncryptionZoneFlag;
|
||||||
import org.apache.hadoop.hdfs.client.HdfsAdmin;
|
import org.apache.hadoop.hdfs.client.HdfsAdmin;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
|
||||||
|
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
|
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
|
import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
|
||||||
@ -60,9 +65,11 @@
|
|||||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
||||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||||
import org.apache.hadoop.metrics2.MetricsSource;
|
import org.apache.hadoop.metrics2.MetricsSource;
|
||||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
@ -683,4 +690,34 @@ public void testGenerateEDEKTime() throws IOException,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testResourceCheck() throws Exception {
|
||||||
|
HdfsConfiguration conf = new HdfsConfiguration();
|
||||||
|
MiniDFSCluster tmpCluster = new MiniDFSCluster.Builder(conf)
|
||||||
|
.numDataNodes(0)
|
||||||
|
.nnTopology(MiniDFSNNTopology.simpleHATopology())
|
||||||
|
.build();
|
||||||
|
try {
|
||||||
|
MockNameNodeResourceChecker mockResourceChecker =
|
||||||
|
new MockNameNodeResourceChecker(conf);
|
||||||
|
tmpCluster.getNameNode(0).getNamesystem()
|
||||||
|
.setNNResourceChecker(mockResourceChecker);
|
||||||
|
NNHAServiceTarget haTarget = new NNHAServiceTarget(conf,
|
||||||
|
DFSUtil.getNamenodeNameServiceId(
|
||||||
|
new HdfsConfiguration()), "nn1");
|
||||||
|
HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, conf.getInt(
|
||||||
|
HA_HM_RPC_TIMEOUT_KEY, HA_HM_RPC_TIMEOUT_DEFAULT));
|
||||||
|
|
||||||
|
MetricsRecordBuilder rb = getMetrics(NN_METRICS);
|
||||||
|
for (long i = 0; i < 10; i++) {
|
||||||
|
rpc.monitorHealth();
|
||||||
|
assertQuantileGauges("ResourceCheckTime1s", rb);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (tmpCluster != null) {
|
||||||
|
tmpCluster.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user