HDFS-11907. Add metric for time taken by NameNode resource check. Contributed by Chen Liang.
This commit is contained in:
parent
a81916ea89
commit
3f0a727f75
@ -3743,9 +3743,12 @@ boolean nameNodeHasResourcesAvailable() {
|
||||
* Perform resource checks and cache the results.
|
||||
*/
|
||||
void checkAvailableResources() {
|
||||
long resourceCheckTime = monotonicNow();
|
||||
Preconditions.checkState(nnResourceChecker != null,
|
||||
"nnResourceChecker not initialized");
|
||||
hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
|
||||
resourceCheckTime = monotonicNow() - resourceCheckTime;
|
||||
NameNode.getNameNodeMetrics().addResourceCheckTime(resourceCheckTime);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -86,6 +86,7 @@
|
||||
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||
import org.apache.hadoop.util.ServicePlugin;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.util.Time;
|
||||
import org.apache.htrace.core.Tracer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -367,6 +368,7 @@ public long getProtocolVersion(String protocol,
|
||||
protected final boolean allowStaleStandbyReads;
|
||||
private AtomicBoolean started = new AtomicBoolean(false);
|
||||
|
||||
private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000;
|
||||
|
||||
/** httpServer */
|
||||
protected NameNodeHttpServer httpServer;
|
||||
@ -1715,7 +1717,14 @@ synchronized void monitorHealth()
|
||||
if (!haEnabled) {
|
||||
return; // no-op, if HA is not enabled
|
||||
}
|
||||
long start = Time.monotonicNow();
|
||||
getNamesystem().checkAvailableResources();
|
||||
long end = Time.monotonicNow();
|
||||
if (end - start >= HEALTH_MONITOR_WARN_THRESHOLD_MS) {
|
||||
// log a warning if it take >= 5 seconds.
|
||||
LOG.warn("Remote IP {} checking available resources took {}ms",
|
||||
Server.getRemoteIp(), end - start);
|
||||
}
|
||||
if (!getNamesystem().nameNodeHasResourcesAvailable()) {
|
||||
throw new HealthCheckFailedException(
|
||||
"The NameNode has no resources available");
|
||||
|
@ -119,6 +119,8 @@ public long totalFileOps(){
|
||||
private final MutableQuantiles[] generateEDEKTimeQuantiles;
|
||||
@Metric("Warm-up EDEK time") private MutableRate warmUpEDEKTime;
|
||||
private final MutableQuantiles[] warmUpEDEKTimeQuantiles;
|
||||
@Metric("Resource check time") private MutableRate resourceCheckTime;
|
||||
private final MutableQuantiles[] resourceCheckTimeQuantiles;
|
||||
|
||||
@Metric("Duration in SafeMode at startup in msec")
|
||||
MutableGaugeInt safeModeTime;
|
||||
@ -145,6 +147,7 @@ public long totalFileOps(){
|
||||
cacheReportQuantiles = new MutableQuantiles[len];
|
||||
generateEDEKTimeQuantiles = new MutableQuantiles[len];
|
||||
warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
|
||||
resourceCheckTimeQuantiles = new MutableQuantiles[len];
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
int interval = intervals[i];
|
||||
@ -163,6 +166,9 @@ public long totalFileOps(){
|
||||
warmUpEDEKTimeQuantiles[i] = registry.newQuantiles(
|
||||
"warmupEDEKTime" + interval + "s",
|
||||
"Warm up EDEK time", "ops", "latency", interval);
|
||||
resourceCheckTimeQuantiles[i] = registry.newQuantiles(
|
||||
"resourceCheckTime" + interval + "s",
|
||||
"resource check time", "ops", "latency", interval);
|
||||
}
|
||||
}
|
||||
|
||||
@ -353,4 +359,11 @@ public void addWarmUpEDEKTime(long latency) {
|
||||
q.add(latency);
|
||||
}
|
||||
}
|
||||
|
||||
public void addResourceCheckTime(long latency) {
|
||||
resourceCheckTime.add(latency);
|
||||
for (MutableQuantiles q : resourceCheckTimeQuantiles) {
|
||||
q.add(latency);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -22,8 +22,13 @@
|
||||
import org.apache.hadoop.fs.FileSystemTestHelper;
|
||||
import org.apache.hadoop.fs.FileSystemTestWrapper;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.ha.HAServiceProtocol;
|
||||
import org.apache.hadoop.hdfs.DFSUtil;
|
||||
import org.apache.hadoop.hdfs.client.CreateEncryptionZoneFlag;
|
||||
import org.apache.hadoop.hdfs.client.HdfsAdmin;
|
||||
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
|
||||
@ -60,9 +65,11 @@
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
||||
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||
import org.apache.hadoop.metrics2.MetricsSource;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
@ -683,4 +690,34 @@ public void testGenerateEDEKTime() throws IOException,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResourceCheck() throws Exception {
|
||||
HdfsConfiguration conf = new HdfsConfiguration();
|
||||
MiniDFSCluster tmpCluster = new MiniDFSCluster.Builder(conf)
|
||||
.numDataNodes(0)
|
||||
.nnTopology(MiniDFSNNTopology.simpleHATopology())
|
||||
.build();
|
||||
try {
|
||||
MockNameNodeResourceChecker mockResourceChecker =
|
||||
new MockNameNodeResourceChecker(conf);
|
||||
tmpCluster.getNameNode(0).getNamesystem()
|
||||
.setNNResourceChecker(mockResourceChecker);
|
||||
NNHAServiceTarget haTarget = new NNHAServiceTarget(conf,
|
||||
DFSUtil.getNamenodeNameServiceId(
|
||||
new HdfsConfiguration()), "nn1");
|
||||
HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, conf.getInt(
|
||||
HA_HM_RPC_TIMEOUT_KEY, HA_HM_RPC_TIMEOUT_DEFAULT));
|
||||
|
||||
MetricsRecordBuilder rb = getMetrics(NN_METRICS);
|
||||
for (long i = 0; i < 10; i++) {
|
||||
rpc.monitorHealth();
|
||||
assertQuantileGauges("ResourceCheckTime1s", rb);
|
||||
}
|
||||
} finally {
|
||||
if (tmpCluster != null) {
|
||||
tmpCluster.shutdown();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user