HDFS-11907. Add metric for time taken by NameNode resource check. Contributed by Chen Liang.

This commit is contained in:
Arpit Agarwal 2017-06-12 15:18:38 -07:00
parent a81916ea89
commit 3f0a727f75
4 changed files with 63 additions and 1 deletions

View File

@ -3743,9 +3743,12 @@ boolean nameNodeHasResourcesAvailable() {
* Perform resource checks and cache the results.
*/
void checkAvailableResources() {
long resourceCheckTime = monotonicNow();
Preconditions.checkState(nnResourceChecker != null,
"nnResourceChecker not initialized");
hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
resourceCheckTime = monotonicNow() - resourceCheckTime;
NameNode.getNameNodeMetrics().addResourceCheckTime(resourceCheckTime);
}
/**

View File

@ -86,6 +86,7 @@
import org.apache.hadoop.util.JvmPauseMonitor;
import org.apache.hadoop.util.ServicePlugin;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.htrace.core.Tracer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -365,8 +366,9 @@ public long getProtocolVersion(String protocol,
private final boolean haEnabled;
private final HAContext haContext;
protected final boolean allowStaleStandbyReads;
private AtomicBoolean started = new AtomicBoolean(false);
private AtomicBoolean started = new AtomicBoolean(false);
private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000;
/** httpServer */
protected NameNodeHttpServer httpServer;
@ -1715,7 +1717,14 @@ synchronized void monitorHealth()
if (!haEnabled) {
return; // no-op, if HA is not enabled
}
long start = Time.monotonicNow();
getNamesystem().checkAvailableResources();
long end = Time.monotonicNow();
if (end - start >= HEALTH_MONITOR_WARN_THRESHOLD_MS) {
// log a warning if it take >= 5 seconds.
LOG.warn("Remote IP {} checking available resources took {}ms",
Server.getRemoteIp(), end - start);
}
if (!getNamesystem().nameNodeHasResourcesAvailable()) {
throw new HealthCheckFailedException(
"The NameNode has no resources available");

View File

@ -119,6 +119,8 @@ public long totalFileOps(){
private final MutableQuantiles[] generateEDEKTimeQuantiles;
@Metric("Warm-up EDEK time") private MutableRate warmUpEDEKTime;
private final MutableQuantiles[] warmUpEDEKTimeQuantiles;
@Metric("Resource check time") private MutableRate resourceCheckTime;
private final MutableQuantiles[] resourceCheckTimeQuantiles;
@Metric("Duration in SafeMode at startup in msec")
MutableGaugeInt safeModeTime;
@ -145,6 +147,7 @@ public long totalFileOps(){
cacheReportQuantiles = new MutableQuantiles[len];
generateEDEKTimeQuantiles = new MutableQuantiles[len];
warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
resourceCheckTimeQuantiles = new MutableQuantiles[len];
for (int i = 0; i < len; i++) {
int interval = intervals[i];
@ -163,6 +166,9 @@ public long totalFileOps(){
warmUpEDEKTimeQuantiles[i] = registry.newQuantiles(
"warmupEDEKTime" + interval + "s",
"Warm up EDEK time", "ops", "latency", interval);
resourceCheckTimeQuantiles[i] = registry.newQuantiles(
"resourceCheckTime" + interval + "s",
"resource check time", "ops", "latency", interval);
}
}
@ -353,4 +359,11 @@ public void addWarmUpEDEKTime(long latency) {
q.add(latency);
}
}
public void addResourceCheckTime(long latency) {
resourceCheckTime.add(latency);
for (MutableQuantiles q : resourceCheckTimeQuantiles) {
q.add(latency);
}
}
}

View File

@ -22,8 +22,13 @@
import org.apache.hadoop.fs.FileSystemTestHelper;
import org.apache.hadoop.fs.FileSystemTestWrapper;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.client.CreateEncryptionZoneFlag;
import org.apache.hadoop.hdfs.client.HdfsAdmin;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
@ -60,9 +65,11 @@
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@ -683,4 +690,34 @@ public void testGenerateEDEKTime() throws IOException,
}
}
}
@Test
public void testResourceCheck() throws Exception {
HdfsConfiguration conf = new HdfsConfiguration();
MiniDFSCluster tmpCluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(0)
.nnTopology(MiniDFSNNTopology.simpleHATopology())
.build();
try {
MockNameNodeResourceChecker mockResourceChecker =
new MockNameNodeResourceChecker(conf);
tmpCluster.getNameNode(0).getNamesystem()
.setNNResourceChecker(mockResourceChecker);
NNHAServiceTarget haTarget = new NNHAServiceTarget(conf,
DFSUtil.getNamenodeNameServiceId(
new HdfsConfiguration()), "nn1");
HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, conf.getInt(
HA_HM_RPC_TIMEOUT_KEY, HA_HM_RPC_TIMEOUT_DEFAULT));
MetricsRecordBuilder rb = getMetrics(NN_METRICS);
for (long i = 0; i < 10; i++) {
rpc.monitorHealth();
assertQuantileGauges("ResourceCheckTime1s", rb);
}
} finally {
if (tmpCluster != null) {
tmpCluster.shutdown();
}
}
}
}