HDFS-13641. Add metrics for edit log tailing. Contributed by Chao Sun.

This commit is contained in:
Yiqun Lin 2018-06-13 20:05:55 +08:00
parent 6307962b93
commit 8e7548d33b
5 changed files with 140 additions and 5 deletions

View File

@ -181,6 +181,18 @@ Each metrics record contains tags such as ProcessName, SessionId, and Hostname a
| `WarmUpEDEKTimeAvgTime` | Average time of warming up EDEK in milliseconds |
| `ResourceCheckTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of NameNode resource check latency in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `StorageBlockReport`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of storage block report latency in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `EditLogTailTimeNumOps` | Total number of times the standby NameNode tailed the edit log |
| `EditLogTailTimeAvgTime` | Average time (in milliseconds) spent by standby NameNode in tailing edit log |
| `EditLogTailTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of time spent in tailing edit logs by standby NameNode, in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `EditLogFetchTimeNumOps` | Total number of times the standby NameNode fetched remote edit streams from journal nodes |
| `EditLogFetchTimeAvgTime` | Average time (in milliseconds) spent by standby NameNode in fetching remote edit streams from journal nodes |
| `EditLogFetchTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of time spent in fetching edit streams from journal nodes by standby NameNode, in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `NumEditLogLoadedNumOps` | Total number of times edits were loaded by standby NameNode |
| `NumEditLogLoadedAvgCount` | Average number of edits loaded by standby NameNode in each edit log tailing |
| `NumEditLogLoaded`*num*`s(50/75/90/95/99)thPercentileCount` | The 50/75/90/95/99th percentile of number of edits loaded by standby NameNode in each edit log tailing. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `EditLogTailIntervalNumOps` | Total number of intervals between edit log tailings by standby NameNode |
| `EditLogTailIntervalAvgTime` | Average time of intervals between edit log tailings by standby NameNode in milliseconds |
| `EditLogTailInterval`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of time between edit log tailings by standby NameNode, in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
FSNamesystem
------------

View File

@ -367,17 +367,31 @@ public static void assertGaugeGt(String name, double greater,
}
/**
* Asserts that the NumOps and quantiles for a metric have been changed at
* some point to a non-zero value.
* Asserts that the NumOps and quantiles for a metric with value name
* "Latency" have been changed at some point to a non-zero value.
*
* @param prefix of the metric
* @param rb MetricsRecordBuilder with the metric
*/
public static void assertQuantileGauges(String prefix,
MetricsRecordBuilder rb) {
assertQuantileGauges(prefix, rb, "Latency");
}
/**
* Asserts that the NumOps and quantiles for a metric have been changed at
* some point to a non-zero value, for the specified value name of the
* metrics (e.g., "Latency", "Count").
*
* @param prefix of the metric
* @param rb MetricsRecordBuilder with the metric
* @param valueName the value name for the metric
*/
public static void assertQuantileGauges(String prefix,
MetricsRecordBuilder rb, String valueName) {
verify(rb).addGauge(eqName(info(prefix + "NumOps", "")), geq(0l));
for (Quantile q : MutableQuantiles.quantiles) {
String nameTemplate = prefix + "%dthPercentileLatency";
String nameTemplate = prefix + "%dthPercentile" + valueName;
int percentile = (int) (100 * q.quantile);
verify(rb).addGauge(
eqName(info(String.format(nameTemplate, percentile), "")),

View File

@ -62,6 +62,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.hadoop.util.Time;
/**
@ -292,6 +293,7 @@ void doTailEdits() throws IOException, InterruptedException {
LOG.debug("lastTxnId: " + lastTxnId);
}
Collection<EditLogInputStream> streams;
long startTime = Time.monotonicNow();
try {
streams = editLog.selectInputStreams(lastTxnId + 1, 0,
null, inProgressOk, true);
@ -302,6 +304,9 @@ void doTailEdits() throws IOException, InterruptedException {
LOG.warn("Edits tailer failed to find any streams. Will try again " +
"later.", ioe);
return;
} finally {
NameNode.getNameNodeMetrics().addEditLogFetchTime(
Time.monotonicNow() - startTime);
}
if (LOG.isDebugEnabled()) {
LOG.debug("edit streams to load from: " + streams.size());
@ -322,6 +327,7 @@ void doTailEdits() throws IOException, InterruptedException {
LOG.debug(String.format("Loaded %d edits starting from txid %d ",
editsLoaded, lastTxnId));
}
NameNode.getNameNodeMetrics().addNumEditLogLoaded(editsLoaded);
}
if (editsLoaded > 0) {
@ -446,10 +452,15 @@ private void doWork() {
// name system lock will be acquired to further block even the block
// state updates.
namesystem.cpLockInterruptibly();
long startTime = Time.monotonicNow();
try {
NameNode.getNameNodeMetrics().addEditLogTailInterval(
startTime - lastLoadTimeMs);
doTailEdits();
} finally {
namesystem.cpUnlock();
NameNode.getNameNodeMetrics().addEditLogTailTime(
Time.monotonicNow() - startTime);
}
//Update NameDirSize Metric
namesystem.getFSImage().getStorage().updateNameDirSize();

View File

@ -32,6 +32,7 @@
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.metrics2.lib.MutableStat;
import org.apache.hadoop.metrics2.source.JvmMetrics;
/**
@ -134,6 +135,18 @@ public long totalFileOps(){
@Metric("Time loading FS Image at startup in msec")
MutableGaugeInt fsImageLoadTime;
@Metric("Time tailing edit logs in msec")
MutableRate editLogTailTime;
private final MutableQuantiles[] editLogTailTimeQuantiles;
@Metric MutableRate editLogFetchTime;
private final MutableQuantiles[] editLogFetchTimeQuantiles;
@Metric(value = "Number of edits loaded", valueName = "Count")
MutableStat numEditLogLoaded;
private final MutableQuantiles[] numEditLogLoadedQuantiles;
@Metric("Time between edit log tailing in msec")
MutableRate editLogTailInterval;
private final MutableQuantiles[] editLogTailIntervalQuantiles;
@Metric("GetImageServlet getEdit")
MutableRate getEdit;
@Metric("GetImageServlet getImage")
@ -156,6 +169,10 @@ public long totalFileOps(){
generateEDEKTimeQuantiles = new MutableQuantiles[len];
warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
resourceCheckTimeQuantiles = new MutableQuantiles[len];
editLogTailTimeQuantiles = new MutableQuantiles[len];
editLogFetchTimeQuantiles = new MutableQuantiles[len];
numEditLogLoadedQuantiles = new MutableQuantiles[len];
editLogTailIntervalQuantiles = new MutableQuantiles[len];
for (int i = 0; i < len; i++) {
int interval = intervals[i];
@ -181,6 +198,18 @@ public long totalFileOps(){
resourceCheckTimeQuantiles[i] = registry.newQuantiles(
"resourceCheckTime" + interval + "s",
"resource check time", "ops", "latency", interval);
editLogTailTimeQuantiles[i] = registry.newQuantiles(
"editLogTailTime" + interval + "s",
"Edit log tailing time", "ops", "latency", interval);
editLogFetchTimeQuantiles[i] = registry.newQuantiles(
"editLogFetchTime" + interval + "s",
"Edit log fetch time", "ops", "latency", interval);
numEditLogLoadedQuantiles[i] = registry.newQuantiles(
"numEditLogLoaded" + interval + "s",
"Number of edits loaded", "ops", "count", interval);
editLogTailIntervalQuantiles[i] = registry.newQuantiles(
"editLogTailInterval" + interval + "s",
"Edit log tailing interval", "ops", "latency", interval);
}
}
@ -389,4 +418,32 @@ public void addResourceCheckTime(long latency) {
q.add(latency);
}
}
public void addEditLogTailTime(long elapsed) {
editLogTailTime.add(elapsed);
for (MutableQuantiles q : editLogTailTimeQuantiles) {
q.add(elapsed);
}
}
public void addEditLogFetchTime(long elapsed) {
editLogFetchTime.add(elapsed);
for (MutableQuantiles q : editLogFetchTimeQuantiles) {
q.add(elapsed);
}
}
public void addNumEditLogLoaded(long loaded) {
numEditLogLoaded.add(loaded);
for (MutableQuantiles q : numEditLogLoadedQuantiles) {
q.add(loaded);
}
}
public void addEditLogTailInterval(long elapsed) {
editLogTailInterval.add(elapsed);
for (MutableQuantiles q : editLogTailIntervalQuantiles) {
q.add(elapsed);
}
}
}

View File

@ -32,6 +32,7 @@
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
@ -1059,4 +1060,44 @@ public void testResourceCheck() throws Exception {
}
}
}
@Test
public void testEditLogTailing() throws Exception {
HdfsConfiguration conf = new HdfsConfiguration();
conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, 60);
MiniDFSCluster dfsCluster = null;
try {
dfsCluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(0)
.nnTopology(MiniDFSNNTopology.simpleHATopology())
.build();
DistributedFileSystem dfs = dfsCluster.getFileSystem(0);
dfsCluster.transitionToActive(0);
dfsCluster.waitActive();
Path testDir = new Path("/testdir");
dfs.mkdir(testDir, FsPermission.getDefault());
dfsCluster.getNameNodeRpc(0).rollEditLog();
Thread.sleep(2 * 1000);
// We need to get the metrics for the SBN (excluding the NN from dfs
// cluster created in setUp() and the ANN).
MetricsRecordBuilder rb = getMetrics(NN_METRICS+"-2");
assertQuantileGauges("EditLogTailTime60s", rb);
assertQuantileGauges("EditLogFetchTime60s", rb);
assertQuantileGauges("NumEditLogLoaded60s", rb, "Count");
assertQuantileGauges("EditLogTailInterval60s", rb);
assertCounterGt("EditLogTailTimeNumOps", 0L, rb);
assertCounterGt("EditLogFetchTimeNumOps", 0L, rb);
assertCounterGt("NumEditLogLoadedNumOps", 0L, rb);
assertCounterGt("EditLogTailIntervalNumOps", 0L, rb);
} finally {
if (dfsCluster != null) {
dfsCluster.shutdown();
}
}
}
}