HDFS-6403. Add metrics for log warnings reported by JVM pauses. Contributed by Yongjun Zhang.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1604074 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2014-06-20 02:38:00 +00:00
parent 46dc32e125
commit af6c91a80c
10 changed files with 89 additions and 15 deletions

View File

@ -38,6 +38,7 @@
import org.apache.hadoop.metrics2.lib.Interns;
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
import static org.apache.hadoop.metrics2.impl.MsInfo.*;
import org.apache.hadoop.util.JvmPauseMonitor;
/**
* JVM and logging related metrics.
@ -65,6 +66,7 @@ synchronized JvmMetrics init(String processName, String sessionId) {
ManagementFactory.getGarbageCollectorMXBeans();
final ThreadMXBean threadMXBean = ManagementFactory.getThreadMXBean();
final String processName, sessionId;
private JvmPauseMonitor pauseMonitor = null;
final ConcurrentHashMap<String, MetricsInfo[]> gcInfoCache =
new ConcurrentHashMap<String, MetricsInfo[]>();
@ -73,6 +75,10 @@ synchronized JvmMetrics init(String processName, String sessionId) {
this.sessionId = sessionId;
}
public void setPauseMonitor(final JvmPauseMonitor pauseMonitor) {
this.pauseMonitor = pauseMonitor;
}
public static JvmMetrics create(String processName, String sessionId,
MetricsSystem ms) {
return ms.register(JvmMetrics.name(), JvmMetrics.description(),
@ -120,6 +126,15 @@ private void getGcUsage(MetricsRecordBuilder rb) {
}
rb.addCounter(GcCount, count)
.addCounter(GcTimeMillis, timeMillis);
if (pauseMonitor != null) {
rb.addCounter(GcNumWarnThresholdExceeded,
pauseMonitor.getNumGcWarnThreadholdExceeded());
rb.addCounter(GcNumInfoThresholdExceeded,
pauseMonitor.getNumGcInfoThresholdExceeded());
rb.addCounter(GcTotalExtraSleepTime,
pauseMonitor.getTotalGcExtraSleepTime());
}
}
private MetricsInfo[] getGcInfo(String gcName) {

View File

@ -48,7 +48,10 @@ public enum JvmMetricsInfo implements MetricsInfo {
LogFatal("Total number of fatal log events"),
LogError("Total number of error log events"),
LogWarn("Total number of warning log events"),
LogInfo("Total number of info log events");
LogInfo("Total number of info log events"),
GcNumWarnThresholdExceeded("Number of times that the GC warn threshold is exceeded"),
GcNumInfoThresholdExceeded("Number of times that the GC info threshold is exceeded"),
GcTotalExtraSleepTime("Total GC extra sleep time in milliseconds");
private final String desc;

View File

@ -62,10 +62,13 @@ public class JvmPauseMonitor {
"jvm.pause.info-threshold.ms";
private static final long INFO_THRESHOLD_DEFAULT = 1000;
private long numGcWarnThresholdExceeded = 0;
private long numGcInfoThresholdExceeded = 0;
private long totalGcExtraSleepTime = 0;
private Thread monitorThread;
private volatile boolean shouldRun = true;
public JvmPauseMonitor(Configuration conf) {
this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
@ -87,6 +90,22 @@ public void stop() {
Thread.currentThread().interrupt();
}
}
public boolean isStarted() {
return monitorThread != null;
}
public long getNumGcWarnThreadholdExceeded() {
return numGcWarnThresholdExceeded;
}
public long getNumGcInfoThresholdExceeded() {
return numGcInfoThresholdExceeded;
}
public long getTotalGcExtraSleepTime() {
return totalGcExtraSleepTime;
}
private String formatMessage(long extraSleepTime,
Map<String, GcTimes> gcTimesAfterSleep,
@ -166,13 +185,15 @@ public void run() {
Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
if (extraSleepTime > warnThresholdMs) {
++numGcWarnThresholdExceeded;
LOG.warn(formatMessage(
extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
} else if (extraSleepTime > infoThresholdMs) {
++numGcInfoThresholdExceeded;
LOG.info(formatMessage(
extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
}
totalGcExtraSleepTime += extraSleepTime;
gcTimesBeforeSleep = gcTimesAfterSleep;
}
}

View File

@ -86,6 +86,14 @@ jvm context
*-------------------------------------+--------------------------------------+
|<<<LogInfo>>> | Total number of INFO logs
*-------------------------------------+--------------------------------------+
|<<<GcNumWarnThresholdExceeded>>> | Number of times that the GC warn
| threshold is exceeded
*-------------------------------------+--------------------------------------+
|<<<GcNumInfoThresholdExceeded>>> | Number of times that the GC info
| threshold is exceeded
*-------------------------------------+--------------------------------------+
|<<<GcTotalExtraSleepTime>>> | Total GC extra sleep time in msec
*-------------------------------------+--------------------------------------+
rpc context

View File

@ -19,18 +19,25 @@
package org.apache.hadoop.metrics2.source;
import org.junit.Test;
import static org.mockito.Mockito.*;
import static org.apache.hadoop.test.MetricsAsserts.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.util.JvmPauseMonitor;
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
import static org.apache.hadoop.metrics2.impl.MsInfo.*;
public class TestJvmMetrics {
@Test public void testPresence() {
MetricsRecordBuilder rb = getMetrics(new JvmMetrics("test", "test"));
JvmPauseMonitor pauseMonitor = new JvmPauseMonitor(new Configuration());
JvmMetrics jvmMetrics = new JvmMetrics("test", "test");
jvmMetrics.setPauseMonitor(pauseMonitor);
MetricsRecordBuilder rb = getMetrics(jvmMetrics);
MetricsCollector mc = rb.parent();
verify(mc).addRecord(JvmMetrics);

View File

@ -453,6 +453,9 @@ Release 2.5.0 - UNRELEASED
HDFS-6480. Move waitForReady() from FSDirectory to FSNamesystem. (wheat9)
HDFS-6403. Add metrics for log warnings reported by JVM pauses. (Yongjun
Zhang via atm)
OPTIMIZATIONS
HDFS-6214. Webhdfs has poor throughput for files >2GB (daryn)

View File

@ -778,7 +778,8 @@ void startDataNode(Configuration conf,
initIpcServer(conf);
metrics = DataNodeMetrics.create(conf, getDisplayName());
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
blockPoolManager = new BlockPoolManager(this);
blockPoolManager.refreshNamenodes(conf);

View File

@ -90,13 +90,15 @@ public class DataNodeMetrics {
final MutableQuantiles[] sendDataPacketBlockedOnNetworkNanosQuantiles;
@Metric MutableRate sendDataPacketTransferNanos;
final MutableQuantiles[] sendDataPacketTransferNanosQuantiles;
final MetricsRegistry registry = new MetricsRegistry("datanode");
final String name;
public DataNodeMetrics(String name, String sessionId, int[] intervals) {
JvmMetrics jvmMetrics = null;
public DataNodeMetrics(String name, String sessionId, int[] intervals,
final JvmMetrics jvmMetrics) {
this.name = name;
this.jvmMetrics = jvmMetrics;
registry.tag(SessionId, sessionId);
final int len = intervals.length;
@ -131,7 +133,7 @@ public DataNodeMetrics(String name, String sessionId, int[] intervals) {
public static DataNodeMetrics create(Configuration conf, String dnName) {
String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY);
MetricsSystem ms = DefaultMetricsSystem.instance();
JvmMetrics.create("DataNode", sessionId, ms);
JvmMetrics jm = JvmMetrics.create("DataNode", sessionId, ms);
String name = "DataNodeActivity-"+ (dnName.isEmpty()
? "UndefinedDataNodeName"+ DFSUtil.getRandom().nextInt()
: dnName.replace(':', '-'));
@ -141,11 +143,15 @@ public static DataNodeMetrics create(Configuration conf, String dnName) {
conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY);
return ms.register(name, null, new DataNodeMetrics(name, sessionId,
intervals));
intervals, jm));
}
public String name() { return name; }
public JvmMetrics getJvmMetrics() {
return jvmMetrics;
}
public void addHeartbeat(long latency) {
heartbeats.add(latency);
}

View File

@ -598,7 +598,8 @@ protected void initialize(Configuration conf) throws IOException {
pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
startCommonServices(conf);
}

View File

@ -98,7 +98,11 @@ public class NameNodeMetrics {
@Metric("GetImageServlet putImage")
MutableRate putImage;
NameNodeMetrics(String processName, String sessionId, int[] intervals) {
JvmMetrics jvmMetrics = null;
NameNodeMetrics(String processName, String sessionId, int[] intervals,
final JvmMetrics jvmMetrics) {
this.jvmMetrics = jvmMetrics;
registry.tag(ProcessName, processName).tag(SessionId, sessionId);
final int len = intervals.length;
@ -124,14 +128,19 @@ public static NameNodeMetrics create(Configuration conf, NamenodeRole r) {
String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY);
String processName = r.toString();
MetricsSystem ms = DefaultMetricsSystem.instance();
JvmMetrics.create(processName, sessionId, ms);
JvmMetrics jm = JvmMetrics.create(processName, sessionId, ms);
// Percentile measurement is off by default, by watching no intervals
int[] intervals =
conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY);
return ms.register(new NameNodeMetrics(processName, sessionId, intervals));
return ms.register(new NameNodeMetrics(processName, sessionId,
intervals, jm));
}
public JvmMetrics getJvmMetrics() {
return jvmMetrics;
}
public void shutdown() {
DefaultMetricsSystem.shutdown();
}