HADOOP-14960. Add GC time percentage monitor/alerter. Contributed by Misha Dmitriev.
This commit is contained in:
parent
10a1f557e7
commit
3c6adda291
@ -28,6 +28,8 @@
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.log.metrics.EventCounter;
|
||||
import org.apache.hadoop.metrics2.MetricsCollector;
|
||||
@ -39,6 +41,8 @@
|
||||
import org.apache.hadoop.metrics2.lib.Interns;
|
||||
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
|
||||
import static org.apache.hadoop.metrics2.impl.MsInfo.*;
|
||||
|
||||
import org.apache.hadoop.util.GcTimeMonitor;
|
||||
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||
|
||||
/**
|
||||
@ -85,6 +89,7 @@ public synchronized void registerIfNeeded(){
|
||||
private JvmPauseMonitor pauseMonitor = null;
|
||||
final ConcurrentHashMap<String, MetricsInfo[]> gcInfoCache =
|
||||
new ConcurrentHashMap<String, MetricsInfo[]>();
|
||||
private GcTimeMonitor gcTimeMonitor = null;
|
||||
|
||||
@VisibleForTesting
|
||||
JvmMetrics(String processName, String sessionId) {
|
||||
@ -96,6 +101,11 @@ public void setPauseMonitor(final JvmPauseMonitor pauseMonitor) {
|
||||
this.pauseMonitor = pauseMonitor;
|
||||
}
|
||||
|
||||
public void setGcTimeMonitor(GcTimeMonitor gcTimeMonitor) {
|
||||
Preconditions.checkNotNull(gcTimeMonitor);
|
||||
this.gcTimeMonitor = gcTimeMonitor;
|
||||
}
|
||||
|
||||
public static JvmMetrics create(String processName, String sessionId,
|
||||
MetricsSystem ms) {
|
||||
return ms.register(JvmMetrics.name(), JvmMetrics.description(),
|
||||
@ -176,6 +186,11 @@ private void getGcUsage(MetricsRecordBuilder rb) {
|
||||
rb.addCounter(GcTotalExtraSleepTime,
|
||||
pauseMonitor.getTotalGcExtraSleepTime());
|
||||
}
|
||||
|
||||
if (gcTimeMonitor != null) {
|
||||
rb.addCounter(GcTimePercentage,
|
||||
gcTimeMonitor.getLatestGcData().getGcTimePercentage());
|
||||
}
|
||||
}
|
||||
|
||||
private MetricsInfo[] getGcInfo(String gcName) {
|
||||
|
@ -51,7 +51,8 @@ public enum JvmMetricsInfo implements MetricsInfo {
|
||||
LogInfo("Total number of info log events"),
|
||||
GcNumWarnThresholdExceeded("Number of times that the GC warn threshold is exceeded"),
|
||||
GcNumInfoThresholdExceeded("Number of times that the GC info threshold is exceeded"),
|
||||
GcTotalExtraSleepTime("Total GC extra sleep time in milliseconds");
|
||||
GcTotalExtraSleepTime("Total GC extra sleep time in milliseconds"),
|
||||
GcTimePercentage("Percentage of time the JVM was paused in GC");
|
||||
|
||||
private final String desc;
|
||||
|
||||
|
@ -0,0 +1,242 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.util;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import java.lang.management.GarbageCollectorMXBean;
|
||||
import java.lang.management.ManagementFactory;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* This class monitors the percentage of time the JVM is paused in GC within
|
||||
* the specified observation window, say 1 minute. The user can provide a
|
||||
* hook which will be called whenever this percentage exceeds the specified
|
||||
* threshold.
|
||||
*/
|
||||
public class GcTimeMonitor extends Thread {
|
||||
|
||||
private final long maxGcTimePercentage;
|
||||
private final long observationWindowMs, sleepIntervalMs;
|
||||
private final GcTimeAlertHandler alertHandler;
|
||||
|
||||
private final List<GarbageCollectorMXBean> gcBeans =
|
||||
ManagementFactory.getGarbageCollectorMXBeans();
|
||||
// Ring buffers containing GC timings and timestamps when timings were taken
|
||||
private final TsAndData[] gcDataBuf;
|
||||
private int bufSize, startIdx, endIdx;
|
||||
|
||||
private long startTime;
|
||||
private final GcData curData = new GcData();
|
||||
private volatile boolean shouldRun = true;
|
||||
|
||||
/**
|
||||
* Create an instance of GCTimeMonitor. Once it's started, it will stay alive
|
||||
* and monitor GC time percentage until shutdown() is called. If you don't
|
||||
* put a limit on the number of GCTimeMonitor instances that you create, and
|
||||
* alertHandler != null, you should necessarily call shutdown() once the given
|
||||
* instance is not needed. Otherwise, you may create a memory leak, because
|
||||
* each running GCTimeMonitor will keep its alertHandler object in memory,
|
||||
* which in turn may reference and keep in memory many more other objects.
|
||||
*
|
||||
* @param observationWindowMs the interval over which the percentage
|
||||
* of GC time should be calculated. A practical value would be somewhere
|
||||
* between 30 sec and several minutes.
|
||||
* @param sleepIntervalMs how frequently this thread should wake up to check
|
||||
* GC timings. This is also a frequency with which alertHandler will be
|
||||
* invoked if GC time percentage exceeds the specified limit. A practical
|
||||
* value would likely be 500..1000 ms.
|
||||
* @param maxGcTimePercentage A GC time percentage limit (0..100) within
|
||||
* observationWindowMs. Once this is exceeded, alertHandler will be
|
||||
* invoked every sleepIntervalMs milliseconds until GC time percentage
|
||||
* falls below this limit.
|
||||
* @param alertHandler a single method in this interface is invoked when GC
|
||||
* time percentage exceeds the specified limit.
|
||||
*/
|
||||
public GcTimeMonitor(long observationWindowMs, long sleepIntervalMs,
|
||||
int maxGcTimePercentage, GcTimeAlertHandler alertHandler) {
|
||||
Preconditions.checkArgument(observationWindowMs > 0);
|
||||
Preconditions.checkArgument(
|
||||
sleepIntervalMs > 0 && sleepIntervalMs < observationWindowMs);
|
||||
Preconditions.checkArgument(
|
||||
maxGcTimePercentage >= 0 && maxGcTimePercentage <= 100);
|
||||
|
||||
this.observationWindowMs = observationWindowMs;
|
||||
this.sleepIntervalMs = sleepIntervalMs;
|
||||
this.maxGcTimePercentage = maxGcTimePercentage;
|
||||
this.alertHandler = alertHandler;
|
||||
|
||||
bufSize = (int) (observationWindowMs / sleepIntervalMs + 2);
|
||||
// Prevent the user from accidentally creating an abnormally big buffer,
|
||||
// which will result in slow calculations and likely inaccuracy.
|
||||
Preconditions.checkArgument(bufSize <= 128 * 1024);
|
||||
gcDataBuf = new TsAndData[bufSize];
|
||||
for (int i = 0; i < bufSize; i++) {
|
||||
gcDataBuf[i] = new TsAndData();
|
||||
}
|
||||
|
||||
this.setDaemon(true);
|
||||
this.setName("GcTimeMonitor obsWindow = " + observationWindowMs +
|
||||
", sleepInterval = " + sleepIntervalMs +
|
||||
", maxGcTimePerc = " + maxGcTimePercentage);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
startTime = System.currentTimeMillis();
|
||||
curData.timestamp = startTime;
|
||||
gcDataBuf[startIdx].setValues(startTime, 0);
|
||||
|
||||
while (shouldRun) {
|
||||
try {
|
||||
Thread.sleep(sleepIntervalMs);
|
||||
} catch (InterruptedException ie) {
|
||||
return;
|
||||
}
|
||||
|
||||
calculateGCTimePercentageWithinObservedInterval();
|
||||
if (alertHandler != null &&
|
||||
curData.gcTimePercentage > maxGcTimePercentage) {
|
||||
alertHandler.alert(curData.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void shutdown() {
|
||||
shouldRun = false;
|
||||
}
|
||||
|
||||
/** Returns a copy of the most recent data measured by this monitor. */
|
||||
public GcData getLatestGcData() {
|
||||
return curData.clone();
|
||||
}
|
||||
|
||||
private void calculateGCTimePercentageWithinObservedInterval() {
|
||||
long prevTotalGcTime = curData.totalGcTime;
|
||||
long totalGcTime = 0;
|
||||
long totalGcCount = 0;
|
||||
for (GarbageCollectorMXBean gcBean : gcBeans) {
|
||||
totalGcTime += gcBean.getCollectionTime();
|
||||
totalGcCount += gcBean.getCollectionCount();
|
||||
}
|
||||
long gcTimeWithinSleepInterval = totalGcTime - prevTotalGcTime;
|
||||
|
||||
long ts = System.currentTimeMillis();
|
||||
long gcMonitorRunTime = ts - startTime;
|
||||
|
||||
endIdx = (endIdx + 1) % bufSize;
|
||||
gcDataBuf[endIdx].setValues(ts, gcTimeWithinSleepInterval);
|
||||
|
||||
// Move startIdx forward until we reach the first buffer entry with
|
||||
// timestamp within the observation window.
|
||||
long startObsWindowTs = ts - observationWindowMs;
|
||||
while (gcDataBuf[startIdx].ts < startObsWindowTs && startIdx != endIdx) {
|
||||
startIdx = (startIdx + 1) % bufSize;
|
||||
}
|
||||
|
||||
// Calculate total GC time within observationWindowMs.
|
||||
// We should be careful about GC time that passed before the first timestamp
|
||||
// in our observation window.
|
||||
long gcTimeWithinObservationWindow = Math.min(
|
||||
gcDataBuf[startIdx].gcPause, gcDataBuf[startIdx].ts - startObsWindowTs);
|
||||
if (startIdx != endIdx) {
|
||||
for (int i = (startIdx + 1) % bufSize; i != endIdx;
|
||||
i = (i + 1) % bufSize) {
|
||||
gcTimeWithinObservationWindow += gcDataBuf[i].gcPause;
|
||||
}
|
||||
}
|
||||
|
||||
curData.update(ts, gcMonitorRunTime, totalGcTime, totalGcCount,
|
||||
(int) (gcTimeWithinObservationWindow * 100 /
|
||||
Math.min(observationWindowMs, gcMonitorRunTime)));
|
||||
}
|
||||
|
||||
/**
|
||||
* The user can provide an instance of a class implementing this interface
|
||||
* when initializing a GcTimeMonitor to receive alerts when GC time
|
||||
* percentage exceeds the specified threshold.
|
||||
*/
|
||||
public interface GcTimeAlertHandler {
|
||||
void alert(GcData gcData);
|
||||
}
|
||||
|
||||
/** Encapsulates data about GC pauses measured at the specific timestamp. */
|
||||
public static class GcData implements Cloneable {
|
||||
private long timestamp;
|
||||
private long gcMonitorRunTime, totalGcTime, totalGcCount;
|
||||
private int gcTimePercentage;
|
||||
|
||||
/** Returns the absolute timestamp when this measurement was taken. */
|
||||
public long getTimestamp() {
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
/** Returns the time since the start of the associated GcTimeMonitor. */
|
||||
public long getGcMonitorRunTime() {
|
||||
return gcMonitorRunTime;
|
||||
}
|
||||
|
||||
/** Returns accumulated GC time since this JVM started. */
|
||||
public long getAccumulatedGcTime() {
|
||||
return totalGcTime;
|
||||
}
|
||||
|
||||
/** Returns the accumulated number of GC pauses since this JVM started. */
|
||||
public long getAccumulatedGcCount() {
|
||||
return totalGcCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the percentage (0..100) of time that the JVM spent in GC pauses
|
||||
* within the observation window of the associated GcTimeMonitor.
|
||||
*/
|
||||
public int getGcTimePercentage() {
|
||||
return gcTimePercentage;
|
||||
}
|
||||
|
||||
private synchronized void update(long inTimestamp, long inGcMonitorRunTime,
|
||||
long inTotalGcTime, long inTotalGcCount, int inGcTimePercentage) {
|
||||
this.timestamp = inTimestamp;
|
||||
this.gcMonitorRunTime = inGcMonitorRunTime;
|
||||
this.totalGcTime = inTotalGcTime;
|
||||
this.totalGcCount = inTotalGcCount;
|
||||
this.gcTimePercentage = inGcTimePercentage;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized GcData clone() {
|
||||
try {
|
||||
return (GcData) super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class TsAndData {
|
||||
private long ts; // Timestamp when this measurement was taken
|
||||
private long gcPause; // Total GC pause time within the interval between ts
|
||||
// and the timestamp of the previous measurement.
|
||||
|
||||
void setValues(long inTs, long inGcPause) {
|
||||
this.ts = inTs;
|
||||
this.gcPause = inGcPause;
|
||||
}
|
||||
}
|
||||
}
|
@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hadoop.metrics2.source;
|
||||
|
||||
import org.apache.hadoop.util.GcTimeMonitor;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Rule;
|
||||
@ -35,6 +36,9 @@
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
|
||||
import static org.apache.hadoop.metrics2.impl.MsInfo.*;
|
||||
|
||||
@ -43,17 +47,21 @@ public class TestJvmMetrics {
|
||||
@Rule
|
||||
public Timeout timeout = new Timeout(30000);
|
||||
private JvmPauseMonitor pauseMonitor;
|
||||
private GcTimeMonitor gcTimeMonitor;
|
||||
|
||||
/**
|
||||
* Robust shutdown of the pause monitor if it hasn't been stopped already.
|
||||
* Robust shutdown of the monitors if they haven't been stopped already.
|
||||
*/
|
||||
@After
|
||||
public void teardown() {
|
||||
ServiceOperations.stop(pauseMonitor);
|
||||
if (gcTimeMonitor != null) {
|
||||
gcTimeMonitor.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPresence() {
|
||||
public void testJvmPauseMonitorPresence() {
|
||||
pauseMonitor = new JvmPauseMonitor();
|
||||
pauseMonitor.init(new Configuration());
|
||||
pauseMonitor.start();
|
||||
@ -66,16 +74,37 @@ public void testPresence() {
|
||||
verify(rb).tag(ProcessName, "test");
|
||||
verify(rb).tag(SessionId, "test");
|
||||
for (JvmMetricsInfo info : JvmMetricsInfo.values()) {
|
||||
if (info.name().startsWith("Mem"))
|
||||
if (info.name().startsWith("Mem")) {
|
||||
verify(rb).addGauge(eq(info), anyFloat());
|
||||
else if (info.name().startsWith("Gc"))
|
||||
} else if (info.name().startsWith("Gc") &&
|
||||
!info.name().equals("GcTimePercentage")) {
|
||||
verify(rb).addCounter(eq(info), anyLong());
|
||||
else if (info.name().startsWith("Threads"))
|
||||
} else if (info.name().startsWith("Threads")) {
|
||||
verify(rb).addGauge(eq(info), anyInt());
|
||||
else if (info.name().startsWith("Log"))
|
||||
} else if (info.name().startsWith("Log")) {
|
||||
verify(rb).addCounter(eq(info), anyLong());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGcTimeMonitorPresence() {
|
||||
gcTimeMonitor = new GcTimeMonitor(60000, 1000, 70, null);
|
||||
gcTimeMonitor.start();
|
||||
JvmMetrics jvmMetrics = new JvmMetrics("test", "test");
|
||||
jvmMetrics.setGcTimeMonitor(gcTimeMonitor);
|
||||
MetricsRecordBuilder rb = getMetrics(jvmMetrics);
|
||||
MetricsCollector mc = rb.parent();
|
||||
|
||||
verify(mc).addRecord(JvmMetrics);
|
||||
verify(rb).tag(ProcessName, "test");
|
||||
verify(rb).tag(SessionId, "test");
|
||||
for (JvmMetricsInfo info : JvmMetricsInfo.values()) {
|
||||
if (info.name().equals("GcTimePercentage")) {
|
||||
verify(rb).addCounter(eq(info), anyInt());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoubleStop() throws Throwable {
|
||||
@ -120,4 +149,55 @@ public void testStopBeforeInit() throws Throwable {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGcTimeMonitor() {
|
||||
class Alerter implements GcTimeMonitor.GcTimeAlertHandler {
|
||||
private volatile int numAlerts;
|
||||
private volatile int maxGcTimePercentage;
|
||||
@Override
|
||||
public void alert(GcTimeMonitor.GcData gcData) {
|
||||
numAlerts++;
|
||||
if (gcData.getGcTimePercentage() > maxGcTimePercentage) {
|
||||
maxGcTimePercentage = gcData.getGcTimePercentage();
|
||||
}
|
||||
}
|
||||
}
|
||||
Alerter alerter = new Alerter();
|
||||
|
||||
int alertGcPerc = 10; // Alerter should be called if GC takes >= 10%
|
||||
gcTimeMonitor = new GcTimeMonitor(60*1000, 100, alertGcPerc, alerter);
|
||||
gcTimeMonitor.start();
|
||||
|
||||
int maxGcTimePercentage = 0;
|
||||
long gcCount = 0;
|
||||
|
||||
// Generate a lot of garbage for some time and verify that the monitor
|
||||
// reports at least some percentage of time in GC pauses, and that the
|
||||
// alerter is invoked at least once.
|
||||
|
||||
List<String> garbageStrings = new ArrayList<>();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
// Run this for at least 1 sec for our monitor to collect enough data
|
||||
while (System.currentTimeMillis() - startTime < 1000) {
|
||||
for (int j = 0; j < 100000; j++) {
|
||||
garbageStrings.add(
|
||||
"Long string prefix just to fill memory with garbage " + j);
|
||||
}
|
||||
garbageStrings.clear();
|
||||
System.gc();
|
||||
|
||||
GcTimeMonitor.GcData gcData = gcTimeMonitor.getLatestGcData();
|
||||
int gcTimePercentage = gcData.getGcTimePercentage();
|
||||
if (gcTimePercentage > maxGcTimePercentage) {
|
||||
maxGcTimePercentage = gcTimePercentage;
|
||||
}
|
||||
gcCount = gcData.getAccumulatedGcCount();
|
||||
}
|
||||
|
||||
Assert.assertTrue(maxGcTimePercentage > 0);
|
||||
Assert.assertTrue(gcCount > 0);
|
||||
Assert.assertTrue(alerter.numAlerts > 0);
|
||||
Assert.assertTrue(alerter.maxGcTimePercentage >= alertGcPerc);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user