HDFS-15176. Enable GcTimePercentage Metric in NameNode's JvmMetrics. Contributed by Jinglun.
This commit is contained in:
parent
9eb7a8bdf8
commit
b5698e0c33
@ -23,6 +23,7 @@ import com.google.common.base.Preconditions;
|
|||||||
import java.lang.management.GarbageCollectorMXBean;
|
import java.lang.management.GarbageCollectorMXBean;
|
||||||
import java.lang.management.ManagementFactory;
|
import java.lang.management.ManagementFactory;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class monitors the percentage of time the JVM is paused in GC within
|
* This class monitors the percentage of time the JVM is paused in GC within
|
||||||
@ -46,6 +47,52 @@ public class GcTimeMonitor extends Thread {
|
|||||||
private final GcData curData = new GcData();
|
private final GcData curData = new GcData();
|
||||||
private volatile boolean shouldRun = true;
|
private volatile boolean shouldRun = true;
|
||||||
|
|
||||||
|
public static class Builder {
|
||||||
|
|
||||||
|
private long observationWindowMs = TimeUnit.MINUTES.toMillis(1);
|
||||||
|
private long sleepIntervalMs = TimeUnit.SECONDS.toMillis(5);
|
||||||
|
private int maxGcTimePercentage = 100;
|
||||||
|
private GcTimeAlertHandler handler = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set observation window size in milliseconds.
|
||||||
|
*/
|
||||||
|
public Builder observationWindowMs(long value) {
|
||||||
|
this.observationWindowMs = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set sleep interval in milliseconds.
|
||||||
|
*/
|
||||||
|
public Builder sleepIntervalMs(long value) {
|
||||||
|
this.sleepIntervalMs = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the max GC time percentage that triggers the alert handler.
|
||||||
|
*/
|
||||||
|
public Builder maxGcTimePercentage(int value) {
|
||||||
|
this.maxGcTimePercentage = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the GC alert handler.
|
||||||
|
*/
|
||||||
|
public Builder gcTimeAlertHandler(GcTimeAlertHandler value) {
|
||||||
|
this.handler = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public GcTimeMonitor build() {
|
||||||
|
return new GcTimeMonitor(observationWindowMs, sleepIntervalMs,
|
||||||
|
maxGcTimePercentage, handler);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an instance of GCTimeMonitor. Once it's started, it will stay alive
|
* Create an instance of GCTimeMonitor. Once it's started, it will stay alive
|
||||||
* and monitor GC time percentage until shutdown() is called. If you don't
|
* and monitor GC time percentage until shutdown() is called. If you don't
|
||||||
|
@ -56,6 +56,7 @@ Each metrics record contains tags such as ProcessName, SessionID and Hostname as
|
|||||||
| `GcNumWarnThresholdExceeded` | Number of times that the GC warn threshold is exceeded |
|
| `GcNumWarnThresholdExceeded` | Number of times that the GC warn threshold is exceeded |
|
||||||
| `GcNumInfoThresholdExceeded` | Number of times that the GC info threshold is exceeded |
|
| `GcNumInfoThresholdExceeded` | Number of times that the GC info threshold is exceeded |
|
||||||
| `GcTotalExtraSleepTime` | Total GC extra sleep time in msec |
|
| `GcTotalExtraSleepTime` | Total GC extra sleep time in msec |
|
||||||
|
| `GcTimePercentage` | The percentage (0..100) of time that the JVM spent in GC pauses within the observation window if `dfs.namenode.gc.time.monitor.enable` is set to true. Use `dfs.namenode.gc.time.monitor.sleep.interval.ms` to specify the sleep interval in msec. Use `dfs.namenode.gc.time.monitor.observation.window.ms` to specify the observation window in msec. |
|
||||||
|
|
||||||
rpc context
|
rpc context
|
||||||
===========
|
===========
|
||||||
|
@ -1069,6 +1069,21 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
|
|||||||
public static final String DFS_NAMENODE_BLOCKPLACEMENTPOLICY_DEFAULT_PREFER_LOCAL_NODE_KEY =
|
public static final String DFS_NAMENODE_BLOCKPLACEMENTPOLICY_DEFAULT_PREFER_LOCAL_NODE_KEY =
|
||||||
"dfs.namenode.block-placement-policy.default.prefer-local-node";
|
"dfs.namenode.block-placement-policy.default.prefer-local-node";
|
||||||
public static final boolean DFS_NAMENODE_BLOCKPLACEMENTPOLICY_DEFAULT_PREFER_LOCAL_NODE_DEFAULT = true;
|
public static final boolean DFS_NAMENODE_BLOCKPLACEMENTPOLICY_DEFAULT_PREFER_LOCAL_NODE_DEFAULT = true;
|
||||||
|
public static final String DFS_NAMENODE_GC_TIME_MONITOR_ENABLE =
|
||||||
|
"dfs.namenode.gc.time.monitor.enable";
|
||||||
|
public static final boolean DFS_NAMENODE_GC_TIME_MONITOR_ENABLE_DEFAULT =
|
||||||
|
true;
|
||||||
|
public static final String
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_OBSERVATION_WINDOW_MS =
|
||||||
|
"dfs.namenode.gc.time.monitor.observation.window.ms";
|
||||||
|
public static final long
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_OBSERVATION_WINDOW_MS_DEFAULT =
|
||||||
|
TimeUnit.MINUTES.toMillis(1);
|
||||||
|
public static final String DFS_NAMENODE_GC_TIME_MONITOR_SLEEP_INTERVAL_MS =
|
||||||
|
"dfs.namenode.gc.time.monitor.sleep.interval.ms";
|
||||||
|
public static final long
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_SLEEP_INTERVAL_MS_DEFAULT =
|
||||||
|
TimeUnit.SECONDS.toMillis(5);
|
||||||
|
|
||||||
public static final String DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY = "dfs.block.local-path-access.user";
|
public static final String DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY = "dfs.block.local-path-access.user";
|
||||||
public static final String DFS_DOMAIN_SOCKET_PATH_KEY =
|
public static final String DFS_DOMAIN_SOCKET_PATH_KEY =
|
||||||
|
@ -96,6 +96,8 @@ import org.apache.hadoop.util.JvmPauseMonitor;
|
|||||||
import org.apache.hadoop.util.ServicePlugin;
|
import org.apache.hadoop.util.ServicePlugin;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
import org.apache.hadoop.util.Time;
|
import org.apache.hadoop.util.Time;
|
||||||
|
import org.apache.hadoop.util.GcTimeMonitor;
|
||||||
|
import org.apache.hadoop.util.GcTimeMonitor.Builder;
|
||||||
import org.apache.htrace.core.Tracer;
|
import org.apache.htrace.core.Tracer;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -176,6 +178,12 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_STRE
|
|||||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_DEFAULT;
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_DEFAULT;
|
||||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_WORK_MULTIPLIER_PER_ITERATION;
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_WORK_MULTIPLIER_PER_ITERATION;
|
||||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_WORK_MULTIPLIER_PER_ITERATION_DEFAULT;
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_WORK_MULTIPLIER_PER_ITERATION_DEFAULT;
|
||||||
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_GC_TIME_MONITOR_SLEEP_INTERVAL_MS;
|
||||||
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_GC_TIME_MONITOR_SLEEP_INTERVAL_MS_DEFAULT;
|
||||||
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_GC_TIME_MONITOR_OBSERVATION_WINDOW_MS;
|
||||||
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_GC_TIME_MONITOR_OBSERVATION_WINDOW_MS_DEFAULT;
|
||||||
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_GC_TIME_MONITOR_ENABLE;
|
||||||
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_GC_TIME_MONITOR_ENABLE_DEFAULT;
|
||||||
|
|
||||||
import static org.apache.hadoop.util.ExitUtil.terminate;
|
import static org.apache.hadoop.util.ExitUtil.terminate;
|
||||||
import static org.apache.hadoop.util.ToolRunner.confirmPrompt;
|
import static org.apache.hadoop.util.ToolRunner.confirmPrompt;
|
||||||
@ -411,6 +419,7 @@ public class NameNode extends ReconfigurableBase implements
|
|||||||
private NameNodeRpcServer rpcServer;
|
private NameNodeRpcServer rpcServer;
|
||||||
|
|
||||||
private JvmPauseMonitor pauseMonitor;
|
private JvmPauseMonitor pauseMonitor;
|
||||||
|
private GcTimeMonitor gcTimeMonitor;
|
||||||
private ObjectName nameNodeStatusBeanName;
|
private ObjectName nameNodeStatusBeanName;
|
||||||
protected final Tracer tracer;
|
protected final Tracer tracer;
|
||||||
protected final TracerConfigurationManager tracerConfigurationManager;
|
protected final TracerConfigurationManager tracerConfigurationManager;
|
||||||
@ -724,6 +733,22 @@ public class NameNode extends ReconfigurableBase implements
|
|||||||
pauseMonitor.start();
|
pauseMonitor.start();
|
||||||
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
|
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
|
||||||
|
|
||||||
|
if (conf.getBoolean(DFS_NAMENODE_GC_TIME_MONITOR_ENABLE,
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_ENABLE_DEFAULT)) {
|
||||||
|
long observationWindow = conf.getTimeDuration(
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_OBSERVATION_WINDOW_MS,
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_OBSERVATION_WINDOW_MS_DEFAULT,
|
||||||
|
TimeUnit.MILLISECONDS);
|
||||||
|
long sleepInterval = conf.getTimeDuration(
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_SLEEP_INTERVAL_MS,
|
||||||
|
DFS_NAMENODE_GC_TIME_MONITOR_SLEEP_INTERVAL_MS_DEFAULT,
|
||||||
|
TimeUnit.MILLISECONDS);
|
||||||
|
gcTimeMonitor = new Builder().observationWindowMs(observationWindow)
|
||||||
|
.sleepIntervalMs(sleepInterval).build();
|
||||||
|
gcTimeMonitor.start();
|
||||||
|
metrics.getJvmMetrics().setGcTimeMonitor(gcTimeMonitor);
|
||||||
|
}
|
||||||
|
|
||||||
if (NamenodeRole.NAMENODE == role) {
|
if (NamenodeRole.NAMENODE == role) {
|
||||||
startHttpServer(conf);
|
startHttpServer(conf);
|
||||||
}
|
}
|
||||||
|
@ -5761,4 +5761,34 @@
|
|||||||
Determines the namenode automatic lease recovery interval in seconds.
|
Determines the namenode automatic lease recovery interval in seconds.
|
||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dfs.namenode.gc.time.monitor.enable</name>
|
||||||
|
<value>true</value>
|
||||||
|
<description>
|
||||||
|
Enable the GcTimePercentage metrics in NameNode's JvmMetrics. It will
|
||||||
|
start a thread(GcTimeMonitor) computing the metric.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dfs.namenode.gc.time.monitor.observation.window.ms</name>
|
||||||
|
<value>1m</value>
|
||||||
|
<description>
|
||||||
|
Determines the windows size of GcTimeMonitor. A window is a period of time
|
||||||
|
starts at now-windowSize and ends at now. The GcTimePercentage is the gc
|
||||||
|
time proportion of the window.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dfs.namenode.gc.time.monitor.sleep.interval.ms</name>
|
||||||
|
<value>5s</value>
|
||||||
|
<description>
|
||||||
|
Determines the sleep interval in the window. The GcTimeMonitor wakes up in
|
||||||
|
the sleep interval periodically to compute the gc time proportion. The
|
||||||
|
shorter the interval the preciser the GcTimePercentage. The sleep interval
|
||||||
|
must be shorter than the window size.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
|
@ -31,6 +31,7 @@ import org.apache.hadoop.hdfs.client.HdfsAdmin;
|
|||||||
|
|
||||||
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
|
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
|
||||||
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
|
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
|
||||||
|
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.GcTimePercentage;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
|
import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
|
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
|
||||||
@ -103,6 +104,7 @@ public class TestNameNodeMetrics {
|
|||||||
new Path("/testNameNodeMetrics");
|
new Path("/testNameNodeMetrics");
|
||||||
private static final String NN_METRICS = "NameNodeActivity";
|
private static final String NN_METRICS = "NameNodeActivity";
|
||||||
private static final String NS_METRICS = "FSNamesystem";
|
private static final String NS_METRICS = "FSNamesystem";
|
||||||
|
private static final String JVM_METRICS = "JvmMetrics";
|
||||||
private static final int BLOCK_SIZE = 1024 * 1024;
|
private static final int BLOCK_SIZE = 1024 * 1024;
|
||||||
private static final ErasureCodingPolicy EC_POLICY =
|
private static final ErasureCodingPolicy EC_POLICY =
|
||||||
SystemErasureCodingPolicies.getByID(
|
SystemErasureCodingPolicies.getByID(
|
||||||
@ -223,6 +225,15 @@ public class TestNameNodeMetrics {
|
|||||||
capacityTotal);
|
capacityTotal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the GcTimePercentage could be got successfully.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testGcTimePercentageMetrics() throws Exception {
|
||||||
|
MetricsRecordBuilder rb = getMetrics(JVM_METRICS);
|
||||||
|
MetricsAsserts.getIntGauge(GcTimePercentage.name(), rb);
|
||||||
|
}
|
||||||
|
|
||||||
/** Test metrics indicating the number of stale DataNodes */
|
/** Test metrics indicating the number of stale DataNodes */
|
||||||
@Test
|
@Test
|
||||||
public void testStaleNodes() throws Exception {
|
public void testStaleNodes() throws Exception {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user