YARN-4019. Add JvmPauseMonitor to ResourceManager and NodeManager. Contributed by Robert Kanter.
This commit is contained in:
parent
154c9d2e42
commit
cfee02b3bd
@ -374,6 +374,9 @@ Release 2.8.0 - UNRELEASED
|
||||
YARN-3961. Expose pending, running and reserved containers of a queue in REST
|
||||
api and yarn top (adhoot via asuresh)
|
||||
|
||||
YARN-4019. Add JvmPauseMonitor to ResourceManager and NodeManager. (Robert Kanter
|
||||
via junping_du)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
||||
|
@ -40,6 +40,7 @@
|
||||
import org.apache.hadoop.service.CompositeService;
|
||||
import org.apache.hadoop.util.ExitUtil;
|
||||
import org.apache.hadoop.util.GenericOptionsParser;
|
||||
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||
import org.apache.hadoop.util.NodeHealthScriptRunner;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
import org.apache.hadoop.util.ShutdownHookManager;
|
||||
@ -83,6 +84,7 @@ public class NodeManager extends CompositeService
|
||||
private static final Log LOG = LogFactory.getLog(NodeManager.class);
|
||||
private static long nmStartupTime = System.currentTimeMillis();
|
||||
protected final NodeManagerMetrics metrics = NodeManagerMetrics.create();
|
||||
private JvmPauseMonitor pauseMonitor;
|
||||
private ApplicationACLsManager aclsManager;
|
||||
private NodeHealthCheckerService nodeHealthChecker;
|
||||
private NodeLabelsProvider nodeLabelsProvider;
|
||||
@ -307,13 +309,16 @@ protected void serviceInit(Configuration conf) throws Exception {
|
||||
dispatcher.register(ContainerManagerEventType.class, containerManager);
|
||||
dispatcher.register(NodeManagerEventType.class, this);
|
||||
addService(dispatcher);
|
||||
|
||||
|
||||
pauseMonitor = new JvmPauseMonitor(conf);
|
||||
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
|
||||
|
||||
DefaultMetricsSystem.initialize("NodeManager");
|
||||
|
||||
// StatusUpdater should be added last so that it get started last
|
||||
// so that we make sure everything is up before registering with RM.
|
||||
addService(nodeStatusUpdater);
|
||||
|
||||
|
||||
super.serviceInit(conf);
|
||||
// TODO add local dirs to del
|
||||
}
|
||||
@ -325,6 +330,7 @@ protected void serviceStart() throws Exception {
|
||||
} catch (IOException e) {
|
||||
throw new YarnRuntimeException("Failed NodeManager login", e);
|
||||
}
|
||||
pauseMonitor.start();
|
||||
super.serviceStart();
|
||||
}
|
||||
|
||||
@ -336,6 +342,9 @@ protected void serviceStop() throws Exception {
|
||||
try {
|
||||
super.serviceStop();
|
||||
DefaultMetricsSystem.shutdown();
|
||||
if (pauseMonitor != null) {
|
||||
pauseMonitor.stop();
|
||||
}
|
||||
} finally {
|
||||
// YARN-3641: NM's services stop get failed shouldn't block the
|
||||
// release of NMLevelDBStore.
|
||||
|
@ -57,17 +57,26 @@ public class NodeManagerMetrics {
|
||||
@Metric("Disk utilization % on good log dirs")
|
||||
MutableGaugeInt goodLogDirsDiskUtilizationPerc;
|
||||
|
||||
private JvmMetrics jvmMetrics = null;
|
||||
|
||||
private long allocatedMB;
|
||||
private long availableMB;
|
||||
|
||||
public NodeManagerMetrics(JvmMetrics jvmMetrics) {
|
||||
this.jvmMetrics = jvmMetrics;
|
||||
}
|
||||
|
||||
public static NodeManagerMetrics create() {
|
||||
return create(DefaultMetricsSystem.instance());
|
||||
}
|
||||
|
||||
static NodeManagerMetrics create(MetricsSystem ms) {
|
||||
JvmMetrics.create("NodeManager", null, ms);
|
||||
return ms.register(new NodeManagerMetrics());
|
||||
JvmMetrics jm = JvmMetrics.create("NodeManager", null, ms);
|
||||
return ms.register(new NodeManagerMetrics(jm));
|
||||
}
|
||||
|
||||
public JvmMetrics getJvmMetrics() {
|
||||
return jvmMetrics;
|
||||
}
|
||||
|
||||
// Potential instrumentation interface methods
|
||||
|
@ -39,6 +39,7 @@
|
||||
import org.apache.hadoop.service.Service;
|
||||
import org.apache.hadoop.util.ExitUtil;
|
||||
import org.apache.hadoop.util.GenericOptionsParser;
|
||||
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
import org.apache.hadoop.util.ShutdownHookManager;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
@ -157,6 +158,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||
private WebApp webApp;
|
||||
private AppReportFetcher fetcher = null;
|
||||
protected ResourceTrackerService resourceTracker;
|
||||
private JvmPauseMonitor pauseMonitor;
|
||||
|
||||
@VisibleForTesting
|
||||
protected String webAppAddress;
|
||||
@ -511,7 +513,9 @@ protected void serviceInit(Configuration configuration) throws Exception {
|
||||
rmContext.setResourceTrackerService(resourceTracker);
|
||||
|
||||
DefaultMetricsSystem.initialize("ResourceManager");
|
||||
JvmMetrics.initSingleton("ResourceManager", null);
|
||||
JvmMetrics jm = JvmMetrics.initSingleton("ResourceManager", null);
|
||||
pauseMonitor = new JvmPauseMonitor(conf);
|
||||
jm.setPauseMonitor(pauseMonitor);
|
||||
|
||||
// Initialize the Reservation system
|
||||
if (conf.getBoolean(YarnConfiguration.RM_RESERVATION_SYSTEM_ENABLE,
|
||||
@ -566,6 +570,8 @@ protected void serviceStart() throws Exception {
|
||||
// need events to move to further states.
|
||||
rmStore.start();
|
||||
|
||||
pauseMonitor.start();
|
||||
|
||||
if(recoveryEnabled) {
|
||||
try {
|
||||
LOG.info("Recovery started");
|
||||
@ -591,6 +597,9 @@ protected void serviceStart() throws Exception {
|
||||
protected void serviceStop() throws Exception {
|
||||
|
||||
DefaultMetricsSystem.shutdown();
|
||||
if (pauseMonitor != null) {
|
||||
pauseMonitor.stop();
|
||||
}
|
||||
|
||||
if (rmContext != null) {
|
||||
RMStateStore store = rmContext.getStateStore();
|
||||
|
Loading…
Reference in New Issue
Block a user