YARN-4019. Add JvmPauseMonitor to ResourceManager and NodeManager. Contributed by Robert Kanter.
This commit is contained in:
parent
154c9d2e42
commit
cfee02b3bd
@ -374,6 +374,9 @@ Release 2.8.0 - UNRELEASED
|
|||||||
YARN-3961. Expose pending, running and reserved containers of a queue in REST
|
YARN-3961. Expose pending, running and reserved containers of a queue in REST
|
||||||
api and yarn top (adhoot via asuresh)
|
api and yarn top (adhoot via asuresh)
|
||||||
|
|
||||||
|
YARN-4019. Add JvmPauseMonitor to ResourceManager and NodeManager. (Robert Kanter
|
||||||
|
via junping_du)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
||||||
|
@ -40,6 +40,7 @@
|
|||||||
import org.apache.hadoop.service.CompositeService;
|
import org.apache.hadoop.service.CompositeService;
|
||||||
import org.apache.hadoop.util.ExitUtil;
|
import org.apache.hadoop.util.ExitUtil;
|
||||||
import org.apache.hadoop.util.GenericOptionsParser;
|
import org.apache.hadoop.util.GenericOptionsParser;
|
||||||
|
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||||
import org.apache.hadoop.util.NodeHealthScriptRunner;
|
import org.apache.hadoop.util.NodeHealthScriptRunner;
|
||||||
import org.apache.hadoop.util.ReflectionUtils;
|
import org.apache.hadoop.util.ReflectionUtils;
|
||||||
import org.apache.hadoop.util.ShutdownHookManager;
|
import org.apache.hadoop.util.ShutdownHookManager;
|
||||||
@ -83,6 +84,7 @@ public class NodeManager extends CompositeService
|
|||||||
private static final Log LOG = LogFactory.getLog(NodeManager.class);
|
private static final Log LOG = LogFactory.getLog(NodeManager.class);
|
||||||
private static long nmStartupTime = System.currentTimeMillis();
|
private static long nmStartupTime = System.currentTimeMillis();
|
||||||
protected final NodeManagerMetrics metrics = NodeManagerMetrics.create();
|
protected final NodeManagerMetrics metrics = NodeManagerMetrics.create();
|
||||||
|
private JvmPauseMonitor pauseMonitor;
|
||||||
private ApplicationACLsManager aclsManager;
|
private ApplicationACLsManager aclsManager;
|
||||||
private NodeHealthCheckerService nodeHealthChecker;
|
private NodeHealthCheckerService nodeHealthChecker;
|
||||||
private NodeLabelsProvider nodeLabelsProvider;
|
private NodeLabelsProvider nodeLabelsProvider;
|
||||||
@ -307,13 +309,16 @@ protected void serviceInit(Configuration conf) throws Exception {
|
|||||||
dispatcher.register(ContainerManagerEventType.class, containerManager);
|
dispatcher.register(ContainerManagerEventType.class, containerManager);
|
||||||
dispatcher.register(NodeManagerEventType.class, this);
|
dispatcher.register(NodeManagerEventType.class, this);
|
||||||
addService(dispatcher);
|
addService(dispatcher);
|
||||||
|
|
||||||
|
pauseMonitor = new JvmPauseMonitor(conf);
|
||||||
|
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
|
||||||
|
|
||||||
DefaultMetricsSystem.initialize("NodeManager");
|
DefaultMetricsSystem.initialize("NodeManager");
|
||||||
|
|
||||||
// StatusUpdater should be added last so that it get started last
|
// StatusUpdater should be added last so that it get started last
|
||||||
// so that we make sure everything is up before registering with RM.
|
// so that we make sure everything is up before registering with RM.
|
||||||
addService(nodeStatusUpdater);
|
addService(nodeStatusUpdater);
|
||||||
|
|
||||||
super.serviceInit(conf);
|
super.serviceInit(conf);
|
||||||
// TODO add local dirs to del
|
// TODO add local dirs to del
|
||||||
}
|
}
|
||||||
@ -325,6 +330,7 @@ protected void serviceStart() throws Exception {
|
|||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new YarnRuntimeException("Failed NodeManager login", e);
|
throw new YarnRuntimeException("Failed NodeManager login", e);
|
||||||
}
|
}
|
||||||
|
pauseMonitor.start();
|
||||||
super.serviceStart();
|
super.serviceStart();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -336,6 +342,9 @@ protected void serviceStop() throws Exception {
|
|||||||
try {
|
try {
|
||||||
super.serviceStop();
|
super.serviceStop();
|
||||||
DefaultMetricsSystem.shutdown();
|
DefaultMetricsSystem.shutdown();
|
||||||
|
if (pauseMonitor != null) {
|
||||||
|
pauseMonitor.stop();
|
||||||
|
}
|
||||||
} finally {
|
} finally {
|
||||||
// YARN-3641: NM's services stop get failed shouldn't block the
|
// YARN-3641: NM's services stop get failed shouldn't block the
|
||||||
// release of NMLevelDBStore.
|
// release of NMLevelDBStore.
|
||||||
|
@ -57,17 +57,26 @@ public class NodeManagerMetrics {
|
|||||||
@Metric("Disk utilization % on good log dirs")
|
@Metric("Disk utilization % on good log dirs")
|
||||||
MutableGaugeInt goodLogDirsDiskUtilizationPerc;
|
MutableGaugeInt goodLogDirsDiskUtilizationPerc;
|
||||||
|
|
||||||
|
private JvmMetrics jvmMetrics = null;
|
||||||
|
|
||||||
private long allocatedMB;
|
private long allocatedMB;
|
||||||
private long availableMB;
|
private long availableMB;
|
||||||
|
|
||||||
|
public NodeManagerMetrics(JvmMetrics jvmMetrics) {
|
||||||
|
this.jvmMetrics = jvmMetrics;
|
||||||
|
}
|
||||||
|
|
||||||
public static NodeManagerMetrics create() {
|
public static NodeManagerMetrics create() {
|
||||||
return create(DefaultMetricsSystem.instance());
|
return create(DefaultMetricsSystem.instance());
|
||||||
}
|
}
|
||||||
|
|
||||||
static NodeManagerMetrics create(MetricsSystem ms) {
|
static NodeManagerMetrics create(MetricsSystem ms) {
|
||||||
JvmMetrics.create("NodeManager", null, ms);
|
JvmMetrics jm = JvmMetrics.create("NodeManager", null, ms);
|
||||||
return ms.register(new NodeManagerMetrics());
|
return ms.register(new NodeManagerMetrics(jm));
|
||||||
|
}
|
||||||
|
|
||||||
|
public JvmMetrics getJvmMetrics() {
|
||||||
|
return jvmMetrics;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Potential instrumentation interface methods
|
// Potential instrumentation interface methods
|
||||||
|
@ -39,6 +39,7 @@
|
|||||||
import org.apache.hadoop.service.Service;
|
import org.apache.hadoop.service.Service;
|
||||||
import org.apache.hadoop.util.ExitUtil;
|
import org.apache.hadoop.util.ExitUtil;
|
||||||
import org.apache.hadoop.util.GenericOptionsParser;
|
import org.apache.hadoop.util.GenericOptionsParser;
|
||||||
|
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||||
import org.apache.hadoop.util.ReflectionUtils;
|
import org.apache.hadoop.util.ReflectionUtils;
|
||||||
import org.apache.hadoop.util.ShutdownHookManager;
|
import org.apache.hadoop.util.ShutdownHookManager;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
@ -157,6 +158,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
|||||||
private WebApp webApp;
|
private WebApp webApp;
|
||||||
private AppReportFetcher fetcher = null;
|
private AppReportFetcher fetcher = null;
|
||||||
protected ResourceTrackerService resourceTracker;
|
protected ResourceTrackerService resourceTracker;
|
||||||
|
private JvmPauseMonitor pauseMonitor;
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
protected String webAppAddress;
|
protected String webAppAddress;
|
||||||
@ -511,7 +513,9 @@ protected void serviceInit(Configuration configuration) throws Exception {
|
|||||||
rmContext.setResourceTrackerService(resourceTracker);
|
rmContext.setResourceTrackerService(resourceTracker);
|
||||||
|
|
||||||
DefaultMetricsSystem.initialize("ResourceManager");
|
DefaultMetricsSystem.initialize("ResourceManager");
|
||||||
JvmMetrics.initSingleton("ResourceManager", null);
|
JvmMetrics jm = JvmMetrics.initSingleton("ResourceManager", null);
|
||||||
|
pauseMonitor = new JvmPauseMonitor(conf);
|
||||||
|
jm.setPauseMonitor(pauseMonitor);
|
||||||
|
|
||||||
// Initialize the Reservation system
|
// Initialize the Reservation system
|
||||||
if (conf.getBoolean(YarnConfiguration.RM_RESERVATION_SYSTEM_ENABLE,
|
if (conf.getBoolean(YarnConfiguration.RM_RESERVATION_SYSTEM_ENABLE,
|
||||||
@ -566,6 +570,8 @@ protected void serviceStart() throws Exception {
|
|||||||
// need events to move to further states.
|
// need events to move to further states.
|
||||||
rmStore.start();
|
rmStore.start();
|
||||||
|
|
||||||
|
pauseMonitor.start();
|
||||||
|
|
||||||
if(recoveryEnabled) {
|
if(recoveryEnabled) {
|
||||||
try {
|
try {
|
||||||
LOG.info("Recovery started");
|
LOG.info("Recovery started");
|
||||||
@ -591,6 +597,9 @@ protected void serviceStart() throws Exception {
|
|||||||
protected void serviceStop() throws Exception {
|
protected void serviceStop() throws Exception {
|
||||||
|
|
||||||
DefaultMetricsSystem.shutdown();
|
DefaultMetricsSystem.shutdown();
|
||||||
|
if (pauseMonitor != null) {
|
||||||
|
pauseMonitor.stop();
|
||||||
|
}
|
||||||
|
|
||||||
if (rmContext != null) {
|
if (rmContext != null) {
|
||||||
RMStateStore store = rmContext.getStateStore();
|
RMStateStore store = rmContext.getStateStore();
|
||||||
|
Loading…
Reference in New Issue
Block a user