From cfee02b3bdd1117370200c9d8ce216676cff8888 Mon Sep 17 00:00:00 2001 From: Junping Du Date: Thu, 6 Aug 2015 06:49:45 -0700 Subject: [PATCH] YARN-4019. Add JvmPauseMonitor to ResourceManager and NodeManager. Contributed by Robert Kanter. --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../hadoop/yarn/server/nodemanager/NodeManager.java | 13 +++++++++++-- .../nodemanager/metrics/NodeManagerMetrics.java | 13 +++++++++++-- .../server/resourcemanager/ResourceManager.java | 11 ++++++++++- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 6b409ddd5c..1840b1bdec 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -374,6 +374,9 @@ Release 2.8.0 - UNRELEASED YARN-3961. Expose pending, running and reserved containers of a queue in REST api and yarn top (adhoot via asuresh) + YARN-4019. Add JvmPauseMonitor to ResourceManager and NodeManager. (Robert Kanter + via junping_du) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index b8889eeded..a06293dc8b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -40,6 +40,7 @@ import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.JvmPauseMonitor; import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ShutdownHookManager; @@ -83,6 +84,7 @@ public class NodeManager extends CompositeService private static final Log LOG = LogFactory.getLog(NodeManager.class); private static long nmStartupTime = System.currentTimeMillis(); protected final NodeManagerMetrics metrics = NodeManagerMetrics.create(); + private JvmPauseMonitor pauseMonitor; private ApplicationACLsManager aclsManager; private NodeHealthCheckerService nodeHealthChecker; private NodeLabelsProvider nodeLabelsProvider; @@ -307,13 +309,16 @@ protected void serviceInit(Configuration conf) throws Exception { dispatcher.register(ContainerManagerEventType.class, containerManager); dispatcher.register(NodeManagerEventType.class, this); addService(dispatcher); - + + pauseMonitor = new JvmPauseMonitor(conf); + metrics.getJvmMetrics().setPauseMonitor(pauseMonitor); + DefaultMetricsSystem.initialize("NodeManager"); // StatusUpdater should be added last so that it get started last // so that we make sure everything is up before registering with RM. addService(nodeStatusUpdater); - + super.serviceInit(conf); // TODO add local dirs to del } @@ -325,6 +330,7 @@ protected void serviceStart() throws Exception { } catch (IOException e) { throw new YarnRuntimeException("Failed NodeManager login", e); } + pauseMonitor.start(); super.serviceStart(); } @@ -336,6 +342,9 @@ protected void serviceStop() throws Exception { try { super.serviceStop(); DefaultMetricsSystem.shutdown(); + if (pauseMonitor != null) { + pauseMonitor.stop(); + } } finally { // YARN-3641: NM's services stop get failed shouldn't block the // release of NMLevelDBStore. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java index 400f14bfcc..56797d11c8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java @@ -57,17 +57,26 @@ public class NodeManagerMetrics { @Metric("Disk utilization % on good log dirs") MutableGaugeInt goodLogDirsDiskUtilizationPerc; + private JvmMetrics jvmMetrics = null; private long allocatedMB; private long availableMB; + public NodeManagerMetrics(JvmMetrics jvmMetrics) { + this.jvmMetrics = jvmMetrics; + } + public static NodeManagerMetrics create() { return create(DefaultMetricsSystem.instance()); } static NodeManagerMetrics create(MetricsSystem ms) { - JvmMetrics.create("NodeManager", null, ms); - return ms.register(new NodeManagerMetrics()); + JvmMetrics jm = JvmMetrics.create("NodeManager", null, ms); + return ms.register(new NodeManagerMetrics(jm)); + } + + public JvmMetrics getJvmMetrics() { + return jvmMetrics; } // Potential instrumentation interface methods diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 1b606b4654..817565b572 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -39,6 +39,7 @@ import org.apache.hadoop.service.Service; import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.JvmPauseMonitor; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ShutdownHookManager; import org.apache.hadoop.util.StringUtils; @@ -157,6 +158,7 @@ public class ResourceManager extends CompositeService implements Recoverable { private WebApp webApp; private AppReportFetcher fetcher = null; protected ResourceTrackerService resourceTracker; + private JvmPauseMonitor pauseMonitor; @VisibleForTesting protected String webAppAddress; @@ -511,7 +513,9 @@ protected void serviceInit(Configuration configuration) throws Exception { rmContext.setResourceTrackerService(resourceTracker); DefaultMetricsSystem.initialize("ResourceManager"); - JvmMetrics.initSingleton("ResourceManager", null); + JvmMetrics jm = JvmMetrics.initSingleton("ResourceManager", null); + pauseMonitor = new JvmPauseMonitor(conf); + jm.setPauseMonitor(pauseMonitor); // Initialize the Reservation system if (conf.getBoolean(YarnConfiguration.RM_RESERVATION_SYSTEM_ENABLE, @@ -566,6 +570,8 @@ protected void serviceStart() throws Exception { // need events to move to further states. rmStore.start(); + pauseMonitor.start(); + if(recoveryEnabled) { try { LOG.info("Recovery started"); @@ -591,6 +597,9 @@ protected void serviceStart() throws Exception { protected void serviceStop() throws Exception { DefaultMetricsSystem.shutdown(); + if (pauseMonitor != null) { + pauseMonitor.stop(); + } if (rmContext != null) { RMStateStore store = rmContext.getStateStore();