From b41e65e5bc9459b4d950a2c53860a223f1a0d2ec Mon Sep 17 00:00:00 2001 From: Varun Vasudev Date: Wed, 6 Apr 2016 13:41:33 +0530 Subject: [PATCH] YARN-4906. Capture container start/finish time in container metrics. Contributed by Jian He. --- .../container/ContainerImpl.java | 22 +++++++++++++++++++ .../monitor/ContainerMetrics.java | 18 +++++++++++++++ .../containermanager/TestAuxServices.java | 2 +- .../container/TestContainer.java | 11 ++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index da8a3a6715..a43a005880 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStartMonitoringEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStopMonitoringEvent; import org.apache.hadoop.yarn.server.nodemanager.Context; @@ -100,6 +101,7 @@ public class ContainerImpl implements Container { private boolean wasLaunched; private long containerLocalizationStartTime; private long containerLaunchStartTime; + private ContainerMetrics containerMetrics; private static Clock clock = SystemClock.getInstance(); /** The NM-wide configuration - not specific to this container */ @@ -147,6 +149,21 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, this.readLock = readWriteLock.readLock(); this.writeLock = readWriteLock.writeLock(); this.context = context; + boolean containerMetricsEnabled = + conf.getBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE, + YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_ENABLE); + + if (containerMetricsEnabled) { + long flushPeriod = + conf.getLong(YarnConfiguration.NM_CONTAINER_METRICS_PERIOD_MS, + YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_PERIOD_MS); + long unregisterDelay = conf.getLong( + YarnConfiguration.NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS, + YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS); + containerMetrics = ContainerMetrics + .forContainer(containerId, flushPeriod, unregisterDelay); + containerMetrics.recordStartTime(clock.getTime()); + } stateMachine = stateMachineFactory.make(this); } @@ -989,6 +1006,11 @@ static class ContainerDoneTransition implements @SuppressWarnings("unchecked") public void transition(ContainerImpl container, ContainerEvent event) { container.metrics.releaseContainer(container.resource); + if (container.containerMetrics != null) { + container.containerMetrics + .recordFinishTimeAndExitCode(clock.getTime(), container.exitCode); + container.containerMetrics.finished(); + } container.sendFinishedEvents(); //if the current state is NEW it means the CONTAINER_INIT was never // sent for the event, thus no need to send the CONTAINER_STOP diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java index 9d17db0488..f85431ee0b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java @@ -100,6 +100,15 @@ public class ContainerMetrics implements MetricsSource { @Metric public MutableGaugeLong localizationDurationMs; + @Metric + public MutableGaugeLong startTime; + + @Metric + public MutableGaugeLong finishTime; + + @Metric + public MutableGaugeInt exitCode; + static final MetricsInfo RECORD_INFO = info("ContainerResource", "Resource limit and usage by container"); @@ -277,6 +286,15 @@ public void recordStateChangeDurations(long launchDuration, this.localizationDurationMs.set(localizationDuration); } + public void recordStartTime(long startTime) { + this.startTime.set(startTime); + } + + public void recordFinishTimeAndExitCode(long finishTime, int exitCode) { + this.finishTime.set(finishTime); + this.exitCode.set(exitCode); + } + private synchronized void scheduleTimerTaskIfRequired() { if (flushPeriodMs > 0) { // Lazily initialize timer diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java index 91466e8f0c..9d0d0c037d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java @@ -195,7 +195,7 @@ public void testAuxEventDispatch() { ContainerId.newContainerId(attemptId, 1), "", "", Resource.newInstance(1, 1), 0,0,0, Priority.newInstance(0), 0); Context context = mock(Context.class); - Container container = new ContainerImpl(null, null, null, null, + Container container = new ContainerImpl(new YarnConfiguration(), null, null, null, null, cti, context); ContainerId containerId = container.getContainerId(); Resource resource = container.getResource(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java index 3e062367e9..cc98bdc54d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java @@ -85,6 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEventType; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; @@ -333,6 +334,7 @@ public void testCleanupOnKillRequest() throws Exception { @Test public void testKillOnNew() throws Exception { WrappedContainer wc = null; + try { wc = new WrappedContainer(13, 314159265358979L, 4344, "yak"); assertEquals(ContainerState.NEW, wc.c.getContainerState()); @@ -345,6 +347,15 @@ public void testKillOnNew() throws Exception { assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics() .contains("KillRequest")); assertEquals(killed + 1, metrics.getKilledContainers()); + // check container metrics is generated. + ContainerMetrics containerMetrics = + ContainerMetrics.forContainer(wc.cId, 1, 5000); + Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, + containerMetrics.exitCode.value()); + Assert.assertTrue(containerMetrics.startTime.value() > 0); + Assert.assertTrue( + containerMetrics.finishTime.value() > containerMetrics.startTime + .value()); } finally { if (wc != null) { wc.finished();