YARN-4906. Capture container start/finish time in container metrics. Contributed by Jian He.
This commit is contained in:
parent
21eb428448
commit
b41e65e5bc
@ -65,6 +65,7 @@
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStartMonitoringEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStopMonitoringEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
@ -100,6 +101,7 @@ public class ContainerImpl implements Container {
|
||||
private boolean wasLaunched;
|
||||
private long containerLocalizationStartTime;
|
||||
private long containerLaunchStartTime;
|
||||
private ContainerMetrics containerMetrics;
|
||||
private static Clock clock = SystemClock.getInstance();
|
||||
|
||||
/** The NM-wide configuration - not specific to this container */
|
||||
@ -147,6 +149,21 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher,
|
||||
this.readLock = readWriteLock.readLock();
|
||||
this.writeLock = readWriteLock.writeLock();
|
||||
this.context = context;
|
||||
boolean containerMetricsEnabled =
|
||||
conf.getBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE,
|
||||
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_ENABLE);
|
||||
|
||||
if (containerMetricsEnabled) {
|
||||
long flushPeriod =
|
||||
conf.getLong(YarnConfiguration.NM_CONTAINER_METRICS_PERIOD_MS,
|
||||
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_PERIOD_MS);
|
||||
long unregisterDelay = conf.getLong(
|
||||
YarnConfiguration.NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS,
|
||||
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS);
|
||||
containerMetrics = ContainerMetrics
|
||||
.forContainer(containerId, flushPeriod, unregisterDelay);
|
||||
containerMetrics.recordStartTime(clock.getTime());
|
||||
}
|
||||
|
||||
stateMachine = stateMachineFactory.make(this);
|
||||
}
|
||||
@ -989,6 +1006,11 @@ static class ContainerDoneTransition implements
|
||||
@SuppressWarnings("unchecked")
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
container.metrics.releaseContainer(container.resource);
|
||||
if (container.containerMetrics != null) {
|
||||
container.containerMetrics
|
||||
.recordFinishTimeAndExitCode(clock.getTime(), container.exitCode);
|
||||
container.containerMetrics.finished();
|
||||
}
|
||||
container.sendFinishedEvents();
|
||||
//if the current state is NEW it means the CONTAINER_INIT was never
|
||||
// sent for the event, thus no need to send the CONTAINER_STOP
|
||||
|
@ -100,6 +100,15 @@ public class ContainerMetrics implements MetricsSource {
|
||||
@Metric
|
||||
public MutableGaugeLong localizationDurationMs;
|
||||
|
||||
@Metric
|
||||
public MutableGaugeLong startTime;
|
||||
|
||||
@Metric
|
||||
public MutableGaugeLong finishTime;
|
||||
|
||||
@Metric
|
||||
public MutableGaugeInt exitCode;
|
||||
|
||||
static final MetricsInfo RECORD_INFO =
|
||||
info("ContainerResource", "Resource limit and usage by container");
|
||||
|
||||
@ -277,6 +286,15 @@ public void recordStateChangeDurations(long launchDuration,
|
||||
this.localizationDurationMs.set(localizationDuration);
|
||||
}
|
||||
|
||||
public void recordStartTime(long startTime) {
|
||||
this.startTime.set(startTime);
|
||||
}
|
||||
|
||||
public void recordFinishTimeAndExitCode(long finishTime, int exitCode) {
|
||||
this.finishTime.set(finishTime);
|
||||
this.exitCode.set(exitCode);
|
||||
}
|
||||
|
||||
private synchronized void scheduleTimerTaskIfRequired() {
|
||||
if (flushPeriodMs > 0) {
|
||||
// Lazily initialize timer
|
||||
|
@ -195,7 +195,7 @@ public void testAuxEventDispatch() {
|
||||
ContainerId.newContainerId(attemptId, 1), "", "",
|
||||
Resource.newInstance(1, 1), 0,0,0, Priority.newInstance(0), 0);
|
||||
Context context = mock(Context.class);
|
||||
Container container = new ContainerImpl(null, null, null, null,
|
||||
Container container = new ContainerImpl(new YarnConfiguration(), null, null, null,
|
||||
null, cti, context);
|
||||
ContainerId containerId = container.getContainerId();
|
||||
Resource resource = container.getResource();
|
||||
|
@ -85,6 +85,7 @@
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
@ -333,6 +334,7 @@ public void testCleanupOnKillRequest() throws Exception {
|
||||
@Test
|
||||
public void testKillOnNew() throws Exception {
|
||||
WrappedContainer wc = null;
|
||||
|
||||
try {
|
||||
wc = new WrappedContainer(13, 314159265358979L, 4344, "yak");
|
||||
assertEquals(ContainerState.NEW, wc.c.getContainerState());
|
||||
@ -345,6 +347,15 @@ public void testKillOnNew() throws Exception {
|
||||
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
|
||||
.contains("KillRequest"));
|
||||
assertEquals(killed + 1, metrics.getKilledContainers());
|
||||
// check container metrics is generated.
|
||||
ContainerMetrics containerMetrics =
|
||||
ContainerMetrics.forContainer(wc.cId, 1, 5000);
|
||||
Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
containerMetrics.exitCode.value());
|
||||
Assert.assertTrue(containerMetrics.startTime.value() > 0);
|
||||
Assert.assertTrue(
|
||||
containerMetrics.finishTime.value() > containerMetrics.startTime
|
||||
.value());
|
||||
} finally {
|
||||
if (wc != null) {
|
||||
wc.finished();
|
||||
|
Loading…
Reference in New Issue
Block a user