YARN-5662. Provide an option to enable ContainerMonitor. Contributed by Jian He.
This commit is contained in:
parent
03f519a757
commit
bc2656f09f
@ -1106,7 +1106,7 @@ public static boolean isAclEnabled(Configuration conf) {
|
||||
public static final String NM_VMEM_PMEM_RATIO =
|
||||
NM_PREFIX + "vmem-pmem-ratio";
|
||||
public static final float DEFAULT_NM_VMEM_PMEM_RATIO = 2.1f;
|
||||
|
||||
|
||||
/** Number of Virtual CPU Cores which can be allocated for containers.*/
|
||||
public static final String NM_VCORES = NM_PREFIX + "resource.cpu-vcores";
|
||||
public static final int DEFAULT_NM_VCORES = 8;
|
||||
@ -1259,6 +1259,10 @@ public static boolean isAclEnabled(Configuration conf) {
|
||||
NM_PREFIX + "resource-monitor.interval-ms";
|
||||
public static final int DEFAULT_NM_RESOURCE_MON_INTERVAL_MS = 3000;
|
||||
|
||||
public static final String NM_CONTAINER_MONITOR_ENABLED =
|
||||
NM_PREFIX + "container-monitor.enabled";
|
||||
public static final boolean DEFAULT_NM_CONTAINER_MONITOR_ENABLED = true;
|
||||
|
||||
/** How often to monitor containers.*/
|
||||
public final static String NM_CONTAINER_MON_INTERVAL_MS =
|
||||
NM_PREFIX + "container-monitor.interval-ms";
|
||||
|
@ -95,6 +95,8 @@ public String toString() {
|
||||
sb.append("Capability: ").append(getCapability()).append(", ");
|
||||
sb.append("Diagnostics: ").append(getDiagnostics()).append(", ");
|
||||
sb.append("ExitStatus: ").append(getExitStatus()).append(", ");
|
||||
sb.append("IP: ").append(getIPs()).append(", ");
|
||||
sb.append("Host: ").append(getHost());
|
||||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
@ -1364,6 +1364,12 @@
|
||||
<name>yarn.nodemanager.resource-calculator.class</name>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Enable container monitor</description>
|
||||
<name>yarn.nodemanager.container-monitor.enabled</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>How often to monitor containers. If not set, the value for
|
||||
yarn.nodemanager.resource-monitor.interval-ms will be used.</description>
|
||||
|
@ -171,7 +171,7 @@ protected void serviceInit(Configuration conf) throws Exception {
|
||||
LOG.info("Physical memory check enabled: " + pmemCheckEnabled);
|
||||
LOG.info("Virtual memory check enabled: " + vmemCheckEnabled);
|
||||
|
||||
containersMonitorEnabled = isEnabled();
|
||||
containersMonitorEnabled = isContainerMonitorEnabled();
|
||||
LOG.info("ContainersMonitor enabled: " + containersMonitorEnabled);
|
||||
|
||||
nodeCpuPercentageForYARN =
|
||||
@ -204,23 +204,24 @@ protected void serviceInit(Configuration conf) throws Exception {
|
||||
super.serviceInit(conf);
|
||||
}
|
||||
|
||||
private boolean isEnabled() {
|
||||
private boolean isContainerMonitorEnabled() {
|
||||
return conf.getBoolean(YarnConfiguration.NM_CONTAINER_MONITOR_ENABLED,
|
||||
YarnConfiguration.DEFAULT_NM_CONTAINER_MONITOR_ENABLED);
|
||||
}
|
||||
|
||||
private boolean isResourceCalculatorAvailable() {
|
||||
if (resourceCalculatorPlugin == null) {
|
||||
LOG.info("ResourceCalculatorPlugin is unavailable on this system. "
|
||||
+ this.getClass().getName() + " is disabled.");
|
||||
return false;
|
||||
}
|
||||
if (ResourceCalculatorProcessTree.getResourceCalculatorProcessTree("0", processTreeClass, conf) == null) {
|
||||
LOG.info("ResourceCalculatorProcessTree is unavailable on this system. "
|
||||
+ this.getClass().getName() + " is disabled.");
|
||||
return false;
|
||||
}
|
||||
if (!(isPmemCheckEnabled() || isVmemCheckEnabled())) {
|
||||
LOG.info("Neither virtual-memory nor physical-memory monitoring is " +
|
||||
"needed. Not running the monitor-thread");
|
||||
LOG.info("ResourceCalculatorPlugin is unavailable on this system. " + this
|
||||
.getClass().getName() + " is disabled.");
|
||||
return false;
|
||||
}
|
||||
if (ResourceCalculatorProcessTree
|
||||
.getResourceCalculatorProcessTree("0", processTreeClass, conf)
|
||||
== null) {
|
||||
LOG.info("ResourceCalculatorProcessTree is unavailable on this system. "
|
||||
+ this.getClass().getName() + " is disabled.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -462,7 +463,7 @@ public void run() {
|
||||
}
|
||||
// End of initializing any uninitialized processTrees
|
||||
|
||||
if (pId == null) {
|
||||
if (pId == null || !isResourceCalculatorAvailable()) {
|
||||
continue; // processTree cannot be tracked
|
||||
}
|
||||
|
||||
|
@ -29,15 +29,18 @@
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest;
|
||||
@ -181,6 +184,42 @@ public void testProcessTreeLimits() throws IOException {
|
||||
}
|
||||
}
|
||||
|
||||
// Test that even if VMEM_PMEM_CHECK is not enabled, container monitor will
|
||||
// run.
|
||||
@Test
|
||||
public void testContainerMonitor() throws Exception {
|
||||
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
||||
conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
||||
containerManager.start();
|
||||
ContainerLaunchContext context =
|
||||
recordFactory.newRecordInstance(ContainerLaunchContext.class);
|
||||
context.setCommands(Arrays.asList("sleep 6"));
|
||||
ContainerId cId = createContainerId(1705);
|
||||
|
||||
// start the container
|
||||
StartContainerRequest scRequest = StartContainerRequest.newInstance(context,
|
||||
createContainerToken(cId, DUMMY_RM_IDENTIFIER, this.context.getNodeId(),
|
||||
user, this.context.getContainerTokenSecretManager()));
|
||||
StartContainersRequest allRequests =
|
||||
StartContainersRequest.newInstance(Arrays.asList(scRequest));
|
||||
containerManager.startContainers(allRequests);
|
||||
BaseContainerManagerTest
|
||||
.waitForContainerState(containerManager, cId, ContainerState.RUNNING);
|
||||
Thread.sleep(2000);
|
||||
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||
public Boolean get() {
|
||||
try {
|
||||
return containerManager.getContainerStatuses(
|
||||
GetContainerStatusesRequest.newInstance(Arrays.asList(cId)))
|
||||
.getContainerStatuses().get(0).getHost() != null;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}, 300, 10000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContainerKillOnMemoryOverflow() throws IOException,
|
||||
InterruptedException, YarnException {
|
||||
|
Loading…
Reference in New Issue
Block a user