YARN-5662. Provide an option to enable ContainerMonitor. Contributed by Jian He.
This commit is contained in:
parent
03f519a757
commit
bc2656f09f
@ -1259,6 +1259,10 @@ public static boolean isAclEnabled(Configuration conf) {
|
|||||||
NM_PREFIX + "resource-monitor.interval-ms";
|
NM_PREFIX + "resource-monitor.interval-ms";
|
||||||
public static final int DEFAULT_NM_RESOURCE_MON_INTERVAL_MS = 3000;
|
public static final int DEFAULT_NM_RESOURCE_MON_INTERVAL_MS = 3000;
|
||||||
|
|
||||||
|
public static final String NM_CONTAINER_MONITOR_ENABLED =
|
||||||
|
NM_PREFIX + "container-monitor.enabled";
|
||||||
|
public static final boolean DEFAULT_NM_CONTAINER_MONITOR_ENABLED = true;
|
||||||
|
|
||||||
/** How often to monitor containers.*/
|
/** How often to monitor containers.*/
|
||||||
public final static String NM_CONTAINER_MON_INTERVAL_MS =
|
public final static String NM_CONTAINER_MON_INTERVAL_MS =
|
||||||
NM_PREFIX + "container-monitor.interval-ms";
|
NM_PREFIX + "container-monitor.interval-ms";
|
||||||
|
@ -95,6 +95,8 @@ public String toString() {
|
|||||||
sb.append("Capability: ").append(getCapability()).append(", ");
|
sb.append("Capability: ").append(getCapability()).append(", ");
|
||||||
sb.append("Diagnostics: ").append(getDiagnostics()).append(", ");
|
sb.append("Diagnostics: ").append(getDiagnostics()).append(", ");
|
||||||
sb.append("ExitStatus: ").append(getExitStatus()).append(", ");
|
sb.append("ExitStatus: ").append(getExitStatus()).append(", ");
|
||||||
|
sb.append("IP: ").append(getIPs()).append(", ");
|
||||||
|
sb.append("Host: ").append(getHost());
|
||||||
sb.append("]");
|
sb.append("]");
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
@ -1364,6 +1364,12 @@
|
|||||||
<name>yarn.nodemanager.resource-calculator.class</name>
|
<name>yarn.nodemanager.resource-calculator.class</name>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Enable container monitor</description>
|
||||||
|
<name>yarn.nodemanager.container-monitor.enabled</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>How often to monitor containers. If not set, the value for
|
<description>How often to monitor containers. If not set, the value for
|
||||||
yarn.nodemanager.resource-monitor.interval-ms will be used.</description>
|
yarn.nodemanager.resource-monitor.interval-ms will be used.</description>
|
||||||
|
@ -171,7 +171,7 @@ protected void serviceInit(Configuration conf) throws Exception {
|
|||||||
LOG.info("Physical memory check enabled: " + pmemCheckEnabled);
|
LOG.info("Physical memory check enabled: " + pmemCheckEnabled);
|
||||||
LOG.info("Virtual memory check enabled: " + vmemCheckEnabled);
|
LOG.info("Virtual memory check enabled: " + vmemCheckEnabled);
|
||||||
|
|
||||||
containersMonitorEnabled = isEnabled();
|
containersMonitorEnabled = isContainerMonitorEnabled();
|
||||||
LOG.info("ContainersMonitor enabled: " + containersMonitorEnabled);
|
LOG.info("ContainersMonitor enabled: " + containersMonitorEnabled);
|
||||||
|
|
||||||
nodeCpuPercentageForYARN =
|
nodeCpuPercentageForYARN =
|
||||||
@ -204,23 +204,24 @@ protected void serviceInit(Configuration conf) throws Exception {
|
|||||||
super.serviceInit(conf);
|
super.serviceInit(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isEnabled() {
|
private boolean isContainerMonitorEnabled() {
|
||||||
|
return conf.getBoolean(YarnConfiguration.NM_CONTAINER_MONITOR_ENABLED,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_MONITOR_ENABLED);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isResourceCalculatorAvailable() {
|
||||||
if (resourceCalculatorPlugin == null) {
|
if (resourceCalculatorPlugin == null) {
|
||||||
LOG.info("ResourceCalculatorPlugin is unavailable on this system. "
|
LOG.info("ResourceCalculatorPlugin is unavailable on this system. " + this
|
||||||
+ this.getClass().getName() + " is disabled.");
|
.getClass().getName() + " is disabled.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (ResourceCalculatorProcessTree.getResourceCalculatorProcessTree("0", processTreeClass, conf) == null) {
|
if (ResourceCalculatorProcessTree
|
||||||
LOG.info("ResourceCalculatorProcessTree is unavailable on this system. "
|
.getResourceCalculatorProcessTree("0", processTreeClass, conf)
|
||||||
+ this.getClass().getName() + " is disabled.");
|
== null) {
|
||||||
return false;
|
LOG.info("ResourceCalculatorProcessTree is unavailable on this system. "
|
||||||
}
|
+ this.getClass().getName() + " is disabled.");
|
||||||
if (!(isPmemCheckEnabled() || isVmemCheckEnabled())) {
|
|
||||||
LOG.info("Neither virtual-memory nor physical-memory monitoring is " +
|
|
||||||
"needed. Not running the monitor-thread");
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -462,7 +463,7 @@ public void run() {
|
|||||||
}
|
}
|
||||||
// End of initializing any uninitialized processTrees
|
// End of initializing any uninitialized processTrees
|
||||||
|
|
||||||
if (pId == null) {
|
if (pId == null || !isResourceCalculatorAvailable()) {
|
||||||
continue; // processTree cannot be tracked
|
continue; // processTree cannot be tracked
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,15 +29,18 @@
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import com.google.common.base.Supplier;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.fs.FileUtil;
|
import org.apache.hadoop.fs.FileUtil;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest;
|
||||||
@ -181,6 +184,42 @@ public void testProcessTreeLimits() throws IOException {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test that even if VMEM_PMEM_CHECK is not enabled, container monitor will
|
||||||
|
// run.
|
||||||
|
@Test
|
||||||
|
public void testContainerMonitor() throws Exception {
|
||||||
|
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
||||||
|
conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
||||||
|
containerManager.start();
|
||||||
|
ContainerLaunchContext context =
|
||||||
|
recordFactory.newRecordInstance(ContainerLaunchContext.class);
|
||||||
|
context.setCommands(Arrays.asList("sleep 6"));
|
||||||
|
ContainerId cId = createContainerId(1705);
|
||||||
|
|
||||||
|
// start the container
|
||||||
|
StartContainerRequest scRequest = StartContainerRequest.newInstance(context,
|
||||||
|
createContainerToken(cId, DUMMY_RM_IDENTIFIER, this.context.getNodeId(),
|
||||||
|
user, this.context.getContainerTokenSecretManager()));
|
||||||
|
StartContainersRequest allRequests =
|
||||||
|
StartContainersRequest.newInstance(Arrays.asList(scRequest));
|
||||||
|
containerManager.startContainers(allRequests);
|
||||||
|
BaseContainerManagerTest
|
||||||
|
.waitForContainerState(containerManager, cId, ContainerState.RUNNING);
|
||||||
|
Thread.sleep(2000);
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
public Boolean get() {
|
||||||
|
try {
|
||||||
|
return containerManager.getContainerStatuses(
|
||||||
|
GetContainerStatusesRequest.newInstance(Arrays.asList(cId)))
|
||||||
|
.getContainerStatuses().get(0).getHost() != null;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}, 300, 10000);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testContainerKillOnMemoryOverflow() throws IOException,
|
public void testContainerKillOnMemoryOverflow() throws IOException,
|
||||||
InterruptedException, YarnException {
|
InterruptedException, YarnException {
|
||||||
|
Loading…
Reference in New Issue
Block a user