YARN-11709. NodeManager should be shut down or blacklisted when it cacannot run program /var/lib/yarn-ce/bin/container-executor (#6960)

This commit is contained in:
Ferenc Erdelyi 2024-08-16 16:33:10 +02:00 committed by GitHub
parent 5f93edfd70
commit f00094203b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 36 additions and 5 deletions

View File

@ -451,8 +451,10 @@ public void startLocalizer(LocalizerStartContext ctx)
} catch (PrivilegedOperationException e) {
int exitCode = e.getExitCode();
LOG.warn("Exit code from container {} startLocalizer is : {}",
locId, exitCode, e);
LOG.error("Unrecoverable issue occurred. Marking the node as unhealthy to prevent "
+ "further containers to get scheduled on the node and cause application failures. " +
"Exit code from the container " + locId + "startLocalizer is : " + exitCode, e);
nmContext.getNodeStatusUpdater().reportException(e);
throw new IOException("Application " + appId + " initialization failed" +
" (exitCode=" + exitCode + ") with output: " + e.getOutput(), e);

View File

@ -26,6 +26,7 @@
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyBoolean;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.spy;
@ -37,6 +38,7 @@
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.lang.reflect.Field;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URISyntaxException;
@ -345,7 +347,8 @@ public void testStartLocalizer() throws IOException {
@Test
public void testContainerLaunchError()
throws IOException, ContainerExecutionException, URISyntaxException {
throws IOException, ContainerExecutionException, URISyntaxException, IllegalAccessException,
NoSuchFieldException {
final String[] expecetedMessage = {"badcommand", "Exit code: 24"};
final String[] executor = {
@ -387,6 +390,14 @@ public Object answer(InvocationOnMock invocationOnMock)
dirsHandler.init(conf);
mockExec.setConf(conf);
//set the private nmContext field without initing the LinuxContainerExecutor
NodeManager nodeManager = new NodeManager();
NodeManager.NMContext nmContext =
nodeManager.createNMContext(null, null, null, false, conf);
Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext");
lceNmContext.setAccessible(true);
lceNmContext.set(mockExec, nmContext);
String appSubmitter = "nobody";
String cmd = String
.valueOf(PrivilegedOperation.RunAsUserCommand.LAUNCH_CONTAINER.
@ -601,8 +612,6 @@ public void testNoExitCodeFromPrivilegedOperation() throws Exception {
LinuxContainerRuntime runtime = new DefaultLinuxContainerRuntime(
spyPrivilegedExecutor);
runtime.initialize(conf, null);
mockExec = new LinuxContainerExecutor(runtime);
mockExec.setConf(conf);
LinuxContainerExecutor lce = new LinuxContainerExecutor(runtime) {
@Override
protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
@ -610,6 +619,23 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
}
};
lce.setConf(conf);
//set the private nmContext field without initing the LinuxContainerExecutor
NodeManager nodeManager = new NodeManager();
NodeManager.NMContext nmContext =
nodeManager.createNMContext(null, null, null, false, conf);
NodeManager.NMContext spyNmContext = spy(nmContext);
//initialize a mock NodeStatusUpdater
NodeStatusUpdaterImpl nodeStatusUpdater = mock(NodeStatusUpdaterImpl.class);
nmContext.setNodeStatusUpdater(nodeStatusUpdater);
//imitate a void method call on the NodeStatusUpdater when setting NM unhealthy.
doNothing().when(nodeStatusUpdater).reportException(any());
Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext");
lceNmContext.setAccessible(true);
lceNmContext.set(lce, nmContext);
InetSocketAddress address = InetSocketAddress.createUnresolved(
"localhost", 8040);
Path nmPrivateCTokensPath= new Path("file:///bin/nmPrivateCTokensPath");
@ -672,6 +698,9 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
assertTrue("Unexpected exception " + e,
e.getMessage().contains("exit code"));
}
//verify that the NM was set unhealthy on PrivilegedOperationException
verify(nodeStatusUpdater, times(1)).reportException(any());
}
@Test