From f00094203bf40a8c3f2216cf22eaa5599e3b9b4d Mon Sep 17 00:00:00 2001 From: Ferenc Erdelyi <55103964+ferdelyi@users.noreply.github.com> Date: Fri, 16 Aug 2024 16:33:10 +0200 Subject: [PATCH] YARN-11709. NodeManager should be shut down or blacklisted when it cacannot run program /var/lib/yarn-ce/bin/container-executor (#6960) --- .../nodemanager/LinuxContainerExecutor.java | 6 ++-- .../TestLinuxContainerExecutorWithMocks.java | 35 +++++++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 19335045c8..19c0673603 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -451,8 +451,10 @@ public void startLocalizer(LocalizerStartContext ctx) } catch (PrivilegedOperationException e) { int exitCode = e.getExitCode(); - LOG.warn("Exit code from container {} startLocalizer is : {}", - locId, exitCode, e); + LOG.error("Unrecoverable issue occurred. Marking the node as unhealthy to prevent " + + "further containers to get scheduled on the node and cause application failures. " + + "Exit code from the container " + locId + "startLocalizer is : " + exitCode, e); + nmContext.getNodeStatusUpdater().reportException(e); throw new IOException("Application " + appId + " initialization failed" + " (exitCode=" + exitCode + ") with output: " + e.getOutput(), e); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java index 3d9d33c5a1..7d49cab4a8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java @@ -26,6 +26,7 @@ import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; @@ -37,6 +38,7 @@ import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; +import java.lang.reflect.Field; import java.net.InetSocketAddress; import java.net.URI; import java.net.URISyntaxException; @@ -345,7 +347,8 @@ public void testStartLocalizer() throws IOException { @Test public void testContainerLaunchError() - throws IOException, ContainerExecutionException, URISyntaxException { + throws IOException, ContainerExecutionException, URISyntaxException, IllegalAccessException, + NoSuchFieldException { final String[] expecetedMessage = {"badcommand", "Exit code: 24"}; final String[] executor = { @@ -387,6 +390,14 @@ public Object answer(InvocationOnMock invocationOnMock) dirsHandler.init(conf); mockExec.setConf(conf); + //set the private nmContext field without initing the LinuxContainerExecutor + NodeManager nodeManager = new NodeManager(); + NodeManager.NMContext nmContext = + nodeManager.createNMContext(null, null, null, false, conf); + Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext"); + lceNmContext.setAccessible(true); + lceNmContext.set(mockExec, nmContext); + String appSubmitter = "nobody"; String cmd = String .valueOf(PrivilegedOperation.RunAsUserCommand.LAUNCH_CONTAINER. @@ -601,8 +612,6 @@ public void testNoExitCodeFromPrivilegedOperation() throws Exception { LinuxContainerRuntime runtime = new DefaultLinuxContainerRuntime( spyPrivilegedExecutor); runtime.initialize(conf, null); - mockExec = new LinuxContainerExecutor(runtime); - mockExec.setConf(conf); LinuxContainerExecutor lce = new LinuxContainerExecutor(runtime) { @Override protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() { @@ -610,6 +619,23 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() { } }; lce.setConf(conf); + + //set the private nmContext field without initing the LinuxContainerExecutor + NodeManager nodeManager = new NodeManager(); + NodeManager.NMContext nmContext = + nodeManager.createNMContext(null, null, null, false, conf); + NodeManager.NMContext spyNmContext = spy(nmContext); + + //initialize a mock NodeStatusUpdater + NodeStatusUpdaterImpl nodeStatusUpdater = mock(NodeStatusUpdaterImpl.class); + nmContext.setNodeStatusUpdater(nodeStatusUpdater); + //imitate a void method call on the NodeStatusUpdater when setting NM unhealthy. + doNothing().when(nodeStatusUpdater).reportException(any()); + + Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext"); + lceNmContext.setAccessible(true); + lceNmContext.set(lce, nmContext); + InetSocketAddress address = InetSocketAddress.createUnresolved( "localhost", 8040); Path nmPrivateCTokensPath= new Path("file:///bin/nmPrivateCTokensPath"); @@ -672,6 +698,9 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() { assertTrue("Unexpected exception " + e, e.getMessage().contains("exit code")); } + + //verify that the NM was set unhealthy on PrivilegedOperationException + verify(nodeStatusUpdater, times(1)).reportException(any()); } @Test