diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index c1f343d499..ddbaa4e062 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -397,6 +397,9 @@ Release 2.1.0-beta - UNRELEASED YARN-578. Fixed NM to use SecureIOUtils for reading and aggregating logs. (Omkar Vinit Joshi via vinodkv) + YARN-733. Fixed TestNMClient from failing occasionally. (Zhijie Shen via + vinodkv) + BREAKDOWN OF HADOOP-8562 SUBTASKS AND RELATED JIRAS YARN-158. Yarn creating package-info.java must not depend on sh. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java index 4cf12b07d6..1a564f4a48 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java @@ -64,6 +64,17 @@ * continue to run even after this client is stopped and till the application * runs at which point ResourceManager will forcefully kill them. *

+ * + *

+ * Note that the blocking APIs ensure the RPC calls to NodeManager + * are executed immediately, and the responses are received before these APIs + * return. However, when {@link #startContainer} or {@link #stopContainer} + * returns, NodeManager may still need some time to either start + * or stop the container because of its asynchronous implementation. Therefore, + * {@link #getContainerStatus} is likely to return a transit container status + * if it is executed immediately after {@link #startContainer} or + * {@link #stopContainer}. + *

*/ public class NMClientImpl extends AbstractService implements NMClient { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java index 34ca1ae754..8e1c3926f5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java @@ -20,8 +20,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; @@ -228,7 +228,7 @@ private Set allocateContainers( } private void testContainerManagement(NMClientImpl nmClient, - Set containers) throws IOException { + Set containers) throws YarnRemoteException, IOException { int size = containers.size(); int i = 0; for (Container container : containers) { @@ -271,17 +271,9 @@ private void testContainerManagement(NMClientImpl nmClient, // leave one container unclosed if (++i < size) { - try { - ContainerStatus status = nmClient.getContainerStatus(container.getId(), - container.getNodeId(), container.getContainerToken()); - // verify the container is started and in good shape - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertEquals("", status.getDiagnostics()); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); - } + // NodeManager may still need some time to make the container started + testGetContainerStatus(container, i, ContainerState.RUNNING, "", + -1000); try { nmClient.stopContainer(container.getId(), container.getNodeId(), @@ -291,18 +283,8 @@ private void testContainerManagement(NMClientImpl nmClient, } // getContainerStatus can be called after stopContainer - try { - ContainerStatus status = nmClient.getContainerStatus( - container.getId(), container.getNodeId(), - container.getContainerToken()); - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertTrue("" + i, status.getDiagnostics().contains( - "Container killed by the ApplicationMaster.")); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); - } + testGetContainerStatus(container, i, ContainerState.COMPLETE, + "Container killed by the ApplicationMaster.", 143); } } } @@ -315,4 +297,28 @@ private void sleep(int sleepTime) { } } + private void testGetContainerStatus(Container container, int index, + ContainerState state, String diagnostics, int exitStatus) + throws YarnRemoteException, IOException { + while (true) { + try { + ContainerStatus status = nmClient.getContainerStatus( + container.getId(), container.getNodeId(), + container.getContainerToken()); + // NodeManager may still need some time to get the stable + // container status + if (status.getState() == state) { + assertEquals(container.getId(), status.getContainerId()); + assertTrue("" + index + ": " + status.getDiagnostics(), + status.getDiagnostics().contains(diagnostics)); + assertEquals(exitStatus, status.getExitStatus()); + break; + } + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + }