diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index c1f343d499..ddbaa4e062 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -397,6 +397,9 @@ Release 2.1.0-beta - UNRELEASED
YARN-578. Fixed NM to use SecureIOUtils for reading and aggregating logs.
(Omkar Vinit Joshi via vinodkv)
+ YARN-733. Fixed TestNMClient from failing occasionally. (Zhijie Shen via
+ vinodkv)
+
BREAKDOWN OF HADOOP-8562 SUBTASKS AND RELATED JIRAS
YARN-158. Yarn creating package-info.java must not depend on sh.
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java
index 4cf12b07d6..1a564f4a48 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java
@@ -64,6 +64,17 @@
* continue to run even after this client is stopped and till the application
* runs at which point ResourceManager will forcefully kill them.
*
+ *
+ *
+ * Note that the blocking APIs ensure the RPC calls to NodeManager
+ * are executed immediately, and the responses are received before these APIs
+ * return. However, when {@link #startContainer} or {@link #stopContainer}
+ * returns, NodeManager
may still need some time to either start
+ * or stop the container because of its asynchronous implementation. Therefore,
+ * {@link #getContainerStatus} is likely to return a transit container status
+ * if it is executed immediately after {@link #startContainer} or
+ * {@link #stopContainer}.
+ *
*/
public class NMClientImpl extends AbstractService implements NMClient {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java
index 34ca1ae754..8e1c3926f5 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java
@@ -20,8 +20,8 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.IOException;
@@ -228,7 +228,7 @@ private Set allocateContainers(
}
private void testContainerManagement(NMClientImpl nmClient,
- Set containers) throws IOException {
+ Set containers) throws YarnRemoteException, IOException {
int size = containers.size();
int i = 0;
for (Container container : containers) {
@@ -271,17 +271,9 @@ private void testContainerManagement(NMClientImpl nmClient,
// leave one container unclosed
if (++i < size) {
- try {
- ContainerStatus status = nmClient.getContainerStatus(container.getId(),
- container.getNodeId(), container.getContainerToken());
- // verify the container is started and in good shape
- assertEquals(container.getId(), status.getContainerId());
- assertEquals(ContainerState.RUNNING, status.getState());
- assertEquals("", status.getDiagnostics());
- assertEquals(-1000, status.getExitStatus());
- } catch (YarnRemoteException e) {
- fail("Exception is not expected");
- }
+ // NodeManager may still need some time to make the container started
+ testGetContainerStatus(container, i, ContainerState.RUNNING, "",
+ -1000);
try {
nmClient.stopContainer(container.getId(), container.getNodeId(),
@@ -291,18 +283,8 @@ private void testContainerManagement(NMClientImpl nmClient,
}
// getContainerStatus can be called after stopContainer
- try {
- ContainerStatus status = nmClient.getContainerStatus(
- container.getId(), container.getNodeId(),
- container.getContainerToken());
- assertEquals(container.getId(), status.getContainerId());
- assertEquals(ContainerState.RUNNING, status.getState());
- assertTrue("" + i, status.getDiagnostics().contains(
- "Container killed by the ApplicationMaster."));
- assertEquals(-1000, status.getExitStatus());
- } catch (YarnRemoteException e) {
- fail("Exception is not expected");
- }
+ testGetContainerStatus(container, i, ContainerState.COMPLETE,
+ "Container killed by the ApplicationMaster.", 143);
}
}
}
@@ -315,4 +297,28 @@ private void sleep(int sleepTime) {
}
}
+ private void testGetContainerStatus(Container container, int index,
+ ContainerState state, String diagnostics, int exitStatus)
+ throws YarnRemoteException, IOException {
+ while (true) {
+ try {
+ ContainerStatus status = nmClient.getContainerStatus(
+ container.getId(), container.getNodeId(),
+ container.getContainerToken());
+ // NodeManager may still need some time to get the stable
+ // container status
+ if (status.getState() == state) {
+ assertEquals(container.getId(), status.getContainerId());
+ assertTrue("" + index + ": " + status.getDiagnostics(),
+ status.getDiagnostics().contains(diagnostics));
+ assertEquals(exitStatus, status.getExitStatus());
+ break;
+ }
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
}