HDFS-11919. Ozone: SCM: TestNodeManager takes too long to execute. Contributed by Yiqun Lin.

2017-06-05 21:34:32 -07:00 · 2017-06-05 21:34:32 -07:00 · f5d17b8f7c
commit f5d17b8f7c
parent d67542c115
1 changed files with 37 additions and 68 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/scm/node/TestNodeManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/ozone/scm/node/TestNodeManager.java
@ -48,6 +48,7 @@
 import java.util.UUID;
 import java.util.concurrent.TimeoutException;

+import static java.util.concurrent.TimeUnit.SECONDS;
 import static org.apache.hadoop.ozone.protocol.proto
    .StorageContainerDatanodeProtocolProtos.Type;
 import static org.apache.hadoop.ozone.scm.node.NodeManager.NODESTATE.DEAD;
@ -99,6 +100,7 @@ OzoneConfiguration getConf() {
    OzoneConfiguration conf = new OzoneConfiguration();
    conf.set(OzoneConfigKeys.OZONE_CONTAINER_METADATA_DIRS,
        testDir.getAbsolutePath());
+    conf.setLong(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
    return conf;
  }

@ -367,28 +369,30 @@ public void testScmSanityOfUserConfig2() throws IOException,
  }

  /**
-   * Asserts that a single node moves from Healthy to stale node if it misses
-   * the heartbeat.
+   * Asserts that a single node moves from Healthy to stale node, then from
+   * stale node to dead node if it misses enough heartbeats.
   *
   * @throws IOException
   * @throws InterruptedException
   * @throws TimeoutException
   */
  @Test
-  public void testScmDetectStaleNode() throws IOException,
+  public void testScmDetectStaleAndDeadNode() throws IOException,
      InterruptedException, TimeoutException {
-    OzoneConfiguration conf = getConf();
    final int interval = 100;
    final int nodeCount = 10;
+
+    OzoneConfiguration conf = getConf();
    conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval);
-    conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
-    // This should be 5 times more than  OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS
-    // and 3 times more than OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS
+    conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
    conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
+    conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
+

    try (SCMNodeManager nodeManager = createNodeManager(conf)) {
      List<DatanodeID> nodeList = createNodeSet(nodeManager, nodeCount,
-          "staleNode");
+          "Node");
+
      DatanodeID staleNode = SCMTestUtils.getDatanodeID(nodeManager);

      // Heartbeat once
@ -401,13 +405,14 @@ public void testScmDetectStaleNode() throws IOException,

      // Wait for 2 seconds .. and heartbeat good nodes again.
      Thread.sleep(2 * 1000);
+
      for (DatanodeID dn : nodeList) {
        nodeManager.sendHeartbeat(dn, null);
      }

-      // Wait for 2 more seconds, 3 seconds is the stale window for this test
+      // Wait for 2 seconds, wait a total of 4 seconds to make sure that the
+      // node moves into stale state.
      Thread.sleep(2 * 1000);
-
      List<DatanodeID> staleNodeList = nodeManager.getNodes(NodeManager
          .NODESTATE.STALE);
      assertEquals("Expected to find 1 stale node",
@ -416,51 +421,7 @@ public void testScmDetectStaleNode() throws IOException,
          1, staleNodeList.size());
      assertEquals("Stale node is not the expected ID", staleNode
          .getDatanodeUuid(), staleNodeList.get(0).getDatanodeUuid());
-    }
-  }
-
-  /**
-   * Asserts that a single node moves from Healthy to dead node if it misses
-   * enough heartbeats.
-   *
-   * @throws IOException
-   * @throws InterruptedException
-   * @throws TimeoutException
-   */
-  @Test
-  public void testScmDetectDeadNode() throws IOException,
-      InterruptedException, TimeoutException {
-    final int interval = 100;
-    final int nodeCount = 10;
-
-    OzoneConfiguration conf = getConf();
-    conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval);
-    conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
-    conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
-    conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
-
-
-    try (SCMNodeManager nodeManager = createNodeManager(conf)) {
-      List<DatanodeID> nodeList = createNodeSet(nodeManager, nodeCount,
-          "Node");
-
-      DatanodeID deadNode = SCMTestUtils.getDatanodeID(nodeManager);
-
-      // Heartbeat once
-      nodeManager.sendHeartbeat(deadNode, null);
-
-      // Heartbeat all other nodes.
-      for (DatanodeID dn : nodeList) {
-        nodeManager.sendHeartbeat(dn, null);
-      }
-
-      // Wait for 2 seconds .. and heartbeat good nodes again.
-      Thread.sleep(2 * 1000);
-
-      for (DatanodeID dn : nodeList) {
-        nodeManager.sendHeartbeat(dn, null);
-      }
-      Thread.sleep(3 * 1000);
+      Thread.sleep(1000);

      // heartbeat good nodes again.
      for (DatanodeID dn : nodeList) {
@ -471,13 +432,21 @@ public void testScmDetectDeadNode() throws IOException,
      // 7 seconds to make sure that the node moves into dead state.
      Thread.sleep(2 * 1000);

+      // the stale node has been removed
+      staleNodeList = nodeManager.getNodes(NodeManager
+          .NODESTATE.STALE);
+      assertEquals("Expected to find 1 stale node",
+          0, nodeManager.getNodeCount(STALE));
+      assertEquals("Expected to find 1 stale node",
+          0, staleNodeList.size());
+
      // Check for the dead node now.
      List<DatanodeID> deadNodeList = nodeManager.getNodes(DEAD);
      assertEquals("Expected to find 1 dead node", 1,
          nodeManager.getNodeCount(DEAD));
      assertEquals("Expected to find 1 dead node",
          1, deadNodeList.size());
-      assertEquals("Dead node is not the expected ID", deadNode
+      assertEquals("Dead node is not the expected ID", staleNode
          .getDatanodeUuid(), deadNodeList.get(0).getDatanodeUuid());
    }
  }
@ -556,7 +525,7 @@ public void testScmClusterIsInExpectedState1() throws IOException,

    OzoneConfiguration conf = getConf();
    conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
-    conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
+    conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
    conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
    conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);

@ -733,7 +702,7 @@ public void testScmClusterIsInExpectedState2() throws IOException,

    OzoneConfiguration conf = getConf();
    conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
-    conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
+    conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
    conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
    conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
    conf.setInt(OZONE_SCM_MAX_HB_COUNT_TO_PROCESS, 7000);
@ -822,7 +791,7 @@ public void testScmCanHandleScale() throws IOException,
    final int staleCount = 3000;
    OzoneConfiguration conf = getConf();
    conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
-    conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
+    conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
    conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
    conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);

@ -873,16 +842,17 @@ public void testScmCanHandleScale() throws IOException,
   * lead to many nodes becoming stale or dead due to the fact that SCM is not
   * able to keep up with heartbeat processing. This test just verifies that SCM
   * will log that information.
+   * @throws TimeoutException
   */
  @Test
  public void testScmLogsHeartbeatFlooding() throws IOException,
-      InterruptedException {
+      InterruptedException, TimeoutException {
    final int healthyCount = 3000;

    // Make the HB process thread run slower.
    OzoneConfiguration conf = getConf();
    conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 500);
-    conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
+    conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
    conf.setInt(OZONE_SCM_MAX_HB_COUNT_TO_PROCESS, 500);

    try (SCMNodeManager nodeManager = createNodeManager(conf)) {
@ -901,15 +871,14 @@ public void testScmLogsHeartbeatFlooding() throws IOException,
      thread1.setDaemon(true);
      thread1.start();

-      Thread.sleep(6 * 1000);
-
+      GenericTestUtils.waitFor(() -> logCapturer.getOutput()
+          .contains("SCM is being "
+              + "flooded by heartbeats. Not able to keep up"
+              + " with the heartbeat counts."),
+          500, 20 * 1000);

      thread1.interrupt();
      logCapturer.stopCapturing();
-
-      assertThat(logCapturer.getOutput(), containsString("SCM is being " +
-          "flooded by heartbeats. Not able to keep up with the heartbeat " +
-          "counts."));
    }
  }

@ -1025,7 +994,7 @@ public void testScmNodeReportUpdate() throws IOException,
    final int interval = 100;

    conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval);
-    conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
+    conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
    conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
    conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);