HADOOP-11149. TestZKFailoverController times out. Contributed by Steve Loughran. closes apache/hadoop#51

2015-11-23 05:38:42 +09:00 · 2015-11-23 05:38:42 +09:00 · 053a511919
commit 053a511919
parent a4bd54f9d7
3 changed files with 325 additions and 357 deletions
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@ -1355,6 +1355,9 @@ Release 2.8.0 - UNRELEASED

    HADOOP-8419. Fixed GzipCode NPE reset for IBM JDK. (Yu Li via eyang)

+    HADOOP-11149. TestZKFailoverController times out. (Steve Loughran
+    via ozawa)
+
  OPTIMIZATIONS

    HADOOP-12051. ProtobufRpcEngine.invoke() should use Exception.toString()
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/MiniZKFCCluster.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/MiniZKFCCluster.java
@ -133,11 +133,13 @@ public void start(int count) throws Exception {
   * @throws Exception if either of the services had encountered a fatal error
   */
  public void stop() throws Exception {
+    if (thrs != null) {
      for (DummyZKFCThread thr : thrs) {
        if (thr != null) {
          thr.interrupt();
        }
      }
+    }
    if (ctx != null) {
      ctx.stop();
    }
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
@ -34,14 +34,23 @@
 import org.apache.zookeeper.ZooKeeper;
 import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
+import org.junit.After;
 import org.junit.Before;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.Timeout;
 import org.mockito.Mockito;

 public class TestZKFailoverController extends ClientBaseWithFixes {
  private Configuration conf;
  private MiniZKFCCluster cluster;

+  /**
+   * Set the timeout for every test
+   */
+  @Rule
+  public Timeout testTimeout = new Timeout(3 * 60 * 1000);
+
  // Set up ZK digest-based credentials for the purposes of the tests,
  // to make sure all of our functionality works with auth and ACLs
  // present.
@ -74,11 +83,21 @@ public void setupConfAndServices() {
    this.cluster = new MiniZKFCCluster(conf, getServer(serverFactory));
  }

+  @After
+  public void teardown() {
+    if (cluster != null) {
+      try {
+        cluster.stop();
+      } catch (Exception e) {
+        LOG.warn("When stopping the cluster", e);
+      }
+    }
+  }
  /**
   * Test that the various command lines for formatting the ZK directory
   * function correctly.
   */
-  @Test(timeout=15000)
+  @Test
  public void testFormatZK() throws Exception {
    DummyHAService svc = cluster.getService(1);
    // Run without formatting the base dir,
@ -101,7 +120,7 @@ public void testFormatZK() throws Exception {
   * Test that if ZooKeeper is not running, the correct error
   * code is returned.
   */
-  @Test(timeout=15000)
+  @Test
  public void testNoZK() throws Exception {
    stopServer();
    DummyHAService svc = cluster.getService(1);
@ -146,7 +165,7 @@ protected String getScopeInsideParentNode() {
   * Test that automatic failover won't run against a target that hasn't
   * explicitly enabled the feature.
   */
-  @Test(timeout=10000)
+  @Test
  public void testWontRunWhenAutoFailoverDisabled() throws Exception {
    DummyHAService svc = cluster.getService(1);
    svc = Mockito.spy(svc);
@ -162,7 +181,7 @@ public void testWontRunWhenAutoFailoverDisabled() throws Exception {
   * Test that, if ACLs are specified in the configuration, that
   * it sets the ACLs when formatting the parent node.
   */
-  @Test(timeout=15000)
+  @Test
  public void testFormatSetsAcls() throws Exception {
    // Format the base dir, should succeed
    DummyHAService svc = cluster.getService(1);
@ -184,7 +203,7 @@ public void testFormatSetsAcls() throws Exception {
   * Test that the ZKFC won't run if fencing is not configured for the
   * local service.
   */
-  @Test(timeout=15000)
+  @Test
  public void testFencingMustBeConfigured() throws Exception {
    DummyHAService svc = Mockito.spy(cluster.getService(0));
    Mockito.doThrow(new BadFencingConfigurationException("no fencing"))
@ -202,9 +221,8 @@ public void testFencingMustBeConfigured() throws Exception {
   * transition is used when possible, falling back to fencing when
   * the graceful approach fails.
   */
-  @Test(timeout=15000)
+  @Test
  public void testAutoFailoverOnBadHealth() throws Exception {
-    try {
    cluster.start();
    DummyHAService svc1 = cluster.getService(1);

@ -224,9 +242,6 @@ public void testAutoFailoverOnBadHealth() throws Exception {
    cluster.waitForHAState(0, HAServiceState.ACTIVE);
    // and fence svc1
    Mockito.verify(svc1.fencer).fence(Mockito.same(svc1));
-    } finally {
-      cluster.stop();
-    }
  }

  /**
@ -235,9 +250,8 @@ public void testAutoFailoverOnBadHealth() throws Exception {
   * transition is used when possible, falling back to fencing when
   * the graceful approach fails.
   */
-  @Test(timeout=15000)
+  @Test
  public void testAutoFailoverOnBadState() throws Exception {
-    try {
    cluster.start();
    DummyHAService svc0 = cluster.getService(0);
    LOG.info("Faking svc0 to change the state, should failover to svc1");
@ -245,14 +259,10 @@ public void testAutoFailoverOnBadState() throws Exception {

    // Should fail back to svc0 at this point
    cluster.waitForHAState(1, HAServiceState.ACTIVE);
-    } finally {
-      cluster.stop();
-    }
  }

-  @Test(timeout=15000)
+  @Test
  public void testAutoFailoverOnLostZKSession() throws Exception {
-    try {
    cluster.start();

    // Expire svc0, it should fail over to svc1
@ -268,18 +278,14 @@ public void testAutoFailoverOnLostZKSession() throws Exception {

    // Expire svc1, it should fail back to svc0
    cluster.expireAndVerifyFailover(1, 0);
-    } finally {
-      cluster.stop();
-    }
  }

  /**
   * Test that, if the standby node is unhealthy, it doesn't try to become
   * active
   */
-  @Test(timeout=15000)
+  @Test
  public void testDontFailoverToUnhealthyNode() throws Exception {
-    try {
    cluster.start();

    // Make svc1 unhealthy, and wait for its FC to notice the bad health.
@ -304,18 +310,14 @@ public void testDontFailoverToUnhealthyNode() throws Exception {
    }
    // svc0 should get the lock again
    cluster.waitForActiveLockHolder(0);
-    } finally {
-      cluster.stop();
-    }
  }

  /**
   * Test that the ZKFC successfully quits the election when it fails to
   * become active. This allows the old node to successfully fail back.
   */
-  @Test(timeout=15000)
+  @Test
  public void testBecomingActiveFails() throws Exception {
-    try {
    cluster.start();
    DummyHAService svc1 = cluster.getService(1);

@ -346,9 +348,6 @@ public void testBecomingActiveFails() throws Exception {
    LOG.info("Allowing svc1 to become active, expiring svc0");
    svc1.failToBecomeActive = false;
    cluster.expireAndVerifyFailover(0, 1);
-    } finally {
-      cluster.stop();
-    }
  }
  
  /**
@ -356,9 +355,8 @@ public void testBecomingActiveFails() throws Exception {
   * current state, without triggering any failovers, and without
   * causing the active node to enter standby state.
   */
-  @Test(timeout=15000)
+  @Test
  public void testZooKeeperFailure() throws Exception {
-    try {
    cluster.start();

    // Record initial ZK sessions
@ -396,17 +394,13 @@ public void testZooKeeperFailure() throws Exception {
        cluster.getElector(0).getZKSessionIdForTests());
    assertEquals(session1,
        cluster.getElector(1).getZKSessionIdForTests());
-    } finally {
-      cluster.stop();
-    }
  }
  
  /**
   * Test that the ZKFC can gracefully cede its active status.
   */
-  @Test(timeout=15000)
+  @Test
  public void testCedeActive() throws Exception {
-    try {
    cluster.start();
    DummyZKFC zkfc = cluster.getZkfc(0);
    // It should be in active to start.
@ -434,14 +428,10 @@ public void testCedeActive() throws Exception {
    assertTrue("Should take ~3 seconds to rejoin. Only took " + (et2 - et) +
        "ms before rejoining.",
        et2 - et > 2800);
-    } finally {
-      cluster.stop();
-    }
  }

-  @Test(timeout=25000)
+  @Test
  public void testGracefulFailover() throws Exception {
-    try {
    cluster.start();

    cluster.waitForActiveLockHolder(0);
@ -457,14 +447,10 @@ public void testGracefulFailover() throws Exception {
    assertEquals(0, cluster.getService(1).fenceCount);
    assertEquals(2, cluster.getService(0).activeTransitionCount);
    assertEquals(1, cluster.getService(1).activeTransitionCount);
-    } finally {
-      cluster.stop();
-    }
  }

-  @Test(timeout=15000)
+  @Test
  public void testGracefulFailoverToUnhealthy() throws Exception {
-    try {
    cluster.start();

    cluster.waitForActiveLockHolder(0);
@ -482,14 +468,10 @@ public void testGracefulFailoverToUnhealthy() throws Exception {
          cluster.getService(1).toString() +
          " is not currently healthy.", sfe);
    }
-    } finally {
-      cluster.stop();
-    }
  }

-  @Test(timeout=15000)
+  @Test
  public void testGracefulFailoverFailBecomingActive() throws Exception {
-    try {
    cluster.start();

    cluster.waitForActiveLockHolder(0);
@ -513,14 +495,10 @@ public void testGracefulFailoverFailBecomingActive() throws Exception {

    // Service 0 should go back to being active after the failed failover
    cluster.waitForActiveLockHolder(0);
-    } finally {
-      cluster.stop();
-    }
  }

-  @Test(timeout=15000)
+  @Test
  public void testGracefulFailoverFailBecomingStandby() throws Exception {
-    try {
    cluster.start();

    cluster.waitForActiveLockHolder(0);
@ -533,15 +511,11 @@ public void testGracefulFailoverFailBecomingStandby() throws Exception {

    // Check that the old node was fenced
    assertEquals(1, cluster.getService(0).fenceCount);
-    } finally {
-      cluster.stop();
-    }
  }

-  @Test(timeout=15000)
+  @Test
  public void testGracefulFailoverFailBecomingStandbyAndFailFence()
    throws Exception {
-    try {
    cluster.start();

    cluster.waitForActiveLockHolder(0);
@ -559,18 +533,14 @@ public void testGracefulFailoverFailBecomingStandbyAndFailFence()
      GenericTestUtils.assertExceptionContains(
          "Unable to fence " + cluster.getService(0), sfe);
    }
-    } finally {
-      cluster.stop();
-    }
  }

  /**
   * Test which exercises all of the inputs into ZKFC. This is particularly
   * useful for running under jcarder to check for lock order violations.
   */
-  @Test(timeout=30000)
+  @Test
  public void testOneOfEverything() throws Exception {
-    try {
    cluster.start();

    // Failover by session expiration
@ -600,14 +570,10 @@ public void testOneOfEverything() throws Exception {
    // Graceful failovers
    cluster.getZkfc(1).gracefulFailoverToYou();
    cluster.getZkfc(0).gracefulFailoverToYou();
-    } finally {
-      cluster.stop();
-    }
  }

-  @Test(timeout = 25000)
+  @Test
  public void testGracefulFailoverMultipleZKfcs() throws Exception {
-    try {
    cluster.start(3);

    cluster.waitForActiveLockHolder(0);
@ -632,9 +598,6 @@ public void testGracefulFailoverMultipleZKfcs() throws Exception {
    assertEquals(2, cluster.getService(0).activeTransitionCount);
    assertEquals(1, cluster.getService(1).activeTransitionCount);
    assertEquals(1, cluster.getService(2).activeTransitionCount);
-    } finally {
-      cluster.stop();
-    }
  }

  private int runFC(DummyHAService target, String ... args) throws Exception {