HADOOP-10251. Both NameNodes could be in STANDBY State if SNN network is unstable. Contributed by Vinayakumar B.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1589494 13f79535-47bb-0310-9956-ffa450edef68
2014-04-23 18:55:30 +00:00 · 2014-04-23 18:55:30 +00:00 · 6eba48cbde
commit 6eba48cbde
parent ee8a152520
4 changed files with 107 additions and 6 deletions
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@ -401,6 +401,9 @@ Release 2.5.0 - UNRELEASED
    HADOOP-10526. Chance for Stream leakage in CompressorStream. (Rushabh 
    Shah via kihwal)

+    HADOOP-10251. Both NameNodes could be in STANDBY State if SNN network is unstable
+    (Vinayakumar B via umamahesh)
+
 Release 2.4.1 - UNRELEASED

  INCOMPATIBLE CHANGES
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthMonitor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthMonitor.java
@ -74,6 +74,9 @@ public class HealthMonitor {
  private List<Callback> callbacks = Collections.synchronizedList(
      new LinkedList<Callback>());

+  private List<ServiceStateCallback> serviceStateCallbacks = Collections
+      .synchronizedList(new LinkedList<ServiceStateCallback>());
+
  private HAServiceStatus lastServiceState = new HAServiceStatus(
      HAServiceState.INITIALIZING);
  
@ -135,6 +138,14 @@ public void removeCallback(Callback cb) {
    callbacks.remove(cb);
  }

+  public synchronized void addServiceStateCallback(ServiceStateCallback cb) {
+    this.serviceStateCallbacks.add(cb);
+  }
+
+  public synchronized void removeServiceStateCallback(ServiceStateCallback cb) {
+    serviceStateCallbacks.remove(cb);
+  }
+
  public void shutdown() {
    LOG.info("Stopping HealthMonitor thread");
    shouldRun = false;
@ -217,6 +228,9 @@ private void doHealthChecks() throws InterruptedException {
  
  private synchronized void setLastServiceStatus(HAServiceStatus status) {
    this.lastServiceState = status;
+    for (ServiceStateCallback cb : serviceStateCallbacks) {
+      cb.reportServiceStatus(lastServiceState);
+    }
  }

  private synchronized void enterState(State newState) {
@ -293,4 +307,11 @@ public void run() {
  static interface Callback {
    void enteredState(State newState);
  }
+
+  /**
+   * Callback interface for service states.
+   */
+  static interface ServiceStateCallback {
+    void reportServiceStatus(HAServiceStatus status);
+  }
 }
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
@ -34,6 +34,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException;
 import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
 import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
 import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
 import org.apache.hadoop.util.ZKUtil;
@ -105,6 +106,8 @@ public abstract class ZKFailoverController {

  private State lastHealthState = State.INITIALIZING;

+  private volatile HAServiceState serviceState = HAServiceState.INITIALIZING;
+
  /** Set if a fatal error occurs */
  private String fatalError = null;

@ -294,6 +297,7 @@ private boolean confirmFormat() {
  private void initHM() {
    healthMonitor = new HealthMonitor(conf, localTarget);
    healthMonitor.addCallback(new HealthCallbacks());
+    healthMonitor.addServiceStateCallback(new ServiceStateCallBacks());
    healthMonitor.start();
  }
  
@ -376,6 +380,7 @@ private synchronized void becomeActive() throws ServiceFailedException {
      String msg = "Successfully transitioned " + localTarget +
          " to active state";
      LOG.info(msg);
+      serviceState = HAServiceState.ACTIVE;
      recordActiveAttempt(new ActiveAttemptRecord(true, msg));

    } catch (Throwable t) {
@ -484,6 +489,7 @@ private synchronized void becomeStandby() {
      // TODO handle this. It's a likely case since we probably got fenced
      // at the same time.
    }
+    serviceState = HAServiceState.STANDBY;
  }
  

@ -574,6 +580,7 @@ private void doCedeActive(int millisToCede)
        delayJoiningUntilNanotime = System.nanoTime() +
            TimeUnit.MILLISECONDS.toNanos(millisToCede);
        elector.quitElection(needFence);
+        serviceState = HAServiceState.INITIALIZING;
      }
    }
    recheckElectability();
@ -739,12 +746,16 @@ private void recheckElectability() {
        switch (lastHealthState) {
        case SERVICE_HEALTHY:
          elector.joinElection(targetToData(localTarget));
+          if (quitElectionOnBadState) {
+            quitElectionOnBadState = false;
+          }
          break;
          
        case INITIALIZING:
          LOG.info("Ensuring that " + localTarget + " does not " +
              "participate in active master election");
          elector.quitElection(false);
+          serviceState = HAServiceState.INITIALIZING;
          break;
    
        case SERVICE_UNHEALTHY:
@ -752,6 +763,7 @@ private void recheckElectability() {
          LOG.info("Quitting master election for " + localTarget +
              " and marking that fencing is necessary");
          elector.quitElection(true);
+          serviceState = HAServiceState.INITIALIZING;
          break;
          
        case HEALTH_MONITOR_FAILED:
@ -784,6 +796,44 @@ public void run() {
        whenNanos, TimeUnit.NANOSECONDS);
  }

+  int serviceStateMismatchCount = 0;
+  boolean quitElectionOnBadState = false;
+
+  void verifyChangedServiceState(HAServiceState changedState) {
+    synchronized (elector) {
+      synchronized (this) {
+        if (serviceState == HAServiceState.INITIALIZING) {
+          if (quitElectionOnBadState) {
+            LOG.debug("rechecking for electability from bad state");
+            recheckElectability();
+          }
+          return;
+        }
+        if (changedState == serviceState) {
+          serviceStateMismatchCount = 0;
+          return;
+        }
+        if (serviceStateMismatchCount == 0) {
+          // recheck one more time. As this might be due to parallel transition.
+          serviceStateMismatchCount++;
+          return;
+        }
+        // quit the election as the expected state and reported state
+        // mismatches.
+        LOG.error("Local service " + localTarget
+            + " has changed the serviceState to " + changedState
+            + ". Expected was " + serviceState
+            + ". Quitting election marking fencing necessary.");
+        delayJoiningUntilNanotime = System.nanoTime()
+            + TimeUnit.MILLISECONDS.toNanos(1000);
+        elector.quitElection(true);
+        quitElectionOnBadState = true;
+        serviceStateMismatchCount = 0;
+        serviceState = HAServiceState.INITIALIZING;
+      }
+    }
+  }
+
  /**
   * @return the last health state passed to the FC
   * by the HealthMonitor.
@ -856,6 +906,16 @@ public void enteredState(HealthMonitor.State newState) {
    }
  }

+  /**
+   * Callbacks for HAServiceStatus
+   */
+  class ServiceStateCallBacks implements HealthMonitor.ServiceStateCallback {
+    @Override
+    public void reportServiceStatus(HAServiceStatus status) {
+      verifyChangedServiceState(status.getState());
+    }
+  }
+
  private static class ActiveAttemptRecord {
    private final boolean succeeded;
    private final String status;
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java
@ -18,7 +18,6 @@
 package org.apache.hadoop.ha;

 import static org.junit.Assert.*;
-import static org.junit.Assume.assumeTrue;

 import java.security.NoSuchAlgorithmException;

@ -29,7 +28,6 @@
 import org.apache.hadoop.ha.HealthMonitor.State;
 import org.apache.hadoop.ha.MiniZKFCCluster.DummyZKFC;
 import org.apache.hadoop.test.GenericTestUtils;
-import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.util.Time;
 import org.apache.log4j.Level;
 import org.apache.zookeeper.KeeperException;
@ -68,8 +66,6 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
  
  @Before
  public void setupConfAndServices() {
-    // skip tests on Windows until after resolution of ZooKeeper client bug
-    assumeTrue(!Shell.WINDOWS);
    conf = new Configuration();
    conf.set(ZKFailoverController.ZK_ACL_KEY, TEST_ACL);
    conf.set(ZKFailoverController.ZK_AUTH_KEY, TEST_AUTH_GOOD);
@ -233,6 +229,27 @@ public void testAutoFailoverOnBadHealth() throws Exception {
    }
  }

+  /**
+   * Test that, when the health monitor indicates bad health status,
+   * failover is triggered. Also ensures that graceful active->standby
+   * transition is used when possible, falling back to fencing when
+   * the graceful approach fails.
+   */
+  @Test(timeout=15000)
+  public void testAutoFailoverOnBadState() throws Exception {
+    try {
+      cluster.start();
+      DummyHAService svc0 = cluster.getService(0);
+      LOG.info("Faking svc0 to change the state, should failover to svc1");
+      svc0.state = HAServiceState.STANDBY;
+      
+      // Should fail back to svc0 at this point
+      cluster.waitForHAState(1, HAServiceState.ACTIVE);
+    } finally {
+      cluster.stop();
+    }
+  }
+  
  @Test(timeout=15000)
  public void testAutoFailoverOnLostZKSession() throws Exception {
    try {