From 9d6842501c88304ca24062d2463480bc7fbe5e57 Mon Sep 17 00:00:00 2001
From: supratimdeka <46919641+supratimdeka@users.noreply.github.com>
Date: Wed, 19 Jun 2019 20:11:16 +0530
Subject: [PATCH] HDDS-1454. GC other system pause events can trigger pipeline
 destroy for all the nodes in the cluster. Contributed by Supratim Deka (#852)

---
 .../hdds/scm/node/NodeStateManager.java       | 167 ++++++++++++++----
 .../hadoop/hdds/scm/node/SCMNodeManager.java  |  29 +++
 .../hdds/scm/node/TestSCMNodeManager.java     |  92 ++++++++++
 3 files changed, 258 insertions(+), 30 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java
index c54944bc35..08a68be2e0 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.hdds.scm.node;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 import org.apache.hadoop.conf.Configuration;
@@ -43,6 +44,7 @@
 import java.io.Closeable;
 import java.util.*;
 import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Predicate;
 
@@ -116,6 +118,26 @@ private enum NodeLifeCycleEvent {
    */
   private final long deadNodeIntervalMs;
 
+  /**
+   * The future is used to pause/unpause the scheduled checks.
+   */
+  private ScheduledFuture<?> healthCheckFuture;
+
+  /**
+   * Test utility - tracks if health check has been paused (unit tests).
+   */
+  private boolean checkPaused;
+
+  /**
+   * timestamp of the latest heartbeat check process.
+   */
+  private long lastHealthCheck;
+
+  /**
+   * number of times the heart beat check was skipped.
+   */
+  private long skippedHealthChecks;
+
   /**
    * Constructs a NodeStateManager instance with the given configuration.
    *
@@ -143,10 +165,11 @@ public NodeStateManager(Configuration conf, EventPublisher eventPublisher) {
     executorService = HadoopExecutors.newScheduledThreadPool(1,
         new ThreadFactoryBuilder().setDaemon(true)
             .setNameFormat("SCM Heartbeat Processing Thread - %d").build());
-    //BUG:BUG TODO: The return value is ignored, if an exception is thrown in
-    // the executing funtion, it will be ignored.
-    executorService.schedule(this, heartbeatCheckerIntervalMs,
-        TimeUnit.MILLISECONDS);
+
+    skippedHealthChecks = 0;
+    checkPaused = false; // accessed only from test functions
+
+    scheduleNextHealthCheck();
   }
 
   /**
@@ -464,6 +487,42 @@ public Set<ContainerID> getContainers(UUID uuid)
   @Override
   public void run() {
 
+    if (shouldSkipCheck()) {
+      skippedHealthChecks++;
+      LOG.info("Detected long delay in scheduling HB processing thread. "
+          + "Skipping heartbeat checks for one iteration.");
+    } else {
+      checkNodesHealth();
+    }
+
+    // we purposefully make this non-deterministic. Instead of using a
+    // scheduleAtFixedFrequency  we will just go to sleep
+    // and wake up at the next rendezvous point, which is currentTime +
+    // heartbeatCheckerIntervalMs. This leads to the issue that we are now
+    // heart beating not at a fixed cadence, but clock tick + time taken to
+    // work.
+    //
+    // This time taken to work can skew the heartbeat processor thread.
+    // The reason why we don't care is because of the following reasons.
+    //
+    // 1. checkerInterval is general many magnitudes faster than datanode HB
+    // frequency.
+    //
+    // 2. if we have too much nodes, the SCM would be doing only HB
+    // processing, this could lead to SCM's CPU starvation. With this
+    // approach we always guarantee that  HB thread sleeps for a little while.
+    //
+    // 3. It is possible that we will never finish processing the HB's in the
+    // thread. But that means we have a mis-configured system. We will warn
+    // the users by logging that information.
+    //
+    // 4. And the most important reason, heartbeats are not blocked even if
+    // this thread does not run, they will go into the processing queue.
+    scheduleNextHealthCheck();
+  }
+
+  private void checkNodesHealth() {
+
     /*
      *
      *          staleNodeDeadline                healthyNodeDeadline
@@ -558,41 +617,36 @@ public void run() {
           heartbeatCheckerIntervalMs);
     }
 
-    // we purposefully make this non-deterministic. Instead of using a
-    // scheduleAtFixedFrequency  we will just go to sleep
-    // and wake up at the next rendezvous point, which is currentTime +
-    // heartbeatCheckerIntervalMs. This leads to the issue that we are now
-    // heart beating not at a fixed cadence, but clock tick + time taken to
-    // work.
-    //
-    // This time taken to work can skew the heartbeat processor thread.
-    // The reason why we don't care is because of the following reasons.
-    //
-    // 1. checkerInterval is general many magnitudes faster than datanode HB
-    // frequency.
-    //
-    // 2. if we have too much nodes, the SCM would be doing only HB
-    // processing, this could lead to SCM's CPU starvation. With this
-    // approach we always guarantee that  HB thread sleeps for a little while.
-    //
-    // 3. It is possible that we will never finish processing the HB's in the
-    // thread. But that means we have a mis-configured system. We will warn
-    // the users by logging that information.
-    //
-    // 4. And the most important reason, heartbeats are not blocked even if
-    // this thread does not run, they will go into the processing queue.
+  }
+
+  private void scheduleNextHealthCheck() {
 
     if (!Thread.currentThread().isInterrupted() &&
         !executorService.isShutdown()) {
       //BUGBUG: The return future needs to checked here to make sure the
       // exceptions are handled correctly.
-      executorService.schedule(this, heartbeatCheckerIntervalMs,
-          TimeUnit.MILLISECONDS);
+      healthCheckFuture = executorService.schedule(this,
+          heartbeatCheckerIntervalMs, TimeUnit.MILLISECONDS);
     } else {
-      LOG.info("Current Thread is interrupted, shutting down HB processing " +
+      LOG.warn("Current Thread is interrupted, shutting down HB processing " +
           "thread for Node Manager.");
     }
 
+    lastHealthCheck = Time.monotonicNow();
+  }
+
+  /**
+   * if the time since last check exceeds the stale|dead node interval, skip.
+   * such long delays might be caused by a JVM pause. SCM cannot make reliable
+   * conclusions about datanode health in such situations.
+   * @return : true indicates skip HB checks
+   */
+  private boolean shouldSkipCheck() {
+
+    long currentTime = Time.monotonicNow();
+    long minInterval = Math.min(staleNodeIntervalMs, deadNodeIntervalMs);
+
+    return ((currentTime - lastHealthCheck) >= minInterval);
   }
 
   /**
@@ -640,4 +694,57 @@ public void close() {
       Thread.currentThread().interrupt();
     }
   }
+
+  /**
+   * Test Utility : return number of times heartbeat check was skipped.
+   * @return : count of times HB process was skipped
+   */
+  @VisibleForTesting
+  long getSkippedHealthChecks() {
+    return skippedHealthChecks;
+  }
+
+  /**
+   * Test Utility : Pause the periodic node hb check.
+   * @return ScheduledFuture for the scheduled check that got cancelled.
+   */
+  @VisibleForTesting
+  ScheduledFuture pause() {
+
+    if (executorService.isShutdown() || checkPaused) {
+      return null;
+    }
+
+    checkPaused = healthCheckFuture.cancel(false);
+
+    return healthCheckFuture;
+  }
+
+  /**
+   * Test utility : unpause the periodic node hb check.
+   * @return ScheduledFuture for the next scheduled check
+   */
+  @VisibleForTesting
+  ScheduledFuture unpause() {
+
+    if (executorService.isShutdown()) {
+      return null;
+    }
+
+    if (checkPaused) {
+      Preconditions.checkState(((healthCheckFuture == null)
+          || healthCheckFuture.isCancelled()
+          || healthCheckFuture.isDone()));
+
+      checkPaused = false;
+      /**
+       * We do not call scheduleNextHealthCheck because we are
+       * not updating the lastHealthCheck timestamp.
+       */
+      healthCheckFuture = executorService.schedule(this,
+          heartbeatCheckerIntervalMs, TimeUnit.MILLISECONDS);
+    }
+
+    return healthCheckFuture;
+  }
 }
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java
index eaa2255cb0..a85271e270 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hdds.scm.node;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import org.apache.hadoop.hdds.protocol.proto
     .StorageContainerDatanodeProtocolProtos;
@@ -72,6 +73,7 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.UUID;
+import java.util.concurrent.ScheduledFuture;
 import java.util.stream.Collectors;
 
 /**
@@ -580,4 +582,31 @@ private String nodeResolve(String hostname) {
       return null;
     }
   }
+
+  /**
+   * Test utility to stop heartbeat check process.
+   * @return ScheduledFuture of next scheduled check that got cancelled.
+   */
+  @VisibleForTesting
+  ScheduledFuture pauseHealthCheck() {
+    return nodeStateManager.pause();
+  }
+
+  /**
+   * Test utility to resume the paused heartbeat check process.
+   * @return ScheduledFuture of the next scheduled check
+   */
+  @VisibleForTesting
+  ScheduledFuture unpauseHealthCheck() {
+    return nodeStateManager.unpause();
+  }
+
+  /**
+   * Test utility to get the count of skipped heartbeat check iterations.
+   * @return count of skipped heartbeat check iterations
+   */
+  @VisibleForTesting
+  long getSkippedHealthChecks() {
+    return nodeStateManager.getSkippedHealthChecks();
+  }
 }
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java
index 60fc2045b0..ae810716da 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java
@@ -53,6 +53,8 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.UUID;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ScheduledFuture;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
@@ -345,6 +347,96 @@ public void testScmDetectStaleAndDeadNode()
     }
   }
 
+  /**
+   * Simulate a JVM Pause by pausing the health check process
+   * Ensure that none of the nodes with heartbeats become Dead or Stale.
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws AuthenticationException
+   */
+  @Test
+  public void testScmHandleJvmPause()
+      throws IOException, InterruptedException, AuthenticationException {
+    final int healthCheckInterval = 200; // milliseconds
+    final int heartbeatInterval = 1; // seconds
+    final int staleNodeInterval = 3; // seconds
+    final int deadNodeInterval = 6; // seconds
+    ScheduledFuture schedFuture;
+
+    OzoneConfiguration conf = getConf();
+    conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL,
+        healthCheckInterval, MILLISECONDS);
+    conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL,
+        heartbeatInterval, SECONDS);
+    conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL,
+        staleNodeInterval, SECONDS);
+    conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL,
+        deadNodeInterval, SECONDS);
+
+    try (SCMNodeManager nodeManager = createNodeManager(conf)) {
+      DatanodeDetails node1 =
+          TestUtils.createRandomDatanodeAndRegister(nodeManager);
+      DatanodeDetails node2 =
+          TestUtils.createRandomDatanodeAndRegister(nodeManager);
+
+      nodeManager.processHeartbeat(node1);
+      nodeManager.processHeartbeat(node2);
+
+      // Sleep so that heartbeat processing thread gets to run.
+      Thread.sleep(1000);
+
+      //Assert all nodes are healthy.
+      assertEquals(2, nodeManager.getAllNodes().size());
+      assertEquals(2, nodeManager.getNodeCount(HEALTHY));
+
+      /**
+       * Simulate a JVM Pause and subsequent handling in following steps:
+       * Step 1 : stop heartbeat check process for stale node interval
+       * Step 2 : resume heartbeat check
+       * Step 3 : wait for 1 iteration of heartbeat check thread
+       * Step 4 : retrieve the state of all nodes - assert all are HEALTHY
+       * Step 5 : heartbeat for node1
+       * [TODO : what if there is scheduling delay of test thread in Step 5?]
+       * Step 6 : wait for some time to allow iterations of check process
+       * Step 7 : retrieve the state of all nodes -  assert node2 is STALE
+       * and node1 is HEALTHY
+       */
+
+      // Step 1 : stop health check process (simulate JVM pause)
+      nodeManager.pauseHealthCheck();
+      Thread.sleep(MILLISECONDS.convert(staleNodeInterval, SECONDS));
+
+      // Step 2 : resume health check
+      assertTrue("Unexpected, already skipped heartbeat checks",
+          (nodeManager.getSkippedHealthChecks() == 0));
+      schedFuture = nodeManager.unpauseHealthCheck();
+
+      // Step 3 : wait for 1 iteration of health check
+      try {
+        schedFuture.get();
+        assertTrue("We did not skip any heartbeat checks",
+            nodeManager.getSkippedHealthChecks() > 0);
+      } catch (ExecutionException e) {
+        assertEquals("Unexpected exception waiting for Scheduled Health Check",
+            0, 1);
+      }
+
+      // Step 4 : all nodes should still be HEALTHY
+      assertEquals(2, nodeManager.getAllNodes().size());
+      assertEquals(2, nodeManager.getNodeCount(HEALTHY));
+
+      // Step 5 : heartbeat for node1
+      nodeManager.processHeartbeat(node1);
+
+      // Step 6 : wait for health check process to run
+      Thread.sleep(1000);
+
+      // Step 7 : node2 should transition to STALE
+      assertEquals(1, nodeManager.getNodeCount(HEALTHY));
+      assertEquals(1, nodeManager.getNodeCount(STALE));
+    }
+  }
+
   /**
    * Check for NPE when datanodeDetails is passed null for sendHeartbeat.
    *