YARN-8775. TestDiskFailures.testLocalDirsFailures sometimes can fail on concurrent File modifications. (Contributed by Antal Bálint Steinbach)

2018-10-15 09:37:20 -07:00 · 2018-10-15 09:37:20 -07:00 · f880ff418c
commit f880ff418c
parent fa94d370b6
2 changed files with 19 additions and 24 deletions
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java
@ -27,6 +27,8 @@
 import java.util.Set;
 import java.util.Timer;
 import java.util.TimerTask;
+
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.util.DiskChecker.DiskErrorException;
 import org.apache.hadoop.util.DiskValidator;
 import org.apache.hadoop.util.DiskValidatorFactory;
@ -493,7 +495,8 @@ private void logDiskStatus(boolean newDiskFailure, boolean diskTurnedGood) {

  }

-  private void checkDirs() {
+  @VisibleForTesting
+  public void checkDirs() {
    boolean disksStatusChange = false;
    Set<String> failedLocalDirsPreCheck =
        new HashSet<String>(localDirs.getFailedDirs());
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java
@ -27,7 +27,6 @@
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.records.NodeState;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
-import org.apache.hadoop.yarn.server.MiniYARNCluster;
 import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
@ -56,7 +55,12 @@ public class TestDiskFailures {

  private static final Logger LOG = LoggerFactory.getLogger(TestDiskFailures.class);

-  private static final long DISK_HEALTH_CHECK_INTERVAL = 1000;//1 sec
+  /*
+   * Set disk check interval high enough so that it never runs during the test.
+   * Checks will be called manually if necessary.
+   */
+  private static final long TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL =
+      1000 * 60 * 60 * 24;

  private static FileContext localFS = null;
  private static final File testDir = new File("target",
@ -146,9 +150,10 @@ private void testDirsFailures(boolean localORLogDirs) throws IOException {
                                         : YarnConfiguration.NM_LOG_DIRS;

    Configuration conf = new Configuration();
-    // set disk health check interval to a small value (say 1 sec).
+    // set disk health check interval to a large value to effectively disable
+    // disk health check done internally in LocalDirsHandlerService"
    conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS,
-                 DISK_HEALTH_CHECK_INTERVAL);
+        TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL);

    // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
    // log-dirs fail, then the node's health status should become unhealthy.
@ -202,22 +207,6 @@ private void testDirsFailures(boolean localORLogDirs) throws IOException {
    verifyDisksHealth(localORLogDirs, expectedDirs, false);
  }

-  /**
-   * Wait for the NodeManger to go for the disk-health-check at least once.
-   */
-  private void waitForDiskHealthCheck() {
-    long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime();
-    long time = lastDisksCheckTime;
-    for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) {
-      try {
-        Thread.sleep(1000);
-      } catch(InterruptedException e) {
-        LOG.error(
-            "Interrupted while waiting for NodeManager's disk health check.");
-      }
-      time = dirsHandler.getLastDisksCheckTime();
-    }
-  }

  /**
   * Verify if the NodeManager could identify disk failures.
@ -228,8 +217,8 @@ private void waitForDiskHealthCheck() {
   */
  private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs,
      boolean isHealthy) {
-    // Wait for the NodeManager to identify disk failures.
-    waitForDiskHealthCheck();
+    // identify disk failures
+    dirsHandler.checkDirs();

    List<String> list = localORLogDirs ? dirsHandler.getLocalDirs()
                                       : dirsHandler.getLogDirs();
@ -272,7 +261,10 @@ private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs,
   */
  private void prepareDirToFail(String dir) throws IOException {
    File file = new File(dir);
-    FileUtil.fullyDelete(file);
+    if(!FileUtil.fullyDelete(file)) {
+      throw new IOException("Delete of file was unsuccessful! Path: " +
+          file.getAbsolutePath());
+    }
    file.createNewFile();
    LOG.info("Prepared " + dir + " to fail.");
  }