From 134dcf166f51a3bd47923f3a0fbad7954135cb6d Mon Sep 17 00:00:00 2001 From: K0K0V0K <109747532+K0K0V0K@users.noreply.github.com> Date: Thu, 27 Jun 2024 16:21:28 +0200 Subject: [PATCH] YARN-11703. Validate accessibility of Node Manager working directories (#6903) --- .../hadoop/yarn/conf/YarnConfiguration.java | 19 ++- .../src/main/resources/yarn-default.xml | 6 + .../nodemanager/DirectoryCollection.java | 155 +++++++++++------- .../nodemanager/TestDirectoryCollection.java | 35 +++- 4 files changed, 141 insertions(+), 74 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 7747d4cb73..9503d47537 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -2157,16 +2157,19 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_MIN_PER_DISK_FREE_SPACE_MB = NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-mb"; + /** + * By default, all the disk can be used before it is marked as offline. + */ + public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0; + /** * Enable/Disable the minimum disk free * space threshold for disk health checker. */ public static final String NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = - NM_DISK_HEALTH_CHECK_PREFIX + - "disk-free-space-threshold.enabled"; + NM_DISK_HEALTH_CHECK_PREFIX + "disk-free-space-threshold.enabled"; - public static final boolean - DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true; + public static final boolean DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true; /** * The minimum space that must be available on an offline @@ -2180,9 +2183,13 @@ public static boolean isAclEnabled(Configuration conf) { NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-watermark-high-mb"; /** - * By default, all of the disk can be used before it is marked as offline. + * Validate content of the node manager directories can be accessed. */ - public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0; + public static final String NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED = + NM_DISK_HEALTH_CHECK_PREFIX + "working-dir-content-accessibility-validation.enabled"; + + public static final boolean DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED = + true; /** The health checker scripts. */ public static final String NM_HEALTH_CHECK_SCRIPTS = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 927d0c1aa4..ac976b7472 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1995,6 +1995,12 @@ true + + Validate content of the node manager directories can be accessed + yarn.nodemanager.disk-health-checker.working-dir-content-accessibility-validation.enabled + true + + The maximum percentage of disk space utilization allowed after which a disk is marked as bad. Values can range from 0.0 to 100.0. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java index 8ecaa6d959..a5657ab48b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java @@ -21,6 +21,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -28,22 +30,27 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; +import java.util.stream.Collectors; +import java.util.stream.Stream; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.DiskChecker; import org.apache.hadoop.util.DiskValidator; import org.apache.hadoop.util.DiskValidatorFactory; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; @@ -62,6 +69,7 @@ public class DirectoryCollection { private boolean diskUtilizationThresholdEnabled; private boolean diskFreeSpaceThresholdEnabled; + private boolean subAccessibilityValidationEnabled; /** * The enum defines disk failure type. */ @@ -242,16 +250,15 @@ public DirectoryCollection(String[] dirs, throw new YarnRuntimeException(e); } - diskUtilizationThresholdEnabled = conf. - getBoolean(YarnConfiguration. - NM_DISK_UTILIZATION_THRESHOLD_ENABLED, - YarnConfiguration. - DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED); - diskFreeSpaceThresholdEnabled = conf. - getBoolean(YarnConfiguration. - NM_DISK_FREE_SPACE_THRESHOLD_ENABLED, - YarnConfiguration. - DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED); + diskUtilizationThresholdEnabled = conf.getBoolean( + YarnConfiguration.NM_DISK_UTILIZATION_THRESHOLD_ENABLED, + YarnConfiguration.DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED); + diskFreeSpaceThresholdEnabled = conf.getBoolean( + YarnConfiguration.NM_DISK_FREE_SPACE_THRESHOLD_ENABLED, + YarnConfiguration.DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED); + subAccessibilityValidationEnabled = conf.getBoolean( + YarnConfiguration.NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED, + YarnConfiguration.DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED); localDirs = new ArrayList<>(Arrays.asList(dirs)); errorDirs = new ArrayList<>(); @@ -448,8 +455,7 @@ boolean checkDirs() { // move testDirs out of any lock as it could wait for very long time in // case of busy IO - Map dirsFailedCheck = testDirs(allLocalDirs, - preCheckGoodDirs); + Map dirsFailedCheck = testDirs(allLocalDirs, preCheckGoodDirs); this.writeLock.lock(); try { @@ -521,60 +527,89 @@ boolean checkDirs() { } } - Map testDirs(List dirs, - Set goodDirs) { - HashMap ret = - new HashMap(); - for (final String dir : dirs) { - String msg; - try { - File testDir = new File(dir); - diskValidator.checkStatus(testDir); - float diskUtilizationPercentageCutoff = goodDirs.contains(dir) ? - diskUtilizationPercentageCutoffHigh : diskUtilizationPercentageCutoffLow; - long diskFreeSpaceCutoff = goodDirs.contains(dir) ? - diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh; - - if (diskUtilizationThresholdEnabled - && isDiskUsageOverPercentageLimit(testDir, - diskUtilizationPercentageCutoff)) { - msg = - "used space above threshold of " - + diskUtilizationPercentageCutoff - + "%"; - ret.put(dir, - new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg)); - continue; - } else if (diskFreeSpaceThresholdEnabled - && isDiskFreeSpaceUnderLimit(testDir, diskFreeSpaceCutoff)) { - msg = - "free space below limit of " + diskFreeSpaceCutoff - + "MB"; - ret.put(dir, - new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg)); - continue; - } - } catch (IOException ie) { - ret.put(dir, - new DiskErrorInformation(DiskErrorCause.OTHER, ie.getMessage())); - } + Map testDirs(List dirs, Set goodDirs) { + final Map ret = new HashMap<>(0); + for (String dir : dirs) { + LOG.debug("Start testing dir accessibility: {}", dir); + File testDir = new File(dir); + boolean goodDir = goodDirs.contains(dir); + Stream.of( + validateDisk(testDir), + validateUsageOverPercentageLimit(testDir, goodDir), + validateDiskFreeSpaceUnderLimit(testDir, goodDir), + validateSubsAccessibility(testDir) + ) + .filter(Objects::nonNull) + .findFirst() + .ifPresent(diskErrorInformation -> ret.put(dir, diskErrorInformation)); } return ret; } - private boolean isDiskUsageOverPercentageLimit(File dir, - float diskUtilizationPercentageCutoff) { - float freePercentage = - 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace()); - float usedPercentage = 100.0F - freePercentage; - return (usedPercentage > diskUtilizationPercentageCutoff - || usedPercentage >= 100.0F); + private DiskErrorInformation validateDisk(File dir) { + try { + diskValidator.checkStatus(dir); + LOG.debug("Dir {} pass throw the disk validation", dir); + return null; + } catch (IOException | UncheckedIOException | SecurityException e) { + return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage()); + } } - private boolean isDiskFreeSpaceUnderLimit(File dir, - long freeSpaceCutoff) { + private DiskErrorInformation validateUsageOverPercentageLimit(File dir, boolean isGoodDir) { + if (!diskUtilizationThresholdEnabled) { + return null; + } + float diskUtilizationPercentageCutoff = isGoodDir + ? diskUtilizationPercentageCutoffHigh + : diskUtilizationPercentageCutoffLow; + float freePercentage = 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace()); + float usedPercentage = 100.0F - freePercentage; + if (usedPercentage > diskUtilizationPercentageCutoff || usedPercentage >= 100.0F) { + return new DiskErrorInformation(DiskErrorCause.DISK_FULL, + "used space above threshold of " + diskUtilizationPercentageCutoff + "%"); + } else { + LOG.debug("Dir {} pass throw the usage over percentage validation", dir); + return null; + } + } + + private DiskErrorInformation validateDiskFreeSpaceUnderLimit(File dir, boolean isGoodDir) { + if (!diskFreeSpaceThresholdEnabled) { + return null; + } + long freeSpaceCutoff = isGoodDir ? diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh; long freeSpace = dir.getUsableSpace() / (1024 * 1024); - return freeSpace < freeSpaceCutoff; + if (freeSpace < freeSpaceCutoff) { + return new DiskErrorInformation(DiskErrorCause.DISK_FULL, + "free space below limit of " + freeSpaceCutoff + "MB"); + } else { + LOG.debug("Dir {} pass throw the free space validation", dir); + return null; + } + } + + private DiskErrorInformation validateSubsAccessibility(File dir) { + if (!subAccessibilityValidationEnabled) { + return null; + } + try (Stream walk = Files.walk(dir.toPath())) { + List subs = walk + .map(java.nio.file.Path::toFile) + .collect(Collectors.toList()); + for (File sub : subs) { + if (sub.isDirectory()) { + DiskChecker.checkDir(sub); + } else if (!Files.isReadable(sub.toPath())) { + return new DiskErrorInformation(DiskErrorCause.OTHER, "Can not read " + sub); + } else { + LOG.debug("{} under {} is accessible", sub, dir); + } + } + } catch (IOException | UncheckedIOException | SecurityException e) { + return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage()); + } + return null; } private void createDir(FileContext localFs, Path dir, FsPermission perm) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java index 33bd4d9234..0193f844ac 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java @@ -20,8 +20,17 @@ import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.attribute.PosixFilePermissions; +import java.util.Collections; import java.util.List; import java.util.ListIterator; +import java.util.Map; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; @@ -32,16 +41,11 @@ import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.DirsChangeListener; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; public class TestDirectoryCollection { - private static final File testDir = new File("target", - TestDirectoryCollection.class.getName()).getAbsoluteFile(); - private static final File testFile = new File(testDir, "testfile"); + private File testDir; + private File testFile; private Configuration conf; private FileContext localFs; @@ -50,7 +54,8 @@ public class TestDirectoryCollection { public void setupForTests() throws IOException { conf = new Configuration(); localFs = FileContext.getLocalFSFileContext(conf); - testDir.mkdirs(); + testDir = Files.createTempDirectory(TestDirectoryCollection.class.getName()).toFile(); + testFile = new File(testDir, "testfile"); testFile.createNewFile(); } @@ -516,6 +521,20 @@ public void testDirsChangeListener() { Assert.assertEquals(listener3.num, 1); } + @Test + public void testNonAccessibleSub() throws IOException { + Files.setPosixFilePermissions(testDir.toPath(), + PosixFilePermissions.fromString("rwx------")); + Files.setPosixFilePermissions(testFile.toPath(), + PosixFilePermissions.fromString("-w--w--w-")); + DirectoryCollection dc = new DirectoryCollection(new String[]{testDir.toString()}); + Map diskErrorInformationMap = + dc.testDirs(Collections.singletonList(testDir.toString()), Collections.emptySet()); + Assert.assertEquals(1, diskErrorInformationMap.size()); + Assert.assertTrue(diskErrorInformationMap.values().iterator().next() + .message.contains(testFile.getName())); + } + static class DirsChangeListenerTest implements DirsChangeListener { public int num = 0; public DirsChangeListenerTest() {