YARN-11703. Validate accessibility of Node Manager working directories (#6903)
This commit is contained in:
parent
b4ddb2d3bb
commit
134dcf166f
@ -2157,16 +2157,19 @@ public static boolean isAclEnabled(Configuration conf) {
|
||||
public static final String NM_MIN_PER_DISK_FREE_SPACE_MB =
|
||||
NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-mb";
|
||||
|
||||
/**
|
||||
* By default, all the disk can be used before it is marked as offline.
|
||||
*/
|
||||
public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
|
||||
|
||||
/**
|
||||
* Enable/Disable the minimum disk free
|
||||
* space threshold for disk health checker.
|
||||
*/
|
||||
public static final String NM_DISK_FREE_SPACE_THRESHOLD_ENABLED =
|
||||
NM_DISK_HEALTH_CHECK_PREFIX +
|
||||
"disk-free-space-threshold.enabled";
|
||||
NM_DISK_HEALTH_CHECK_PREFIX + "disk-free-space-threshold.enabled";
|
||||
|
||||
public static final boolean
|
||||
DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true;
|
||||
public static final boolean DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true;
|
||||
|
||||
/**
|
||||
* The minimum space that must be available on an offline
|
||||
@ -2180,9 +2183,13 @@ public static boolean isAclEnabled(Configuration conf) {
|
||||
NM_DISK_HEALTH_CHECK_PREFIX +
|
||||
"min-free-space-per-disk-watermark-high-mb";
|
||||
/**
|
||||
* By default, all of the disk can be used before it is marked as offline.
|
||||
* Validate content of the node manager directories can be accessed.
|
||||
*/
|
||||
public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
|
||||
public static final String NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED =
|
||||
NM_DISK_HEALTH_CHECK_PREFIX + "working-dir-content-accessibility-validation.enabled";
|
||||
|
||||
public static final boolean DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED =
|
||||
true;
|
||||
|
||||
/** The health checker scripts. */
|
||||
public static final String NM_HEALTH_CHECK_SCRIPTS =
|
||||
|
@ -1995,6 +1995,12 @@
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Validate content of the node manager directories can be accessed</description>
|
||||
<name>yarn.nodemanager.disk-health-checker.working-dir-content-accessibility-validation.enabled</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>The maximum percentage of disk space utilization allowed after
|
||||
which a disk is marked as bad. Values can range from 0.0 to 100.0.
|
||||
|
@ -21,6 +21,8 @@
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
@ -28,22 +30,27 @@
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileAlreadyExistsException;
|
||||
import org.apache.hadoop.fs.FileContext;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.util.DiskChecker;
|
||||
import org.apache.hadoop.util.DiskValidator;
|
||||
import org.apache.hadoop.util.DiskValidatorFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
|
||||
@ -62,6 +69,7 @@ public class DirectoryCollection {
|
||||
|
||||
private boolean diskUtilizationThresholdEnabled;
|
||||
private boolean diskFreeSpaceThresholdEnabled;
|
||||
private boolean subAccessibilityValidationEnabled;
|
||||
/**
|
||||
* The enum defines disk failure type.
|
||||
*/
|
||||
@ -242,16 +250,15 @@ public DirectoryCollection(String[] dirs,
|
||||
throw new YarnRuntimeException(e);
|
||||
}
|
||||
|
||||
diskUtilizationThresholdEnabled = conf.
|
||||
getBoolean(YarnConfiguration.
|
||||
NM_DISK_UTILIZATION_THRESHOLD_ENABLED,
|
||||
YarnConfiguration.
|
||||
DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED);
|
||||
diskFreeSpaceThresholdEnabled = conf.
|
||||
getBoolean(YarnConfiguration.
|
||||
NM_DISK_FREE_SPACE_THRESHOLD_ENABLED,
|
||||
YarnConfiguration.
|
||||
DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED);
|
||||
diskUtilizationThresholdEnabled = conf.getBoolean(
|
||||
YarnConfiguration.NM_DISK_UTILIZATION_THRESHOLD_ENABLED,
|
||||
YarnConfiguration.DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED);
|
||||
diskFreeSpaceThresholdEnabled = conf.getBoolean(
|
||||
YarnConfiguration.NM_DISK_FREE_SPACE_THRESHOLD_ENABLED,
|
||||
YarnConfiguration.DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED);
|
||||
subAccessibilityValidationEnabled = conf.getBoolean(
|
||||
YarnConfiguration.NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED,
|
||||
YarnConfiguration.DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED);
|
||||
|
||||
localDirs = new ArrayList<>(Arrays.asList(dirs));
|
||||
errorDirs = new ArrayList<>();
|
||||
@ -448,8 +455,7 @@ boolean checkDirs() {
|
||||
|
||||
// move testDirs out of any lock as it could wait for very long time in
|
||||
// case of busy IO
|
||||
Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs,
|
||||
preCheckGoodDirs);
|
||||
Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs, preCheckGoodDirs);
|
||||
|
||||
this.writeLock.lock();
|
||||
try {
|
||||
@ -521,60 +527,89 @@ boolean checkDirs() {
|
||||
}
|
||||
}
|
||||
|
||||
Map<String, DiskErrorInformation> testDirs(List<String> dirs,
|
||||
Set<String> goodDirs) {
|
||||
HashMap<String, DiskErrorInformation> ret =
|
||||
new HashMap<String, DiskErrorInformation>();
|
||||
for (final String dir : dirs) {
|
||||
String msg;
|
||||
try {
|
||||
File testDir = new File(dir);
|
||||
diskValidator.checkStatus(testDir);
|
||||
float diskUtilizationPercentageCutoff = goodDirs.contains(dir) ?
|
||||
diskUtilizationPercentageCutoffHigh : diskUtilizationPercentageCutoffLow;
|
||||
long diskFreeSpaceCutoff = goodDirs.contains(dir) ?
|
||||
diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh;
|
||||
|
||||
if (diskUtilizationThresholdEnabled
|
||||
&& isDiskUsageOverPercentageLimit(testDir,
|
||||
diskUtilizationPercentageCutoff)) {
|
||||
msg =
|
||||
"used space above threshold of "
|
||||
+ diskUtilizationPercentageCutoff
|
||||
+ "%";
|
||||
ret.put(dir,
|
||||
new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg));
|
||||
continue;
|
||||
} else if (diskFreeSpaceThresholdEnabled
|
||||
&& isDiskFreeSpaceUnderLimit(testDir, diskFreeSpaceCutoff)) {
|
||||
msg =
|
||||
"free space below limit of " + diskFreeSpaceCutoff
|
||||
+ "MB";
|
||||
ret.put(dir,
|
||||
new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg));
|
||||
continue;
|
||||
}
|
||||
} catch (IOException ie) {
|
||||
ret.put(dir,
|
||||
new DiskErrorInformation(DiskErrorCause.OTHER, ie.getMessage()));
|
||||
}
|
||||
Map<String, DiskErrorInformation> testDirs(List<String> dirs, Set<String> goodDirs) {
|
||||
final Map<String, DiskErrorInformation> ret = new HashMap<>(0);
|
||||
for (String dir : dirs) {
|
||||
LOG.debug("Start testing dir accessibility: {}", dir);
|
||||
File testDir = new File(dir);
|
||||
boolean goodDir = goodDirs.contains(dir);
|
||||
Stream.of(
|
||||
validateDisk(testDir),
|
||||
validateUsageOverPercentageLimit(testDir, goodDir),
|
||||
validateDiskFreeSpaceUnderLimit(testDir, goodDir),
|
||||
validateSubsAccessibility(testDir)
|
||||
)
|
||||
.filter(Objects::nonNull)
|
||||
.findFirst()
|
||||
.ifPresent(diskErrorInformation -> ret.put(dir, diskErrorInformation));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private boolean isDiskUsageOverPercentageLimit(File dir,
|
||||
float diskUtilizationPercentageCutoff) {
|
||||
float freePercentage =
|
||||
100 * (dir.getUsableSpace() / (float) dir.getTotalSpace());
|
||||
float usedPercentage = 100.0F - freePercentage;
|
||||
return (usedPercentage > diskUtilizationPercentageCutoff
|
||||
|| usedPercentage >= 100.0F);
|
||||
private DiskErrorInformation validateDisk(File dir) {
|
||||
try {
|
||||
diskValidator.checkStatus(dir);
|
||||
LOG.debug("Dir {} pass throw the disk validation", dir);
|
||||
return null;
|
||||
} catch (IOException | UncheckedIOException | SecurityException e) {
|
||||
return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isDiskFreeSpaceUnderLimit(File dir,
|
||||
long freeSpaceCutoff) {
|
||||
private DiskErrorInformation validateUsageOverPercentageLimit(File dir, boolean isGoodDir) {
|
||||
if (!diskUtilizationThresholdEnabled) {
|
||||
return null;
|
||||
}
|
||||
float diskUtilizationPercentageCutoff = isGoodDir
|
||||
? diskUtilizationPercentageCutoffHigh
|
||||
: diskUtilizationPercentageCutoffLow;
|
||||
float freePercentage = 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace());
|
||||
float usedPercentage = 100.0F - freePercentage;
|
||||
if (usedPercentage > diskUtilizationPercentageCutoff || usedPercentage >= 100.0F) {
|
||||
return new DiskErrorInformation(DiskErrorCause.DISK_FULL,
|
||||
"used space above threshold of " + diskUtilizationPercentageCutoff + "%");
|
||||
} else {
|
||||
LOG.debug("Dir {} pass throw the usage over percentage validation", dir);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private DiskErrorInformation validateDiskFreeSpaceUnderLimit(File dir, boolean isGoodDir) {
|
||||
if (!diskFreeSpaceThresholdEnabled) {
|
||||
return null;
|
||||
}
|
||||
long freeSpaceCutoff = isGoodDir ? diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh;
|
||||
long freeSpace = dir.getUsableSpace() / (1024 * 1024);
|
||||
return freeSpace < freeSpaceCutoff;
|
||||
if (freeSpace < freeSpaceCutoff) {
|
||||
return new DiskErrorInformation(DiskErrorCause.DISK_FULL,
|
||||
"free space below limit of " + freeSpaceCutoff + "MB");
|
||||
} else {
|
||||
LOG.debug("Dir {} pass throw the free space validation", dir);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private DiskErrorInformation validateSubsAccessibility(File dir) {
|
||||
if (!subAccessibilityValidationEnabled) {
|
||||
return null;
|
||||
}
|
||||
try (Stream<java.nio.file.Path> walk = Files.walk(dir.toPath())) {
|
||||
List<File> subs = walk
|
||||
.map(java.nio.file.Path::toFile)
|
||||
.collect(Collectors.toList());
|
||||
for (File sub : subs) {
|
||||
if (sub.isDirectory()) {
|
||||
DiskChecker.checkDir(sub);
|
||||
} else if (!Files.isReadable(sub.toPath())) {
|
||||
return new DiskErrorInformation(DiskErrorCause.OTHER, "Can not read " + sub);
|
||||
} else {
|
||||
LOG.debug("{} under {} is accessible", sub, dir);
|
||||
}
|
||||
}
|
||||
} catch (IOException | UncheckedIOException | SecurityException e) {
|
||||
return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private void createDir(FileContext localFs, Path dir, FsPermission perm)
|
||||
|
@ -20,8 +20,17 @@
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||
@ -32,16 +41,11 @@
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.DirsChangeListener;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestDirectoryCollection {
|
||||
|
||||
private static final File testDir = new File("target",
|
||||
TestDirectoryCollection.class.getName()).getAbsoluteFile();
|
||||
private static final File testFile = new File(testDir, "testfile");
|
||||
private File testDir;
|
||||
private File testFile;
|
||||
|
||||
private Configuration conf;
|
||||
private FileContext localFs;
|
||||
@ -50,7 +54,8 @@ public class TestDirectoryCollection {
|
||||
public void setupForTests() throws IOException {
|
||||
conf = new Configuration();
|
||||
localFs = FileContext.getLocalFSFileContext(conf);
|
||||
testDir.mkdirs();
|
||||
testDir = Files.createTempDirectory(TestDirectoryCollection.class.getName()).toFile();
|
||||
testFile = new File(testDir, "testfile");
|
||||
testFile.createNewFile();
|
||||
}
|
||||
|
||||
@ -516,6 +521,20 @@ public void testDirsChangeListener() {
|
||||
Assert.assertEquals(listener3.num, 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNonAccessibleSub() throws IOException {
|
||||
Files.setPosixFilePermissions(testDir.toPath(),
|
||||
PosixFilePermissions.fromString("rwx------"));
|
||||
Files.setPosixFilePermissions(testFile.toPath(),
|
||||
PosixFilePermissions.fromString("-w--w--w-"));
|
||||
DirectoryCollection dc = new DirectoryCollection(new String[]{testDir.toString()});
|
||||
Map<String, DirectoryCollection.DiskErrorInformation> diskErrorInformationMap =
|
||||
dc.testDirs(Collections.singletonList(testDir.toString()), Collections.emptySet());
|
||||
Assert.assertEquals(1, diskErrorInformationMap.size());
|
||||
Assert.assertTrue(diskErrorInformationMap.values().iterator().next()
|
||||
.message.contains(testFile.getName()));
|
||||
}
|
||||
|
||||
static class DirsChangeListenerTest implements DirsChangeListener {
|
||||
public int num = 0;
|
||||
public DirsChangeListenerTest() {
|
||||
|
Loading…
Reference in New Issue
Block a user