[YARN-10687] Add option to disable/enable free disk space checking and percentage checking for full and not-full disks. Contributed by Qi Zhu.

This commit is contained in:
Jim Brennan 2021-03-12 17:17:31 +00:00
parent 5db4c0bf70
commit 5f067cf0f3
4 changed files with 185 additions and 4 deletions

View File

@ -2027,6 +2027,8 @@ public static boolean isAclEnabled(Configuration conf) {
* marked as offline. Values can range from 0.0 to 100.0. If the value is * marked as offline. Values can range from 0.0 to 100.0. If the value is
* greater than or equal to 100, NM will check for full disk. This applies to * greater than or equal to 100, NM will check for full disk. This applies to
* nm-local-dirs and nm-log-dirs. * nm-local-dirs and nm-log-dirs.
*
* This applies when disk-utilization-threshold.enabled is true.
*/ */
public static final String NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE = public static final String NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE =
NM_DISK_HEALTH_CHECK_PREFIX + "max-disk-utilization-per-disk-percentage"; NM_DISK_HEALTH_CHECK_PREFIX + "max-disk-utilization-per-disk-percentage";
@ -2036,6 +2038,17 @@ public static boolean isAclEnabled(Configuration conf) {
public static final float DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE = public static final float DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE =
90.0F; 90.0F;
/**
* Enable/Disable the disk utilisation percentage
* threshold for disk health checker.
*/
public static final String NM_DISK_UTILIZATION_THRESHOLD_ENABLED =
NM_DISK_HEALTH_CHECK_PREFIX +
"disk-utilization-threshold.enabled";
public static final
boolean DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED = true;
/** /**
* The low threshold percentage of disk space used when an offline disk is * The low threshold percentage of disk space used when an offline disk is
* marked as online. Values can range from 0.0 to 100.0. The value shouldn't * marked as online. Values can range from 0.0 to 100.0. The value shouldn't
@ -2051,9 +2064,23 @@ public static boolean isAclEnabled(Configuration conf) {
/** /**
* The minimum space that must be available on a local dir for it to be used. * The minimum space that must be available on a local dir for it to be used.
* This applies to nm-local-dirs and nm-log-dirs. * This applies to nm-local-dirs and nm-log-dirs.
*
* This applies when disk-free-space-threshold.enabled is true.
*/ */
public static final String NM_MIN_PER_DISK_FREE_SPACE_MB = public static final String NM_MIN_PER_DISK_FREE_SPACE_MB =
NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-mb"; NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-mb";
/**
* Enable/Disable the minimum disk free
* space threshold for disk health checker.
*/
public static final String NM_DISK_FREE_SPACE_THRESHOLD_ENABLED =
NM_DISK_HEALTH_CHECK_PREFIX +
"disk-free-space-threshold.enabled";
public static final boolean
DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true;
/** /**
* The minimum space that must be available on an offline * The minimum space that must be available on an offline
* disk for it to be marked as online. The value should not be less * disk for it to be marked as online. The value should not be less

View File

@ -1809,12 +1809,27 @@
<value>0.25</value> <value>0.25</value>
</property> </property>
<property>
<description>Enable/Disable the disk utilisation percentage
threshold for disk health checker.</description>
<name>yarn.nodemanager.disk-health-checker.disk-utilization-threshold.enabled</name>
<value>true</value>
</property>
<property>
<description> Enable/Disable the minimum disk free
space threshold for disk health checker.</description>
<name>yarn.nodemanager.disk-health-checker.disk-free-space-threshold.enabled</name>
<value>true</value>
</property>
<property> <property>
<description>The maximum percentage of disk space utilization allowed after <description>The maximum percentage of disk space utilization allowed after
which a disk is marked as bad. Values can range from 0.0 to 100.0. which a disk is marked as bad. Values can range from 0.0 to 100.0.
If the value is greater than or equal to 100, the nodemanager will check If the value is greater than or equal to 100, the nodemanager will check
for full disk. This applies to yarn.nodemanager.local-dirs and for full disk. This applies to yarn.nodemanager.local-dirs and
yarn.nodemanager.log-dirs.</description> yarn.nodemanager.log-dirs when
yarn.nodemanager.disk-health-checker.disk-utilization-threshold.enabled is true.</description>
<name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name> <name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
<value>90.0</value> <value>90.0</value>
</property> </property>
@ -1834,7 +1849,8 @@
<description>The minimum space in megabytes that must be available on a disk for <description>The minimum space in megabytes that must be available on a disk for
it to be used. If space on a disk falls below this threshold, it will be marked it to be used. If space on a disk falls below this threshold, it will be marked
as bad. This applies to yarn.nodemanager.local-dirs and as bad. This applies to yarn.nodemanager.local-dirs and
yarn.nodemanager.log-dirs.</description> yarn.nodemanager.log-dirs when
yarn.nodemanager.disk-health-checker.disk-free-space-threshold.enabled is true.</description>
<name>yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb</name> <name>yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb</name>
<value>0</value> <value>0</value>
</property> </property>

View File

@ -59,6 +59,9 @@ public class DirectoryCollection {
private final Configuration conf; private final Configuration conf;
private final DiskValidator diskValidator; private final DiskValidator diskValidator;
private boolean diskUtilizationThresholdEnabled;
private boolean diskFreeSpaceThresholdEnabled;
/** /**
* The enum defines disk failure type. * The enum defines disk failure type.
*/ */
@ -239,6 +242,17 @@ public DirectoryCollection(String[] dirs,
throw new YarnRuntimeException(e); throw new YarnRuntimeException(e);
} }
diskUtilizationThresholdEnabled = conf.
getBoolean(YarnConfiguration.
NM_DISK_UTILIZATION_THRESHOLD_ENABLED,
YarnConfiguration.
DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED);
diskFreeSpaceThresholdEnabled = conf.
getBoolean(YarnConfiguration.
NM_DISK_FREE_SPACE_THRESHOLD_ENABLED,
YarnConfiguration.
DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED);
localDirs = new ArrayList<>(Arrays.asList(dirs)); localDirs = new ArrayList<>(Arrays.asList(dirs));
errorDirs = new ArrayList<>(); errorDirs = new ArrayList<>();
fullDirs = new ArrayList<>(); fullDirs = new ArrayList<>();
@ -520,7 +534,9 @@ Map<String, DiskErrorInformation> testDirs(List<String> dirs,
diskUtilizationPercentageCutoffHigh : diskUtilizationPercentageCutoffLow; diskUtilizationPercentageCutoffHigh : diskUtilizationPercentageCutoffLow;
long diskFreeSpaceCutoff = goodDirs.contains(dir) ? long diskFreeSpaceCutoff = goodDirs.contains(dir) ?
diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh; diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh;
if (isDiskUsageOverPercentageLimit(testDir,
if (diskUtilizationThresholdEnabled
&& isDiskUsageOverPercentageLimit(testDir,
diskUtilizationPercentageCutoff)) { diskUtilizationPercentageCutoff)) {
msg = msg =
"used space above threshold of " "used space above threshold of "
@ -529,7 +545,8 @@ Map<String, DiskErrorInformation> testDirs(List<String> dirs,
ret.put(dir, ret.put(dir,
new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg)); new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg));
continue; continue;
} else if (isDiskFreeSpaceUnderLimit(testDir, diskFreeSpaceCutoff)) { } else if (diskFreeSpaceThresholdEnabled
&& isDiskFreeSpaceUnderLimit(testDir, diskFreeSpaceCutoff)) {
msg = msg =
"free space below limit of " + diskFreeSpaceCutoff "free space below limit of " + diskFreeSpaceCutoff
+ "MB"; + "MB";
@ -613,6 +630,28 @@ long getDiskUtilizationSpaceCutoffHigh() {
return diskFreeSpaceCutoffHigh; return diskFreeSpaceCutoffHigh;
} }
@VisibleForTesting
boolean getDiskUtilizationThresholdEnabled() {
return diskUtilizationThresholdEnabled;
}
@VisibleForTesting
boolean getDiskFreeSpaceThresholdEnabled() {
return diskFreeSpaceThresholdEnabled;
}
@VisibleForTesting
void setDiskUtilizationThresholdEnabled(boolean
utilizationEnabled) {
diskUtilizationThresholdEnabled = utilizationEnabled;
}
@VisibleForTesting
void setDiskFreeSpaceThresholdEnabled(boolean
freeSpaceEnabled) {
diskFreeSpaceThresholdEnabled = freeSpaceEnabled;
}
public void setDiskUtilizationSpaceCutoff(long freeSpaceCutoff) { public void setDiskUtilizationSpaceCutoff(long freeSpaceCutoff) {
setDiskUtilizationSpaceCutoff(freeSpaceCutoff, setDiskUtilizationSpaceCutoff(freeSpaceCutoff,
freeSpaceCutoff); freeSpaceCutoff);

View File

@ -176,6 +176,105 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
dc.getGoodDirsDiskUtilizationPercentage()); dc.getGoodDirsDiskUtilizationPercentage());
} }
@Test
public void testDiskSpaceUtilizationThresholdEnabled() throws IOException {
String dirA = new File(testDir, "dirA").getPath();
String[] dirs = {dirA};
DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F);
// Disable disk utilization threshold.
dc.setDiskUtilizationThresholdEnabled(false);
Assert.assertFalse(dc.getDiskUtilizationThresholdEnabled());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
// Enable disk utilization threshold.
dc.setDiskUtilizationThresholdEnabled(true);
Assert.assertTrue(dc.getDiskUtilizationThresholdEnabled());
dc.checkDirs();
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL,
dc.getDirectoryErrorInfo(dirA).cause);
// no good dirs
Assert.assertEquals(0,
dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, 100.0F);
int utilizedSpacePerc =
(int) ((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs,
testDir.getTotalSpace() / (1024 * 1024));
// Disable disk utilization threshold.
dc.setDiskUtilizationThresholdEnabled(false);
Assert.assertFalse(dc.getDiskUtilizationThresholdEnabled());
// Disable disk free space threshold.
dc.setDiskFreeSpaceThresholdEnabled(false);
Assert.assertFalse(dc.getDiskFreeSpaceThresholdEnabled());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
dc = new DirectoryCollection(dirs,
testDir.getTotalSpace() / (1024 * 1024));
// Enable disk free space threshold.
dc.setDiskFreeSpaceThresholdEnabled(true);
Assert.assertTrue(dc.getDiskFreeSpaceThresholdEnabled());
dc.checkDirs();
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
// no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, 100.0F, 100.0F, 0);
utilizedSpacePerc =
(int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
}
@Test @Test
public void testDiskLimitsCutoffSetters() throws IOException { public void testDiskLimitsCutoffSetters() throws IOException {