YARN-3943. Use separate threshold configurations for disk-full detection and disk-not-full detection. Contributed by Zhihai Xu

This commit is contained in:
Jason Lowe 2015-10-08 22:25:34 +00:00
parent 118a35bc2e
commit 8d226225d0
6 changed files with 142 additions and 50 deletions

View File

@ -492,6 +492,9 @@ Release 2.8.0 - UNRELEASED
YARN-4215. RMNodeLabels Manager Need to verify and replace node labels for the YARN-4215. RMNodeLabels Manager Need to verify and replace node labels for the
only modified Node Label Mappings in the request. (Naganarasimha G R via wangda) only modified Node Label Mappings in the request. (Naganarasimha G R via wangda)
YARN-3943. Use separate threshold configurations for disk-full detection
and disk-not-full detection. (Zhihai Xu via jlowe)
OPTIMIZATIONS OPTIMIZATIONS
YARN-3339. TestDockerContainerExecutor should pull a single image and not YARN-3339. TestDockerContainerExecutor should pull a single image and not

View File

@ -1059,6 +1059,18 @@ private static void addDeprecatedKeys() {
public static final float DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE = public static final float DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE =
90.0F; 90.0F;
/**
* The low threshold percentage of disk space used when an offline disk is
* marked as online. Values can range from 0.0 to 100.0. The value shouldn't
* be more than NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE. If its value is
* more than NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE or not set, it will be
* set to the same value as NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE.
* This applies to nm-local-dirs and nm-log-dirs.
*/
public static final String NM_WM_LOW_PER_DISK_UTILIZATION_PERCENTAGE =
NM_DISK_HEALTH_CHECK_PREFIX +
"disk-utilization-watermark-low-per-disk-percentage";
/** /**
* The minimum space that must be available on a local dir for it to be used. * The minimum space that must be available on a local dir for it to be used.
* This applies to nm-local-dirs and nm-log-dirs. * This applies to nm-local-dirs and nm-log-dirs.

View File

@ -1317,6 +1317,17 @@
<value>90.0</value> <value>90.0</value>
</property> </property>
<property>
<description>The low threshold percentage of disk space used when a bad disk is
marked as good. Values can range from 0.0 to 100.0. This applies to
yarn-nodemanager.local-dirs and yarn.nodemanager.log-dirs.
Note that if its value is more than yarn.nodemanager.disk-health-checker.
max-disk-utilization-per-disk-percentage or not set, it will be set to the same value as
yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage.</description>
<name>yarn.nodemanager.disk-health-checker.disk-utilization-watermark-low-per-disk-percentage</name>
<value></value>
</property>
<property> <property>
<description>The minimum space that must be available on a disk for <description>The minimum space that must be available on a disk for
it to be used. This applies to yarn-nodemanager.local-dirs and it to be used. This applies to yarn-nodemanager.local-dirs and

View File

@ -39,6 +39,8 @@
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.DiskChecker; import org.apache.hadoop.util.DiskChecker;
import com.google.common.annotations.VisibleForTesting;
/** /**
* Manages a list of local storage directories. * Manages a list of local storage directories.
*/ */
@ -89,7 +91,8 @@ static List<String> concat(List<String> l1, List<String> l2) {
private int numFailures; private int numFailures;
private float diskUtilizationPercentageCutoff; private float diskUtilizationPercentageCutoffHigh;
private float diskUtilizationPercentageCutoffLow;
private long diskUtilizationSpaceCutoff; private long diskUtilizationSpaceCutoff;
private int goodDirsDiskUtilizationPercentage; private int goodDirsDiskUtilizationPercentage;
@ -103,7 +106,7 @@ static List<String> concat(List<String> l1, List<String> l2) {
* directories to be monitored * directories to be monitored
*/ */
public DirectoryCollection(String[] dirs) { public DirectoryCollection(String[] dirs) {
this(dirs, 100.0F, 0); this(dirs, 100.0F, 100.0F, 0);
} }
/** /**
@ -119,7 +122,7 @@ public DirectoryCollection(String[] dirs) {
* *
*/ */
public DirectoryCollection(String[] dirs, float utilizationPercentageCutOff) { public DirectoryCollection(String[] dirs, float utilizationPercentageCutOff) {
this(dirs, utilizationPercentageCutOff, 0); this(dirs, utilizationPercentageCutOff, utilizationPercentageCutOff, 0);
} }
/** /**
@ -134,7 +137,7 @@ public DirectoryCollection(String[] dirs, float utilizationPercentageCutOff) {
* *
*/ */
public DirectoryCollection(String[] dirs, long utilizationSpaceCutOff) { public DirectoryCollection(String[] dirs, long utilizationSpaceCutOff) {
this(dirs, 100.0F, utilizationSpaceCutOff); this(dirs, 100.0F, 100.0F, utilizationSpaceCutOff);
} }
/** /**
@ -145,25 +148,29 @@ public DirectoryCollection(String[] dirs, long utilizationSpaceCutOff) {
* *
* @param dirs * @param dirs
* directories to be monitored * directories to be monitored
* @param utilizationPercentageCutOff * @param utilizationPercentageCutOffHigh
* percentage of disk that can be used before the dir is taken out of * percentage of disk that can be used before the dir is taken out of
* the good dirs list * the good dirs list
* @param utilizationPercentageCutOffLow
* percentage of disk that can be used when the dir is moved from
* the bad dirs list to the good dirs list
* @param utilizationSpaceCutOff * @param utilizationSpaceCutOff
* minimum space, in MB, that must be available on the disk for the * minimum space, in MB, that must be available on the disk for the
* dir to be marked as good * dir to be marked as good
* *
*/ */
public DirectoryCollection(String[] dirs, public DirectoryCollection(String[] dirs,
float utilizationPercentageCutOff, float utilizationPercentageCutOffHigh,
float utilizationPercentageCutOffLow,
long utilizationSpaceCutOff) { long utilizationSpaceCutOff) {
localDirs = new CopyOnWriteArrayList<String>(dirs); localDirs = new CopyOnWriteArrayList<String>(dirs);
errorDirs = new CopyOnWriteArrayList<String>(); errorDirs = new CopyOnWriteArrayList<String>();
fullDirs = new CopyOnWriteArrayList<String>(); fullDirs = new CopyOnWriteArrayList<String>();
diskUtilizationPercentageCutoff = diskUtilizationPercentageCutoffHigh = Math.max(0.0F, Math.min(100.0F,
utilizationPercentageCutOff < 0.0F ? 0.0F utilizationPercentageCutOffHigh));
: (utilizationPercentageCutOff > 100.0F ? 100.0F diskUtilizationPercentageCutoffLow = Math.max(0.0F, Math.min(
: utilizationPercentageCutOff); diskUtilizationPercentageCutoffHigh, utilizationPercentageCutOffLow));
diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff =
utilizationSpaceCutOff < 0 ? 0 : utilizationSpaceCutOff; utilizationSpaceCutOff < 0 ? 0 : utilizationSpaceCutOff;
@ -254,7 +261,8 @@ synchronized boolean checkDirs() {
List<String> allLocalDirs = List<String> allLocalDirs =
DirectoryCollection.concat(localDirs, failedDirs); DirectoryCollection.concat(localDirs, failedDirs);
Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs); Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs,
preCheckGoodDirs);
localDirs.clear(); localDirs.clear();
errorDirs.clear(); errorDirs.clear();
@ -314,7 +322,8 @@ synchronized boolean checkDirs() {
return setChanged; return setChanged;
} }
Map<String, DiskErrorInformation> testDirs(List<String> dirs) { Map<String, DiskErrorInformation> testDirs(List<String> dirs,
Set<String> goodDirs) {
HashMap<String, DiskErrorInformation> ret = HashMap<String, DiskErrorInformation> ret =
new HashMap<String, DiskErrorInformation>(); new HashMap<String, DiskErrorInformation>();
for (final String dir : dirs) { for (final String dir : dirs) {
@ -322,7 +331,10 @@ Map<String, DiskErrorInformation> testDirs(List<String> dirs) {
try { try {
File testDir = new File(dir); File testDir = new File(dir);
DiskChecker.checkDir(testDir); DiskChecker.checkDir(testDir);
if (isDiskUsageOverPercentageLimit(testDir)) { float diskUtilizationPercentageCutoff = goodDirs.contains(dir) ?
diskUtilizationPercentageCutoffHigh : diskUtilizationPercentageCutoffLow;
if (isDiskUsageOverPercentageLimit(testDir,
diskUtilizationPercentageCutoff)) {
msg = msg =
"used space above threshold of " "used space above threshold of "
+ diskUtilizationPercentageCutoff + diskUtilizationPercentageCutoff
@ -374,7 +386,8 @@ private void verifyDirUsingMkdir(File dir) throws IOException {
} }
} }
private boolean isDiskUsageOverPercentageLimit(File dir) { private boolean isDiskUsageOverPercentageLimit(File dir,
float diskUtilizationPercentageCutoff) {
float freePercentage = float freePercentage =
100 * (dir.getUsableSpace() / (float) dir.getTotalSpace()); 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace());
float usedPercentage = 100.0F - freePercentage; float usedPercentage = 100.0F - freePercentage;
@ -403,16 +416,23 @@ private void createDir(FileContext localFs, Path dir, FsPermission perm)
} }
} }
public float getDiskUtilizationPercentageCutoff() { @VisibleForTesting
return diskUtilizationPercentageCutoff; float getDiskUtilizationPercentageCutoffHigh() {
return diskUtilizationPercentageCutoffHigh;
}
@VisibleForTesting
float getDiskUtilizationPercentageCutoffLow() {
return diskUtilizationPercentageCutoffLow;
} }
public void setDiskUtilizationPercentageCutoff( public void setDiskUtilizationPercentageCutoff(
float diskUtilizationPercentageCutoff) { float utilizationPercentageCutOffHigh,
this.diskUtilizationPercentageCutoff = float utilizationPercentageCutOffLow) {
diskUtilizationPercentageCutoff < 0.0F ? 0.0F diskUtilizationPercentageCutoffHigh = Math.max(0.0F, Math.min(100.0F,
: (diskUtilizationPercentageCutoff > 100.0F ? 100.0F utilizationPercentageCutOffHigh));
: diskUtilizationPercentageCutoff); diskUtilizationPercentageCutoffLow = Math.max(0.0F, Math.min(
diskUtilizationPercentageCutoffHigh, utilizationPercentageCutOffLow));
} }
public long getDiskUtilizationSpaceCutoff() { public long getDiskUtilizationSpaceCutoff() {

View File

@ -114,22 +114,40 @@ public class LocalDirsHandlerService extends AbstractService {
private final class MonitoringTimerTask extends TimerTask { private final class MonitoringTimerTask extends TimerTask {
public MonitoringTimerTask(Configuration conf) throws YarnRuntimeException { public MonitoringTimerTask(Configuration conf) throws YarnRuntimeException {
float maxUsableSpacePercentagePerDisk = float highUsableSpacePercentagePerDisk =
conf.getFloat( conf.getFloat(
YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE, YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
YarnConfiguration.DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE); YarnConfiguration.DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE);
float lowUsableSpacePercentagePerDisk =
conf.getFloat(
YarnConfiguration.NM_WM_LOW_PER_DISK_UTILIZATION_PERCENTAGE,
highUsableSpacePercentagePerDisk);
if (lowUsableSpacePercentagePerDisk > highUsableSpacePercentagePerDisk) {
LOG.warn("Using " + YarnConfiguration.
NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE + " as " +
YarnConfiguration.NM_WM_LOW_PER_DISK_UTILIZATION_PERCENTAGE +
", because " + YarnConfiguration.
NM_WM_LOW_PER_DISK_UTILIZATION_PERCENTAGE +
" is not configured properly.");
lowUsableSpacePercentagePerDisk = highUsableSpacePercentagePerDisk;
}
long minFreeSpacePerDiskMB = long minFreeSpacePerDiskMB =
conf.getLong(YarnConfiguration.NM_MIN_PER_DISK_FREE_SPACE_MB, conf.getLong(YarnConfiguration.NM_MIN_PER_DISK_FREE_SPACE_MB,
YarnConfiguration.DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB); YarnConfiguration.DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB);
localDirs = localDirs =
new DirectoryCollection( new DirectoryCollection(
validatePaths(conf validatePaths(conf
.getTrimmedStrings(YarnConfiguration.NM_LOCAL_DIRS)), .getTrimmedStrings(YarnConfiguration.NM_LOCAL_DIRS)),
maxUsableSpacePercentagePerDisk, minFreeSpacePerDiskMB); highUsableSpacePercentagePerDisk,
lowUsableSpacePercentagePerDisk,
minFreeSpacePerDiskMB);
logDirs = logDirs =
new DirectoryCollection( new DirectoryCollection(
validatePaths(conf.getTrimmedStrings(YarnConfiguration.NM_LOG_DIRS)), validatePaths(conf
maxUsableSpacePercentagePerDisk, minFreeSpacePerDiskMB); .getTrimmedStrings(YarnConfiguration.NM_LOG_DIRS)),
highUsableSpacePercentagePerDisk,
lowUsableSpacePercentagePerDisk,
minFreeSpacePerDiskMB);
String local = conf.get(YarnConfiguration.NM_LOCAL_DIRS); String local = conf.get(YarnConfiguration.NM_LOCAL_DIRS);
conf.set(NM_GOOD_LOCAL_DIRS, conf.set(NM_GOOD_LOCAL_DIRS,

View File

@ -152,7 +152,7 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
// no good dirs // no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage()); Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, 100.0F, 0); dc = new DirectoryCollection(dirs, 100.0F, 100.0F, 0);
utilizedSpacePerc = utilizedSpacePerc =
(int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 / (int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
testDir.getTotalSpace()); testDir.getTotalSpace());
@ -168,18 +168,28 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
public void testDiskLimitsCutoffSetters() throws IOException { public void testDiskLimitsCutoffSetters() throws IOException {
String[] dirs = { "dir" }; String[] dirs = { "dir" };
DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F, 100); DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F, 0.0F, 100);
float testValue = 57.5F; float testValue = 57.5F;
float delta = 0.1F; float delta = 0.1F;
dc.setDiskUtilizationPercentageCutoff(testValue); dc.setDiskUtilizationPercentageCutoff(testValue, 50.0F);
Assert.assertEquals(testValue, dc.getDiskUtilizationPercentageCutoff(), Assert.assertEquals(testValue, dc.getDiskUtilizationPercentageCutoffHigh(),
delta); delta);
Assert.assertEquals(50.0F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
testValue = -57.5F; testValue = -57.5F;
dc.setDiskUtilizationPercentageCutoff(testValue); dc.setDiskUtilizationPercentageCutoff(testValue, testValue);
Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
testValue = 157.5F; testValue = 157.5F;
dc.setDiskUtilizationPercentageCutoff(testValue); dc.setDiskUtilizationPercentageCutoff(testValue, testValue);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
long spaceValue = 57; long spaceValue = 57;
dc.setDiskUtilizationSpaceCutoff(spaceValue); dc.setDiskUtilizationSpaceCutoff(spaceValue);
@ -200,7 +210,7 @@ public void testFailedDisksBecomingGoodAgain() throws Exception {
Assert.assertEquals(1, dc.getFailedDirs().size()); Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size()); Assert.assertEquals(1, dc.getFullDirs().size());
dc.setDiskUtilizationPercentageCutoff(100.0F); dc.setDiskUtilizationPercentageCutoff(100.0F, 100.0F);
dc.checkDirs(); dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size()); Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFailedDirs().size());
@ -236,27 +246,45 @@ public void testConstructors() {
String[] dirs = { "dir" }; String[] dirs = { "dir" };
float delta = 0.1F; float delta = 0.1F;
DirectoryCollection dc = new DirectoryCollection(dirs); DirectoryCollection dc = new DirectoryCollection(dirs);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff());
dc = new DirectoryCollection(dirs, 57.5F); dc = new DirectoryCollection(dirs, 57.5F);
Assert.assertEquals(57.5F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(57.5F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(57.5F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff());
dc = new DirectoryCollection(dirs, 57); dc = new DirectoryCollection(dirs, 57);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
Assert.assertEquals(57, dc.getDiskUtilizationSpaceCutoff()); Assert.assertEquals(57, dc.getDiskUtilizationSpaceCutoff());
dc = new DirectoryCollection(dirs, 57.5F, 67); dc = new DirectoryCollection(dirs, 57.5F, 50.5F, 67);
Assert.assertEquals(57.5F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(57.5F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(50.5F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
Assert.assertEquals(67, dc.getDiskUtilizationSpaceCutoff()); Assert.assertEquals(67, dc.getDiskUtilizationSpaceCutoff());
dc = new DirectoryCollection(dirs, -57.5F, -67); dc = new DirectoryCollection(dirs, -57.5F, -57.5F, -67);
Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff());
dc = new DirectoryCollection(dirs, 157.5F, -67); dc = new DirectoryCollection(dirs, 157.5F, 157.5F, -67);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffHigh(),
delta);
Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoffLow(),
delta);
Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff());
} }
@ -288,7 +316,7 @@ public void testDirsChangeListener() {
Assert.assertEquals(listener3.num, 1); Assert.assertEquals(listener3.num, 1);
dc.deregisterDirsChangeListener(listener2); dc.deregisterDirsChangeListener(listener2);
dc.setDiskUtilizationPercentageCutoff(100.0F); dc.setDiskUtilizationPercentageCutoff(100.0F, 100.0F);
dc.checkDirs(); dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size()); Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(listener1.num, 3); Assert.assertEquals(listener1.num, 3);