diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 75c90c9a35..980a6014a7 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -271,6 +271,10 @@ Release 2.4.0 - UNRELEASED YARN-1525. Web UI should redirect to active RM when HA is enabled. (Cindy Li via kasha) + YARN-1781. Modified NodeManagers to allow admins to specify max disk + utilization for local disks so as to be able to offline full disks. (Varun + Vasudev via vinodkv) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 9b7dda5ff3..dbb8465e35 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -717,32 +717,59 @@ public class YarnConfiguration extends Configuration { /** Class that calculates process tree resource utilization.*/ public static final String NM_CONTAINER_MON_PROCESS_TREE = NM_PREFIX + "container-monitor.process-tree.class"; - + + /** Prefix for all node manager disk health checker configs. */ + private static final String NM_DISK_HEALTH_CHECK_PREFIX = + "yarn.nodemanager.disk-health-checker."; /** - * Enable/Disable disks' health checker. Default is true. - * An expert level configuration property. + * Enable/Disable disks' health checker. Default is true. An expert level + * configuration property. */ public static final String NM_DISK_HEALTH_CHECK_ENABLE = - NM_PREFIX + "disk-health-checker.enable"; - /** Frequency of running disks' health checker.*/ + NM_DISK_HEALTH_CHECK_PREFIX + "enable"; + /** Frequency of running disks' health checker. */ public static final String NM_DISK_HEALTH_CHECK_INTERVAL_MS = - NM_PREFIX + "disk-health-checker.interval-ms"; + NM_DISK_HEALTH_CHECK_PREFIX + "interval-ms"; /** By default, disks' health is checked every 2 minutes. */ public static final long DEFAULT_NM_DISK_HEALTH_CHECK_INTERVAL_MS = - 2 * 60 * 1000; + 2 * 60 * 1000; /** * The minimum fraction of number of disks to be healthy for the nodemanager * to launch new containers. This applies to nm-local-dirs and nm-log-dirs. */ public static final String NM_MIN_HEALTHY_DISKS_FRACTION = - NM_PREFIX + "disk-health-checker.min-healthy-disks"; + NM_DISK_HEALTH_CHECK_PREFIX + "min-healthy-disks"; /** - * By default, at least 25% of disks are to be healthy to say that the node - * is healthy in terms of disks. + * By default, at least 25% of disks are to be healthy to say that the node is + * healthy in terms of disks. */ - public static final float DEFAULT_NM_MIN_HEALTHY_DISKS_FRACTION - = 0.25F; + public static final float DEFAULT_NM_MIN_HEALTHY_DISKS_FRACTION = 0.25F; + + /** + * The maximum percentage of disk space that can be used after which a disk is + * marked as offline. Values can range from 0.0 to 100.0. If the value is + * greater than or equal to 100, NM will check for full disk. This applies to + * nm-local-dirs and nm-log-dirs. + */ + public static final String NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE = + NM_DISK_HEALTH_CHECK_PREFIX + "max-disk-utilization-per-disk-percentage"; + /** + * By default, 100% of the disk can be used before it is marked as offline. + */ + public static final float DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE = + 100.0F; + + /** + * The minimum space that must be available on a local dir for it to be used. + * This applies to nm-local-dirs and nm-log-dirs. + */ + public static final String NM_MIN_PER_DISK_FREE_SPACE_MB = + NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-mb"; + /** + * By default, all of the disk can be used before it is marked as offline. + */ + public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0; /** Frequency of running node health script.*/ public static final String NM_HEALTH_CHECK_INTERVAL_MS = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 5b3ede7d14..c1344679bc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -870,6 +870,24 @@ 0.25 + + The maximum percentage of disk space utilization allowed after + which a disk is marked as bad. Values can range from 0.0 to 100.0. + If the value is greater than or equal to 100, the nodemanager will check + for full disk. This applies to yarn-nodemanager.local-dirs and + yarn.nodemanager.log-dirs. + yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage + 100.0 + + + + The minimum space that must be available on a disk for + it to be used. This applies to yarn-nodemanager.local-dirs and + yarn.nodemanager.log-dirs. + yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb + 0 + + The path to the Linux container executor. yarn.nodemanager.linux-container-executor.path diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java index 10362d2294..f6ee128992 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java @@ -22,6 +22,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.concurrent.CopyOnWriteArrayList; @@ -43,10 +44,80 @@ class DirectoryCollection { private List localDirs; private List failedDirs; private int numFailures; + + private float diskUtilizationPercentageCutoff; + private long diskUtilizationSpaceCutoff; + /** + * Create collection for the directories specified. No check for free space. + * + * @param dirs + * directories to be monitored + */ public DirectoryCollection(String[] dirs) { + this(dirs, 100.0F, 0); + } + + /** + * Create collection for the directories specified. Users must specify the + * maximum percentage of disk utilization allowed. Minimum amount of disk + * space is not checked. + * + * @param dirs + * directories to be monitored + * @param utilizationPercentageCutOff + * percentage of disk that can be used before the dir is taken out of + * the good dirs list + * + */ + public DirectoryCollection(String[] dirs, float utilizationPercentageCutOff) { + this(dirs, utilizationPercentageCutOff, 0); + } + + /** + * Create collection for the directories specified. Users must specify the + * minimum amount of free space that must be available for the dir to be used. + * + * @param dirs + * directories to be monitored + * @param utilizationSpaceCutOff + * minimum space, in MB, that must be available on the disk for the + * dir to be marked as good + * + */ + public DirectoryCollection(String[] dirs, long utilizationSpaceCutOff) { + this(dirs, 100.0F, utilizationSpaceCutOff); + } + + /** + * Create collection for the directories specified. Users must specify the + * maximum percentage of disk utilization allowed and the minimum amount of + * free space that must be available for the dir to be used. If either check + * fails the dir is removed from the good dirs list. + * + * @param dirs + * directories to be monitored + * @param utilizationPercentageCutOff + * percentage of disk that can be used before the dir is taken out of + * the good dirs list + * @param utilizationSpaceCutOff + * minimum space, in MB, that must be available on the disk for the + * dir to be marked as good + * + */ + public DirectoryCollection(String[] dirs, + float utilizationPercentageCutOff, + long utilizationSpaceCutOff) { localDirs = new CopyOnWriteArrayList(dirs); failedDirs = new CopyOnWriteArrayList(); + diskUtilizationPercentageCutoff = utilizationPercentageCutOff; + diskUtilizationSpaceCutoff = utilizationSpaceCutOff; + diskUtilizationPercentageCutoff = + utilizationPercentageCutOff < 0.0F ? 0.0F + : (utilizationPercentageCutOff > 100.0F ? 100.0F + : utilizationPercentageCutOff); + diskUtilizationSpaceCutoff = + utilizationSpaceCutOff < 0 ? 0 : utilizationSpaceCutOff; } /** @@ -103,19 +174,55 @@ synchronized boolean createNonExistentDirs(FileContext localFs, */ synchronized boolean checkDirs() { int oldNumFailures = numFailures; + HashSet checkFailedDirs = new HashSet(); for (final String dir : localDirs) { try { - DiskChecker.checkDir(new File(dir)); + File testDir = new File(dir); + DiskChecker.checkDir(testDir); + if (isDiskUsageUnderPercentageLimit(testDir)) { + LOG.warn("Directory " + dir + + " error, used space above threshold of " + + diskUtilizationPercentageCutoff + + "%, removing from the list of valid directories."); + checkFailedDirs.add(dir); + } else if (isDiskFreeSpaceWithinLimit(testDir)) { + LOG.warn("Directory " + dir + " error, free space below limit of " + + diskUtilizationSpaceCutoff + + "MB, removing from the list of valid directories."); + checkFailedDirs.add(dir); + } } catch (DiskErrorException de) { - LOG.warn("Directory " + dir + " error " + - de.getMessage() + ", removing from the list of valid directories."); - localDirs.remove(dir); - failedDirs.add(dir); - numFailures++; + LOG.warn("Directory " + dir + " error " + de.getMessage() + + ", removing from the list of valid directories."); + checkFailedDirs.add(dir); } } + for (String dir : checkFailedDirs) { + localDirs.remove(dir); + failedDirs.add(dir); + numFailures++; + } return numFailures > oldNumFailures; } + + private boolean isDiskUsageUnderPercentageLimit(File dir) { + float freePercentage = + 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace()); + float usedPercentage = 100.0F - freePercentage; + if (usedPercentage > diskUtilizationPercentageCutoff + || usedPercentage >= 100.0F) { + return true; + } + return false; + } + + private boolean isDiskFreeSpaceWithinLimit(File dir) { + long freeSpace = dir.getUsableSpace() / (1024 * 1024); + if (freeSpace < this.diskUtilizationSpaceCutoff) { + return true; + } + return false; + } private void createDir(FileContext localFs, Path dir, FsPermission perm) throws IOException { @@ -132,4 +239,26 @@ private void createDir(FileContext localFs, Path dir, FsPermission perm) } } } + + public float getDiskUtilizationPercentageCutoff() { + return diskUtilizationPercentageCutoff; + } + + public void setDiskUtilizationPercentageCutoff( + float diskUtilizationPercentageCutoff) { + this.diskUtilizationPercentageCutoff = + diskUtilizationPercentageCutoff < 0.0F ? 0.0F + : (diskUtilizationPercentageCutoff > 100.0F ? 100.0F + : diskUtilizationPercentageCutoff); + } + + public long getDiskUtilizationSpaceCutoff() { + return diskUtilizationSpaceCutoff; + } + + public void setDiskUtilizationSpaceCutoff(long diskUtilizationSpaceCutoff) { + diskUtilizationSpaceCutoff = + diskUtilizationSpaceCutoff < 0 ? 0 : diskUtilizationSpaceCutoff; + this.diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index b9b46ebedf..b0539414de 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -89,10 +89,22 @@ public class LocalDirsHandlerService extends AbstractService { private final class MonitoringTimerTask extends TimerTask { public MonitoringTimerTask(Configuration conf) throws YarnRuntimeException { - localDirs = new DirectoryCollection( - validatePaths(conf.getTrimmedStrings(YarnConfiguration.NM_LOCAL_DIRS))); - logDirs = new DirectoryCollection( - validatePaths(conf.getTrimmedStrings(YarnConfiguration.NM_LOG_DIRS))); + float maxUsableSpacePercentagePerDisk = + conf.getFloat( + YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE, + YarnConfiguration.DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE); + long minFreeSpacePerDiskMB = + conf.getLong(YarnConfiguration.NM_MIN_PER_DISK_FREE_SPACE_MB, + YarnConfiguration.DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB); + localDirs = + new DirectoryCollection( + validatePaths(conf + .getTrimmedStrings(YarnConfiguration.NM_LOCAL_DIRS)), + maxUsableSpacePercentagePerDisk, minFreeSpacePerDiskMB); + logDirs = + new DirectoryCollection( + validatePaths(conf.getTrimmedStrings(YarnConfiguration.NM_LOG_DIRS)), + maxUsableSpacePercentagePerDisk, minFreeSpacePerDiskMB); localDirsAllocator = new LocalDirAllocator( YarnConfiguration.NM_LOCAL_DIRS); logDirsAllocator = new LocalDirAllocator(YarnConfiguration.NM_LOG_DIRS); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java index 4ab61c9ade..8528721988 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java @@ -30,6 +30,7 @@ import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; @@ -55,8 +56,11 @@ public static void teardown() { @Test public void testConcurrentAccess() throws IOException { // Initialize DirectoryCollection with a file instead of a directory + Configuration conf = new Configuration(); String[] dirs = {testFile.getPath()}; - DirectoryCollection dc = new DirectoryCollection(dirs); + DirectoryCollection dc = new DirectoryCollection(dirs, + conf.getFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE, + YarnConfiguration.DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE)); // Create an iterator before checkDirs is called to reliable test case List list = dc.getGoodDirs(); @@ -88,7 +92,9 @@ public void testCreateDirectories() throws IOException { localFs.setPermission(pathC, permDirC); String[] dirs = { dirA, dirB, dirC }; - DirectoryCollection dc = new DirectoryCollection(dirs); + DirectoryCollection dc = new DirectoryCollection(dirs, + conf.getFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE, + YarnConfiguration.DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE)); FsPermission defaultPerm = FsPermission.getDefault() .applyUMask(new FsPermission((short)FsPermission.DEFAULT_UMASK)); boolean createResult = dc.createNonExistentDirs(localFs, defaultPerm); @@ -104,4 +110,85 @@ public void testCreateDirectories() throws IOException { Assert.assertEquals("existing local directory permissions modified", permDirC, status.getPermission()); } + + @Test + public void testDiskSpaceUtilizationLimit() throws IOException { + + String dirA = new File(testDir, "dirA").getPath(); + String[] dirs = { dirA }; + DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F); + dc.checkDirs(); + Assert.assertEquals(0, dc.getGoodDirs().size()); + Assert.assertEquals(1, dc.getFailedDirs().size()); + + dc = new DirectoryCollection(dirs, 100.0F); + dc.checkDirs(); + Assert.assertEquals(1, dc.getGoodDirs().size()); + Assert.assertEquals(0, dc.getFailedDirs().size()); + + dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024)); + dc.checkDirs(); + Assert.assertEquals(0, dc.getGoodDirs().size()); + Assert.assertEquals(1, dc.getFailedDirs().size()); + + dc = new DirectoryCollection(dirs, 100.0F, 0); + dc.checkDirs(); + Assert.assertEquals(1, dc.getGoodDirs().size()); + Assert.assertEquals(0, dc.getFailedDirs().size()); + } + + @Test + public void testDiskLimitsCutoffSetters() { + + String[] dirs = { "dir" }; + DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F, 100); + float testValue = 57.5F; + float delta = 0.1F; + dc.setDiskUtilizationPercentageCutoff(testValue); + Assert.assertEquals(testValue, dc.getDiskUtilizationPercentageCutoff(), + delta); + testValue = -57.5F; + dc.setDiskUtilizationPercentageCutoff(testValue); + Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoff(), delta); + testValue = 157.5F; + dc.setDiskUtilizationPercentageCutoff(testValue); + Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); + + long spaceValue = 57; + dc.setDiskUtilizationSpaceCutoff(spaceValue); + Assert.assertEquals(spaceValue, dc.getDiskUtilizationSpaceCutoff()); + spaceValue = -57; + dc.setDiskUtilizationSpaceCutoff(spaceValue); + Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); + } + + @Test + public void testConstructors() { + + String[] dirs = { "dir" }; + float delta = 0.1F; + DirectoryCollection dc = new DirectoryCollection(dirs); + Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); + Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); + + dc = new DirectoryCollection(dirs, 57.5F); + Assert.assertEquals(57.5F, dc.getDiskUtilizationPercentageCutoff(), delta); + Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); + + dc = new DirectoryCollection(dirs, 57); + Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); + Assert.assertEquals(57, dc.getDiskUtilizationSpaceCutoff()); + + dc = new DirectoryCollection(dirs, 57.5F, 67); + Assert.assertEquals(57.5F, dc.getDiskUtilizationPercentageCutoff(), delta); + Assert.assertEquals(67, dc.getDiskUtilizationSpaceCutoff()); + + dc = new DirectoryCollection(dirs, -57.5F, -67); + Assert.assertEquals(0.0F, dc.getDiskUtilizationPercentageCutoff(), delta); + Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); + + dc = new DirectoryCollection(dirs, 157.5F, -67); + Assert.assertEquals(100.0F, dc.getDiskUtilizationPercentageCutoff(), delta); + Assert.assertEquals(0, dc.getDiskUtilizationSpaceCutoff()); + } }