diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 9150372081..3bb6f8925d 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -154,6 +154,9 @@ Release 2.8.0 - UNRELEASED YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith Sharmaks via jianhe) + YARN-3503. Expose disk utilization percentage and bad local and log dir + counts in NM metrics. (Varun Vasudev via jianhe) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java index c019aa9a3f..26589188c4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java @@ -82,6 +82,8 @@ class DirectoryCollection { private float diskUtilizationPercentageCutoff; private long diskUtilizationSpaceCutoff; + private int goodDirsDiskUtilizationPercentage; + /** * Create collection for the directories specified. No check for free space. * @@ -277,6 +279,7 @@ class DirectoryCollection { + dirsFailedCheck.get(dir).message); } } + setGoodDirsDiskUtilizationPercentage(); return setChanged; } @@ -390,4 +393,32 @@ class DirectoryCollection { diskUtilizationSpaceCutoff < 0 ? 0 : diskUtilizationSpaceCutoff; this.diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff; } + + private void setGoodDirsDiskUtilizationPercentage() { + + long totalSpace = 0; + long usableSpace = 0; + + for (String dir : localDirs) { + File f = new File(dir); + if (!f.isDirectory()) { + continue; + } + totalSpace += f.getTotalSpace(); + usableSpace += f.getUsableSpace(); + } + if (totalSpace != 0) { + long tmp = ((totalSpace - usableSpace) * 100) / totalSpace; + if (Integer.MIN_VALUE < tmp && Integer.MAX_VALUE > tmp) { + goodDirsDiskUtilizationPercentage = (int) tmp; + } + } else { + // got no good dirs + goodDirsDiskUtilizationPercentage = 0; + } + } + + public int getGoodDirsDiskUtilizationPercentage() { + return goodDirsDiskUtilizationPercentage; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index 7d1aa534be..493571dc81 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -38,6 +38,7 @@ import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; /** * The class which provides functionality of checking the health of the local @@ -84,6 +85,8 @@ public class LocalDirsHandlerService extends AbstractService { private static String FILE_SCHEME = "file"; + private NodeManagerMetrics nodeManagerMetrics = null; + /** * Class which is used by the {@link Timer} class to periodically execute the * disks' health checker code. @@ -119,7 +122,12 @@ public class LocalDirsHandlerService extends AbstractService { } public LocalDirsHandlerService() { + this(null); + } + + public LocalDirsHandlerService(NodeManagerMetrics nodeManagerMetrics) { super(LocalDirsHandlerService.class.getName()); + this.nodeManagerMetrics = nodeManagerMetrics; } /** @@ -389,6 +397,8 @@ public class LocalDirsHandlerService extends AbstractService { updateDirsAfterTest(); } + updateMetrics(); + lastDisksCheckTime = System.currentTimeMillis(); } @@ -462,4 +472,15 @@ public class LocalDirsHandlerService extends AbstractService { validPaths.toArray(arrValidPaths); return arrValidPaths; } + + protected void updateMetrics() { + if (nodeManagerMetrics != null) { + nodeManagerMetrics.setBadLocalDirs(localDirs.getFailedDirs().size()); + nodeManagerMetrics.setBadLogDirs(logDirs.getFailedDirs().size()); + nodeManagerMetrics.setGoodLocalDirsDiskUtilizationPerc( + localDirs.getGoodDirsDiskUtilizationPercentage()); + nodeManagerMetrics.setGoodLogDirsDiskUtilizationPerc( + logDirs.getGoodDirsDiskUtilizationPercentage()); + } + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 0bac8d7aa8..4a28c6f07c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -258,7 +258,7 @@ public class NodeManager extends CompositeService // NodeManager level dispatcher this.dispatcher = new AsyncDispatcher(); - dirsHandler = new LocalDirsHandlerService(); + dirsHandler = new LocalDirsHandlerService(metrics); nodeHealthChecker = new NodeHealthCheckerService( getNodeHealthScriptRunner(conf), dirsHandler); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java index 3615feefa9..400f14bfcc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java @@ -48,6 +48,15 @@ public class NodeManagerMetrics { @Metric MutableGaugeInt availableVCores; @Metric("Container launch duration") MutableRate containerLaunchDuration; + @Metric("# of bad local dirs") + MutableGaugeInt badLocalDirs; + @Metric("# of bad log dirs") + MutableGaugeInt badLogDirs; + @Metric("Disk utilization % on good local dirs") + MutableGaugeInt goodLocalDirsDiskUtilizationPerc; + @Metric("Disk utilization % on good log dirs") + MutableGaugeInt goodLogDirsDiskUtilizationPerc; + private long allocatedMB; private long availableMB; @@ -125,6 +134,24 @@ public class NodeManagerMetrics { containerLaunchDuration.add(value); } + public void setBadLocalDirs(int badLocalDirs) { + this.badLocalDirs.set(badLocalDirs); + } + + public void setBadLogDirs(int badLogDirs) { + this.badLogDirs.set(badLogDirs); + } + + public void setGoodLocalDirsDiskUtilizationPerc( + int goodLocalDirsDiskUtilizationPerc) { + this.goodLocalDirsDiskUtilizationPerc.set(goodLocalDirsDiskUtilizationPerc); + } + + public void setGoodLogDirsDiskUtilizationPerc( + int goodLogDirsDiskUtilizationPerc) { + this.goodLogDirsDiskUtilizationPerc.set(goodLogDirsDiskUtilizationPerc); + } + public int getRunningContainers() { return containersRunning.value(); } @@ -143,4 +170,25 @@ public class NodeManagerMetrics { public int getCompletedContainers() { return containersCompleted.value(); } + + @VisibleForTesting + public int getBadLogDirs() { + return badLogDirs.value(); + } + + @VisibleForTesting + public int getBadLocalDirs() { + return badLocalDirs.value(); + } + + @VisibleForTesting + public int getGoodLogDirsDiskUtilizationPerc() { + return goodLogDirsDiskUtilizationPerc.value(); + } + + @VisibleForTesting + public int getGoodLocalDirsDiskUtilizationPerc() { + return goodLocalDirsDiskUtilizationPerc.value(); + } + } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java index e435375796..e4525a570f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java @@ -129,24 +129,38 @@ public class TestDirectoryCollection { Assert.assertEquals(0, dc.getGoodDirs().size()); Assert.assertEquals(1, dc.getFailedDirs().size()); Assert.assertEquals(1, dc.getFullDirs().size()); + // no good dirs + Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage()); dc = new DirectoryCollection(dirs, 100.0F); + int utilizedSpacePerc = + (int) ((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 / + testDir.getTotalSpace()); dc.checkDirs(); Assert.assertEquals(1, dc.getGoodDirs().size()); Assert.assertEquals(0, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFullDirs().size()); + Assert.assertEquals(utilizedSpacePerc, + dc.getGoodDirsDiskUtilizationPercentage()); dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024)); dc.checkDirs(); Assert.assertEquals(0, dc.getGoodDirs().size()); Assert.assertEquals(1, dc.getFailedDirs().size()); Assert.assertEquals(1, dc.getFullDirs().size()); + // no good dirs + Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage()); dc = new DirectoryCollection(dirs, 100.0F, 0); + utilizedSpacePerc = + (int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 / + testDir.getTotalSpace()); dc.checkDirs(); Assert.assertEquals(1, dc.getGoodDirs().size()); Assert.assertEquals(0, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFullDirs().size()); + Assert.assertEquals(utilizedSpacePerc, + dc.getGoodDirsDiskUtilizationPercentage()); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLocalDirsHandlerService.java index 84f2fad898..a045e62ba9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLocalDirsHandlerService.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.service.Service.STATE; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -106,12 +107,40 @@ public class TestLocalDirsHandlerService { conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2); conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE, 0.0f); - LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(); + NodeManagerMetrics nm = NodeManagerMetrics.create(); + LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(nm); dirSvc.init(conf); Assert.assertEquals(0, dirSvc.getLocalDirs().size()); Assert.assertEquals(0, dirSvc.getLogDirs().size()); Assert.assertEquals(1, dirSvc.getDiskFullLocalDirs().size()); Assert.assertEquals(1, dirSvc.getDiskFullLogDirs().size()); + // check the metrics + Assert.assertEquals(2, nm.getBadLocalDirs()); + Assert.assertEquals(2, nm.getBadLogDirs()); + Assert.assertEquals(0, nm.getGoodLocalDirsDiskUtilizationPerc()); + Assert.assertEquals(0, nm.getGoodLogDirsDiskUtilizationPerc()); + + conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE, + 100.0f); + nm = NodeManagerMetrics.create(); + dirSvc = new LocalDirsHandlerService(nm); + dirSvc.init(conf); + Assert.assertEquals(1, dirSvc.getLocalDirs().size()); + Assert.assertEquals(1, dirSvc.getLogDirs().size()); + Assert.assertEquals(0, dirSvc.getDiskFullLocalDirs().size()); + Assert.assertEquals(0, dirSvc.getDiskFullLogDirs().size()); + // check the metrics + File dir = new File(localDir1); + int utilizationPerc = + (int) ((dir.getTotalSpace() - dir.getUsableSpace()) * 100 / + dir.getTotalSpace()); + Assert.assertEquals(1, nm.getBadLocalDirs()); + Assert.assertEquals(1, nm.getBadLogDirs()); + Assert.assertEquals(utilizationPerc, + nm.getGoodLocalDirsDiskUtilizationPerc()); + Assert + .assertEquals(utilizationPerc, nm.getGoodLogDirsDiskUtilizationPerc()); + FileUtils.deleteDirectory(new File(localDir1)); FileUtils.deleteDirectory(new File(localDir2)); FileUtils.deleteDirectory(new File(logDir1));