YARN-3254. HealthReport should include disk full information. Contributed by Suma Shivaprasad.

This commit is contained in:
Sunil G 2017-08-17 15:07:15 +05:30
parent 1f04cb45f7
commit f9a0e23381
3 changed files with 130 additions and 13 deletions

View File

@ -38,6 +38,7 @@
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
@ -99,6 +100,7 @@ static List<String> concat(List<String> l1, List<String> l2) {
private List<String> localDirs;
private List<String> errorDirs;
private List<String> fullDirs;
private Map<String, DiskErrorInformation> directoryErrorInfo;
// read/write lock for accessing above directories.
private final ReadLock readLock;
@ -192,6 +194,7 @@ public DirectoryCollection(String[] dirs,
localDirs = new CopyOnWriteArrayList<>(dirs);
errorDirs = new CopyOnWriteArrayList<>();
fullDirs = new CopyOnWriteArrayList<>();
directoryErrorInfo = new ConcurrentHashMap<>();
ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
this.readLock = lock.readLock();
@ -248,11 +251,25 @@ List<String> getFailedDirs() {
/**
* @return the directories that have used all disk space
*/
List<String> getFullDirs() {
this.readLock.lock();
try {
return fullDirs;
return Collections.unmodifiableList(fullDirs);
} finally {
this.readLock.unlock();
}
}
/**
* @return the directories that have errors - many not have appropriate permissions
* or other disk validation checks might have failed in {@link DiskValidator}
*
*/
@InterfaceStability.Evolving
List<String> getErroredDirs() {
this.readLock.lock();
try {
return Collections.unmodifiableList(errorDirs);
} finally {
this.readLock.unlock();
}
@ -270,6 +287,39 @@ int getNumFailures() {
}
}
/**
*
* @param dirName Absolute path of Directory for which error diagnostics are needed
* @return DiskErrorInformation - disk error diagnostics for the specified directory
* null - the disk associated with the directory has passed disk utilization checks
* /error validations in {@link DiskValidator}
*
*/
@InterfaceStability.Evolving
DiskErrorInformation getDirectoryErrorInfo(String dirName) {
this.readLock.lock();
try {
return directoryErrorInfo.get(dirName);
} finally {
this.readLock.unlock();
}
}
/**
*
* @param dirName Absolute path of Directory for which the disk has been marked as unhealthy
* @return Check if disk associated with the directory is unhealthy
*/
@InterfaceStability.Evolving
boolean isDiskUnHealthy(String dirName) {
this.readLock.lock();
try {
return directoryErrorInfo.containsKey(dirName);
} finally {
this.readLock.unlock();
}
}
/**
* Create any non-existent directories and parent directories, updating the
* list of valid directories if necessary.
@ -297,6 +347,9 @@ boolean createNonExistentDirs(FileContext localFs,
try {
localDirs.remove(dir);
errorDirs.add(dir);
directoryErrorInfo.put(dir,
new DiskErrorInformation(DiskErrorCause.OTHER,
"Cannot create directory : " + dir + ", error " + e.getMessage()));
numFailures++;
} finally {
this.writeLock.unlock();
@ -343,11 +396,13 @@ boolean checkDirs() {
localDirs.clear();
errorDirs.clear();
fullDirs.clear();
directoryErrorInfo.clear();
for (Map.Entry<String, DiskErrorInformation> entry : dirsFailedCheck
.entrySet()) {
String dir = entry.getKey();
DiskErrorInformation errorInformation = entry.getValue();
switch (entry.getValue().cause) {
case DISK_FULL:
fullDirs.add(entry.getKey());
@ -359,6 +414,8 @@ boolean checkDirs() {
LOG.warn(entry.getValue().cause + " is unknown for disk error.");
break;
}
directoryErrorInfo.put(entry.getKey(), errorInformation);
if (preCheckGoodDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error, " + errorInformation.message
+ ", removing from list of valid directories");

View File

@ -53,6 +53,8 @@ public class LocalDirsHandlerService extends AbstractService {
private static Log LOG = LogFactory.getLog(LocalDirsHandlerService.class);
private static final String diskCapacityExceededErrorMsg = "usable space is below configured utilization percentage/no more usable space";
/**
* Good local directories, use internally,
* initial value is the same as NM_LOCAL_DIRS.
@ -344,21 +346,36 @@ public String getDisksHealthReport(boolean listGoodDirs) {
}
StringBuilder report = new StringBuilder();
List<String> failedLocalDirsList = localDirs.getFailedDirs();
List<String> failedLogDirsList = logDirs.getFailedDirs();
List<String> erroredLocalDirsList = localDirs.getErroredDirs();
List<String> erroredLogDirsList = logDirs.getErroredDirs();
List<String> diskFullLocalDirsList = localDirs.getFullDirs();
List<String> diskFullLogDirsList = logDirs.getFullDirs();
List<String> goodLocalDirsList = localDirs.getGoodDirs();
List<String> goodLogDirsList = logDirs.getGoodDirs();
int numLocalDirs = goodLocalDirsList.size() + failedLocalDirsList.size();
int numLogDirs = goodLogDirsList.size() + failedLogDirsList.size();
int numLocalDirs = goodLocalDirsList.size() + erroredLocalDirsList.size() + diskFullLocalDirsList.size();
int numLogDirs = goodLogDirsList.size() + erroredLogDirsList.size() + diskFullLogDirsList.size();
if (!listGoodDirs) {
if (!failedLocalDirsList.isEmpty()) {
report.append(failedLocalDirsList.size() + "/" + numLocalDirs
+ " local-dirs are bad: "
+ StringUtils.join(",", failedLocalDirsList) + "; ");
if (!erroredLocalDirsList.isEmpty()) {
report.append(erroredLocalDirsList.size() + "/" + numLocalDirs
+ " local-dirs have errors: "
+ buildDiskErrorReport(erroredLocalDirsList, localDirs));
}
if (!failedLogDirsList.isEmpty()) {
report.append(failedLogDirsList.size() + "/" + numLogDirs
+ " log-dirs are bad: " + StringUtils.join(",", failedLogDirsList));
if (!diskFullLocalDirsList.isEmpty()) {
report.append(diskFullLocalDirsList.size() + "/" + numLocalDirs
+ " local-dirs " + diskCapacityExceededErrorMsg
+ buildDiskErrorReport(diskFullLocalDirsList, localDirs) + "; ");
}
if (!erroredLogDirsList.isEmpty()) {
report.append(erroredLogDirsList.size() + "/" + numLogDirs
+ " log-dirs have errors: "
+ buildDiskErrorReport(erroredLogDirsList, logDirs));
}
if (!diskFullLogDirsList.isEmpty()) {
report.append(diskFullLogDirsList.size() + "/" + numLogDirs
+ " log-dirs " + diskCapacityExceededErrorMsg
+ buildDiskErrorReport(diskFullLogDirsList, logDirs));
}
} else {
report.append(goodLocalDirsList.size() + "/" + numLocalDirs
@ -620,4 +637,24 @@ protected void updateMetrics() {
logDirs.getGoodDirsDiskUtilizationPercentage());
}
}
private String buildDiskErrorReport(List<String> dirs, DirectoryCollection directoryCollection) {
StringBuilder sb = new StringBuilder();
sb.append(" [ ");
for (int i = 0; i < dirs.size(); i++) {
final String dirName = dirs.get(i);
if ( directoryCollection.isDiskUnHealthy(dirName)) {
sb.append(dirName + " : " + directoryCollection.getDirectoryErrorInfo(dirName).message);
} else {
sb.append(dirName + " : " + "Unknown cause for disk error");
}
if ( i != (dirs.size() - 1)) {
sb.append(" , ");
}
}
sb.append(" ] ");
return sb.toString();
}
}

View File

@ -128,8 +128,12 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F);
dc.checkDirs();
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
// no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
@ -139,16 +143,21 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
dc.checkDirs();
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
// no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
@ -158,8 +167,11 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
}
@ -209,12 +221,17 @@ public void testFailedDisksBecomingGoodAgain() throws Exception {
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
dc.setDiskUtilizationPercentageCutoff(100.0F, 100.0F);
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077");
@ -232,12 +249,18 @@ public void testFailedDisksBecomingGoodAgain() throws Exception {
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertEquals(1, dc.getErroredDirs().size());
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirB));
Assert.assertEquals(DirectoryCollection.DiskErrorCause.OTHER, dc.getDirectoryErrorInfo(dirB).cause);
permDirB = new FsPermission((short) 0700);
localFs.setPermission(pathB, permDirB);
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertEquals(0, dc.getErroredDirs().size());
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
}
@Test