YARN-3254. HealthReport should include disk full information. Contributed by Suma Shivaprasad.
This commit is contained in:
parent
1f04cb45f7
commit
f9a0e23381
@ -38,6 +38,7 @@
|
||||
import org.apache.commons.lang.RandomStringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.fs.FileAlreadyExistsException;
|
||||
import org.apache.hadoop.fs.FileContext;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@ -99,6 +100,7 @@ static List<String> concat(List<String> l1, List<String> l2) {
|
||||
private List<String> localDirs;
|
||||
private List<String> errorDirs;
|
||||
private List<String> fullDirs;
|
||||
private Map<String, DiskErrorInformation> directoryErrorInfo;
|
||||
|
||||
// read/write lock for accessing above directories.
|
||||
private final ReadLock readLock;
|
||||
@ -192,6 +194,7 @@ public DirectoryCollection(String[] dirs,
|
||||
localDirs = new CopyOnWriteArrayList<>(dirs);
|
||||
errorDirs = new CopyOnWriteArrayList<>();
|
||||
fullDirs = new CopyOnWriteArrayList<>();
|
||||
directoryErrorInfo = new ConcurrentHashMap<>();
|
||||
|
||||
ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
|
||||
this.readLock = lock.readLock();
|
||||
@ -248,11 +251,25 @@ List<String> getFailedDirs() {
|
||||
/**
|
||||
* @return the directories that have used all disk space
|
||||
*/
|
||||
|
||||
List<String> getFullDirs() {
|
||||
this.readLock.lock();
|
||||
try {
|
||||
return fullDirs;
|
||||
return Collections.unmodifiableList(fullDirs);
|
||||
} finally {
|
||||
this.readLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the directories that have errors - many not have appropriate permissions
|
||||
* or other disk validation checks might have failed in {@link DiskValidator}
|
||||
*
|
||||
*/
|
||||
@InterfaceStability.Evolving
|
||||
List<String> getErroredDirs() {
|
||||
this.readLock.lock();
|
||||
try {
|
||||
return Collections.unmodifiableList(errorDirs);
|
||||
} finally {
|
||||
this.readLock.unlock();
|
||||
}
|
||||
@ -270,6 +287,39 @@ int getNumFailures() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param dirName Absolute path of Directory for which error diagnostics are needed
|
||||
* @return DiskErrorInformation - disk error diagnostics for the specified directory
|
||||
* null - the disk associated with the directory has passed disk utilization checks
|
||||
* /error validations in {@link DiskValidator}
|
||||
*
|
||||
*/
|
||||
@InterfaceStability.Evolving
|
||||
DiskErrorInformation getDirectoryErrorInfo(String dirName) {
|
||||
this.readLock.lock();
|
||||
try {
|
||||
return directoryErrorInfo.get(dirName);
|
||||
} finally {
|
||||
this.readLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param dirName Absolute path of Directory for which the disk has been marked as unhealthy
|
||||
* @return Check if disk associated with the directory is unhealthy
|
||||
*/
|
||||
@InterfaceStability.Evolving
|
||||
boolean isDiskUnHealthy(String dirName) {
|
||||
this.readLock.lock();
|
||||
try {
|
||||
return directoryErrorInfo.containsKey(dirName);
|
||||
} finally {
|
||||
this.readLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create any non-existent directories and parent directories, updating the
|
||||
* list of valid directories if necessary.
|
||||
@ -297,6 +347,9 @@ boolean createNonExistentDirs(FileContext localFs,
|
||||
try {
|
||||
localDirs.remove(dir);
|
||||
errorDirs.add(dir);
|
||||
directoryErrorInfo.put(dir,
|
||||
new DiskErrorInformation(DiskErrorCause.OTHER,
|
||||
"Cannot create directory : " + dir + ", error " + e.getMessage()));
|
||||
numFailures++;
|
||||
} finally {
|
||||
this.writeLock.unlock();
|
||||
@ -343,11 +396,13 @@ boolean checkDirs() {
|
||||
localDirs.clear();
|
||||
errorDirs.clear();
|
||||
fullDirs.clear();
|
||||
directoryErrorInfo.clear();
|
||||
|
||||
for (Map.Entry<String, DiskErrorInformation> entry : dirsFailedCheck
|
||||
.entrySet()) {
|
||||
String dir = entry.getKey();
|
||||
DiskErrorInformation errorInformation = entry.getValue();
|
||||
|
||||
switch (entry.getValue().cause) {
|
||||
case DISK_FULL:
|
||||
fullDirs.add(entry.getKey());
|
||||
@ -359,6 +414,8 @@ boolean checkDirs() {
|
||||
LOG.warn(entry.getValue().cause + " is unknown for disk error.");
|
||||
break;
|
||||
}
|
||||
directoryErrorInfo.put(entry.getKey(), errorInformation);
|
||||
|
||||
if (preCheckGoodDirs.contains(dir)) {
|
||||
LOG.warn("Directory " + dir + " error, " + errorInformation.message
|
||||
+ ", removing from list of valid directories");
|
||||
|
@ -53,6 +53,8 @@ public class LocalDirsHandlerService extends AbstractService {
|
||||
|
||||
private static Log LOG = LogFactory.getLog(LocalDirsHandlerService.class);
|
||||
|
||||
private static final String diskCapacityExceededErrorMsg = "usable space is below configured utilization percentage/no more usable space";
|
||||
|
||||
/**
|
||||
* Good local directories, use internally,
|
||||
* initial value is the same as NM_LOCAL_DIRS.
|
||||
@ -344,21 +346,36 @@ public String getDisksHealthReport(boolean listGoodDirs) {
|
||||
}
|
||||
|
||||
StringBuilder report = new StringBuilder();
|
||||
List<String> failedLocalDirsList = localDirs.getFailedDirs();
|
||||
List<String> failedLogDirsList = logDirs.getFailedDirs();
|
||||
List<String> erroredLocalDirsList = localDirs.getErroredDirs();
|
||||
List<String> erroredLogDirsList = logDirs.getErroredDirs();
|
||||
List<String> diskFullLocalDirsList = localDirs.getFullDirs();
|
||||
List<String> diskFullLogDirsList = logDirs.getFullDirs();
|
||||
List<String> goodLocalDirsList = localDirs.getGoodDirs();
|
||||
List<String> goodLogDirsList = logDirs.getGoodDirs();
|
||||
int numLocalDirs = goodLocalDirsList.size() + failedLocalDirsList.size();
|
||||
int numLogDirs = goodLogDirsList.size() + failedLogDirsList.size();
|
||||
|
||||
int numLocalDirs = goodLocalDirsList.size() + erroredLocalDirsList.size() + diskFullLocalDirsList.size();
|
||||
int numLogDirs = goodLogDirsList.size() + erroredLogDirsList.size() + diskFullLogDirsList.size();
|
||||
if (!listGoodDirs) {
|
||||
if (!failedLocalDirsList.isEmpty()) {
|
||||
report.append(failedLocalDirsList.size() + "/" + numLocalDirs
|
||||
+ " local-dirs are bad: "
|
||||
+ StringUtils.join(",", failedLocalDirsList) + "; ");
|
||||
if (!erroredLocalDirsList.isEmpty()) {
|
||||
report.append(erroredLocalDirsList.size() + "/" + numLocalDirs
|
||||
+ " local-dirs have errors: "
|
||||
+ buildDiskErrorReport(erroredLocalDirsList, localDirs));
|
||||
}
|
||||
if (!failedLogDirsList.isEmpty()) {
|
||||
report.append(failedLogDirsList.size() + "/" + numLogDirs
|
||||
+ " log-dirs are bad: " + StringUtils.join(",", failedLogDirsList));
|
||||
if (!diskFullLocalDirsList.isEmpty()) {
|
||||
report.append(diskFullLocalDirsList.size() + "/" + numLocalDirs
|
||||
+ " local-dirs " + diskCapacityExceededErrorMsg
|
||||
+ buildDiskErrorReport(diskFullLocalDirsList, localDirs) + "; ");
|
||||
}
|
||||
|
||||
if (!erroredLogDirsList.isEmpty()) {
|
||||
report.append(erroredLogDirsList.size() + "/" + numLogDirs
|
||||
+ " log-dirs have errors: "
|
||||
+ buildDiskErrorReport(erroredLogDirsList, logDirs));
|
||||
}
|
||||
if (!diskFullLogDirsList.isEmpty()) {
|
||||
report.append(diskFullLogDirsList.size() + "/" + numLogDirs
|
||||
+ " log-dirs " + diskCapacityExceededErrorMsg
|
||||
+ buildDiskErrorReport(diskFullLogDirsList, logDirs));
|
||||
}
|
||||
} else {
|
||||
report.append(goodLocalDirsList.size() + "/" + numLocalDirs
|
||||
@ -620,4 +637,24 @@ protected void updateMetrics() {
|
||||
logDirs.getGoodDirsDiskUtilizationPercentage());
|
||||
}
|
||||
}
|
||||
|
||||
private String buildDiskErrorReport(List<String> dirs, DirectoryCollection directoryCollection) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(" [ ");
|
||||
for (int i = 0; i < dirs.size(); i++) {
|
||||
final String dirName = dirs.get(i);
|
||||
if ( directoryCollection.isDiskUnHealthy(dirName)) {
|
||||
sb.append(dirName + " : " + directoryCollection.getDirectoryErrorInfo(dirName).message);
|
||||
} else {
|
||||
sb.append(dirName + " : " + "Unknown cause for disk error");
|
||||
}
|
||||
|
||||
if ( i != (dirs.size() - 1)) {
|
||||
sb.append(" , ");
|
||||
}
|
||||
}
|
||||
sb.append(" ] ");
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
@ -128,8 +128,12 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
|
||||
DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F);
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getErroredDirs().size());
|
||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(1, dc.getFullDirs().size());
|
||||
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
|
||||
Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
|
||||
|
||||
// no good dirs
|
||||
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
|
||||
|
||||
@ -139,16 +143,21 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
|
||||
testDir.getTotalSpace());
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getErroredDirs().size());
|
||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
|
||||
|
||||
Assert.assertEquals(utilizedSpacePerc,
|
||||
dc.getGoodDirsDiskUtilizationPercentage());
|
||||
|
||||
dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getErroredDirs().size());
|
||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(1, dc.getFullDirs().size());
|
||||
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
|
||||
// no good dirs
|
||||
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
|
||||
|
||||
@ -158,8 +167,11 @@ public void testDiskSpaceUtilizationLimit() throws IOException {
|
||||
testDir.getTotalSpace());
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getErroredDirs().size());
|
||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
|
||||
|
||||
Assert.assertEquals(utilizedSpacePerc,
|
||||
dc.getGoodDirsDiskUtilizationPercentage());
|
||||
}
|
||||
@ -209,12 +221,17 @@ public void testFailedDisksBecomingGoodAgain() throws Exception {
|
||||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(1, dc.getFullDirs().size());
|
||||
Assert.assertEquals(0, dc.getErroredDirs().size());
|
||||
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA));
|
||||
Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause);
|
||||
|
||||
dc.setDiskUtilizationPercentageCutoff(100.0F, 100.0F);
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||
Assert.assertEquals(0, dc.getErroredDirs().size());
|
||||
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
|
||||
|
||||
conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077");
|
||||
|
||||
@ -232,12 +249,18 @@ public void testFailedDisksBecomingGoodAgain() throws Exception {
|
||||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||
Assert.assertEquals(1, dc.getErroredDirs().size());
|
||||
Assert.assertNotNull(dc.getDirectoryErrorInfo(dirB));
|
||||
Assert.assertEquals(DirectoryCollection.DiskErrorCause.OTHER, dc.getDirectoryErrorInfo(dirB).cause);
|
||||
|
||||
permDirB = new FsPermission((short) 0700);
|
||||
localFs.setPermission(pathB, permDirB);
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||
Assert.assertEquals(0, dc.getErroredDirs().size());
|
||||
Assert.assertNull(dc.getDirectoryErrorInfo(dirA));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Loading…
Reference in New Issue
Block a user