YARN-5214. Fixed locking in DirectoryCollection to avoid hanging NMs when various code-paths hit slow disks. Contributed by Junping Du.

This commit is contained in:
Vinod Kumar Vavilapalli 2016-07-05 16:07:28 -07:00
parent 9560f252cf
commit ce9c006430

View File

@ -28,12 +28,17 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
@ -89,6 +94,10 @@ static List<String> concat(List<String> l1, List<String> l2) {
private List<String> errorDirs;
private List<String> fullDirs;
// read/write lock for accessing above directories.
private final ReadLock readLock;
private final WriteLock writeLock;
private int numFailures;
private float diskUtilizationPercentageCutoffHigh;
@ -163,9 +172,13 @@ public DirectoryCollection(String[] dirs,
float utilizationPercentageCutOffHigh,
float utilizationPercentageCutOffLow,
long utilizationSpaceCutOff) {
localDirs = new CopyOnWriteArrayList<String>(dirs);
errorDirs = new CopyOnWriteArrayList<String>();
fullDirs = new CopyOnWriteArrayList<String>();
localDirs = new CopyOnWriteArrayList<>(dirs);
errorDirs = new CopyOnWriteArrayList<>();
fullDirs = new CopyOnWriteArrayList<>();
ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
this.readLock = lock.readLock();
this.writeLock = lock.writeLock();
diskUtilizationPercentageCutoffHigh = Math.max(0.0F, Math.min(100.0F,
utilizationPercentageCutOffHigh));
@ -174,17 +187,18 @@ public DirectoryCollection(String[] dirs,
diskUtilizationSpaceCutoff =
utilizationSpaceCutOff < 0 ? 0 : utilizationSpaceCutOff;
dirsChangeListeners = new HashSet<DirsChangeListener>();
dirsChangeListeners = Collections.newSetFromMap(
new ConcurrentHashMap<DirsChangeListener, Boolean>());
}
synchronized void registerDirsChangeListener(
void registerDirsChangeListener(
DirsChangeListener listener) {
if (dirsChangeListeners.add(listener)) {
listener.onDirsChanged();
}
}
synchronized void deregisterDirsChangeListener(
void deregisterDirsChangeListener(
DirsChangeListener listener) {
dirsChangeListeners.remove(listener);
}
@ -192,31 +206,51 @@ synchronized void deregisterDirsChangeListener(
/**
* @return the current valid directories
*/
synchronized List<String> getGoodDirs() {
return Collections.unmodifiableList(localDirs);
List<String> getGoodDirs() {
this.readLock.lock();
try {
return Collections.unmodifiableList(localDirs);
} finally {
this.readLock.unlock();
}
}
/**
* @return the failed directories
*/
synchronized List<String> getFailedDirs() {
return Collections.unmodifiableList(
DirectoryCollection.concat(errorDirs, fullDirs));
List<String> getFailedDirs() {
this.readLock.lock();
try {
return Collections.unmodifiableList(
DirectoryCollection.concat(errorDirs, fullDirs));
} finally {
this.readLock.unlock();
}
}
/**
* @return the directories that have used all disk space
*/
synchronized List<String> getFullDirs() {
return fullDirs;
List<String> getFullDirs() {
this.readLock.lock();
try {
return fullDirs;
} finally {
this.readLock.unlock();
}
}
/**
* @return total the number of directory failures seen till now
*/
synchronized int getNumFailures() {
return numFailures;
int getNumFailures() {
this.readLock.lock();
try {
return numFailures;
}finally {
this.readLock.unlock();
}
}
/**
@ -226,18 +260,30 @@ synchronized int getNumFailures() {
* @param perm absolute permissions to use for any directories created
* @return true if there were no errors, false if at least one error occurred
*/
synchronized boolean createNonExistentDirs(FileContext localFs,
boolean createNonExistentDirs(FileContext localFs,
FsPermission perm) {
boolean failed = false;
for (final String dir : localDirs) {
List<String> localDirectories = null;
this.readLock.lock();
try {
localDirectories = new ArrayList<>(localDirs);
} finally {
this.readLock.unlock();
}
for (final String dir : localDirectories) {
try {
createDir(localFs, new Path(dir), perm);
} catch (IOException e) {
LOG.warn("Unable to create directory " + dir + " error " +
e.getMessage() + ", removing from the list of valid directories.");
localDirs.remove(dir);
errorDirs.add(dir);
numFailures++;
this.writeLock.lock();
try {
localDirs.remove(dir);
errorDirs.add(dir);
numFailures++;
} finally {
this.writeLock.unlock();
}
failed = true;
}
}
@ -252,74 +298,93 @@ synchronized boolean createNonExistentDirs(FileContext localFs,
* checking or a failed directory passes the disk check <em>false</em>
* otherwise.
*/
synchronized boolean checkDirs() {
boolean checkDirs() {
boolean setChanged = false;
Set<String> preCheckGoodDirs = new HashSet<String>(localDirs);
Set<String> preCheckFullDirs = new HashSet<String>(fullDirs);
Set<String> preCheckOtherErrorDirs = new HashSet<String>(errorDirs);
List<String> failedDirs = DirectoryCollection.concat(errorDirs, fullDirs);
List<String> allLocalDirs =
DirectoryCollection.concat(localDirs, failedDirs);
Set<String> preCheckGoodDirs = null;
Set<String> preCheckFullDirs = null;
Set<String> preCheckOtherErrorDirs = null;
List<String> failedDirs = null;
List<String> allLocalDirs = null;
this.readLock.lock();
try {
preCheckGoodDirs = new HashSet<String>(localDirs);
preCheckFullDirs = new HashSet<String>(fullDirs);
preCheckOtherErrorDirs = new HashSet<String>(errorDirs);
failedDirs = DirectoryCollection.concat(errorDirs, fullDirs);
allLocalDirs = DirectoryCollection.concat(localDirs, failedDirs);
} finally {
this.readLock.unlock();
}
// move testDirs out of any lock as it could wait for very long time in
// case of busy IO
Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs,
preCheckGoodDirs);
localDirs.clear();
errorDirs.clear();
fullDirs.clear();
this.writeLock.lock();
try {
localDirs.clear();
errorDirs.clear();
fullDirs.clear();
for (Map.Entry<String, DiskErrorInformation> entry : dirsFailedCheck
.entrySet()) {
String dir = entry.getKey();
DiskErrorInformation errorInformation = entry.getValue();
switch (entry.getValue().cause) {
case DISK_FULL:
fullDirs.add(entry.getKey());
break;
case OTHER:
errorDirs.add(entry.getKey());
break;
}
if (preCheckGoodDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error, " + errorInformation.message
+ ", removing from list of valid directories");
setChanged = true;
numFailures++;
}
}
for (String dir : allLocalDirs) {
if (!dirsFailedCheck.containsKey(dir)) {
localDirs.add(dir);
if (preCheckFullDirs.contains(dir)
|| preCheckOtherErrorDirs.contains(dir)) {
for (Map.Entry<String, DiskErrorInformation> entry : dirsFailedCheck
.entrySet()) {
String dir = entry.getKey();
DiskErrorInformation errorInformation = entry.getValue();
switch (entry.getValue().cause) {
case DISK_FULL:
fullDirs.add(entry.getKey());
break;
case OTHER:
errorDirs.add(entry.getKey());
break;
default:
LOG.warn(entry.getValue().cause + " is unknown for disk error.");
break;
}
if (preCheckGoodDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error, " + errorInformation.message
+ ", removing from list of valid directories");
setChanged = true;
LOG.info("Directory " + dir
+ " passed disk check, adding to list of valid directories.");
numFailures++;
}
}
}
Set<String> postCheckFullDirs = new HashSet<String>(fullDirs);
Set<String> postCheckOtherDirs = new HashSet<String>(errorDirs);
for (String dir : preCheckFullDirs) {
if (postCheckOtherDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error "
+ dirsFailedCheck.get(dir).message);
for (String dir : allLocalDirs) {
if (!dirsFailedCheck.containsKey(dir)) {
localDirs.add(dir);
if (preCheckFullDirs.contains(dir)
|| preCheckOtherErrorDirs.contains(dir)) {
setChanged = true;
LOG.info("Directory " + dir
+ " passed disk check, adding to list of valid directories.");
}
}
}
Set<String> postCheckFullDirs = new HashSet<String>(fullDirs);
Set<String> postCheckOtherDirs = new HashSet<String>(errorDirs);
for (String dir : preCheckFullDirs) {
if (postCheckOtherDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error "
+ dirsFailedCheck.get(dir).message);
}
}
}
for (String dir : preCheckOtherErrorDirs) {
if (postCheckFullDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error "
+ dirsFailedCheck.get(dir).message);
for (String dir : preCheckOtherErrorDirs) {
if (postCheckFullDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error "
+ dirsFailedCheck.get(dir).message);
}
}
}
setGoodDirsDiskUtilizationPercentage();
if (setChanged) {
for (DirsChangeListener listener : dirsChangeListeners) {
listener.onDirsChanged();
setGoodDirsDiskUtilizationPercentage();
if (setChanged) {
for (DirsChangeListener listener : dirsChangeListeners) {
listener.onDirsChanged();
}
}
return setChanged;
} finally {
this.writeLock.unlock();
}
return setChanged;
}
Map<String, DiskErrorInformation> testDirs(List<String> dirs,
@ -409,7 +474,11 @@ private void createDir(FileContext localFs, Path dir, FsPermission perm)
localFs.getFileStatus(dir);
} catch (FileNotFoundException e) {
createDir(localFs, dir.getParent(), perm);
localFs.mkdir(dir, perm, false);
try {
localFs.mkdir(dir, perm, false);
} catch (FileAlreadyExistsException ex) {
// do nothing as other threads could in creating the same directory.
}
if (!perm.equals(perm.applyUMask(localFs.getUMask()))) {
localFs.setPermission(dir, perm);
}