YARN-5214. Fixed locking in DirectoryCollection to avoid hanging NMs when various code-paths hit slow disks. Contributed by Junping Du.

This commit is contained in:
Vinod Kumar Vavilapalli 2016-07-05 16:07:28 -07:00
parent 9560f252cf
commit ce9c006430

View File

@ -28,12 +28,17 @@
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
@ -89,6 +94,10 @@ static List<String> concat(List<String> l1, List<String> l2) {
private List<String> errorDirs; private List<String> errorDirs;
private List<String> fullDirs; private List<String> fullDirs;
// read/write lock for accessing above directories.
private final ReadLock readLock;
private final WriteLock writeLock;
private int numFailures; private int numFailures;
private float diskUtilizationPercentageCutoffHigh; private float diskUtilizationPercentageCutoffHigh;
@ -163,9 +172,13 @@ public DirectoryCollection(String[] dirs,
float utilizationPercentageCutOffHigh, float utilizationPercentageCutOffHigh,
float utilizationPercentageCutOffLow, float utilizationPercentageCutOffLow,
long utilizationSpaceCutOff) { long utilizationSpaceCutOff) {
localDirs = new CopyOnWriteArrayList<String>(dirs); localDirs = new CopyOnWriteArrayList<>(dirs);
errorDirs = new CopyOnWriteArrayList<String>(); errorDirs = new CopyOnWriteArrayList<>();
fullDirs = new CopyOnWriteArrayList<String>(); fullDirs = new CopyOnWriteArrayList<>();
ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
this.readLock = lock.readLock();
this.writeLock = lock.writeLock();
diskUtilizationPercentageCutoffHigh = Math.max(0.0F, Math.min(100.0F, diskUtilizationPercentageCutoffHigh = Math.max(0.0F, Math.min(100.0F,
utilizationPercentageCutOffHigh)); utilizationPercentageCutOffHigh));
@ -174,17 +187,18 @@ public DirectoryCollection(String[] dirs,
diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff =
utilizationSpaceCutOff < 0 ? 0 : utilizationSpaceCutOff; utilizationSpaceCutOff < 0 ? 0 : utilizationSpaceCutOff;
dirsChangeListeners = new HashSet<DirsChangeListener>(); dirsChangeListeners = Collections.newSetFromMap(
new ConcurrentHashMap<DirsChangeListener, Boolean>());
} }
synchronized void registerDirsChangeListener( void registerDirsChangeListener(
DirsChangeListener listener) { DirsChangeListener listener) {
if (dirsChangeListeners.add(listener)) { if (dirsChangeListeners.add(listener)) {
listener.onDirsChanged(); listener.onDirsChanged();
} }
} }
synchronized void deregisterDirsChangeListener( void deregisterDirsChangeListener(
DirsChangeListener listener) { DirsChangeListener listener) {
dirsChangeListeners.remove(listener); dirsChangeListeners.remove(listener);
} }
@ -192,31 +206,51 @@ synchronized void deregisterDirsChangeListener(
/** /**
* @return the current valid directories * @return the current valid directories
*/ */
synchronized List<String> getGoodDirs() { List<String> getGoodDirs() {
this.readLock.lock();
try {
return Collections.unmodifiableList(localDirs); return Collections.unmodifiableList(localDirs);
} finally {
this.readLock.unlock();
}
} }
/** /**
* @return the failed directories * @return the failed directories
*/ */
synchronized List<String> getFailedDirs() { List<String> getFailedDirs() {
this.readLock.lock();
try {
return Collections.unmodifiableList( return Collections.unmodifiableList(
DirectoryCollection.concat(errorDirs, fullDirs)); DirectoryCollection.concat(errorDirs, fullDirs));
} finally {
this.readLock.unlock();
}
} }
/** /**
* @return the directories that have used all disk space * @return the directories that have used all disk space
*/ */
synchronized List<String> getFullDirs() { List<String> getFullDirs() {
this.readLock.lock();
try {
return fullDirs; return fullDirs;
} finally {
this.readLock.unlock();
}
} }
/** /**
* @return total the number of directory failures seen till now * @return total the number of directory failures seen till now
*/ */
synchronized int getNumFailures() { int getNumFailures() {
this.readLock.lock();
try {
return numFailures; return numFailures;
}finally {
this.readLock.unlock();
}
} }
/** /**
@ -226,18 +260,30 @@ synchronized int getNumFailures() {
* @param perm absolute permissions to use for any directories created * @param perm absolute permissions to use for any directories created
* @return true if there were no errors, false if at least one error occurred * @return true if there were no errors, false if at least one error occurred
*/ */
synchronized boolean createNonExistentDirs(FileContext localFs, boolean createNonExistentDirs(FileContext localFs,
FsPermission perm) { FsPermission perm) {
boolean failed = false; boolean failed = false;
for (final String dir : localDirs) { List<String> localDirectories = null;
this.readLock.lock();
try {
localDirectories = new ArrayList<>(localDirs);
} finally {
this.readLock.unlock();
}
for (final String dir : localDirectories) {
try { try {
createDir(localFs, new Path(dir), perm); createDir(localFs, new Path(dir), perm);
} catch (IOException e) { } catch (IOException e) {
LOG.warn("Unable to create directory " + dir + " error " + LOG.warn("Unable to create directory " + dir + " error " +
e.getMessage() + ", removing from the list of valid directories."); e.getMessage() + ", removing from the list of valid directories.");
this.writeLock.lock();
try {
localDirs.remove(dir); localDirs.remove(dir);
errorDirs.add(dir); errorDirs.add(dir);
numFailures++; numFailures++;
} finally {
this.writeLock.unlock();
}
failed = true; failed = true;
} }
} }
@ -252,18 +298,31 @@ synchronized boolean createNonExistentDirs(FileContext localFs,
* checking or a failed directory passes the disk check <em>false</em> * checking or a failed directory passes the disk check <em>false</em>
* otherwise. * otherwise.
*/ */
synchronized boolean checkDirs() { boolean checkDirs() {
boolean setChanged = false; boolean setChanged = false;
Set<String> preCheckGoodDirs = new HashSet<String>(localDirs); Set<String> preCheckGoodDirs = null;
Set<String> preCheckFullDirs = new HashSet<String>(fullDirs); Set<String> preCheckFullDirs = null;
Set<String> preCheckOtherErrorDirs = new HashSet<String>(errorDirs); Set<String> preCheckOtherErrorDirs = null;
List<String> failedDirs = DirectoryCollection.concat(errorDirs, fullDirs); List<String> failedDirs = null;
List<String> allLocalDirs = List<String> allLocalDirs = null;
DirectoryCollection.concat(localDirs, failedDirs); this.readLock.lock();
try {
preCheckGoodDirs = new HashSet<String>(localDirs);
preCheckFullDirs = new HashSet<String>(fullDirs);
preCheckOtherErrorDirs = new HashSet<String>(errorDirs);
failedDirs = DirectoryCollection.concat(errorDirs, fullDirs);
allLocalDirs = DirectoryCollection.concat(localDirs, failedDirs);
} finally {
this.readLock.unlock();
}
// move testDirs out of any lock as it could wait for very long time in
// case of busy IO
Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs, Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs,
preCheckGoodDirs); preCheckGoodDirs);
this.writeLock.lock();
try {
localDirs.clear(); localDirs.clear();
errorDirs.clear(); errorDirs.clear();
fullDirs.clear(); fullDirs.clear();
@ -279,6 +338,9 @@ synchronized boolean checkDirs() {
case OTHER: case OTHER:
errorDirs.add(entry.getKey()); errorDirs.add(entry.getKey());
break; break;
default:
LOG.warn(entry.getValue().cause + " is unknown for disk error.");
break;
} }
if (preCheckGoodDirs.contains(dir)) { if (preCheckGoodDirs.contains(dir)) {
LOG.warn("Directory " + dir + " error, " + errorInformation.message LOG.warn("Directory " + dir + " error, " + errorInformation.message
@ -320,6 +382,9 @@ synchronized boolean checkDirs() {
} }
} }
return setChanged; return setChanged;
} finally {
this.writeLock.unlock();
}
} }
Map<String, DiskErrorInformation> testDirs(List<String> dirs, Map<String, DiskErrorInformation> testDirs(List<String> dirs,
@ -409,7 +474,11 @@ private void createDir(FileContext localFs, Path dir, FsPermission perm)
localFs.getFileStatus(dir); localFs.getFileStatus(dir);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
createDir(localFs, dir.getParent(), perm); createDir(localFs, dir.getParent(), perm);
try {
localFs.mkdir(dir, perm, false); localFs.mkdir(dir, perm, false);
} catch (FileAlreadyExistsException ex) {
// do nothing as other threads could in creating the same directory.
}
if (!perm.equals(perm.applyUMask(localFs.getUMask()))) { if (!perm.equals(perm.applyUMask(localFs.getUMask()))) {
localFs.setPermission(dir, perm); localFs.setPermission(dir, perm);
} }