YARN-2313. Livelock can occur in FairScheduler when there are lots of running apps (Tsuyoshi Ozawa via Sandy Ryza)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1612769 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sanford Ryza 2014-07-23 05:00:52 +00:00
parent e0f00e12ce
commit c88402f36d
6 changed files with 37 additions and 4 deletions

View File

@ -91,6 +91,9 @@ Release 2.6.0 - UNRELEASED
YARN-2273. NPE in ContinuousScheduling thread when we lose a node.
(Wei Yan via kasha)
YARN-2313. Livelock can occur in FairScheduler when there are lots of
running apps (Tsuyoshi Ozawa via Sandy Ryza)
Release 2.5.0 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -194,6 +194,12 @@
<Field name="scheduleAsynchronously" />
<Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match>
<!-- Inconsistent sync warning - updateInterval is only initialized once and never changed -->
<Match>
<Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler" />
<Field name="updateInterval" />
<Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match>
<!-- Inconsistent sync warning - numRetries is only initialized once and never changed -->
<Match>
<Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" />

View File

@ -135,7 +135,7 @@ public class FairScheduler extends
public static final Resource CONTAINER_RESERVED = Resources.createResource(-1);
// How often fair shares are re-calculated (ms)
protected long UPDATE_INTERVAL = 500;
protected long updateInterval;
private final int UPDATE_DEBUG_FREQUENCY = 5;
private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY;
@ -244,13 +244,13 @@ public QueueManager getQueueManager() {
/**
* A runnable which calls {@link FairScheduler#update()} every
* <code>UPDATE_INTERVAL</code> milliseconds.
* <code>updateInterval</code> milliseconds.
*/
private class UpdateThread implements Runnable {
public void run() {
while (true) {
try {
Thread.sleep(UPDATE_INTERVAL);
Thread.sleep(updateInterval);
update();
preemptTasksIfNecessary();
} catch (Exception e) {
@ -1206,6 +1206,15 @@ private synchronized void initScheduler(Configuration conf)
waitTimeBeforeKill = this.conf.getWaitTimeBeforeKill();
usePortForNodeName = this.conf.getUsePortForNodeName();
updateInterval = this.conf.getUpdateInterval();
if (updateInterval < 0) {
updateInterval = FairSchedulerConfiguration.DEFAULT_UPDATE_INTERVAL_MS;
LOG.warn(FairSchedulerConfiguration.UPDATE_INTERVAL_MS
+ " is invalid, so using default value " +
+ FairSchedulerConfiguration.DEFAULT_UPDATE_INTERVAL_MS
+ " ms instead");
}
rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
// This stores per-application scheduling information
this.applications =

View File

@ -123,6 +123,11 @@ public class FairSchedulerConfiguration extends Configuration {
protected static final String MAX_ASSIGN = CONF_PREFIX + "max.assign";
protected static final int DEFAULT_MAX_ASSIGN = -1;
/** The update interval for calculating resources in FairScheduler .*/
public static final String UPDATE_INTERVAL_MS =
CONF_PREFIX + "update-interval-ms";
public static final int DEFAULT_UPDATE_INTERVAL_MS = 500;
public FairSchedulerConfiguration() {
super();
}
@ -246,6 +251,10 @@ public static Resource parseResourceConfigValue(String val)
"Error reading resource config", ex);
}
}
public long getUpdateInterval() {
return getLong(UPDATE_INTERVAL_MS, DEFAULT_UPDATE_INTERVAL_MS);
}
private static int findResource(String val, String units)
throws AllocationConfigurationException {

View File

@ -94,7 +94,7 @@ private void startResourceManager(float utilizationThreshold) {
scheduler = (FairScheduler)resourceManager.getResourceScheduler();
scheduler.setClock(clock);
scheduler.UPDATE_INTERVAL = 60 * 1000;
scheduler.updateInterval = 60 * 1000;
}
private void registerNodeAndSubmitApp(

View File

@ -205,6 +205,12 @@ Properties that can be placed in yarn-site.xml
instead. Defaults to true. If a queue placement policy is given in the
allocations file, this property is ignored.
* <<<yarn.scheduler.fair.update-interval-ms>>>
* The interval at which to lock the scheduler and recalculate fair shares,
recalculate demand, and check whether anything is due for preemption.
Defaults to 500 ms.
Allocation file format
The allocation file must be in XML format. The format contains five types of