MAPREDUCE-4817. Hardcoded task ping timeout kills tasks localizing large amounts of data (tgraves)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1414873 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Thomas Graves 2012-11-28 19:19:16 +00:00
parent b43deb9af8
commit 5caef48947
3 changed files with 6 additions and 25 deletions

View File

@ -601,6 +601,9 @@ Release 0.23.6 - UNRELEASED
MAPREDUCE-4825. JobImpl.finished doesn't expect ERROR as a final job state
(jlowe via bobby)
MAPREDUCE-4817. Hardcoded task ping timeout kills tasks localizing large
amounts of data (tgraves)
Release 0.23.5 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -274,7 +274,6 @@ public MapTaskCompletionEventsUpdate getMapCompletionEvents(
@Override
public boolean ping(TaskAttemptID taskAttemptID) throws IOException {
LOG.info("Ping from " + taskAttemptID.toString());
taskHeartbeatHandler.pinged(TypeConverter.toYarn(taskAttemptID));
return true;
}

View File

@ -46,33 +46,22 @@
public class TaskHeartbeatHandler extends AbstractService {
private static class ReportTime {
private long lastPing;
private long lastProgress;
public ReportTime(long time) {
setLastProgress(time);
}
public synchronized void setLastPing(long time) {
lastPing = time;
}
public synchronized void setLastProgress(long time) {
lastProgress = time;
lastPing = time;
}
public synchronized long getLastPing() {
return lastPing;
}
public synchronized long getLastProgress() {
return lastProgress;
}
}
private static final Log LOG = LogFactory.getLog(TaskHeartbeatHandler.class);
private static final int PING_TIMEOUT = 5 * 60 * 1000;
//thread which runs periodically to see the last time since a heartbeat is
//received from a task.
@ -127,14 +116,6 @@ public void progressing(TaskAttemptId attemptID) {
}
}
public void pinged(TaskAttemptId attemptID) {
//only put for the registered attempts
//TODO throw an exception if the task isn't registered.
ReportTime time = runningAttempts.get(attemptID);
if(time != null) {
time.setLastPing(clock.getTime());
}
}
public void register(TaskAttemptId attemptID) {
runningAttempts.put(attemptID, new ReportTime(clock.getTime()));
@ -159,10 +140,8 @@ public void run() {
Map.Entry<TaskAttemptId, ReportTime> entry = iterator.next();
boolean taskTimedOut = (taskTimeOut > 0) &&
(currentTime > (entry.getValue().getLastProgress() + taskTimeOut));
boolean pingTimedOut =
(currentTime > (entry.getValue().getLastPing() + PING_TIMEOUT));
if(taskTimedOut || pingTimedOut) {
if(taskTimedOut) {
// task is lost, remove from the list and raise lost event
iterator.remove();
eventHandler.handle(new TaskAttemptDiagnosticsUpdateEvent(entry