MAPREDUCE-4817. Hardcoded task ping timeout kills tasks localizing large amounts of data (tgraves)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1414873 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b43deb9af8
commit
5caef48947
@ -601,6 +601,9 @@ Release 0.23.6 - UNRELEASED
|
||||
MAPREDUCE-4825. JobImpl.finished doesn't expect ERROR as a final job state
|
||||
(jlowe via bobby)
|
||||
|
||||
MAPREDUCE-4817. Hardcoded task ping timeout kills tasks localizing large
|
||||
amounts of data (tgraves)
|
||||
|
||||
Release 0.23.5 - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
@ -274,7 +274,6 @@ public MapTaskCompletionEventsUpdate getMapCompletionEvents(
|
||||
@Override
|
||||
public boolean ping(TaskAttemptID taskAttemptID) throws IOException {
|
||||
LOG.info("Ping from " + taskAttemptID.toString());
|
||||
taskHeartbeatHandler.pinged(TypeConverter.toYarn(taskAttemptID));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -46,33 +46,22 @@
|
||||
public class TaskHeartbeatHandler extends AbstractService {
|
||||
|
||||
private static class ReportTime {
|
||||
private long lastPing;
|
||||
private long lastProgress;
|
||||
|
||||
public ReportTime(long time) {
|
||||
setLastProgress(time);
|
||||
}
|
||||
|
||||
public synchronized void setLastPing(long time) {
|
||||
lastPing = time;
|
||||
}
|
||||
|
||||
public synchronized void setLastProgress(long time) {
|
||||
lastProgress = time;
|
||||
lastPing = time;
|
||||
}
|
||||
|
||||
public synchronized long getLastPing() {
|
||||
return lastPing;
|
||||
}
|
||||
|
||||
|
||||
public synchronized long getLastProgress() {
|
||||
return lastProgress;
|
||||
}
|
||||
}
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(TaskHeartbeatHandler.class);
|
||||
private static final int PING_TIMEOUT = 5 * 60 * 1000;
|
||||
|
||||
//thread which runs periodically to see the last time since a heartbeat is
|
||||
//received from a task.
|
||||
@ -127,14 +116,6 @@ public void progressing(TaskAttemptId attemptID) {
|
||||
}
|
||||
}
|
||||
|
||||
public void pinged(TaskAttemptId attemptID) {
|
||||
//only put for the registered attempts
|
||||
//TODO throw an exception if the task isn't registered.
|
||||
ReportTime time = runningAttempts.get(attemptID);
|
||||
if(time != null) {
|
||||
time.setLastPing(clock.getTime());
|
||||
}
|
||||
}
|
||||
|
||||
public void register(TaskAttemptId attemptID) {
|
||||
runningAttempts.put(attemptID, new ReportTime(clock.getTime()));
|
||||
@ -159,10 +140,8 @@ public void run() {
|
||||
Map.Entry<TaskAttemptId, ReportTime> entry = iterator.next();
|
||||
boolean taskTimedOut = (taskTimeOut > 0) &&
|
||||
(currentTime > (entry.getValue().getLastProgress() + taskTimeOut));
|
||||
boolean pingTimedOut =
|
||||
(currentTime > (entry.getValue().getLastPing() + PING_TIMEOUT));
|
||||
|
||||
if(taskTimedOut || pingTimedOut) {
|
||||
|
||||
if(taskTimedOut) {
|
||||
// task is lost, remove from the list and raise lost event
|
||||
iterator.remove();
|
||||
eventHandler.handle(new TaskAttemptDiagnosticsUpdateEvent(entry
|
||||
|
Loading…
Reference in New Issue
Block a user