MAPREDUCE-2807. Fix AM restart and client redirection. Contributed by Sharad Agarwal.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1161408 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sharad Agarwal 2011-08-25 06:35:58 +00:00
parent 6316e66279
commit f2b91a8367
23 changed files with 653 additions and 448 deletions

View File

@ -1138,6 +1138,8 @@ Trunk (unreleased changes)
MAPREDUCE-2846. Fix missing synchronization in the task log management.
(omalley)
MAPREDUCE-2807. Fix AM restart and client redirection. (sharad)
Release 0.22.0 - Unreleased
INCOMPATIBLE CHANGES

View File

@ -369,7 +369,6 @@ public static JobStatus.State fromYarn(ApplicationState state) {
case SUBMITTED:
return State.PREP;
case RUNNING:
case RESTARTING:
return State.RUNNING;
case SUCCEEDED:
return State.SUCCEEDED;

View File

@ -0,0 +1,87 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.v2.api.MRClientProtocol;
import org.apache.hadoop.mapreduce.v2.jobhistory.JHConfig;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.SecurityInfo;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.client.ClientRMSecurityInfo;
public class ClientCache {
private final Configuration conf;
private final ResourceMgrDelegate rm;
private static final Log LOG = LogFactory.getLog(ClientCache.class);
private Map<JobID, ClientServiceDelegate> cache =
new HashMap<JobID, ClientServiceDelegate>();
private MRClientProtocol hsProxy;
ClientCache(Configuration conf, ResourceMgrDelegate rm) {
this.conf = conf;
this.rm = rm;
}
//TODO: evict from the cache on some threshold
synchronized ClientServiceDelegate getClient(JobID jobId) {
if (hsProxy == null) {
try {
hsProxy = instantiateHistoryProxy();
} catch (IOException e) {
LOG.warn("Could not connect to History server.", e);
throw new YarnException("Could not connect to History server.", e);
}
}
ClientServiceDelegate client = cache.get(jobId);
if (client == null) {
client = new ClientServiceDelegate(conf, rm, jobId, hsProxy);
cache.put(jobId, client);
}
return client;
}
private MRClientProtocol instantiateHistoryProxy()
throws IOException {
String serviceAddr = conf.get(JHConfig.HS_BIND_ADDRESS,
JHConfig.DEFAULT_HS_BIND_ADDRESS);
LOG.info("Connecting to HistoryServer at: " + serviceAddr);
Configuration myConf = new Configuration(conf);
//TODO This should ideally be using it's own class (instead of ClientRMSecurityInfo)
myConf.setClass(YarnConfiguration.YARN_SECURITY_INFO,
ClientRMSecurityInfo.class, SecurityInfo.class);
YarnRPC rpc = YarnRPC.create(myConf);
LOG.info("Connected to HistoryServer at: " + serviceAddr);
return (MRClientProtocol) rpc.getProxy(MRClientProtocol.class,
NetUtils.createSocketAddr(serviceAddr), myConf);
}
}

View File

@ -19,14 +19,13 @@
package org.apache.hadoop.mapred;
import java.io.IOException;
import java.net.UnknownHostException;
import java.lang.reflect.Method;
import java.security.PrivilegedAction;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.JobID;
@ -37,16 +36,21 @@
import org.apache.hadoop.mapreduce.v2.api.MRClientProtocol;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.FailTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillJobRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.records.Counters;
import org.apache.hadoop.mapreduce.v2.api.records.JobReport;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.jobhistory.JHConfig;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.SecurityInfo;
import org.apache.hadoop.security.UserGroupInformation;
@ -65,48 +69,47 @@
import org.apache.hadoop.yarn.security.SchedulerSecurityInfo;
import org.apache.hadoop.yarn.security.client.ClientRMSecurityInfo;
public class ClientServiceDelegate {
class ClientServiceDelegate {
private static final Log LOG = LogFactory.getLog(ClientServiceDelegate.class);
private static final NotRunningJob NOTSTARTEDJOB =
new NotRunningJob(JobState.NEW);
private static final NotRunningJob FAILEDJOB =
new NotRunningJob(JobState.FAILED);
private static final NotRunningJob KILLEDJOB =
new NotRunningJob(JobState.KILLED);
private Configuration conf;
private ApplicationId currentAppId;
private ApplicationState currentAppState = ApplicationState.NEW;
private final Configuration conf;
private final JobID jobId;
private final ApplicationId appId;
private final ResourceMgrDelegate rm;
private final MRClientProtocol historyServerProxy;
private boolean forceRefresh;
private MRClientProtocol realProxy = null;
private String serviceAddr = "";
private String serviceHttpAddr = "";
private RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null);
ClientServiceDelegate(Configuration conf, ResourceMgrDelegate rm) {
ClientServiceDelegate(Configuration conf, ResourceMgrDelegate rm,
JobID jobId, MRClientProtocol historyServerProxy) {
this.conf = new Configuration(conf); // Cloning for modifying.
// For faster redirects from AM to HS.
this.conf.setInt(
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 3);
this.rm = rm;
this.jobId = jobId;
this.historyServerProxy = historyServerProxy;
this.appId = TypeConverter.toYarn(jobId).getAppId();
}
private MRClientProtocol getProxy(JobID jobId) throws YarnRemoteException {
return getProxy(TypeConverter.toYarn(jobId).getAppId(), false);
}
private MRClientProtocol getRefreshedProxy(JobID jobId) throws YarnRemoteException {
return getProxy(TypeConverter.toYarn(jobId).getAppId(), true);
}
private MRClientProtocol getProxy(ApplicationId appId,
boolean forceRefresh) throws YarnRemoteException {
if (!appId.equals(currentAppId) || forceRefresh || realProxy == null) {
currentAppId = appId;
refreshProxy();
private MRClientProtocol getProxy() throws YarnRemoteException {
if (!forceRefresh && realProxy != null) {
return realProxy;
}
return realProxy;
}
private void refreshProxy() throws YarnRemoteException {
//TODO RM NPEs for unknown jobs. History may still be aware.
//TODO RM NPEs for unknown jobs. History may still be aware.
// Possibly allow nulls through the PB tunnel, otherwise deal with an exception
// and redirect to the history server.
ApplicationReport application = rm.getApplicationReport(currentAppId);
ApplicationReport application = rm.getApplicationReport(appId);
String serviceAddr = null;
while (ApplicationState.RUNNING.equals(application.getState())) {
try {
if (application.getHost() == null || "".equals(application.getHost())) {
@ -114,12 +117,10 @@ private void refreshProxy() throws YarnRemoteException {
Thread.sleep(2000);
LOG.debug("Application state is " + application.getState());
application = rm.getApplicationReport(currentAppId);
application = rm.getApplicationReport(appId);
continue;
}
serviceAddr = application.getHost() + ":" + application.getRpcPort();
serviceHttpAddr = application.getTrackingUrl();
currentAppState = application.getState();
if (UserGroupInformation.isSecurityEnabled()) {
String clientTokenEncoded = application.getClientToken();
Token<ApplicationTokenIdentifier> clientToken =
@ -129,9 +130,10 @@ private void refreshProxy() throws YarnRemoteException {
+ application.getRpcPort()));
UserGroupInformation.getCurrentUser().addToken(clientToken);
}
LOG.info("Tracking Url of JOB is " + application.getTrackingUrl());
LOG.info("Connecting to " + serviceAddr);
instantiateAMProxy(serviceAddr);
return;
return realProxy;
} catch (Exception e) {
//possibly
//possibly the AM has crashed
@ -143,41 +145,39 @@ private void refreshProxy() throws YarnRemoteException {
Thread.sleep(2000);
} catch (InterruptedException e1) {
}
application = rm.getApplicationReport(currentAppId);
application = rm.getApplicationReport(appId);
}
}
currentAppState = application.getState();
/** we just want to return if its allocating, so that we dont
* block on it. This is to be able to return job status
* on a allocating Application.
*/
if (currentAppState == ApplicationState.NEW) {
if (application.getState() == ApplicationState.NEW ||
application.getState() == ApplicationState.SUBMITTED) {
realProxy = null;
return;
return NOTSTARTEDJOB;
}
if (currentAppState == ApplicationState.SUCCEEDED
|| currentAppState == ApplicationState.FAILED
|| currentAppState == ApplicationState.KILLED) {
serviceAddr = conf.get(JHConfig.HS_BIND_ADDRESS,
JHConfig.DEFAULT_HS_BIND_ADDRESS);
if (application.getState() == ApplicationState.FAILED) {
realProxy = null;
return FAILEDJOB;
}
if (application.getState() == ApplicationState.KILLED) {
realProxy = null;
return KILLEDJOB;
}
//History server can serve a job only if application
//succeeded.
if (application.getState() == ApplicationState.SUCCEEDED) {
LOG.info("Application state is completed. " +
"Redirecting to job history server " + serviceAddr);
try {
serviceHttpAddr = JobHistoryUtils.getHistoryUrl(conf, currentAppId);
} catch (UnknownHostException e) {
LOG.warn("Unable to get history url", e);
serviceHttpAddr = "UNKNOWN";
}
try {
instantiateHistoryProxy(serviceAddr);
return;
} catch (IOException e) {
throw new YarnException(e);
}
realProxy = historyServerProxy;
}
return realProxy;
}
private void instantiateAMProxy(final String serviceAddr) throws IOException {
@ -198,129 +198,70 @@ public MRClientProtocol run() {
LOG.trace("Connected to ApplicationMaster at: " + serviceAddr);
}
private void instantiateHistoryProxy(final String serviceAddr)
throws IOException {
LOG.trace("Connecting to HistoryServer at: " + serviceAddr);
Configuration myConf = new Configuration(conf);
//TODO This should ideally be using it's own class (instead of ClientRMSecurityInfo)
myConf.setClass(YarnConfiguration.YARN_SECURITY_INFO,
ClientRMSecurityInfo.class, SecurityInfo.class);
YarnRPC rpc = YarnRPC.create(myConf);
realProxy = (MRClientProtocol) rpc.getProxy(MRClientProtocol.class,
NetUtils.createSocketAddr(serviceAddr), myConf);
LOG.trace("Connected to HistoryServer at: " + serviceAddr);
private synchronized Object invoke(String method, Class argClass,
Object args) throws YarnRemoteException {
Method methodOb = null;
try {
methodOb = MRClientProtocol.class.getMethod(method, argClass);
} catch (SecurityException e) {
throw new YarnException(e);
} catch (NoSuchMethodException e) {
throw new YarnException("Method name mismatch", e);
}
while (true) {
try {
return methodOb.invoke(getProxy(), args);
} catch (YarnRemoteException yre) {
LOG.warn("Exception thrown by remote end.");
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch (Exception e) {
LOG.info("Failed to contact AM for job " + jobId + " Will retry..");
LOG.debug("Failing to contact application master", e);
forceRefresh = true;
}
}
}
public org.apache.hadoop.mapreduce.Counters getJobCounters(JobID arg0) throws IOException,
org.apache.hadoop.mapreduce.Counters getJobCounters(JobID arg0) throws IOException,
InterruptedException {
org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter.toYarn(arg0);
try {
GetCountersRequest request = recordFactory.newRecordInstance(GetCountersRequest.class);
request.setJobId(jobID);
MRClientProtocol protocol = getProxy(arg0);
if (protocol == null) {
/* no AM to connect to, fake counters */
return new org.apache.hadoop.mapreduce.Counters();
}
return TypeConverter.fromYarn(protocol.getCounters(request).getCounters());
} catch(YarnRemoteException yre) {//thrown by remote server, no need to redirect
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch(Exception e) {
LOG.debug("Failing to contact application master", e);
try {
GetCountersRequest request = recordFactory.newRecordInstance(GetCountersRequest.class);
request.setJobId(jobID);
MRClientProtocol protocol = getRefreshedProxy(arg0);
if (protocol == null) {
/* no History to connect to, fake counters */
return new org.apache.hadoop.mapreduce.Counters();
}
return TypeConverter.fromYarn(protocol.getCounters(request).getCounters());
} catch(YarnRemoteException yre) {
LOG.warn(RPCUtil.toString(yre));
throw yre;
}
}
Counters cnt = ((GetCountersResponse)
invoke("getCounters", GetCountersRequest.class, request)).getCounters();
return TypeConverter.fromYarn(cnt);
}
public String getJobHistoryDir() throws IOException, InterruptedException {
return JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
TaskCompletionEvent[] getTaskCompletionEvents(JobID arg0, int arg1, int arg2)
throws IOException, InterruptedException {
org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter
.toYarn(arg0);
GetTaskAttemptCompletionEventsRequest request = recordFactory
.newRecordInstance(GetTaskAttemptCompletionEventsRequest.class);
request.setJobId(jobID);
request.setFromEventId(arg1);
request.setMaxEvents(arg2);
List<org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent> list =
((GetTaskAttemptCompletionEventsResponse) invoke(
"getTaskAttemptCompletionEvents", GetTaskAttemptCompletionEventsRequest.class, request)).
getCompletionEventList();
return TypeConverter
.fromYarn(list
.toArray(new org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent[0]));
}
public TaskCompletionEvent[] getTaskCompletionEvents(JobID arg0, int arg1,
int arg2) throws IOException, InterruptedException {
org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter.toYarn(arg0);
List<org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent> list = null;
GetTaskAttemptCompletionEventsRequest request = recordFactory.newRecordInstance(GetTaskAttemptCompletionEventsRequest.class);
MRClientProtocol protocol;
try {
request.setJobId(jobID);
request.setFromEventId(arg1);
request.setMaxEvents(arg2);
protocol = getProxy(arg0);
/** This is hack to get around the issue of faking jobstatus while the AM
* is coming up.
*/
if (protocol == null) {
return new TaskCompletionEvent[0];
}
list = getProxy(arg0).getTaskAttemptCompletionEvents(request).getCompletionEventList();
} catch(YarnRemoteException yre) {//thrown by remote server, no need to redirect
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch(Exception e) {
LOG.debug("Failed to contact application master ", e);
try {
request.setJobId(jobID);
request.setFromEventId(arg1);
request.setMaxEvents(arg2);
protocol = getRefreshedProxy(arg0);
if (protocol == null) {
return new TaskCompletionEvent[0];
}
list = protocol.getTaskAttemptCompletionEvents(request).getCompletionEventList();
} catch(YarnRemoteException yre) {
LOG.warn(RPCUtil.toString(yre));
throw yre;
}
}
return TypeConverter.fromYarn(
list.toArray(new org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent[0]));
}
String[] getTaskDiagnostics(org.apache.hadoop.mapreduce.TaskAttemptID arg0)
throws IOException, InterruptedException {
public String[] getTaskDiagnostics(org.apache.hadoop.mapreduce.TaskAttemptID
arg0)
throws IOException,
InterruptedException {
List<String> list = null;
org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = TypeConverter.toYarn(arg0);
GetDiagnosticsRequest request = recordFactory.newRecordInstance(GetDiagnosticsRequest.class);
MRClientProtocol protocol;
try {
request.setTaskAttemptId(attemptID);
protocol = getProxy(arg0.getJobID());
if (protocol == null) {
return new String[0];
}
list = getProxy(arg0.getJobID()).getDiagnostics(request).getDiagnosticsList();
} catch(YarnRemoteException yre) {//thrown by remote server, no need to redirect
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch(Exception e) {
LOG.debug("Failed to contact application master ", e);
try {
protocol = getRefreshedProxy(arg0.getJobID());
if (protocol == null) {
return new String[0];
}
list = protocol.getDiagnostics(request).getDiagnosticsList();
} catch(YarnRemoteException yre) {
LOG.warn(RPCUtil.toString(yre));
throw yre;
}
}
org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = TypeConverter
.toYarn(arg0);
GetDiagnosticsRequest request = recordFactory
.newRecordInstance(GetDiagnosticsRequest.class);
request.setTaskAttemptId(attemptID);
List<String> list = ((GetDiagnosticsResponse) invoke("getDiagnostics",
GetDiagnosticsRequest.class, request)).getDiagnosticsList();
String[] result = new String[list.size()];
int i = 0;
for (String c : list) {
@ -329,180 +270,59 @@ public String[] getTaskDiagnostics(org.apache.hadoop.mapreduce.TaskAttemptID
return result;
}
private JobStatus createFakeJobReport(ApplicationState state,
org.apache.hadoop.mapreduce.v2.api.records.JobId jobId, String jobFile) {
JobReport jobreport = recordFactory.newRecordInstance(JobReport.class);
jobreport.setCleanupProgress(0);
jobreport.setFinishTime(0);
jobreport.setJobId(jobId);
jobreport.setMapProgress(0);
/** fix this, the start time should be fixed */
jobreport.setStartTime(0);
jobreport.setReduceProgress(0);
jobreport.setSetupProgress(0);
if (currentAppState == ApplicationState.NEW) {
/* the protocol wasnt instantiated because the applicaton wasnt launched
* return a fake report.
*/
jobreport.setJobState(JobState.NEW);
} else if (currentAppState == ApplicationState.SUBMITTED) {
jobreport.setJobState(JobState.NEW);
} else if (currentAppState == ApplicationState.KILLED) {
jobreport.setJobState(JobState.KILLED);
} else if (currentAppState == ApplicationState.FAILED) {
jobreport.setJobState(JobState.FAILED);
}
return TypeConverter.fromYarn(jobreport, jobFile, serviceHttpAddr);
}
public JobStatus getJobStatus(JobID oldJobID) throws YarnRemoteException,
YarnRemoteException {
JobStatus getJobStatus(JobID oldJobID) throws YarnRemoteException,
YarnRemoteException {
org.apache.hadoop.mapreduce.v2.api.records.JobId jobId =
TypeConverter.toYarn(oldJobID);
String stagingDir = conf.get("yarn.apps.stagingDir");
String jobFile = stagingDir + "/" + jobId.toString();
JobReport report = null;
MRClientProtocol protocol;
GetJobReportRequest request = recordFactory.newRecordInstance(GetJobReportRequest.class);
try {
request.setJobId(jobId);
protocol = getProxy(oldJobID);
if (protocol == null) {
return createFakeJobReport(currentAppState, jobId, jobFile);
}
report = getProxy(oldJobID).getJobReport(request).getJobReport();
} catch(YarnRemoteException yre) {//thrown by remote server, no need to redirect
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch (Exception e) {
try {
request.setJobId(jobId);
protocol = getRefreshedProxy(oldJobID);
/* this is possible if an application that was running is killed */
if (protocol == null) {
return createFakeJobReport(currentAppState, jobId, jobFile);
}
report = protocol.getJobReport(request).getJobReport();
} catch(YarnRemoteException yre) {
LOG.warn(RPCUtil.toString(yre));
throw yre;
}
}
return TypeConverter.fromYarn(report, jobFile, serviceHttpAddr);
request.setJobId(jobId);
JobReport report = ((GetJobReportResponse) invoke("getJobReport",
GetJobReportRequest.class, request)).getJobReport();
//TODO: add tracking url in JobReport
return TypeConverter.fromYarn(report, jobFile, "");
}
public org.apache.hadoop.mapreduce.TaskReport[] getTaskReports(JobID jobID, TaskType taskType)
throws YarnRemoteException, YarnRemoteException {
List<org.apache.hadoop.mapreduce.v2.api.records.TaskReport> taskReports = null;
org.apache.hadoop.mapreduce.TaskReport[] getTaskReports(JobID jobID, TaskType taskType)
throws YarnRemoteException, YarnRemoteException {
org.apache.hadoop.mapreduce.v2.api.records.JobId nJobID = TypeConverter.toYarn(jobID);
GetTaskReportsRequest request = recordFactory.newRecordInstance(GetTaskReportsRequest.class);
MRClientProtocol protocol = null;
try {
request.setJobId(nJobID);
request.setTaskType(TypeConverter.toYarn(taskType));
protocol = getProxy(jobID);
if (protocol == null) {
return new org.apache.hadoop.mapreduce.TaskReport[0];
}
taskReports = getProxy(jobID).getTaskReports(request).getTaskReportList();
} catch(YarnRemoteException yre) {//thrown by remote server, no need to redirect
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch(Exception e) {
LOG.debug("Failed to contact application master ", e);
try {
request.setJobId(nJobID);
request.setTaskType(TypeConverter.toYarn(taskType));
protocol = getRefreshedProxy(jobID);
if (protocol == null) {
return new org.apache.hadoop.mapreduce.TaskReport[0];
}
taskReports = protocol.getTaskReports(request).getTaskReportList();
} catch(YarnRemoteException yre) {
LOG.warn(RPCUtil.toString(yre));
throw yre;
}
}
List<org.apache.hadoop.mapreduce.v2.api.records.TaskReport> taskReports =
((GetTaskReportsResponse) invoke("getTaskReports", GetTaskReportsRequest.class,
request)).getTaskReportList();
return TypeConverter.fromYarn
(taskReports).toArray(new org.apache.hadoop.mapreduce.TaskReport[0]);
}
public boolean killTask(TaskAttemptID taskAttemptID, boolean fail)
throws YarnRemoteException {
boolean killTask(TaskAttemptID taskAttemptID, boolean fail)
throws YarnRemoteException {
org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID
= TypeConverter.toYarn(taskAttemptID);
KillTaskAttemptRequest killRequest = recordFactory.newRecordInstance(KillTaskAttemptRequest.class);
FailTaskAttemptRequest failRequest = recordFactory.newRecordInstance(FailTaskAttemptRequest.class);
MRClientProtocol protocol = getProxy(taskAttemptID.getJobID());
if (protocol == null) {
return false;
}
try {
if (fail) {
failRequest.setTaskAttemptId(attemptID);
getProxy(taskAttemptID.getJobID()).failTaskAttempt(failRequest);
} else {
killRequest.setTaskAttemptId(attemptID);
getProxy(taskAttemptID.getJobID()).killTaskAttempt(killRequest);
}
} catch(YarnRemoteException yre) {//thrown by remote server, no need to redirect
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch(Exception e) {
LOG.debug("Failed to contact application master ", e);
MRClientProtocol proxy = getRefreshedProxy(taskAttemptID.getJobID());
if (proxy == null) {
return false;
}
try {
if (fail) {
failRequest.setTaskAttemptId(attemptID);
proxy.failTaskAttempt(failRequest);
} else {
killRequest.setTaskAttemptId(attemptID);
proxy.killTaskAttempt(killRequest);
}
} catch(YarnRemoteException yre) {
LOG.warn(RPCUtil.toString(yre));
throw yre;
}
= TypeConverter.toYarn(taskAttemptID);
if (fail) {
FailTaskAttemptRequest failRequest = recordFactory.newRecordInstance(FailTaskAttemptRequest.class);
failRequest.setTaskAttemptId(attemptID);
invoke("failTaskAttempt", FailTaskAttemptRequest.class, failRequest);
} else {
KillTaskAttemptRequest killRequest = recordFactory.newRecordInstance(KillTaskAttemptRequest.class);
killRequest.setTaskAttemptId(attemptID);
invoke("killTaskAttempt", KillTaskAttemptRequest.class, killRequest);
}
return true;
}
public boolean killJob(JobID oldJobID)
throws YarnRemoteException {
boolean killJob(JobID oldJobID)
throws YarnRemoteException {
org.apache.hadoop.mapreduce.v2.api.records.JobId jobId
= TypeConverter.toYarn(oldJobID);
KillJobRequest killRequest = recordFactory.newRecordInstance(KillJobRequest.class);
MRClientProtocol protocol = getProxy(oldJobID);
if (protocol == null) {
return false;
}
try {
killRequest.setJobId(jobId);
protocol.killJob(killRequest);
return true;
} catch(YarnRemoteException yre) {//thrown by remote server, no need to redirect
LOG.warn(RPCUtil.toString(yre));
throw yre;
} catch(Exception e) {
// Not really requied - if this is always the history context.
LOG.debug("Failed to contact application master ", e);
MRClientProtocol proxy = getRefreshedProxy(oldJobID);
if (proxy == null) {
return false;
}
try {
killRequest.setJobId(jobId);
protocol.killJob(killRequest);
return true;
} catch(YarnRemoteException yre) {
LOG.warn(RPCUtil.toString(yre));
throw yre;
}
}
killRequest.setJobId(jobId);
invoke("killJob", KillJobRequest.class, killRequest);
return true;
}
}

View File

@ -0,0 +1,177 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.commons.lang.NotImplementedException;
import org.apache.hadoop.mapreduce.v2.api.MRClientProtocol;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.FailTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.FailTaskAttemptResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillJobRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillJobResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillTaskAttemptResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillTaskRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillTaskResponse;
import org.apache.hadoop.mapreduce.v2.api.records.CounterGroup;
import org.apache.hadoop.mapreduce.v2.api.records.Counters;
import org.apache.hadoop.mapreduce.v2.api.records.JobReport;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskReport;
import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
public class NotRunningJob implements MRClientProtocol {
private RecordFactory recordFactory =
RecordFactoryProvider.getRecordFactory(null);
private final JobState jobState;
NotRunningJob(JobState jobState) {
this.jobState = jobState;
}
@Override
public FailTaskAttemptResponse failTaskAttempt(
FailTaskAttemptRequest request) throws YarnRemoteException {
FailTaskAttemptResponse resp =
recordFactory.newRecordInstance(FailTaskAttemptResponse.class);
return resp;
}
@Override
public GetCountersResponse getCounters(GetCountersRequest request)
throws YarnRemoteException {
GetCountersResponse resp =
recordFactory.newRecordInstance(GetCountersResponse.class);
Counters counters = recordFactory.newRecordInstance(Counters.class);
counters.addAllCounterGroups(new HashMap<String, CounterGroup>());
resp.setCounters(counters);
return resp;
}
@Override
public GetDiagnosticsResponse getDiagnostics(GetDiagnosticsRequest request)
throws YarnRemoteException {
GetDiagnosticsResponse resp =
recordFactory.newRecordInstance(GetDiagnosticsResponse.class);
resp.addDiagnostics("");
return resp;
}
@Override
public GetJobReportResponse getJobReport(GetJobReportRequest request)
throws YarnRemoteException {
GetJobReportResponse resp =
recordFactory.newRecordInstance(GetJobReportResponse.class);
JobReport jobReport =
recordFactory.newRecordInstance(JobReport.class);
jobReport.setJobId(request.getJobId());
jobReport.setJobState(jobState);
resp.setJobReport(jobReport);
return resp;
}
@Override
public GetTaskAttemptCompletionEventsResponse getTaskAttemptCompletionEvents(
GetTaskAttemptCompletionEventsRequest request)
throws YarnRemoteException {
GetTaskAttemptCompletionEventsResponse resp =
recordFactory.newRecordInstance(GetTaskAttemptCompletionEventsResponse.class);
resp.addAllCompletionEvents(new ArrayList<TaskAttemptCompletionEvent>());
return resp;
}
@Override
public GetTaskAttemptReportResponse getTaskAttemptReport(
GetTaskAttemptReportRequest request) throws YarnRemoteException {
//not invoked by anybody
throw new NotImplementedException();
}
@Override
public GetTaskReportResponse getTaskReport(GetTaskReportRequest request)
throws YarnRemoteException {
GetTaskReportResponse resp =
recordFactory.newRecordInstance(GetTaskReportResponse.class);
TaskReport report = recordFactory.newRecordInstance(TaskReport.class);
report.setTaskId(request.getTaskId());
report.setTaskState(TaskState.NEW);
Counters counters = recordFactory.newRecordInstance(Counters.class);
counters.addAllCounterGroups(new HashMap<String, CounterGroup>());
report.setCounters(counters);
report.addAllRunningAttempts(new ArrayList<TaskAttemptId>());
return resp;
}
@Override
public GetTaskReportsResponse getTaskReports(GetTaskReportsRequest request)
throws YarnRemoteException {
GetTaskReportsResponse resp =
recordFactory.newRecordInstance(GetTaskReportsResponse.class);
resp.addAllTaskReports(new ArrayList<TaskReport>());
return resp;
}
@Override
public KillJobResponse killJob(KillJobRequest request)
throws YarnRemoteException {
KillJobResponse resp =
recordFactory.newRecordInstance(KillJobResponse.class);
return resp;
}
@Override
public KillTaskResponse killTask(KillTaskRequest request)
throws YarnRemoteException {
KillTaskResponse resp =
recordFactory.newRecordInstance(KillTaskResponse.class);
return resp;
}
@Override
public KillTaskAttemptResponse killTaskAttempt(
KillTaskAttemptRequest request) throws YarnRemoteException {
KillTaskAttemptResponse resp =
recordFactory.newRecordInstance(KillTaskAttemptResponse.class);
return resp;
}
}

View File

@ -61,6 +61,7 @@
import org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier;
import org.apache.hadoop.mapreduce.v2.ClientConstants;
import org.apache.hadoop.mapreduce.v2.MRConstants;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
import org.apache.hadoop.mapreduce.v2.util.MRApps;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
@ -98,7 +99,7 @@ public class YARNRunner implements ClientProtocol {
private final RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null);
private ResourceMgrDelegate resMgrDelegate;
private ClientServiceDelegate clientServiceDelegate;
private ClientCache clientCache;
private YarnConfiguration conf;
private final FileContext defaultFileContext;
@ -111,7 +112,7 @@ public YARNRunner(Configuration conf) {
this.conf = new YarnConfiguration(conf);
try {
this.resMgrDelegate = new ResourceMgrDelegate(this.conf);
this.clientServiceDelegate = new ClientServiceDelegate(this.conf,
this.clientCache = new ClientCache(this.conf,
resMgrDelegate);
this.defaultFileContext = FileContext.getFileContext(this.conf);
} catch (UnsupportedFileSystemException ufe) {
@ -248,7 +249,7 @@ public JobStatus submitJob(JobID jobId, String jobSubmitDir, Credentials ts)
|| appMaster.getState() == ApplicationState.KILLED) {
throw RPCUtil.getRemoteException("failed to run job");
}
return clientServiceDelegate.getJobStatus(jobId);
return clientCache.getClient(jobId).getJobStatus(jobId);
}
private LocalResource createApplicationResource(FileContext fs, Path p)
@ -519,43 +520,43 @@ public long renewDelegationToken(Token<DelegationTokenIdentifier> arg0)
@Override
public Counters getJobCounters(JobID arg0) throws IOException,
InterruptedException {
return clientServiceDelegate.getJobCounters(arg0);
return clientCache.getClient(arg0).getJobCounters(arg0);
}
@Override
public String getJobHistoryDir() throws IOException, InterruptedException {
return clientServiceDelegate.getJobHistoryDir();
return JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
}
@Override
public JobStatus getJobStatus(JobID jobID) throws IOException,
InterruptedException {
JobStatus status = clientServiceDelegate.getJobStatus(jobID);
JobStatus status = clientCache.getClient(jobID).getJobStatus(jobID);
return status;
}
@Override
public TaskCompletionEvent[] getTaskCompletionEvents(JobID arg0, int arg1,
int arg2) throws IOException, InterruptedException {
return clientServiceDelegate.getTaskCompletionEvents(arg0, arg1, arg2);
return clientCache.getClient(arg0).getTaskCompletionEvents(arg0, arg1, arg2);
}
@Override
public String[] getTaskDiagnostics(TaskAttemptID arg0) throws IOException,
InterruptedException {
return clientServiceDelegate.getTaskDiagnostics(arg0);
return clientCache.getClient(arg0.getJobID()).getTaskDiagnostics(arg0);
}
@Override
public TaskReport[] getTaskReports(JobID jobID, TaskType taskType)
throws IOException, InterruptedException {
return clientServiceDelegate
return clientCache.getClient(jobID)
.getTaskReports(jobID, taskType);
}
@Override
public void killJob(JobID arg0) throws IOException, InterruptedException {
if (!clientServiceDelegate.killJob(arg0)) {
if (!clientCache.getClient(arg0).killJob(arg0)) {
resMgrDelegate.killApplication(TypeConverter.toYarn(arg0).getAppId());
}
}
@ -563,7 +564,7 @@ public void killJob(JobID arg0) throws IOException, InterruptedException {
@Override
public boolean killTask(TaskAttemptID arg0, boolean arg1) throws IOException,
InterruptedException {
return clientServiceDelegate.killTask(arg0, arg1);
return clientCache.getClient(arg0.getJobID()).killTask(arg0, arg1);
}
@Override

View File

@ -21,6 +21,7 @@
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Iterator;
import junit.framework.Assert;
@ -30,7 +31,10 @@
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Cluster;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.v2.api.MRClientProtocol;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.FailTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.FailTaskAttemptResponse;
@ -60,6 +64,7 @@
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.JobReport;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent;
import org.apache.hadoop.mapreduce.v2.jobhistory.JHConfig;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.net.NetUtils;
@ -86,7 +91,6 @@
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.ApplicationState;
import org.apache.hadoop.yarn.api.records.ApplicationStatus;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.factories.RecordFactory;
@ -111,6 +115,7 @@ public class TestClientRedirect {
private volatile boolean amContact = false;
private volatile boolean hsContact = false;
private volatile boolean amRunning = false;
private volatile boolean amRestarting = false;
@Test
public void testRedirect() throws Exception {
@ -138,17 +143,9 @@ public void testRedirect() throws Exception {
new org.apache.hadoop.mapred.JobID("201103121733", 1);
org.apache.hadoop.mapreduce.Counters counters = cluster.getJob(jobID)
.getCounters();
Iterator<org.apache.hadoop.mapreduce.CounterGroup> it = counters.iterator();
while (it.hasNext()) {
org.apache.hadoop.mapreduce.CounterGroup group = it.next();
LOG.info("Group " + group.getDisplayName());
Iterator<org.apache.hadoop.mapreduce.Counter> itc = group.iterator();
while (itc.hasNext()) {
LOG.info("Counter is " + itc.next().getDisplayName());
}
}
validateCounters(counters);
Assert.assertTrue(amContact);
LOG.info("Sleeping for 5 seconds before stop for" +
" the client socket to not get EOF immediately..");
Thread.sleep(5000);
@ -160,10 +157,51 @@ public void testRedirect() throws Exception {
LOG.info("Sleeping for 5 seconds after stop for" +
" the server to exit cleanly..");
Thread.sleep(5000);
amRestarting = true;
// Same client
//results are returned from fake (not started job)
counters = cluster.getJob(jobID).getCounters();
Assert.assertEquals(0, counters.countCounters());
Job job = cluster.getJob(jobID);
org.apache.hadoop.mapreduce.TaskID taskId =
new org.apache.hadoop.mapreduce.TaskID(jobID, TaskType.MAP, 0);
TaskAttemptID tId = new TaskAttemptID(taskId, 0);
//invoke all methods to check that no exception is thrown
job.killJob();
job.killTask(tId);
job.failTask(tId);
job.getTaskCompletionEvents(0, 100);
job.getStatus();
job.getTaskDiagnostics(tId);
job.getTaskReports(TaskType.MAP);
job.getTrackingURL();
amRestarting = false;
amService = new AMService();
amService.init(conf);
amService.start(conf);
amRunning = true;
amContact = false; //reset
counters = cluster.getJob(jobID).getCounters();
validateCounters(counters);
Assert.assertTrue(amContact);
amRunning = false;
// Same client
counters = cluster.getJob(jobID).getCounters();
it = counters.iterator();
validateCounters(counters);
Assert.assertTrue(hsContact);
rmService.stop();
historyService.stop();
}
private void validateCounters(org.apache.hadoop.mapreduce.Counters counters) {
Iterator<org.apache.hadoop.mapreduce.CounterGroup> it = counters.iterator();
while (it.hasNext()) {
org.apache.hadoop.mapreduce.CounterGroup group = it.next();
LOG.info("Group " + group.getDisplayName());
@ -172,11 +210,7 @@ public void testRedirect() throws Exception {
LOG.info("Counter is " + itc.next().getDisplayName());
}
}
Assert.assertTrue(hsContact);
rmService.stop();
historyService.stop();
Assert.assertEquals(1, counters.countCounters());
}
class RMService extends AbstractService implements ClientRMProtocol {
@ -226,6 +260,8 @@ public GetApplicationReportResponse getApplicationReport(
application.setApplicationId(applicationId);
if (amRunning) {
application.setState(ApplicationState.RUNNING);
} else if (amRestarting) {
application.setState(ApplicationState.SUBMITTED);
} else {
application.setState(ApplicationState.SUCCEEDED);
}

View File

@ -19,5 +19,5 @@
package org.apache.hadoop.yarn.api.records;
public enum ApplicationState {
NEW, SUBMITTED, RUNNING, RESTARTING, SUCCEEDED, FAILED, KILLED
NEW, SUBMITTED, RUNNING, SUCCEEDED, FAILED, KILLED
}

View File

@ -11,6 +11,7 @@
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationIdProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ContainerIdProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ContainerIdProtoOrBuilder;
import org.mortbay.log.Log;
@ -33,7 +34,40 @@ public class ContainerIdPBImpl extends ProtoBase<ContainerIdProto> implements Co
counterFormat.setMinimumIntegerDigits(6);
}
// TODO: Why thread local?
// ^ NumberFormat instances are not threadsafe
private static final ThreadLocal<NumberFormat> appIdFormat = new ThreadLocal<NumberFormat>() {
@Override
public NumberFormat initialValue() {
NumberFormat fmt = NumberFormat.getInstance();
fmt.setGroupingUsed(false);
fmt.setMinimumIntegerDigits(4);
return fmt;
}
};
// TODO: fail the app submission if attempts are more than 10 or something
private static final ThreadLocal<NumberFormat> appAttemptIdFormat = new ThreadLocal<NumberFormat>() {
@Override
public NumberFormat initialValue() {
NumberFormat fmt = NumberFormat.getInstance();
fmt.setGroupingUsed(false);
fmt.setMinimumIntegerDigits(2);
return fmt;
}
};
// TODO: Why thread local?
// ^ NumberFormat instances are not threadsafe
private static final ThreadLocal<NumberFormat> containerIdFormat = new ThreadLocal<NumberFormat>() {
@Override
public NumberFormat initialValue() {
NumberFormat fmt = NumberFormat.getInstance();
fmt.setGroupingUsed(false);
fmt.setMinimumIntegerDigits(6);
return fmt;
}
};
public ContainerIdPBImpl() {
builder = ContainerIdProto.newBuilder();
}
@ -162,18 +196,23 @@ public boolean equals(Object other) {
@Override
public int compareTo(ContainerId other) {
if (this.getAppId().compareTo(other.getAppId()) == 0) {
if (this.getAppAttemptId().compareTo(other.getAppAttemptId()) == 0) {
return this.getId() - other.getId();
} else {
return this.getAppId().compareTo(other.getAppId());
return this.getAppAttemptId().compareTo(other.getAppAttemptId());
}
}
@Override
public String toString() {
String id = (this.getAppId() != null) ? this.getAppId().getClusterTimestamp() + "_" +
idFormat.format(this.getAppId().getId()): "none";
return "containerid_" + id + "_" + counterFormat.format(getId());
StringBuilder sb = new StringBuilder();
ApplicationId appId = getAppId();
sb.append("container_").append(appId.getClusterTimestamp()).append("_");
sb.append(appIdFormat.get().format(appId.getId())).append("_");
sb.append(appAttemptIdFormat.get().format(getAppAttemptId().
getAttemptId())).append("_");
sb.append(containerIdFormat.get().format(getId()));
return sb.toString();
}
}

View File

@ -27,6 +27,7 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.Resource;
@ -68,6 +69,7 @@ public Resource getResource() {
public void write(DataOutput out) throws IOException {
LOG.debug("Writing ContainerTokenIdentifier to RPC layer");
out.writeInt(this.containerId.getAppId().getId());
out.writeInt(this.containerId.getAppAttemptId().getAttemptId());
out.writeInt(this.containerId.getId());
// TODO: Cluster time-stamp?
out.writeUTF(this.nmHostName);
@ -78,7 +80,10 @@ public void write(DataOutput out) throws IOException {
public void readFields(DataInput in) throws IOException {
this.containerId = RecordFactoryProvider.getRecordFactory(null).newRecordInstance(ContainerId.class);
this.containerId.setAppId(RecordFactoryProvider.getRecordFactory(null).newRecordInstance(ApplicationId.class));
this.containerId.setAppAttemptId(RecordFactoryProvider.getRecordFactory(null).newRecordInstance(ApplicationAttemptId.class));
this.containerId.getAppId().setId(in.readInt());
this.containerId.getAppAttemptId().setApplicationId(this.containerId.getAppId());
this.containerId.getAppAttemptId().setAttemptId(in.readInt());
this.containerId.setId(in.readInt());
this.nmHostName = in.readUTF();
this.resource = RecordFactoryProvider.getRecordFactory(null).newRecordInstance(Resource.class);

View File

@ -134,11 +134,12 @@ public static ApplicationId convert(long clustertimestamp, CharSequence id) {
}
public static ContainerId newContainerId(RecordFactory recordFactory,
ApplicationId appId,
ApplicationId appId, ApplicationAttemptId appAttemptId,
int containerId) {
ContainerId id = recordFactory.newRecordInstance(ContainerId.class);
id.setAppId(appId);
id.setId(containerId);
id.setAppAttemptId(appAttemptId);
return id;
}

View File

@ -104,18 +104,7 @@ public NumberFormat initialValue() {
}
};
// TODO: Why thread local?
// ^ NumberFormat instances are not threadsafe
private static final ThreadLocal<NumberFormat> containerIdFormat =
new ThreadLocal<NumberFormat>() {
@Override
public NumberFormat initialValue() {
NumberFormat fmt = NumberFormat.getInstance();
fmt.setGroupingUsed(false);
fmt.setMinimumIntegerDigits(6);
return fmt;
}
};
public static String toString(ApplicationId appId) {
StringBuilder sb = new StringBuilder();
@ -142,12 +131,7 @@ private static ApplicationId toApplicationId(RecordFactory recordFactory,
}
public static String toString(ContainerId cId) {
StringBuilder sb = new StringBuilder();
ApplicationId appId = cId.getAppId();
sb.append("container_").append(appId.getClusterTimestamp()).append("_");
sb.append(appIdFormat.get().format(appId.getId())).append("_");
sb.append(containerIdFormat.get().format(cId.getId()));
return sb.toString();
return cId.toString();
}
public static ContainerId toContainerId(RecordFactory recordFactory,

View File

@ -28,6 +28,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StopContainerRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -104,6 +105,9 @@ protected void startStatusUpdater() {
ContainerLaunchContext launchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
ContainerId cID = recordFactory.newRecordInstance(ContainerId.class);
cID.setAppId(recordFactory.newRecordInstance(ApplicationId.class));
ApplicationAttemptId atId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
atId.setApplicationId(cID.getAppId());
cID.setAppAttemptId(atId);
launchContext.setContainerId(cID);
launchContext.setUser("testing");
launchContext.setResource(recordFactory.newRecordInstance(Resource.class));

View File

@ -28,6 +28,7 @@
import org.apache.hadoop.NodeHealthCheckerService;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -93,6 +94,7 @@ public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerReques
}
ApplicationId applicationID = recordFactory.newRecordInstance(ApplicationId.class);
ApplicationAttemptId appAttemptID = recordFactory.newRecordInstance(ApplicationAttemptId.class);
ContainerId firstContainerID = recordFactory.newRecordInstance(ContainerId.class);
ContainerId secondContainerID = recordFactory.newRecordInstance(ContainerId.class);
@ -106,7 +108,9 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws
// Give a container to the NM.
applicationID.setId(heartBeatID);
appAttemptID.setApplicationId(applicationID);
firstContainerID.setAppId(applicationID);
firstContainerID.setAppAttemptId(appAttemptID);
firstContainerID.setId(heartBeatID);
ContainerLaunchContext launchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
launchContext.setContainerId(firstContainerID);
@ -130,7 +134,9 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws
// Give another container to the NM.
applicationID.setId(heartBeatID);
appAttemptID.setApplicationId(applicationID);
secondContainerID.setAppId(applicationID);
secondContainerID.setAppAttemptId(appAttemptID);
secondContainerID.setId(heartBeatID);
ContainerLaunchContext launchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
launchContext.setContainerId(secondContainerID);

View File

@ -34,6 +34,7 @@
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StopContainerRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -46,9 +47,9 @@
import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
@ -74,7 +75,14 @@ public void testContainerManagerInitialization() throws IOException {
boolean throwsException = false;
try {
GetContainerStatusRequest request = recordFactory.newRecordInstance(GetContainerStatusRequest.class);
request.setContainerId(recordFactory.newRecordInstance(ContainerId.class));
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId cId = recordFactory.newRecordInstance(ContainerId.class);
cId.setAppId(appId);
cId.setAppAttemptId(appAttemptId);
request.setContainerId(cId);
containerManager.getContainerStatus(request);
} catch (YarnRemoteException e) {
throwsException = true;
@ -99,8 +107,12 @@ public void testContainerSetup() throws IOException, InterruptedException {
// ////// Construct the Container-id
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId cId = recordFactory.newRecordInstance(ContainerId.class);
cId.setAppId(appId);
cId.setAppAttemptId(appAttemptId);
container.setContainerId(cId);
container.setUser(user);
@ -184,8 +196,12 @@ public void testContainerLaunchAndStop() throws IOException,
// ////// Construct the Container-id
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId cId = recordFactory.newRecordInstance(ContainerId.class);
cId.setAppId(appId);
cId.setAppAttemptId(appAttemptId);
containerLaunchContext.setContainerId(cId);
containerLaunchContext.setUser(user);
@ -282,8 +298,12 @@ public void testLocalFilesCleanup() throws InterruptedException,
// ////// Construct the Container-id
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId cId = recordFactory.newRecordInstance(ContainerId.class);
cId.setAppId(appId);
cId.setAppAttemptId(appAttemptId);
container.setContainerId(cId);
container.setUser(user);

View File

@ -21,6 +21,7 @@
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.LocalResource;
@ -47,6 +48,7 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.Records;
import org.junit.Test;
import static org.junit.Assert.*;
@ -195,7 +197,7 @@ public boolean matches(Object o) {
Thread.sleep(500);
dispatcher.await();
String appStr = ConverterUtils.toString(appId);
String ctnrStr = ConverterUtils.toString(c.getContainerID());
String ctnrStr = c.getContainerID().toString();
verify(exec).startLocalizer(isA(Path.class), isA(InetSocketAddress.class),
eq("user0"), eq(appStr), eq(ctnrStr), isA(List.class));
@ -270,9 +272,13 @@ static LocalResource getMockResource(Random r) {
static Container getMockContainer(ApplicationId appId, int id) {
Container c = mock(Container.class);
ContainerId cId = mock(ContainerId.class);
when(cId.getAppId()).thenReturn(appId);
when(cId.getId()).thenReturn(id);
ApplicationAttemptId appAttemptId = Records.newRecord(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId cId = Records.newRecord(ContainerId.class);
cId.setAppAttemptId(appAttemptId);
cId.setAppId(appId);
cId.setId(id);
when(c.getUser()).thenReturn("user0");
when(c.getContainerID()).thenReturn(cId);
Credentials creds = new Credentials();

View File

@ -37,6 +37,7 @@
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -110,8 +111,11 @@ public void testLocalFileDeletionAfterUpload() throws IOException {
application1, this.user, null,
ContainerLogsRetentionPolicy.ALL_CONTAINERS));
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(application1);
appAttemptId.setAttemptId(1);
ContainerId container11 =
BuilderUtils.newContainerId(recordFactory, application1, 1);
BuilderUtils.newContainerId(recordFactory, application1, appAttemptId, 1);
// Simulate log-file creation
writeContainerLogs(app1LogDir, container11);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
@ -188,14 +192,18 @@ public void testMultipleAppsLogAggregation() throws IOException {
application1, this.user, null,
ContainerLogsRetentionPolicy.ALL_CONTAINERS));
ApplicationAttemptId appAttemptId1 = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId1.setApplicationId(application1);
ContainerId container11 =
BuilderUtils.newContainerId(recordFactory, application1, 1);
BuilderUtils.newContainerId(recordFactory, application1, appAttemptId1, 1);
// Simulate log-file creation
writeContainerLogs(app1LogDir, container11);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
container11, "0"));
ApplicationId application2 = BuilderUtils.newApplicationId(1234, 2);
ApplicationAttemptId appAttemptId2 = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId1.setApplicationId(application2);
File app2LogDir =
new File(localLogDir, ConverterUtils.toString(application2));
@ -204,19 +212,22 @@ public void testMultipleAppsLogAggregation() throws IOException {
application2, this.user, null,
ContainerLogsRetentionPolicy.APPLICATION_MASTER_ONLY));
ContainerId container21 =
BuilderUtils.newContainerId(recordFactory, application2, 1);
BuilderUtils.newContainerId(recordFactory, application2, appAttemptId2, 1);
writeContainerLogs(app2LogDir, container21);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
container21, "0"));
ContainerId container12 =
BuilderUtils.newContainerId(recordFactory, application1, 2);
BuilderUtils.newContainerId(recordFactory, application1, appAttemptId1, 2);
writeContainerLogs(app1LogDir, container12);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
container12, "0"));
ApplicationId application3 = BuilderUtils.newApplicationId(1234, 3);
ApplicationAttemptId appAttemptId3 = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId1.setApplicationId(application3);
File app3LogDir =
new File(localLogDir, ConverterUtils.toString(application3));
@ -226,25 +237,25 @@ public void testMultipleAppsLogAggregation() throws IOException {
ContainerLogsRetentionPolicy.AM_AND_FAILED_CONTAINERS_ONLY));
ContainerId container31 =
BuilderUtils.newContainerId(recordFactory, application3, 1);
BuilderUtils.newContainerId(recordFactory, application3, appAttemptId3, 1);
writeContainerLogs(app3LogDir, container31);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
container31, "0"));
ContainerId container32 =
BuilderUtils.newContainerId(recordFactory, application3, 2);
BuilderUtils.newContainerId(recordFactory, application3, appAttemptId3, 2);
writeContainerLogs(app3LogDir, container32);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
container32, "1")); // Failed container
ContainerId container22 =
BuilderUtils.newContainerId(recordFactory, application2, 2);
BuilderUtils.newContainerId(recordFactory, application2, appAttemptId2, 2);
writeContainerLogs(app2LogDir, container22);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
container22, "0"));
ContainerId container33 =
BuilderUtils.newContainerId(recordFactory, application3, 3);
BuilderUtils.newContainerId(recordFactory, application3, appAttemptId3, 3);
writeContainerLogs(app3LogDir, container33);
logAggregationService.handle(new LogAggregatorContainerFinishedEvent(
container33, "0"));

View File

@ -36,6 +36,7 @@
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -190,9 +191,13 @@ public void testContainerKillOnMemoryOverflow() throws IOException,
// ////// Construct the Container-id
ApplicationId appId =
recordFactory.newRecordInstance(ApplicationId.class);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId cId = recordFactory.newRecordInstance(ContainerId.class);
cId.setAppId(appId);
cId.setId(0);
cId.setAppAttemptId(appAttemptId);
containerLaunchContext.setContainerId(cId);
containerLaunchContext.setUser(user);

View File

@ -24,6 +24,7 @@
import java.io.Writer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -87,10 +88,13 @@ public long getPmemAllocatedForContainers() {
when(app.getUser()).thenReturn(user);
when(app.getAppId()).thenReturn(appId);
nmContext.getApplications().put(appId, app);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId container1 =
BuilderUtils.newContainerId(recordFactory, appId, 0);
BuilderUtils.newContainerId(recordFactory, appId, appAttemptId, 0);
ContainerId container2 =
BuilderUtils.newContainerId(recordFactory, appId, 1);
BuilderUtils.newContainerId(recordFactory, appId, appAttemptId, 1);
NodeManagerMetrics metrics = mock(NodeManagerMetrics.class);
for (ContainerId containerId : new ContainerId[] { container1,
container2}) {

View File

@ -106,9 +106,9 @@ RMAppEventType.KILL, new AppKilledTransition())
.addTransition(RMAppState.ACCEPTED, RMAppState.RUNNING,
RMAppEventType.ATTEMPT_REGISTERED)
.addTransition(RMAppState.ACCEPTED,
EnumSet.of(RMAppState.ACCEPTED, RMAppState.FAILED),
EnumSet.of(RMAppState.SUBMITTED, RMAppState.FAILED),
RMAppEventType.ATTEMPT_FAILED,
new AttemptFailedTransition(RMAppState.ACCEPTED))
new AttemptFailedTransition(RMAppState.SUBMITTED))
.addTransition(RMAppState.ACCEPTED, RMAppState.KILLED,
RMAppEventType.KILL, new AppKilledTransition())
@ -116,23 +116,12 @@ RMAppEventType.KILL, new AppKilledTransition())
.addTransition(RMAppState.RUNNING, RMAppState.FINISHED,
RMAppEventType.ATTEMPT_FINISHED, FINAL_TRANSITION)
.addTransition(RMAppState.RUNNING,
EnumSet.of(RMAppState.RUNNING, RMAppState.FAILED),
EnumSet.of(RMAppState.SUBMITTED, RMAppState.FAILED),
RMAppEventType.ATTEMPT_FAILED,
new AttemptFailedTransition(RMAppState.RUNNING))
new AttemptFailedTransition(RMAppState.SUBMITTED))
.addTransition(RMAppState.RUNNING, RMAppState.KILLED,
RMAppEventType.KILL, new AppKilledTransition())
// Transitions from RESTARTING state
// TODO - no way to get to RESTARTING state right now
.addTransition(RMAppState.RESTARTING, RMAppState.RUNNING,
RMAppEventType.ATTEMPT_REGISTERED)
.addTransition(RMAppState.RESTARTING,
EnumSet.of(RMAppState.RESTARTING, RMAppState.FAILED),
RMAppEventType.ATTEMPT_FAILED,
new AttemptFailedTransition(RMAppState.RESTARTING))
.addTransition(RMAppState.RESTARTING, RMAppState.KILLED,
RMAppEventType.KILL, new AppKilledTransition())
// Transitions from FINISHED state
.addTransition(RMAppState.FINISHED, RMAppState.FINISHED,
RMAppEventType.KILL)
@ -263,8 +252,6 @@ private ApplicationState createApplicationState(RMAppState rmAppState) {
case SUBMITTED:
case ACCEPTED:
return ApplicationState.SUBMITTED;
case RESTARTING:
return ApplicationState.RESTARTING;
case RUNNING:
return ApplicationState.RUNNING;
case FINISHED:
@ -375,6 +362,21 @@ public void handle(RMAppEvent event) {
}
}
private void createNewAttempt() {
ApplicationAttemptId appAttemptId = Records
.newRecord(ApplicationAttemptId.class);
appAttemptId.setApplicationId(applicationId);
appAttemptId.setAttemptId(attempts.size() + 1);
RMAppAttempt attempt = new RMAppAttemptImpl(appAttemptId,
clientTokenStr, rmContext, scheduler, masterService,
submissionContext);
attempts.put(appAttemptId, attempt);
currentAttempt = attempt;
dispatcher.getEventHandler().handle(
new RMAppAttemptEvent(appAttemptId, RMAppAttemptEventType.START));
}
private static class RMAppTransition implements
SingleArcTransition<RMAppImpl, RMAppEvent> {
public void transition(RMAppImpl app, RMAppEvent event) {
@ -384,19 +386,7 @@ public void transition(RMAppImpl app, RMAppEvent event) {
private static final class StartAppAttemptTransition extends RMAppTransition {
public void transition(RMAppImpl app, RMAppEvent event) {
ApplicationAttemptId appAttemptId = Records
.newRecord(ApplicationAttemptId.class);
appAttemptId.setApplicationId(app.applicationId);
appAttemptId.setAttemptId(app.attempts.size() + 1);
RMAppAttempt attempt = new RMAppAttemptImpl(appAttemptId,
app.clientTokenStr, app.rmContext, app.scheduler,
app.masterService, app.submissionContext);
app.attempts.put(appAttemptId, attempt);
app.currentAttempt = attempt;
app.dispatcher.getEventHandler().handle(
new RMAppAttemptEvent(appAttemptId, RMAppAttemptEventType.START));
app.createNewAttempt();
};
}
@ -452,27 +442,17 @@ public AttemptFailedTransition(RMAppState initialState) {
public RMAppState transition(RMAppImpl app, RMAppEvent event) {
if (app.attempts.size() == app.maxRetries) {
app.diagnostics.append("Application " + app.getApplicationId()
+ " failed " + app.maxRetries
+ " times. Failing the application.");
String msg = "Application " + app.getApplicationId()
+ " failed " + app.maxRetries
+ " times. Failing the application.";
LOG.info(msg);
app.diagnostics.append(msg);
// Inform the node for app-finish
FINAL_TRANSITION.transition(app, event);
return RMAppState.FAILED;
}
ApplicationAttemptId appAttemptId = Records
.newRecord(ApplicationAttemptId.class);
appAttemptId.setApplicationId(app.applicationId);
appAttemptId.setAttemptId(app.attempts.size() + 1);
// Create a new attempt.
RMAppAttempt attempt = new RMAppAttemptImpl(appAttemptId,
app.clientTokenStr, app.rmContext, app.scheduler,
app.masterService, app.submissionContext);
app.attempts.put(appAttemptId, attempt);
app.currentAttempt = attempt;
app.dispatcher.getEventHandler().handle(
new RMAppAttemptEvent(appAttemptId, RMAppAttemptEventType.START));
app.createNewAttempt();
return initialState;
}

View File

@ -1,5 +1,5 @@
package org.apache.hadoop.yarn.server.resourcemanager.rmapp;
public enum RMAppState {
NEW, SUBMITTED, ACCEPTED, RUNNING, RESTARTING, FINISHED, FAILED, KILLED
NEW, SUBMITTED, ACCEPTED, RUNNING, FINISHED, FAILED, KILLED
}

View File

@ -18,40 +18,34 @@
package org.apache.hadoop.yarn.server.resourcemanager.rmapp;
import static org.mockito.Mockito.*;
import static org.mockito.Mockito.mock;
import java.io.IOException;
import java.lang.reflect.Method;
import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.MockApps;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.security.ApplicationTokenSecretManager;
import org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.RMContextImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemStore;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.AMLivelinessMonitor;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemStore;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -306,6 +300,9 @@ public void testAppAcceptedFailed() throws IOException {
for (int i=1; i<maxRetries; i++) {
RMAppEvent event = new RMAppEvent(application.getApplicationId(), RMAppEventType.ATTEMPT_FAILED);
application.handle(event);
assertAppState(RMAppState.SUBMITTED, application);
event = new RMAppEvent(application.getApplicationId(), RMAppEventType.APP_ACCEPTED);
application.handle(event);
assertAppState(RMAppState.ACCEPTED, application);
}
@ -342,10 +339,22 @@ public void testAppRunningFailed() throws IOException {
LOG.info("--- START: testAppRunningFailed ---");
RMApp application = testCreateAppRunning();
RMAppAttempt appAttempt = application.getCurrentAppAttempt();
int expectedAttemptId = 1;
Assert.assertEquals(expectedAttemptId, appAttempt.getAppAttemptId().getAttemptId());
// RUNNING => FAILED/RESTARTING event RMAppEventType.ATTEMPT_FAILED
for (int i=1; i<maxRetries; i++) {
RMAppEvent event = new RMAppEvent(application.getApplicationId(), RMAppEventType.ATTEMPT_FAILED);
application.handle(event);
assertAppState(RMAppState.SUBMITTED, application);
appAttempt = application.getCurrentAppAttempt();
Assert.assertEquals(++expectedAttemptId,
appAttempt.getAppAttemptId().getAttemptId());
event = new RMAppEvent(application.getApplicationId(), RMAppEventType.APP_ACCEPTED);
application.handle(event);
assertAppState(RMAppState.ACCEPTED, application);
event = new RMAppEvent(application.getApplicationId(), RMAppEventType.ATTEMPT_REGISTERED);
application.handle(event);
assertAppState(RMAppState.RUNNING, application);
}

View File

@ -52,6 +52,7 @@
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
import org.apache.hadoop.yarn.api.protocolrecords.SubmitApplicationRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationMaster;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
@ -284,8 +285,12 @@ public Void run() {
.newRecordInstance(GetContainerStatusRequest.class);
ContainerId containerID =
recordFactory.newRecordInstance(ContainerId.class);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appID);
appAttemptId.setAttemptId(1);
containerID.setAppId(appID);
containerID.setId(1);
containerID.setAppAttemptId(appAttemptId);
request.setContainerId(containerID);
client.getContainerStatus(request);
} catch (YarnRemoteException e) {
@ -331,8 +336,12 @@ public Void run() {
.newRecordInstance(GetContainerStatusRequest.class);
containerID =
recordFactory.newRecordInstance(ContainerId.class);
ApplicationAttemptId appAttemptId = recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appID);
appAttemptId.setAttemptId(1);
containerID.setAppId(appID);
containerID.setId(1);
containerID.setAppAttemptId(appAttemptId);
request.setContainerId(containerID);
try {
client.getContainerStatus(request);