MAPREDUCE-2655. Add audit logs to ResourceManager and NodeManager. Contributed by Thomas Graves.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1165949 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Arun Murthy 2011-09-07 01:24:21 +00:00
parent d51078eb7e
commit cdfabf5ae2
11 changed files with 191 additions and 30 deletions

View File

@ -245,6 +245,9 @@ Release 0.23.0 - Unreleased
MAPREDUCE-2774. Add startup message to ResourceManager & NodeManager on
startup. (Venu Gopala Rao via acmurthy)
MAPREDUCE-2655. Add audit logs to ResourceManager and NodeManager. (Thomas
Graves via acmurthy)
OPTIMIZATIONS
MAPREDUCE-2026. Make JobTracker.getJobCounters() and

View File

@ -65,6 +65,8 @@
import org.apache.hadoop.yarn.server.nodemanager.ContainerManagerEvent;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger;
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.nodemanager.NMConfig;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
@ -266,6 +268,10 @@ public StartContainerResponse startContainer(StartContainerRequest request)
ContainerId containerID = launchContext.getContainerId();
ApplicationId applicationID = containerID.getAppId();
if (context.getContainers().putIfAbsent(containerID, container) != null) {
NMAuditLogger.logFailure(launchContext.getUser(),
AuditConstants.START_CONTAINER, "ContainerManagerImpl",
"Container already running on this node!",
applicationID, containerID);
throw RPCUtil.getRemoteException("Container " + containerID
+ " already is running on this node!!");
}
@ -281,6 +287,11 @@ public StartContainerResponse startContainer(StartContainerRequest request)
// TODO: Validate the request
dispatcher.getEventHandler().handle(new ApplicationInitEvent(container));
NMAuditLogger.logSuccess(launchContext.getUser(),
AuditConstants.START_CONTAINER, "ContainerManageImpl",
applicationID, containerID);
StartContainerResponse response =
recordFactory.newRecordInstance(StartContainerResponse.class);
response.addAllServiceResponse(auxiluaryServices.getMeta());
@ -300,12 +311,23 @@ public StopContainerResponse stopContainer(StopContainerRequest request)
Container container = this.context.getContainers().get(containerID);
if (container == null) {
LOG.warn("Trying to stop unknown container " + containerID);
NMAuditLogger.logFailure(container.getUser(),
AuditConstants.STOP_CONTAINER, "ContainerManagerImpl",
"Trying to stop unknown container!",
containerID.getAppId(), containerID);
return response; // Return immediately.
}
dispatcher.getEventHandler().handle(
new ContainerKillEvent(containerID,
"Container killed by the ApplicationMaster."));
// user logged here not ideal since just getting user from container but
// request doesn't have anything and should be coming from user of AM so
// should be the same or should be rejected by auth before here.
NMAuditLogger.logSuccess(container.getUser(),
AuditConstants.STOP_CONTAINER, "ContainerManageImpl",
containerID.getAppId(), containerID);
// TODO: Move this code to appropriate place once kill_container is
// implemented.
nodeStatusUpdater.sendOutofBandHeartBeat();

View File

@ -42,6 +42,8 @@
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger;
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationContainerFinishedEvent;
@ -365,18 +367,28 @@ private void finished() {
case EXITED_WITH_SUCCESS:
metrics.endRunningContainer();
metrics.completedContainer();
NMAuditLogger.logSuccess(getUser(),
AuditConstants.FINISH_SUCCESS_CONTAINER, "ContainerImpl",
getContainerID().getAppId(), getContainerID());
break;
case EXITED_WITH_FAILURE:
metrics.endRunningContainer();
// fall through
case LOCALIZATION_FAILED:
metrics.failedContainer();
NMAuditLogger.logFailure(getUser(),
AuditConstants.FINISH_FAILED_CONTAINER, "ContainerImpl",
"Container failed with state: " + getContainerState(),
getContainerID().getAppId(), getContainerID());
break;
case CONTAINER_CLEANEDUP_AFTER_KILL:
metrics.endRunningContainer();
// fall through
case NEW:
metrics.killedContainer();
NMAuditLogger.logSuccess(getUser(),
AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
getContainerID().getAppId(), getContainerID());
}
metrics.releaseContainer(getLaunchContext().getResource());

View File

@ -51,6 +51,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.api.protocolrecords.RefreshUserToGroupsMappingsRequest;
import org.apache.hadoop.yarn.server.resourcemanager.api.protocolrecords.RefreshUserToGroupsMappingsResponse;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.service.AbstractService;
public class AdminService extends AbstractService implements RMAdminProtocol {
@ -113,40 +114,54 @@ public void stop() {
super.stop();
}
private void checkAcls(String method) throws YarnRemoteException {
private UserGroupInformation checkAcls(String method) throws YarnRemoteException {
UserGroupInformation user;
try {
UserGroupInformation user = UserGroupInformation.getCurrentUser();
user = UserGroupInformation.getCurrentUser();
} catch (IOException ioe) {
LOG.warn("Couldn't get current user", ioe);
RMAuditLogger.logFailure("UNKNOWN", method,
adminAcl.toString(), "AdminService",
"Couldn't get current user");
throw RPCUtil.getRemoteException(ioe);
}
if (!adminAcl.isUserAllowed(user)) {
LOG.warn("User " + user.getShortUserName() + " doesn't have permission" +
" to call '" + method + "'");
RMAuditLogger.logFailure(user.getShortUserName(), method,
adminAcl.toString(), "AdminService",
AuditConstants.UNAUTHORIZED_USER);
throw RPCUtil.getRemoteException(
new AccessControlException("User " + user.getShortUserName() +
" doesn't have permission" +
" to call '" + method + "'")
);
}
LOG.info("RM Admin: " + method + " invoked by user " +
user.getShortUserName());
} catch (IOException ioe) {
LOG.warn("Couldn't get current user", ioe);
throw RPCUtil.getRemoteException(ioe);
}
return user;
}
@Override
public RefreshQueuesResponse refreshQueues(RefreshQueuesRequest request)
throws YarnRemoteException {
checkAcls("refreshQueues");
UserGroupInformation user = checkAcls("refreshQueues");
try {
scheduler.reinitialize(conf, null, null); // ContainerTokenSecretManager can't
// be 'refreshed'
RMAuditLogger.logSuccess(user.getShortUserName(), "refreshQueues",
"AdminService");
return recordFactory.newRecordInstance(RefreshQueuesResponse.class);
} catch (IOException ioe) {
LOG.info("Exception refreshing queues ", ioe);
RMAuditLogger.logFailure(user.getShortUserName(), "refreshQueues",
adminAcl.toString(), "AdminService",
"Exception refreshing queues");
throw RPCUtil.getRemoteException(ioe);
}
}
@ -154,12 +169,17 @@ public RefreshQueuesResponse refreshQueues(RefreshQueuesRequest request)
@Override
public RefreshNodesResponse refreshNodes(RefreshNodesRequest request)
throws YarnRemoteException {
checkAcls("refreshNodes");
UserGroupInformation user = checkAcls("refreshNodes");
try {
this.nodesListManager.refreshNodes();
RMAuditLogger.logSuccess(user.getShortUserName(), "refreshNodes",
"AdminService");
return recordFactory.newRecordInstance(RefreshNodesResponse.class);
} catch (IOException ioe) {
LOG.info("Exception refreshing nodes ", ioe);
RMAuditLogger.logFailure(user.getShortUserName(), "refreshNodes",
adminAcl.toString(), "AdminService",
"Exception refreshing nodes");
throw RPCUtil.getRemoteException(ioe);
}
}
@ -168,9 +188,11 @@ public RefreshNodesResponse refreshNodes(RefreshNodesRequest request)
public RefreshSuperUserGroupsConfigurationResponse refreshSuperUserGroupsConfiguration(
RefreshSuperUserGroupsConfigurationRequest request)
throws YarnRemoteException {
checkAcls("refreshSuperUserGroupsConfiguration");
UserGroupInformation user = checkAcls("refreshSuperUserGroupsConfiguration");
ProxyUsers.refreshSuperUserGroupsConfiguration(new Configuration());
RMAuditLogger.logSuccess(user.getShortUserName(),
"refreshSuperUserGroupsConfiguration", "AdminService");
return recordFactory.newRecordInstance(
RefreshSuperUserGroupsConfigurationResponse.class);
@ -179,9 +201,11 @@ public RefreshSuperUserGroupsConfigurationResponse refreshSuperUserGroupsConfigu
@Override
public RefreshUserToGroupsMappingsResponse refreshUserToGroupsMappings(
RefreshUserToGroupsMappingsRequest request) throws YarnRemoteException {
checkAcls("refreshUserToGroupsMappings");
UserGroupInformation user = checkAcls("refreshUserToGroupsMappings");
Groups.getUserToGroupsMappingService().refresh();
RMAuditLogger.logSuccess(user.getShortUserName(),
"refreshUserToGroupsMappings", "AdminService");
return recordFactory.newRecordInstance(
RefreshUserToGroupsMappingsResponse.class);
@ -190,12 +214,14 @@ public RefreshUserToGroupsMappingsResponse refreshUserToGroupsMappings(
@Override
public RefreshAdminAclsResponse refreshAdminAcls(
RefreshAdminAclsRequest request) throws YarnRemoteException {
checkAcls("refreshAdminAcls");
UserGroupInformation user = checkAcls("refreshAdminAcls");
Configuration conf = new Configuration();
adminAcl =
new AccessControlList(
conf.get(RMConfig.RM_ADMIN_ACL, RMConfig.DEFAULT_RM_ADMIN_ACL));
RMAuditLogger.logSuccess(user.getShortUserName(), "refreshAdminAcls",
"AdminService");
return recordFactory.newRecordInstance(RefreshAdminAclsResponse.class);
}

View File

@ -38,6 +38,7 @@
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.AMResponse;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
@ -49,6 +50,7 @@
import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.ApplicationTokenSecretManager;
import org.apache.hadoop.yarn.security.SchedulerSecurityInfo;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.AMLivelinessMonitor;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
@ -115,11 +117,16 @@ public RegisterApplicationMasterResponse registerApplicationMaster(
ApplicationAttemptId applicationAttemptId = request
.getApplicationAttemptId();
ApplicationId appID = applicationAttemptId.getApplicationId();
AMResponse lastResponse = responseMap.get(applicationAttemptId);
if (lastResponse == null) {
String message = "Application doesn't exist in cache "
+ applicationAttemptId;
LOG.error(message);
RMAuditLogger.logFailure(this.rmContext.getRMApps().get(appID).getUser(),
AuditConstants.REGISTER_AM, message, "ApplicationMasterService",
"Error in registering application master", appID,
applicationAttemptId);
throw RPCUtil.getRemoteException(message);
}
@ -133,6 +140,10 @@ public RegisterApplicationMasterResponse registerApplicationMaster(
new RMAppAttemptRegistrationEvent(applicationAttemptId, request
.getHost(), request.getRpcPort(), request.getTrackingUrl()));
RMAuditLogger.logSuccess(this.rmContext.getRMApps().get(appID).getUser(),
AuditConstants.REGISTER_AM, "ApplicationMasterService", appID,
applicationAttemptId);
// Pick up min/max resource from scheduler...
RegisterApplicationMasterResponse response = recordFactory
.newRecordInstance(RegisterApplicationMasterResponse.class);

View File

@ -70,6 +70,7 @@
import org.apache.hadoop.yarn.ipc.RPCUtil;
import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.client.ClientRMSecurityInfo;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
@ -195,9 +196,11 @@ public SubmitApplicationResponse submitApplication(
SubmitApplicationRequest request) throws YarnRemoteException {
ApplicationSubmissionContext submissionContext = request
.getApplicationSubmissionContext();
ApplicationId applicationId = null;
String user = null;
try {
String user = UserGroupInformation.getCurrentUser().getShortUserName();
ApplicationId applicationId = submissionContext.getApplicationId();
user = UserGroupInformation.getCurrentUser().getShortUserName();
applicationId = submissionContext.getApplicationId();
if (rmContext.getRMApps().get(applicationId) != null) {
throw new IOException("Application with id " + applicationId
+ " is already present! Cannot add a duplicate!");
@ -207,8 +210,13 @@ public SubmitApplicationResponse submitApplication(
LOG.info("Application with id " + applicationId.getId() +
" submitted by user " + user + " with " + submissionContext);
RMAuditLogger.logSuccess(user, AuditConstants.SUBMIT_APP_REQUEST,
"ClientRMService", applicationId);
} catch (IOException ie) {
LOG.info("Exception in submitting application", ie);
RMAuditLogger.logFailure(user, AuditConstants.SUBMIT_APP_REQUEST,
ie.getMessage(), "ClientRMService",
"Exception in submitting application", applicationId);
throw RPCUtil.getRemoteException(ie);
}
@ -228,6 +236,9 @@ public FinishApplicationResponse finishApplication(
callerUGI = UserGroupInformation.getCurrentUser();
} catch (IOException ie) {
LOG.info("Error getting UGI ", ie);
RMAuditLogger.logFailure("UNKNOWN", AuditConstants.KILL_APP_REQUEST,
"UNKNOWN", "ClientRMService" , "Error getting UGI",
applicationId);
throw RPCUtil.getRemoteException(ie);
}
@ -235,6 +246,10 @@ public FinishApplicationResponse finishApplication(
// TODO: What if null
if (!checkAccess(callerUGI, application.getUser(),
ApplicationACL.MODIFY_APP)) {
RMAuditLogger.logFailure(callerUGI.getShortUserName(),
AuditConstants.KILL_APP_REQUEST,
"User doesn't have MODIFY_APP permissions", "ClientRMService",
AuditConstants.UNAUTHORIZED_USER, applicationId);
throw RPCUtil.getRemoteException(new AccessControlException("User "
+ callerUGI.getShortUserName() + " cannot perform operation "
+ ApplicationACL.MODIFY_APP.name() + " on " + applicationId));
@ -243,6 +258,8 @@ public FinishApplicationResponse finishApplication(
this.rmContext.getDispatcher().getEventHandler().handle(
new RMAppEvent(applicationId, RMAppEventType.KILL));
RMAuditLogger.logSuccess(callerUGI.getShortUserName(),
AuditConstants.KILL_APP_REQUEST, "ClientRMService" , applicationId);
FinishApplicationResponse response = recordFactory
.newRecordInstance(FinishApplicationResponse.class);
return response;

View File

@ -32,6 +32,8 @@
import org.apache.hadoop.yarn.security.ApplicationTokenIdentifier;
import org.apache.hadoop.yarn.security.ApplicationTokenSecretManager;
import org.apache.hadoop.yarn.security.client.ClientToAMSecretManager;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
@ -165,8 +167,39 @@ protected synchronized void addCompletedApp(ApplicationId appId) {
LOG.error("RMAppManager received completed appId of null, skipping");
} else {
completedApps.add(appId);
writeAuditLog(appId);
}
}
protected void writeAuditLog(ApplicationId appId) {
RMApp app = rmContext.getRMApps().get(appId);
String operation = "UNKONWN";
boolean success = false;
switch (app.getState()) {
case FAILED:
operation = AuditConstants.FINISH_FAILED_APP;
break;
case FINISHED:
operation = AuditConstants.FINISH_SUCCESS_APP;
success = true;
break;
case KILLED:
operation = AuditConstants.FINISH_KILLED_APP;
success = true;
break;
default:
}
if (success) {
RMAuditLogger.logSuccess(app.getUser(), operation,
"RMAppManager", app.getApplicationId());
} else {
StringBuilder diag = app.getDiagnostics();
String msg = diag == null ? null : diag.toString();
RMAuditLogger.logFailure(app.getUser(), operation, msg, "RMAppManager",
"App failed with state: " + app.getState(), appId);
}
}
};
/*
* check to see if hit the limit for max # completed apps kept

View File

@ -39,6 +39,8 @@
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
@ -183,6 +185,10 @@ synchronized public void containerCompleted(RMContainer rmContainer,
// Remove from the list of containers
liveContainers.remove(rmContainer.getContainerId());
RMAuditLogger.logSuccess(getUser(),
AuditConstants.RELEASE_CONTAINER, "SchedulerApp",
getApplicationId(), containerId);
// Update usage metrics
Resource containerResource = rmContainer.getContainer().getResource();
queue.getMetrics().releaseResources(getUser(), 1, containerResource);
@ -217,6 +223,9 @@ synchronized public RMContainer allocate(NodeType type, SchedulerNode node,
+ " container=" + container.getId() + " host="
+ container.getNodeId().getHost() + " type=" + type);
}
RMAuditLogger.logSuccess(getUser(),
AuditConstants.ALLOC_CONTAINER, "SchedulerApp",
getApplicationId(), container.getId());
// Add it to allContainers list.
newlyAllocatedContainers.add(rmContainer);

View File

@ -46,6 +46,8 @@
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.Store.RMState;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
@ -433,8 +435,15 @@ public Allocation allocate(ApplicationAttemptId applicationAttemptId,
// Release containers
for (ContainerId releasedContainerId : release) {
completedContainer(getRMContainer(releasedContainerId),
RMContainerEventType.RELEASED);
RMContainer rmContainer = getRMContainer(releasedContainerId);
if (rmContainer == null) {
RMAuditLogger.logFailure(application.getUser(),
AuditConstants.RELEASE_CONTAINER,
"Unauthorized access or invalid container", "CapacityScheduler",
"Trying to release container not owned by app or with invalid id",
application.getApplicationId(), releasedContainerId);
}
completedContainer(rmContainer, RMContainerEventType.RELEASED);
}
synchronized (application) {

View File

@ -55,6 +55,8 @@
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.Store.RMState;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
@ -225,8 +227,15 @@ public Allocation allocate(
// Release containers
for (ContainerId releasedContainer : release) {
containerCompleted(getRMContainer(releasedContainer),
RMContainerEventType.RELEASED);
RMContainer rmContainer = getRMContainer(releasedContainer);
if (rmContainer == null) {
RMAuditLogger.logFailure(application.getUser(),
AuditConstants.RELEASE_CONTAINER,
"Unauthorized access or invalid container", "FifoScheduler",
"Trying to release container not owned by app or with invalid id",
application.getApplicationId(), releasedContainer);
}
containerCompleted(rmContainer, RMContainerEventType.RELEASED);
}
if (!ask.isEmpty()) {
@ -642,6 +651,11 @@ private void containerLaunchedOnNode(Container container, SchedulerNode node) {
@Lock(FifoScheduler.class)
private synchronized void containerCompleted(RMContainer rmContainer,
RMContainerEventType event) {
if (rmContainer == null) {
LOG.info("Null container completed...");
return;
}
// Get the application for the finished container
Container container = rmContainer.getContainer();
ApplicationAttemptId applicationAttemptId = container.getId().getAppAttemptId();
@ -725,7 +739,7 @@ public synchronized SchedulerNodeReport getNodeReport(NodeId nodeId) {
private RMContainer getRMContainer(ContainerId containerId) {
SchedulerApp application =
getApplication(containerId.getAppAttemptId());
return application.getRMContainer(containerId);
return (application == null) ? null : application.getRMContainer(containerId);
}
}

View File

@ -51,6 +51,11 @@ public MockRMApp(int newid, long time, RMAppState newState, String userName) {
user = userName;
}
public MockRMApp(int newid, long time, RMAppState newState, String userName, String diag) {
this(newid, time, newState, userName);
this.diagnostics = new StringBuilder(diag);
}
@Override
public ApplicationId getApplicationId() {
return id;