YARN-9539.Improve cleanup process of app activities and make some conditions configurable. Contributed by Tao Yang.

This commit is contained in:
Weiwei Yang 2019-05-12 22:31:39 -07:00
parent ff27e8eabd
commit 1a47c2b7ae
5 changed files with 170 additions and 6 deletions

View File

@ -4005,6 +4005,41 @@ public static boolean areNodeLabelsEnabled(
public static final String DEFAULT_NM_NUMA_AWARENESS_NUMACTL_CMD =
"/usr/bin/numactl";
/**
* Settings for activities manager.
*/
public static final String RM_ACTIVITIES_MANAGER_PREFIX =
RM_PREFIX + "activities-manager.";
public static final String RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_PREFIX =
RM_ACTIVITIES_MANAGER_PREFIX + "scheduler-activities.";
public static final String RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_PREFIX =
RM_ACTIVITIES_MANAGER_PREFIX + "app-activities.";
/** The cleanup interval for activities in milliseconds. **/
public static final String RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS =
RM_ACTIVITIES_MANAGER_PREFIX + "cleanup-interval-ms";
public static final long DEFAULT_RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS =
5000L;
/** Time to live for scheduler activities in milliseconds. **/
public static final String RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS =
RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_PREFIX + "ttl-ms";
public static final long
DEFAULT_RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS = 600000L;
/** Time to live for app activities in milliseconds. **/
public static final String RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS =
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_PREFIX + "ttl-ms";
public static final long DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS =
600000L;
/** Max queue length for app activities. **/
public static final String
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH =
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_PREFIX + "max-queue-length";
public static final int
DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH = 1000;
public YarnConfiguration() {
super();
}

View File

@ -4187,4 +4187,28 @@
<name>yarn.nodemanager.csi-driver.names</name>
<value></value>
</property>
<property>
<description>The cleanup interval for activities in milliseconds.</description>
<name>yarn.resourcemanager.activities-manager.cleanup-interval-ms</name>
<value>5000</value>
</property>
<property>
<description>Time to live for scheduler activities in milliseconds.</description>
<name>yarn.resourcemanager.activities-manager.scheduler-activities.ttl-ms</name>
<value>600000</value>
</property>
<property>
<description>Time to live for app activities in milliseconds.</description>
<name>yarn.resourcemanager.activities-manager.app-activities.ttl-ms</name>
<value>600000</value>
</property>
<property>
<description>Max queue length for app activities.</description>
<name>yarn.resourcemanager.activities-manager.app-activities.max-queue-length</name>
<value>1000</value>
</property>
</configuration>

View File

@ -19,9 +19,11 @@
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
import org.apache.commons.collections.CollectionUtils;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.service.AbstractService;
@ -72,7 +74,10 @@ public class ActivitiesManager extends AbstractService {
private boolean recordNextAvailableNode = false;
private List<NodeAllocation> lastAvailableNodeActivities = null;
private Thread cleanUpThread;
private int timeThreshold = 600 * 1000;
private long activitiesCleanupIntervalMs;
private long schedulerActivitiesTTL;
private long appActivitiesTTL;
private int appActivitiesMaxQueueLength;
private final RMContext rmContext;
private volatile boolean stopped;
private ThreadLocal<DiagnosticsCollectorManager> diagnosticCollectorManager;
@ -89,6 +94,28 @@ public ActivitiesManager(RMContext rmContext) {
() -> new DiagnosticsCollectorManager(
new GenericDiagnosticsCollector()));
this.rmContext = rmContext;
if (rmContext.getYarnConfiguration() != null) {
setupConfForCleanup(rmContext.getYarnConfiguration());
}
}
private void setupConfForCleanup(Configuration conf) {
activitiesCleanupIntervalMs = conf.getLong(
YarnConfiguration.RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS,
YarnConfiguration.
DEFAULT_RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS);
schedulerActivitiesTTL = conf.getLong(
YarnConfiguration.RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS,
YarnConfiguration.
DEFAULT_RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS);
appActivitiesTTL = conf.getLong(
YarnConfiguration.RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS,
YarnConfiguration.
DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS);
appActivitiesMaxQueueLength = conf.getInt(YarnConfiguration.
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH,
YarnConfiguration.
DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH);
}
public AppActivitiesInfo getAppActivitiesInfo(ApplicationId applicationId,
@ -152,12 +179,13 @@ public void run() {
while (!stopped && !Thread.currentThread().isInterrupted()) {
Iterator<Map.Entry<NodeId, List<NodeAllocation>>> ite =
completedNodeAllocations.entrySet().iterator();
long curTS = SystemClock.getInstance().getTime();
while (ite.hasNext()) {
Map.Entry<NodeId, List<NodeAllocation>> nodeAllocation = ite.next();
List<NodeAllocation> allocations = nodeAllocation.getValue();
long currTS = SystemClock.getInstance().getTime();
if (allocations.size() > 0 && allocations.get(0).getTimeStamp()
- currTS > timeThreshold) {
if (allocations.size() > 0
&& curTS - allocations.get(0).getTimeStamp()
> schedulerActivitiesTTL) {
ite.remove();
}
}
@ -171,11 +199,29 @@ public void run() {
if (rmApp == null || rmApp.getFinalApplicationStatus()
!= FinalApplicationStatus.UNDEFINED) {
iteApp.remove();
} else {
Iterator<AppAllocation> appActivitiesIt =
appAllocation.getValue().iterator();
while (appActivitiesIt.hasNext()) {
if (curTS - appActivitiesIt.next().getTime()
> appActivitiesTTL) {
appActivitiesIt.remove();
} else {
break;
}
}
if (appAllocation.getValue().isEmpty()) {
iteApp.remove();
LOG.debug("Removed all expired activities from cache for {}.",
rmApp.getApplicationId());
}
}
}
LOG.debug("Remaining apps in app activities cache: {}",
completedAppAllocations.keySet());
try {
Thread.sleep(5000);
Thread.sleep(activitiesCleanupIntervalMs);
} catch (InterruptedException e) {
LOG.info(getName() + " thread interrupted");
break;
@ -290,7 +336,7 @@ void finishAppAllocationRecording(ApplicationId applicationId,
appAllocations = curAppAllocations;
}
}
if (appAllocations.size() == 1000) {
if (appAllocations.size() == appActivitiesMaxQueueLength) {
appAllocations.poll();
}
appAllocations.add(appAllocation);

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.yarn.api.records.ApplicationId;
@ -77,4 +78,9 @@ public AppActivitiesInfo(List<AppAllocation> appAllocations,
}
}
}
@VisibleForTesting
public List<AppAllocationInfo> getAllocations() {
return allocations;
}
}

View File

@ -30,10 +30,13 @@
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
@ -43,6 +46,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppActivitiesInfo;
import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey;
import org.apache.hadoop.yarn.util.SystemClock;
import org.junit.Assert;
@ -81,6 +85,8 @@ public class TestActivitiesManager {
@Before
public void setup() {
rmContext = Mockito.mock(RMContext.class);
Configuration conf = new Configuration();
Mockito.when(rmContext.getYarnConfiguration()).thenReturn(conf);
ResourceScheduler scheduler = Mockito.mock(ResourceScheduler.class);
Mockito.when(scheduler.getMinimumResourceCapability())
.thenReturn(Resources.none());
@ -95,6 +101,8 @@ public void setup() {
RMApp mockApp = Mockito.mock(RMApp.class);
Mockito.doReturn(appAttemptId.getApplicationId()).when(mockApp)
.getApplicationId();
Mockito.doReturn(FinalApplicationStatus.UNDEFINED).when(mockApp)
.getFinalApplicationStatus();
rmApps.put(appAttemptId.getApplicationId(), mockApp);
FiCaSchedulerApp app =
new FiCaSchedulerApp(appAttemptId, "user", mockQueue,
@ -245,6 +253,51 @@ public void testRecordingAppActivitiesInMultiThreads()
}
}
@Test (timeout = 30000)
public void testAppActivitiesTTL() throws Exception {
long cleanupIntervalMs = 100;
long appActivitiesTTL = 1000;
rmContext.getYarnConfiguration()
.setLong(YarnConfiguration.RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS,
cleanupIntervalMs);
rmContext.getYarnConfiguration()
.setLong(YarnConfiguration.RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS,
appActivitiesTTL);
ActivitiesManager newActivitiesManager = new ActivitiesManager(rmContext);
newActivitiesManager.serviceStart();
// start recording activities for first app and first node
SchedulerApplicationAttempt app = apps.get(0);
FiCaSchedulerNode node = (FiCaSchedulerNode) nodes.get(0);
newActivitiesManager
.turnOnAppActivitiesRecording(app.getApplicationId(), 3);
int numActivities = 10;
for (int i = 0; i < numActivities; i++) {
ActivitiesLogger.APP
.startAppAllocationRecording(newActivitiesManager, node,
SystemClock.getInstance().getTime(), app);
ActivitiesLogger.APP
.recordAppActivityWithoutAllocation(newActivitiesManager, node, app,
new SchedulerRequestKey(Priority.newInstance(0), 0, null),
ActivityDiagnosticConstant.FAIL_TO_ALLOCATE,
ActivityState.REJECTED);
ActivitiesLogger.APP
.finishAllocatedAppAllocationRecording(newActivitiesManager,
app.getApplicationId(), null, ActivityState.SKIPPED,
ActivityDiagnosticConstant.SKIPPED_ALL_PRIORITIES);
}
AppActivitiesInfo appActivitiesInfo = newActivitiesManager
.getAppActivitiesInfo(app.getApplicationId(), null, null);
Assert.assertEquals(numActivities,
appActivitiesInfo.getAllocations().size());
// sleep until all app activities expired
Thread.sleep(cleanupIntervalMs + appActivitiesTTL);
// there should be no remaining app activities
appActivitiesInfo = newActivitiesManager
.getAppActivitiesInfo(app.getApplicationId(), null, null);
Assert.assertEquals(0,
appActivitiesInfo.getAllocations().size());
}
/**
* Testing activities manager which can record all history information about
* node allocations.