YARN-3792. Test case failures in TestDistributedShell and some issue fixes related to ATSV2 (Naganarasimha G R via sjlee)
(cherry picked from commit 84f37f1c7eefec6d139cbf091c50d6c06f734323)
This commit is contained in:
parent
92d90c3a24
commit
22e7ae5771
@ -497,7 +497,7 @@ public boolean init(String[] args) throws ParseException {
|
|||||||
}
|
}
|
||||||
if (cliParser.hasOption("flow_run_id")) {
|
if (cliParser.hasOption("flow_run_id")) {
|
||||||
try {
|
try {
|
||||||
flowRunId = Long.valueOf(cliParser.getOptionValue("flow_run_id"));
|
flowRunId = Long.parseLong(cliParser.getOptionValue("flow_run_id"));
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"Flow run is not a valid long value", e);
|
"Flow run is not a valid long value", e);
|
||||||
|
@ -60,6 +60,7 @@
|
|||||||
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||||
import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain;
|
import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain;
|
||||||
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities;
|
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities;
|
||||||
@ -129,7 +130,6 @@ protected void setupInternal(int numNodeManager) throws Exception {
|
|||||||
|
|
||||||
private void setupInternal(int numNodeManager, float timelineVersion)
|
private void setupInternal(int numNodeManager, float timelineVersion)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
LOG.info("Starting up YARN cluster");
|
LOG.info("Starting up YARN cluster");
|
||||||
|
|
||||||
conf = new YarnConfiguration();
|
conf = new YarnConfiguration();
|
||||||
@ -140,7 +140,6 @@ private void setupInternal(int numNodeManager, float timelineVersion)
|
|||||||
boolean enableATSServer = true;
|
boolean enableATSServer = true;
|
||||||
// disable aux-service based timeline aggregators
|
// disable aux-service based timeline aggregators
|
||||||
conf.set(YarnConfiguration.NM_AUX_SERVICES, "");
|
conf.set(YarnConfiguration.NM_AUX_SERVICES, "");
|
||||||
|
|
||||||
conf.set(YarnConfiguration.NM_VMEM_PMEM_RATIO, "8");
|
conf.set(YarnConfiguration.NM_VMEM_PMEM_RATIO, "8");
|
||||||
conf.set(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class.getName());
|
conf.set(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class.getName());
|
||||||
conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
|
conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
|
||||||
@ -155,7 +154,9 @@ private void setupInternal(int numNodeManager, float timelineVersion)
|
|||||||
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true);
|
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true);
|
||||||
conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
|
conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
|
||||||
true);
|
true);
|
||||||
conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true);
|
conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
|
||||||
|
true);
|
||||||
|
conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, false);
|
||||||
|
|
||||||
// ATS version specific settings
|
// ATS version specific settings
|
||||||
if (timelineVersion == 1.0f) {
|
if (timelineVersion == 1.0f) {
|
||||||
@ -180,6 +181,9 @@ private void setupInternal(int numNodeManager, float timelineVersion)
|
|||||||
conf.set(YarnConfiguration.NM_AUX_SERVICES, TIMELINE_AUX_SERVICE_NAME);
|
conf.set(YarnConfiguration.NM_AUX_SERVICES, TIMELINE_AUX_SERVICE_NAME);
|
||||||
conf.set(YarnConfiguration.NM_AUX_SERVICES + "." + TIMELINE_AUX_SERVICE_NAME
|
conf.set(YarnConfiguration.NM_AUX_SERVICES + "." + TIMELINE_AUX_SERVICE_NAME
|
||||||
+ ".class", PerNodeTimelineCollectorsAuxService.class.getName());
|
+ ".class", PerNodeTimelineCollectorsAuxService.class.getName());
|
||||||
|
conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true);
|
||||||
|
conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
|
||||||
|
false);
|
||||||
} else {
|
} else {
|
||||||
Assert.fail("Wrong timeline version number: " + timelineVersion);
|
Assert.fail("Wrong timeline version number: " + timelineVersion);
|
||||||
}
|
}
|
||||||
@ -187,7 +191,7 @@ private void setupInternal(int numNodeManager, float timelineVersion)
|
|||||||
if (yarnCluster == null) {
|
if (yarnCluster == null) {
|
||||||
yarnCluster =
|
yarnCluster =
|
||||||
new MiniYARNCluster(TestDistributedShell.class.getSimpleName(), 1,
|
new MiniYARNCluster(TestDistributedShell.class.getSimpleName(), 1,
|
||||||
numNodeManager, 1, 1, enableATSServer);
|
numNodeManager, 1, 1);
|
||||||
yarnCluster.init(conf);
|
yarnCluster.init(conf);
|
||||||
|
|
||||||
yarnCluster.start();
|
yarnCluster.start();
|
||||||
@ -390,13 +394,15 @@ public void run() {
|
|||||||
if (checkHostname(appReport.getHost()) && appReport.getRpcPort() == -1) {
|
if (checkHostname(appReport.getHost()) && appReport.getRpcPort() == -1) {
|
||||||
verified = true;
|
verified = true;
|
||||||
}
|
}
|
||||||
if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED) {
|
|
||||||
|
if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED
|
||||||
|
&& appReport.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Assert.assertTrue(errorMessage, verified);
|
Assert.assertTrue(errorMessage, verified);
|
||||||
t.join();
|
t.join();
|
||||||
LOG.info("Client run completed. Result=" + result);
|
LOG.info("Client run completed for testDSShell. Result=" + result);
|
||||||
Assert.assertTrue(result.get());
|
Assert.assertTrue(result.get());
|
||||||
|
|
||||||
if (timelineVersionWatcher.getTimelineVersion() == 1.5f) {
|
if (timelineVersionWatcher.getTimelineVersion() == 1.5f) {
|
||||||
@ -477,9 +483,9 @@ private void checkTimelineV1(boolean haveDomain) throws Exception {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkTimelineV2(
|
private void checkTimelineV2(boolean haveDomain, ApplicationId appId,
|
||||||
boolean haveDomain, ApplicationId appId, boolean defaultFlow)
|
boolean defaultFlow) throws Exception {
|
||||||
throws Exception {
|
LOG.info("Started checkTimelineV2 ");
|
||||||
// For PoC check in /tmp/timeline_service_data YARN-3264
|
// For PoC check in /tmp/timeline_service_data YARN-3264
|
||||||
String tmpRoot =
|
String tmpRoot =
|
||||||
FileSystemTimelineWriterImpl.DEFAULT_TIMELINE_SERVICE_STORAGE_DIR_ROOT
|
FileSystemTimelineWriterImpl.DEFAULT_TIMELINE_SERVICE_STORAGE_DIR_ROOT
|
||||||
@ -530,12 +536,29 @@ private void checkTimelineV2(
|
|||||||
verifyEntityTypeFileExists(basePath,
|
verifyEntityTypeFileExists(basePath,
|
||||||
TimelineEntityType.YARN_APPLICATION.toString(),
|
TimelineEntityType.YARN_APPLICATION.toString(),
|
||||||
appMetricsTimestampFileName);
|
appMetricsTimestampFileName);
|
||||||
verifyStringExistsSpecifiedTimes(appEntityFile,
|
Assert.assertEquals(
|
||||||
ApplicationMetricsConstants.CREATED_EVENT_TYPE, 1,
|
"Application created event should be published atleast once",
|
||||||
"Application created event should be published atleast once");
|
1,
|
||||||
verifyStringExistsSpecifiedTimes(appEntityFile,
|
getNumOfStringOccurences(appEntityFile,
|
||||||
ApplicationMetricsConstants.FINISHED_EVENT_TYPE, 1,
|
ApplicationMetricsConstants.CREATED_EVENT_TYPE));
|
||||||
"Application finished event should be published atleast once");
|
|
||||||
|
// to avoid race condition of testcase, atleast check 4 times with sleep
|
||||||
|
// of 500ms
|
||||||
|
long numOfStringOccurences = 0;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
numOfStringOccurences =
|
||||||
|
getNumOfStringOccurences(appEntityFile,
|
||||||
|
ApplicationMetricsConstants.FINISHED_EVENT_TYPE);
|
||||||
|
if (numOfStringOccurences > 0) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
Thread.sleep(500l);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Assert.assertEquals(
|
||||||
|
"Application finished event should be published atleast once",
|
||||||
|
1,
|
||||||
|
numOfStringOccurences);
|
||||||
|
|
||||||
// Verify RM posting AppAttempt life cycle Events are getting published
|
// Verify RM posting AppAttempt life cycle Events are getting published
|
||||||
String appAttemptMetricsTimestampFileName =
|
String appAttemptMetricsTimestampFileName =
|
||||||
@ -546,12 +569,17 @@ private void checkTimelineV2(
|
|||||||
verifyEntityTypeFileExists(basePath,
|
verifyEntityTypeFileExists(basePath,
|
||||||
TimelineEntityType.YARN_APPLICATION_ATTEMPT.toString(),
|
TimelineEntityType.YARN_APPLICATION_ATTEMPT.toString(),
|
||||||
appAttemptMetricsTimestampFileName);
|
appAttemptMetricsTimestampFileName);
|
||||||
verifyStringExistsSpecifiedTimes(appAttemptEntityFile,
|
Assert.assertEquals(
|
||||||
AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE, 1,
|
"AppAttempt register event should be published atleast once",
|
||||||
"AppAttempt register event should be published atleast once");
|
1,
|
||||||
verifyStringExistsSpecifiedTimes(appAttemptEntityFile,
|
getNumOfStringOccurences(appAttemptEntityFile,
|
||||||
AppAttemptMetricsConstants.FINISHED_EVENT_TYPE, 1,
|
AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE));
|
||||||
"AppAttempt finished event should be published atleast once");
|
|
||||||
|
Assert.assertEquals(
|
||||||
|
"AppAttempt finished event should be published atleast once",
|
||||||
|
1,
|
||||||
|
getNumOfStringOccurences(appAttemptEntityFile,
|
||||||
|
AppAttemptMetricsConstants.FINISHED_EVENT_TYPE));
|
||||||
} finally {
|
} finally {
|
||||||
FileUtils.deleteDirectory(tmpRootFolder.getParentFile());
|
FileUtils.deleteDirectory(tmpRootFolder.getParentFile());
|
||||||
}
|
}
|
||||||
@ -570,8 +598,7 @@ private File verifyEntityTypeFileExists(String basePath, String entityType,
|
|||||||
return entityFile;
|
return entityFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyStringExistsSpecifiedTimes(File entityFile,
|
private long getNumOfStringOccurences(File entityFile, String searchString)
|
||||||
String searchString, long expectedNumOfTimes, String errorMsg)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
BufferedReader reader = null;
|
BufferedReader reader = null;
|
||||||
String strLine;
|
String strLine;
|
||||||
@ -585,7 +612,7 @@ private void verifyStringExistsSpecifiedTimes(File entityFile,
|
|||||||
} finally {
|
} finally {
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
Assert.assertEquals(errorMsg, expectedNumOfTimes, actualCount);
|
return actualCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1261,4 +1288,3 @@ private int verifyContainerLog(int containerNum,
|
|||||||
return numOfWords;
|
return numOfWords;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,7 +30,9 @@
|
|||||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.TestName;
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableMap;
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
|
||||||
|
@ -418,6 +418,14 @@ public void putObjects(String path, MultivaluedMap<String, String> params,
|
|||||||
// timelineServiceAddress could haven't be initialized yet
|
// timelineServiceAddress could haven't be initialized yet
|
||||||
// or stale (only for new timeline service)
|
// or stale (only for new timeline service)
|
||||||
int retries = pollTimelineServiceAddress(this.maxServiceRetries);
|
int retries = pollTimelineServiceAddress(this.maxServiceRetries);
|
||||||
|
if (timelineServiceAddress == null) {
|
||||||
|
String errMessage = "TimelineClient has reached to max retry times : "
|
||||||
|
+ this.maxServiceRetries
|
||||||
|
+ ", but failed to fetch timeline service address. Please verify"
|
||||||
|
+ " Timeline Auxillary Service is configured in all the NMs";
|
||||||
|
LOG.error(errMessage);
|
||||||
|
throw new YarnException(errMessage);
|
||||||
|
}
|
||||||
|
|
||||||
// timelineServiceAddress could be stale, add retry logic here.
|
// timelineServiceAddress could be stale, add retry logic here.
|
||||||
boolean needRetry = true;
|
boolean needRetry = true;
|
||||||
|
@ -130,11 +130,11 @@ public ApplicationImpl(Dispatcher dispatcher, String user, String flowId,
|
|||||||
context, -1);
|
context, -1);
|
||||||
Configuration conf = context.getConf();
|
Configuration conf = context.getConf();
|
||||||
if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) {
|
if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) {
|
||||||
createAndStartTimelienClient(conf);
|
createAndStartTimelineClient(conf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createAndStartTimelienClient(Configuration conf) {
|
private void createAndStartTimelineClient(Configuration conf) {
|
||||||
// create and start timeline client
|
// create and start timeline client
|
||||||
this.timelineClient = TimelineClient.createTimelineClient(appId);
|
this.timelineClient = TimelineClient.createTimelineClient(appId);
|
||||||
timelineClient.init(conf);
|
timelineClient.init(conf);
|
||||||
|
@ -96,10 +96,7 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|||||||
|
|
||||||
// For posting entities in new timeline service in a non-blocking way
|
// For posting entities in new timeline service in a non-blocking way
|
||||||
// TODO replace with event loop in TimelineClient.
|
// TODO replace with event loop in TimelineClient.
|
||||||
private static ExecutorService threadPool =
|
private static ExecutorService threadPool;
|
||||||
Executors.newCachedThreadPool(
|
|
||||||
new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
|
|
||||||
.build());
|
|
||||||
|
|
||||||
@Private
|
@Private
|
||||||
public static enum ContainerMetric {
|
public static enum ContainerMetric {
|
||||||
@ -225,6 +222,10 @@ protected void serviceInit(Configuration conf) throws Exception {
|
|||||||
if (publishContainerMetricsToTimelineService) {
|
if (publishContainerMetricsToTimelineService) {
|
||||||
LOG.info("NodeManager has been configured to publish container " +
|
LOG.info("NodeManager has been configured to publish container " +
|
||||||
"metrics to Timeline Service V2.");
|
"metrics to Timeline Service V2.");
|
||||||
|
threadPool =
|
||||||
|
Executors.newCachedThreadPool(
|
||||||
|
new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
|
||||||
|
.build());
|
||||||
} else {
|
} else {
|
||||||
LOG.warn("NodeManager has not been configured to publish container " +
|
LOG.warn("NodeManager has not been configured to publish container " +
|
||||||
"metrics to Timeline Service V2.");
|
"metrics to Timeline Service V2.");
|
||||||
@ -280,6 +281,9 @@ protected void serviceStop() throws Exception {
|
|||||||
|
|
||||||
// TODO remove threadPool after adding non-blocking call in TimelineClient
|
// TODO remove threadPool after adding non-blocking call in TimelineClient
|
||||||
private static void shutdownAndAwaitTermination() {
|
private static void shutdownAndAwaitTermination() {
|
||||||
|
if (threadPool == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
threadPool.shutdown();
|
threadPool.shutdown();
|
||||||
try {
|
try {
|
||||||
if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
|
if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
|
||||||
@ -689,7 +693,6 @@ public void run() {
|
|||||||
timelineClient.putEntities(entity);
|
timelineClient.putEntities(entity);
|
||||||
} catch (IOException|YarnException e) {
|
} catch (IOException|YarnException e) {
|
||||||
LOG.error("putEntityNonBlocking get failed: " + e);
|
LOG.error("putEntityNonBlocking get failed: " + e);
|
||||||
throw new RuntimeException(e.toString());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -56,7 +56,7 @@ public void postPut(ApplicationId appId, TimelineCollector collector) {
|
|||||||
if (parts.length != 2 || parts[1].isEmpty()) {
|
if (parts.length != 2 || parts[1].isEmpty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
switch (parts[0]) {
|
switch (parts[0].toUpperCase()) {
|
||||||
case TimelineUtils.FLOW_NAME_TAG_PREFIX:
|
case TimelineUtils.FLOW_NAME_TAG_PREFIX:
|
||||||
collector.getTimelineEntityContext().setFlowName(parts[1]);
|
collector.getTimelineEntityContext().setFlowName(parts[1]);
|
||||||
break;
|
break;
|
||||||
|
@ -48,21 +48,11 @@
|
|||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* It is a singleton, and instances should be obtained via
|
|
||||||
* {@link #getInstance()}.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
@Private
|
@Private
|
||||||
@Unstable
|
@Unstable
|
||||||
public class NodeTimelineCollectorManager extends TimelineCollectorManager {
|
public class NodeTimelineCollectorManager extends TimelineCollectorManager {
|
||||||
private static final Log LOG =
|
private static final Log LOG =
|
||||||
LogFactory.getLog(NodeTimelineCollectorManager.class);
|
LogFactory.getLog(NodeTimelineCollectorManager.class);
|
||||||
private static final NodeTimelineCollectorManager INSTANCE =
|
|
||||||
new NodeTimelineCollectorManager();
|
|
||||||
|
|
||||||
|
|
||||||
// REST server for this collector manager
|
// REST server for this collector manager
|
||||||
private HttpServer2 timelineRestServer;
|
private HttpServer2 timelineRestServer;
|
||||||
@ -73,10 +63,6 @@ public class NodeTimelineCollectorManager extends TimelineCollectorManager {
|
|||||||
|
|
||||||
static final String COLLECTOR_MANAGER_ATTR_KEY = "collector.manager";
|
static final String COLLECTOR_MANAGER_ATTR_KEY = "collector.manager";
|
||||||
|
|
||||||
static NodeTimelineCollectorManager getInstance() {
|
|
||||||
return INSTANCE;
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
protected NodeTimelineCollectorManager() {
|
protected NodeTimelineCollectorManager() {
|
||||||
super(NodeTimelineCollectorManager.class.getName());
|
super(NodeTimelineCollectorManager.class.getName());
|
||||||
|
@ -56,8 +56,7 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
|
|||||||
private final NodeTimelineCollectorManager collectorManager;
|
private final NodeTimelineCollectorManager collectorManager;
|
||||||
|
|
||||||
public PerNodeTimelineCollectorsAuxService() {
|
public PerNodeTimelineCollectorsAuxService() {
|
||||||
// use the same singleton
|
this(new NodeTimelineCollectorManager());
|
||||||
this(NodeTimelineCollectorManager.getInstance());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting PerNodeTimelineCollectorsAuxService(
|
@VisibleForTesting PerNodeTimelineCollectorsAuxService(
|
||||||
|
@ -128,7 +128,7 @@ public boolean remove(ApplicationId appId) {
|
|||||||
postRemove(appId, collector);
|
postRemove(appId, collector);
|
||||||
// stop the service to do clean up
|
// stop the service to do clean up
|
||||||
collector.stop();
|
collector.stop();
|
||||||
LOG.info("the collector service for " + appId + " was removed");
|
LOG.info("The collector service for " + appId + " was removed");
|
||||||
}
|
}
|
||||||
return collector != null;
|
return collector != null;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user