YARN-11212. [Federation] Add getNodeToLabels REST APIs for Router. (#4614)
This commit is contained in:
parent
a5b12c8010
commit
e994635a95
@ -32,8 +32,7 @@
|
||||
public class NodeLabelsInfo {
|
||||
|
||||
@XmlElement(name = "nodeLabelInfo")
|
||||
private ArrayList<NodeLabelInfo> nodeLabelsInfo =
|
||||
new ArrayList<NodeLabelInfo>();
|
||||
private ArrayList<NodeLabelInfo> nodeLabelsInfo = new ArrayList<>();
|
||||
|
||||
public NodeLabelsInfo() {
|
||||
// JAXB needs this
|
||||
@ -44,25 +43,32 @@ public NodeLabelsInfo(ArrayList<NodeLabelInfo> nodeLabels) {
|
||||
}
|
||||
|
||||
public NodeLabelsInfo(List<NodeLabel> nodeLabels) {
|
||||
this.nodeLabelsInfo = new ArrayList<NodeLabelInfo>();
|
||||
this.nodeLabelsInfo = new ArrayList<>();
|
||||
for (NodeLabel label : nodeLabels) {
|
||||
this.nodeLabelsInfo.add(new NodeLabelInfo(label));
|
||||
}
|
||||
}
|
||||
|
||||
public NodeLabelsInfo(Set<String> nodeLabelsName) {
|
||||
this.nodeLabelsInfo = new ArrayList<NodeLabelInfo>();
|
||||
this.nodeLabelsInfo = new ArrayList<>();
|
||||
for (String labelName : nodeLabelsName) {
|
||||
this.nodeLabelsInfo.add(new NodeLabelInfo(labelName));
|
||||
}
|
||||
}
|
||||
|
||||
public NodeLabelsInfo(Collection<NodeLabel> nodeLabels) {
|
||||
this.nodeLabelsInfo = new ArrayList<>();
|
||||
nodeLabels.stream().forEach(nodeLabel -> {
|
||||
this.nodeLabelsInfo.add(new NodeLabelInfo(nodeLabel));
|
||||
});
|
||||
}
|
||||
|
||||
public ArrayList<NodeLabelInfo> getNodeLabelsInfo() {
|
||||
return nodeLabelsInfo;
|
||||
}
|
||||
|
||||
public Set<NodeLabel> getNodeLabels() {
|
||||
Set<NodeLabel> nodeLabels = new HashSet<NodeLabel>();
|
||||
Set<NodeLabel> nodeLabels = new HashSet<>();
|
||||
for (NodeLabelInfo label : nodeLabelsInfo) {
|
||||
nodeLabels.add(NodeLabel.newInstance(label.getName(),
|
||||
label.getExclusivity()));
|
||||
@ -71,7 +77,7 @@ public Set<NodeLabel> getNodeLabels() {
|
||||
}
|
||||
|
||||
public List<String> getNodeLabelsName() {
|
||||
ArrayList<String> nodeLabelsName = new ArrayList<String>();
|
||||
ArrayList<String> nodeLabelsName = new ArrayList<>();
|
||||
for (NodeLabelInfo label : nodeLabelsInfo) {
|
||||
nodeLabelsName.add(label.getName());
|
||||
}
|
||||
|
@ -35,7 +35,17 @@ public NodeToLabelsInfo() {
|
||||
// JAXB needs this
|
||||
}
|
||||
|
||||
public NodeToLabelsInfo(HashMap<String, NodeLabelsInfo> nodeToLabels) {
|
||||
if (nodeToLabels != null) {
|
||||
this.nodeToLabels.putAll(nodeToLabels);
|
||||
}
|
||||
}
|
||||
|
||||
public HashMap<String, NodeLabelsInfo> getNodeToLabels() {
|
||||
return nodeToLabels;
|
||||
}
|
||||
|
||||
public void setNodeToLabels(HashMap<String, NodeLabelsInfo> nodeToLabels) {
|
||||
this.nodeToLabels = nodeToLabels;
|
||||
}
|
||||
}
|
||||
|
@ -282,7 +282,7 @@ public Response createNewApplication(HttpServletRequest hsr)
|
||||
.entity(e.getLocalizedMessage()).build();
|
||||
}
|
||||
|
||||
List<SubClusterId> blacklist = new ArrayList<SubClusterId>();
|
||||
List<SubClusterId> blacklist = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < numSubmitRetries; ++i) {
|
||||
|
||||
@ -295,7 +295,7 @@ public Response createNewApplication(HttpServletRequest hsr)
|
||||
.entity(e.getLocalizedMessage()).build();
|
||||
}
|
||||
|
||||
LOG.debug("getNewApplication try #{} on SubCluster {}", i, subClusterId);
|
||||
LOG.debug("getNewApplication try #{} on SubCluster {}.", i, subClusterId);
|
||||
|
||||
DefaultRequestInterceptorREST interceptor =
|
||||
getOrCreateInterceptorForSubCluster(subClusterId,
|
||||
@ -304,7 +304,7 @@ public Response createNewApplication(HttpServletRequest hsr)
|
||||
try {
|
||||
response = interceptor.createNewApplication(hsr);
|
||||
} catch (Exception e) {
|
||||
LOG.warn("Unable to create a new ApplicationId in SubCluster {}",
|
||||
LOG.warn("Unable to create a new ApplicationId in SubCluster {}.",
|
||||
subClusterId.getId(), e);
|
||||
}
|
||||
|
||||
@ -424,7 +424,7 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp,
|
||||
.build();
|
||||
}
|
||||
|
||||
List<SubClusterId> blacklist = new ArrayList<SubClusterId>();
|
||||
List<SubClusterId> blacklist = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < numSubmitRetries; ++i) {
|
||||
|
||||
@ -441,7 +441,7 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp,
|
||||
.entity(e.getLocalizedMessage())
|
||||
.build();
|
||||
}
|
||||
LOG.info("submitApplication appId {} try #{} on SubCluster {}",
|
||||
LOG.info("submitApplication appId {} try #{} on SubCluster {}.",
|
||||
applicationId, i, subClusterId);
|
||||
|
||||
ApplicationHomeSubCluster appHomeSubCluster =
|
||||
@ -482,7 +482,7 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp,
|
||||
.build();
|
||||
}
|
||||
if (subClusterId == subClusterIdInStateStore) {
|
||||
LOG.info("Application {} already submitted on SubCluster {}",
|
||||
LOG.info("Application {} already submitted on SubCluster {}.",
|
||||
applicationId, subClusterId);
|
||||
} else {
|
||||
routerMetrics.incrAppsFailedSubmitted();
|
||||
@ -712,8 +712,7 @@ public AppsInfo call() {
|
||||
|
||||
if (rmApps == null) {
|
||||
routerMetrics.incrMultipleAppsFailedRetrieved();
|
||||
LOG.error("Subcluster {} failed to return appReport.",
|
||||
info.getSubClusterId());
|
||||
LOG.error("Subcluster {} failed to return appReport.", info.getSubClusterId());
|
||||
return null;
|
||||
}
|
||||
return rmApps;
|
||||
@ -873,8 +872,7 @@ private Map<SubClusterInfo, NodeInfo> getNode(
|
||||
subclusterId, subcluster.getRMWebServiceAddress());
|
||||
return interceptor.getNode(nodeId);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Subcluster {} failed to return nodeInfo.",
|
||||
subclusterId);
|
||||
LOG.error("Subcluster {} failed to return nodeInfo.", subclusterId, e);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
@ -953,58 +951,28 @@ private SubClusterInfo getNodeSubcluster(String nodeId)
|
||||
public NodesInfo getNodes(String states) {
|
||||
|
||||
NodesInfo nodes = new NodesInfo();
|
||||
|
||||
final Map<SubClusterId, SubClusterInfo> subClustersActive;
|
||||
try {
|
||||
subClustersActive = getActiveSubclusters();
|
||||
} catch (Exception e) {
|
||||
LOG.error("Cannot get nodes: {}", e.getMessage());
|
||||
return new NodesInfo();
|
||||
}
|
||||
|
||||
// Send the requests in parallel
|
||||
CompletionService<NodesInfo> compSvc =
|
||||
new ExecutorCompletionService<NodesInfo>(this.threadpool);
|
||||
|
||||
for (final SubClusterInfo info : subClustersActive.values()) {
|
||||
compSvc.submit(new Callable<NodesInfo>() {
|
||||
@Override
|
||||
public NodesInfo call() {
|
||||
DefaultRequestInterceptorREST interceptor =
|
||||
getOrCreateInterceptorForSubCluster(
|
||||
info.getSubClusterId(), info.getRMWebServiceAddress());
|
||||
try {
|
||||
NodesInfo nodesInfo = interceptor.getNodes(states);
|
||||
return nodesInfo;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Subcluster {} failed to return nodesInfo.",
|
||||
info.getSubClusterId());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Map<SubClusterId, SubClusterInfo> subClustersActive = getActiveSubclusters();
|
||||
Class[] argsClasses = new Class[]{String.class};
|
||||
Object[] args = new Object[]{states};
|
||||
ClientMethod remoteMethod = new ClientMethod("getNodes", argsClasses, args);
|
||||
Map<SubClusterInfo, NodesInfo> nodesMap =
|
||||
invokeConcurrent(subClustersActive.values(), remoteMethod, NodesInfo.class);
|
||||
nodesMap.values().stream().forEach(nodesInfo -> {
|
||||
nodes.addAll(nodesInfo.getNodes());
|
||||
});
|
||||
}
|
||||
|
||||
// Collect all the responses in parallel
|
||||
|
||||
for (int i = 0; i < subClustersActive.size(); i++) {
|
||||
try {
|
||||
Future<NodesInfo> future = compSvc.take();
|
||||
NodesInfo nodesResponse = future.get();
|
||||
|
||||
if (nodesResponse != null) {
|
||||
nodes.addAll(nodesResponse.getNodes());
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
LOG.warn("Failed to get nodes report ", e);
|
||||
}
|
||||
} catch (NotFoundException e) {
|
||||
LOG.error("Get all active sub cluster(s) error.", e);
|
||||
} catch (YarnException e) {
|
||||
LOG.error("getNodes error.", e);
|
||||
} catch (IOException e) {
|
||||
LOG.error("getNodes error with io error.", e);
|
||||
}
|
||||
|
||||
// Delete duplicate from all the node reports got from all the available
|
||||
// YARN RMs. Nodes can be moved from one subclusters to another. In this
|
||||
// operation they result LOST/RUNNING in the previous SubCluster and
|
||||
// NEW/RUNNING in the new one.
|
||||
|
||||
return RouterWebServiceUtil.deleteDuplicateNodesInfo(nodes.getNodes());
|
||||
}
|
||||
|
||||
@ -1172,7 +1140,22 @@ public ApplicationStatisticsInfo getAppStatistics(HttpServletRequest hsr,
|
||||
@Override
|
||||
public NodeToLabelsInfo getNodeToLabels(HttpServletRequest hsr)
|
||||
throws IOException {
|
||||
throw new NotImplementedException("Code is not implemented");
|
||||
try {
|
||||
Map<SubClusterId, SubClusterInfo> subClustersActive = getActiveSubclusters();
|
||||
final HttpServletRequest hsrCopy = clone(hsr);
|
||||
Class[] argsClasses = new Class[]{HttpServletRequest.class};
|
||||
Object[] args = new Object[]{hsrCopy};
|
||||
ClientMethod remoteMethod = new ClientMethod("getNodeToLabels", argsClasses, args);
|
||||
Map<SubClusterInfo, NodeToLabelsInfo> nodeToLabelsInfoMap =
|
||||
invokeConcurrent(subClustersActive.values(), remoteMethod, NodeToLabelsInfo.class);
|
||||
return RouterWebServiceUtil.mergeNodeToLabels(nodeToLabelsInfoMap);
|
||||
} catch (NotFoundException e) {
|
||||
LOG.error("Get all active sub cluster(s) error.", e);
|
||||
throw new IOException("Get all active sub cluster(s) error.", e);
|
||||
} catch (YarnException e) {
|
||||
LOG.error("getNodeToLabels error.", e);
|
||||
throw new IOException("getNodeToLabels error.", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -1395,7 +1378,7 @@ public void shutdown() {
|
||||
}
|
||||
|
||||
private <R> Map<SubClusterInfo, R> invokeConcurrent(Collection<SubClusterInfo> clusterIds,
|
||||
ClientMethod request, Class<R> clazz) {
|
||||
ClientMethod request, Class<R> clazz) throws YarnException {
|
||||
|
||||
Map<SubClusterInfo, R> results = new HashMap<>();
|
||||
|
||||
@ -1413,8 +1396,8 @@ private <R> Map<SubClusterInfo, R> invokeConcurrent(Collection<SubClusterInfo> c
|
||||
R ret = clazz.cast(retObj);
|
||||
return ret;
|
||||
} catch (Exception e) {
|
||||
LOG.error("SubCluster {} failed to call {} method.", info.getSubClusterId(),
|
||||
request.getMethodName(), e);
|
||||
LOG.error("SubCluster %s failed to call %s method.",
|
||||
info.getSubClusterId(), request.getMethodName(), e);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
@ -1428,7 +1411,10 @@ private <R> Map<SubClusterInfo, R> invokeConcurrent(Collection<SubClusterInfo> c
|
||||
results.put(clusterId, response);
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
LOG.warn("SubCluster {} failed to {} report.", clusterId, request.getMethodName(), e);
|
||||
String msg = String.format("SubCluster %s failed to %s report.",
|
||||
clusterId, request.getMethodName());
|
||||
LOG.warn(msg, e);
|
||||
throw new YarnRuntimeException(msg, e);
|
||||
}
|
||||
});
|
||||
return results;
|
||||
|
@ -30,6 +30,9 @@
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
@ -43,13 +46,17 @@
|
||||
import org.apache.hadoop.net.NetUtils;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||
import org.apache.hadoop.yarn.api.records.NodeLabel;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.RMWebAppUtil;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppsInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodesInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeLabelsInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeToLabelsInfo;
|
||||
import org.apache.hadoop.yarn.server.uam.UnmanagedApplicationManager;
|
||||
import org.apache.hadoop.yarn.webapp.BadRequestException;
|
||||
import org.apache.hadoop.yarn.webapp.ForbiddenException;
|
||||
@ -293,8 +300,8 @@ public static AppsInfo mergeAppsInfo(ArrayList<AppInfo> appsInfo,
|
||||
boolean returnPartialResult) {
|
||||
AppsInfo allApps = new AppsInfo();
|
||||
|
||||
Map<String, AppInfo> federationAM = new HashMap<String, AppInfo>();
|
||||
Map<String, AppInfo> federationUAMSum = new HashMap<String, AppInfo>();
|
||||
Map<String, AppInfo> federationAM = new HashMap<>();
|
||||
Map<String, AppInfo> federationUAMSum = new HashMap<>();
|
||||
for (AppInfo a : appsInfo) {
|
||||
// Check if this AppInfo is an AM
|
||||
if (a.getAMHostHttpAddress() != null) {
|
||||
@ -332,7 +339,7 @@ public static AppsInfo mergeAppsInfo(ArrayList<AppInfo> appsInfo,
|
||||
}
|
||||
}
|
||||
|
||||
allApps.addAll(new ArrayList<AppInfo>(federationAM.values()));
|
||||
allApps.addAll(new ArrayList<>(federationAM.values()));
|
||||
return allApps;
|
||||
}
|
||||
|
||||
@ -419,7 +426,7 @@ public static NodesInfo deleteDuplicateNodesInfo(ArrayList<NodeInfo> nodes) {
|
||||
nodesMap.put(node.getNodeId(), node);
|
||||
}
|
||||
}
|
||||
nodesInfo.addAll(new ArrayList<NodeInfo>(nodesMap.values()));
|
||||
nodesInfo.addAll(new ArrayList<>(nodesMap.values()));
|
||||
return nodesInfo;
|
||||
}
|
||||
|
||||
@ -509,4 +516,28 @@ protected static <T> String getMediaTypeFromHttpServletRequest(
|
||||
return header;
|
||||
}
|
||||
|
||||
public static NodeToLabelsInfo mergeNodeToLabels(
|
||||
Map<SubClusterInfo, NodeToLabelsInfo> nodeToLabelsInfoMap) {
|
||||
|
||||
HashMap<String, NodeLabelsInfo> nodeToLabels = new HashMap<>();
|
||||
Collection<NodeToLabelsInfo> nodeToLabelsInfos = nodeToLabelsInfoMap.values();
|
||||
|
||||
nodeToLabelsInfos.stream().forEach(nodeToLabelsInfo -> {
|
||||
for (Map.Entry<String, NodeLabelsInfo> item : nodeToLabelsInfo.getNodeToLabels().entrySet()) {
|
||||
String key = item.getKey();
|
||||
NodeLabelsInfo itemValue = item.getValue();
|
||||
NodeLabelsInfo nodeToLabelsValue = nodeToLabels.getOrDefault(item.getKey(), null);
|
||||
Set<NodeLabel> hashSet = new HashSet<>();
|
||||
if (itemValue != null) {
|
||||
hashSet.addAll(itemValue.getNodeLabels());
|
||||
}
|
||||
if (nodeToLabelsValue != null) {
|
||||
hashSet.addAll(nodeToLabelsValue.getNodeLabels());
|
||||
}
|
||||
nodeToLabels.put(key, new NodeLabelsInfo(hashSet));
|
||||
}
|
||||
});
|
||||
|
||||
return new NodeToLabelsInfo(nodeToLabels);
|
||||
}
|
||||
}
|
||||
|
@ -22,6 +22,8 @@
|
||||
import java.net.ConnectException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.HashMap;
|
||||
import java.util.Collections;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
@ -52,6 +54,8 @@
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodesInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceOptionInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeToLabelsInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeLabelsInfo;
|
||||
import org.apache.hadoop.yarn.server.webapp.dao.ContainerInfo;
|
||||
import org.apache.hadoop.yarn.server.webapp.dao.ContainersInfo;
|
||||
import org.apache.hadoop.yarn.webapp.NotFoundException;
|
||||
@ -279,4 +283,18 @@ public ContainersInfo getContainers(HttpServletRequest req, HttpServletResponse
|
||||
|
||||
return containers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NodeToLabelsInfo getNodeToLabels(HttpServletRequest hsr) throws IOException {
|
||||
if (!isRunning) {
|
||||
throw new RuntimeException("RM is stopped");
|
||||
}
|
||||
NodeLabelsInfo cpuNode = new NodeLabelsInfo(Collections.singleton("CPU"));
|
||||
NodeLabelsInfo gpuNode = new NodeLabelsInfo(Collections.singleton("GPU"));
|
||||
|
||||
HashMap<String, NodeLabelsInfo> nodeLabels = new HashMap<>();
|
||||
nodeLabels.put("node1", cpuNode);
|
||||
nodeLabels.put("node2", gpuNode);
|
||||
return new NodeToLabelsInfo(nodeLabels);
|
||||
}
|
||||
}
|
@ -21,6 +21,7 @@
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.HashMap;
|
||||
|
||||
import javax.ws.rs.core.Response;
|
||||
|
||||
@ -49,6 +50,8 @@
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodesInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceOptionInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeToLabelsInfo;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeLabelsInfo;
|
||||
import org.apache.hadoop.yarn.server.webapp.dao.ContainersInfo;
|
||||
import org.apache.hadoop.yarn.util.MonotonicClock;
|
||||
import org.junit.Assert;
|
||||
@ -605,4 +608,22 @@ public void testGetContainersWrongFormat() {
|
||||
|
||||
Assert.assertTrue(response.getContainers().isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetNodeToLabels() throws IOException {
|
||||
NodeToLabelsInfo info = interceptor.getNodeToLabels(null);
|
||||
HashMap<String, NodeLabelsInfo> map = info.getNodeToLabels();
|
||||
Assert.assertNotNull(map);
|
||||
Assert.assertEquals(2, map.size());
|
||||
|
||||
NodeLabelsInfo node1Value = map.getOrDefault("node1", null);
|
||||
Assert.assertNotNull(node1Value);
|
||||
Assert.assertEquals(1, node1Value.getNodeLabelsName().size());
|
||||
Assert.assertEquals("CPU", node1Value.getNodeLabelsName().get(0));
|
||||
|
||||
NodeLabelsInfo node2Value = map.getOrDefault("node2", null);
|
||||
Assert.assertNotNull(node2Value);
|
||||
Assert.assertEquals(1, node2Value.getNodeLabelsName().size());
|
||||
Assert.assertEquals("GPU", node2Value.getNodeLabelsName().get(0));
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user