YARN-11212. [Federation] Add getNodeToLabels REST APIs for Router. (#4614)

This commit is contained in:
slfan1989 2022-07-29 02:53:04 +08:00 committed by GitHub
parent a5b12c8010
commit e994635a95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 141 additions and 69 deletions

View File

@ -32,8 +32,7 @@
public class NodeLabelsInfo {
@XmlElement(name = "nodeLabelInfo")
private ArrayList<NodeLabelInfo> nodeLabelsInfo =
new ArrayList<NodeLabelInfo>();
private ArrayList<NodeLabelInfo> nodeLabelsInfo = new ArrayList<>();
public NodeLabelsInfo() {
// JAXB needs this
@ -44,25 +43,32 @@ public NodeLabelsInfo(ArrayList<NodeLabelInfo> nodeLabels) {
}
public NodeLabelsInfo(List<NodeLabel> nodeLabels) {
this.nodeLabelsInfo = new ArrayList<NodeLabelInfo>();
this.nodeLabelsInfo = new ArrayList<>();
for (NodeLabel label : nodeLabels) {
this.nodeLabelsInfo.add(new NodeLabelInfo(label));
}
}
public NodeLabelsInfo(Set<String> nodeLabelsName) {
this.nodeLabelsInfo = new ArrayList<NodeLabelInfo>();
this.nodeLabelsInfo = new ArrayList<>();
for (String labelName : nodeLabelsName) {
this.nodeLabelsInfo.add(new NodeLabelInfo(labelName));
}
}
public NodeLabelsInfo(Collection<NodeLabel> nodeLabels) {
this.nodeLabelsInfo = new ArrayList<>();
nodeLabels.stream().forEach(nodeLabel -> {
this.nodeLabelsInfo.add(new NodeLabelInfo(nodeLabel));
});
}
public ArrayList<NodeLabelInfo> getNodeLabelsInfo() {
return nodeLabelsInfo;
}
public Set<NodeLabel> getNodeLabels() {
Set<NodeLabel> nodeLabels = new HashSet<NodeLabel>();
Set<NodeLabel> nodeLabels = new HashSet<>();
for (NodeLabelInfo label : nodeLabelsInfo) {
nodeLabels.add(NodeLabel.newInstance(label.getName(),
label.getExclusivity()));
@ -71,7 +77,7 @@ public Set<NodeLabel> getNodeLabels() {
}
public List<String> getNodeLabelsName() {
ArrayList<String> nodeLabelsName = new ArrayList<String>();
ArrayList<String> nodeLabelsName = new ArrayList<>();
for (NodeLabelInfo label : nodeLabelsInfo) {
nodeLabelsName.add(label.getName());
}

View File

@ -35,7 +35,17 @@ public NodeToLabelsInfo() {
// JAXB needs this
}
public NodeToLabelsInfo(HashMap<String, NodeLabelsInfo> nodeToLabels) {
if (nodeToLabels != null) {
this.nodeToLabels.putAll(nodeToLabels);
}
}
public HashMap<String, NodeLabelsInfo> getNodeToLabels() {
return nodeToLabels;
}
public void setNodeToLabels(HashMap<String, NodeLabelsInfo> nodeToLabels) {
this.nodeToLabels = nodeToLabels;
}
}

View File

@ -282,7 +282,7 @@ public Response createNewApplication(HttpServletRequest hsr)
.entity(e.getLocalizedMessage()).build();
}
List<SubClusterId> blacklist = new ArrayList<SubClusterId>();
List<SubClusterId> blacklist = new ArrayList<>();
for (int i = 0; i < numSubmitRetries; ++i) {
@ -295,7 +295,7 @@ public Response createNewApplication(HttpServletRequest hsr)
.entity(e.getLocalizedMessage()).build();
}
LOG.debug("getNewApplication try #{} on SubCluster {}", i, subClusterId);
LOG.debug("getNewApplication try #{} on SubCluster {}.", i, subClusterId);
DefaultRequestInterceptorREST interceptor =
getOrCreateInterceptorForSubCluster(subClusterId,
@ -304,7 +304,7 @@ public Response createNewApplication(HttpServletRequest hsr)
try {
response = interceptor.createNewApplication(hsr);
} catch (Exception e) {
LOG.warn("Unable to create a new ApplicationId in SubCluster {}",
LOG.warn("Unable to create a new ApplicationId in SubCluster {}.",
subClusterId.getId(), e);
}
@ -424,7 +424,7 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp,
.build();
}
List<SubClusterId> blacklist = new ArrayList<SubClusterId>();
List<SubClusterId> blacklist = new ArrayList<>();
for (int i = 0; i < numSubmitRetries; ++i) {
@ -441,7 +441,7 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp,
.entity(e.getLocalizedMessage())
.build();
}
LOG.info("submitApplication appId {} try #{} on SubCluster {}",
LOG.info("submitApplication appId {} try #{} on SubCluster {}.",
applicationId, i, subClusterId);
ApplicationHomeSubCluster appHomeSubCluster =
@ -482,7 +482,7 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp,
.build();
}
if (subClusterId == subClusterIdInStateStore) {
LOG.info("Application {} already submitted on SubCluster {}",
LOG.info("Application {} already submitted on SubCluster {}.",
applicationId, subClusterId);
} else {
routerMetrics.incrAppsFailedSubmitted();
@ -712,8 +712,7 @@ public AppsInfo call() {
if (rmApps == null) {
routerMetrics.incrMultipleAppsFailedRetrieved();
LOG.error("Subcluster {} failed to return appReport.",
info.getSubClusterId());
LOG.error("Subcluster {} failed to return appReport.", info.getSubClusterId());
return null;
}
return rmApps;
@ -873,8 +872,7 @@ private Map<SubClusterInfo, NodeInfo> getNode(
subclusterId, subcluster.getRMWebServiceAddress());
return interceptor.getNode(nodeId);
} catch (Exception e) {
LOG.error("Subcluster {} failed to return nodeInfo.",
subclusterId);
LOG.error("Subcluster {} failed to return nodeInfo.", subclusterId, e);
return null;
}
});
@ -953,58 +951,28 @@ private SubClusterInfo getNodeSubcluster(String nodeId)
public NodesInfo getNodes(String states) {
NodesInfo nodes = new NodesInfo();
final Map<SubClusterId, SubClusterInfo> subClustersActive;
try {
subClustersActive = getActiveSubclusters();
} catch (Exception e) {
LOG.error("Cannot get nodes: {}", e.getMessage());
return new NodesInfo();
}
// Send the requests in parallel
CompletionService<NodesInfo> compSvc =
new ExecutorCompletionService<NodesInfo>(this.threadpool);
for (final SubClusterInfo info : subClustersActive.values()) {
compSvc.submit(new Callable<NodesInfo>() {
@Override
public NodesInfo call() {
DefaultRequestInterceptorREST interceptor =
getOrCreateInterceptorForSubCluster(
info.getSubClusterId(), info.getRMWebServiceAddress());
try {
NodesInfo nodesInfo = interceptor.getNodes(states);
return nodesInfo;
} catch (Exception e) {
LOG.error("Subcluster {} failed to return nodesInfo.",
info.getSubClusterId());
return null;
}
}
Map<SubClusterId, SubClusterInfo> subClustersActive = getActiveSubclusters();
Class[] argsClasses = new Class[]{String.class};
Object[] args = new Object[]{states};
ClientMethod remoteMethod = new ClientMethod("getNodes", argsClasses, args);
Map<SubClusterInfo, NodesInfo> nodesMap =
invokeConcurrent(subClustersActive.values(), remoteMethod, NodesInfo.class);
nodesMap.values().stream().forEach(nodesInfo -> {
nodes.addAll(nodesInfo.getNodes());
});
}
// Collect all the responses in parallel
for (int i = 0; i < subClustersActive.size(); i++) {
try {
Future<NodesInfo> future = compSvc.take();
NodesInfo nodesResponse = future.get();
if (nodesResponse != null) {
nodes.addAll(nodesResponse.getNodes());
}
} catch (Throwable e) {
LOG.warn("Failed to get nodes report ", e);
}
} catch (NotFoundException e) {
LOG.error("Get all active sub cluster(s) error.", e);
} catch (YarnException e) {
LOG.error("getNodes error.", e);
} catch (IOException e) {
LOG.error("getNodes error with io error.", e);
}
// Delete duplicate from all the node reports got from all the available
// YARN RMs. Nodes can be moved from one subclusters to another. In this
// operation they result LOST/RUNNING in the previous SubCluster and
// NEW/RUNNING in the new one.
return RouterWebServiceUtil.deleteDuplicateNodesInfo(nodes.getNodes());
}
@ -1172,7 +1140,22 @@ public ApplicationStatisticsInfo getAppStatistics(HttpServletRequest hsr,
@Override
public NodeToLabelsInfo getNodeToLabels(HttpServletRequest hsr)
throws IOException {
throw new NotImplementedException("Code is not implemented");
try {
Map<SubClusterId, SubClusterInfo> subClustersActive = getActiveSubclusters();
final HttpServletRequest hsrCopy = clone(hsr);
Class[] argsClasses = new Class[]{HttpServletRequest.class};
Object[] args = new Object[]{hsrCopy};
ClientMethod remoteMethod = new ClientMethod("getNodeToLabels", argsClasses, args);
Map<SubClusterInfo, NodeToLabelsInfo> nodeToLabelsInfoMap =
invokeConcurrent(subClustersActive.values(), remoteMethod, NodeToLabelsInfo.class);
return RouterWebServiceUtil.mergeNodeToLabels(nodeToLabelsInfoMap);
} catch (NotFoundException e) {
LOG.error("Get all active sub cluster(s) error.", e);
throw new IOException("Get all active sub cluster(s) error.", e);
} catch (YarnException e) {
LOG.error("getNodeToLabels error.", e);
throw new IOException("getNodeToLabels error.", e);
}
}
@Override
@ -1395,7 +1378,7 @@ public void shutdown() {
}
private <R> Map<SubClusterInfo, R> invokeConcurrent(Collection<SubClusterInfo> clusterIds,
ClientMethod request, Class<R> clazz) {
ClientMethod request, Class<R> clazz) throws YarnException {
Map<SubClusterInfo, R> results = new HashMap<>();
@ -1413,8 +1396,8 @@ private <R> Map<SubClusterInfo, R> invokeConcurrent(Collection<SubClusterInfo> c
R ret = clazz.cast(retObj);
return ret;
} catch (Exception e) {
LOG.error("SubCluster {} failed to call {} method.", info.getSubClusterId(),
request.getMethodName(), e);
LOG.error("SubCluster %s failed to call %s method.",
info.getSubClusterId(), request.getMethodName(), e);
return null;
}
});
@ -1428,7 +1411,10 @@ private <R> Map<SubClusterInfo, R> invokeConcurrent(Collection<SubClusterInfo> c
results.put(clusterId, response);
}
} catch (Throwable e) {
LOG.warn("SubCluster {} failed to {} report.", clusterId, request.getMethodName(), e);
String msg = String.format("SubCluster %s failed to %s report.",
clusterId, request.getMethodName());
LOG.warn(msg, e);
throw new YarnRuntimeException(msg, e);
}
});
return results;

View File

@ -30,6 +30,9 @@
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Collection;
import java.util.Set;
import java.util.HashSet;
import java.util.concurrent.TimeUnit;
import javax.servlet.http.HttpServletRequest;
@ -43,13 +46,17 @@
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.api.records.NodeLabel;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.RMWebAppUtil;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeLabelsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeToLabelsInfo;
import org.apache.hadoop.yarn.server.uam.UnmanagedApplicationManager;
import org.apache.hadoop.yarn.webapp.BadRequestException;
import org.apache.hadoop.yarn.webapp.ForbiddenException;
@ -293,8 +300,8 @@ public static AppsInfo mergeAppsInfo(ArrayList<AppInfo> appsInfo,
boolean returnPartialResult) {
AppsInfo allApps = new AppsInfo();
Map<String, AppInfo> federationAM = new HashMap<String, AppInfo>();
Map<String, AppInfo> federationUAMSum = new HashMap<String, AppInfo>();
Map<String, AppInfo> federationAM = new HashMap<>();
Map<String, AppInfo> federationUAMSum = new HashMap<>();
for (AppInfo a : appsInfo) {
// Check if this AppInfo is an AM
if (a.getAMHostHttpAddress() != null) {
@ -332,7 +339,7 @@ public static AppsInfo mergeAppsInfo(ArrayList<AppInfo> appsInfo,
}
}
allApps.addAll(new ArrayList<AppInfo>(federationAM.values()));
allApps.addAll(new ArrayList<>(federationAM.values()));
return allApps;
}
@ -419,7 +426,7 @@ public static NodesInfo deleteDuplicateNodesInfo(ArrayList<NodeInfo> nodes) {
nodesMap.put(node.getNodeId(), node);
}
}
nodesInfo.addAll(new ArrayList<NodeInfo>(nodesMap.values()));
nodesInfo.addAll(new ArrayList<>(nodesMap.values()));
return nodesInfo;
}
@ -509,4 +516,28 @@ protected static <T> String getMediaTypeFromHttpServletRequest(
return header;
}
public static NodeToLabelsInfo mergeNodeToLabels(
Map<SubClusterInfo, NodeToLabelsInfo> nodeToLabelsInfoMap) {
HashMap<String, NodeLabelsInfo> nodeToLabels = new HashMap<>();
Collection<NodeToLabelsInfo> nodeToLabelsInfos = nodeToLabelsInfoMap.values();
nodeToLabelsInfos.stream().forEach(nodeToLabelsInfo -> {
for (Map.Entry<String, NodeLabelsInfo> item : nodeToLabelsInfo.getNodeToLabels().entrySet()) {
String key = item.getKey();
NodeLabelsInfo itemValue = item.getValue();
NodeLabelsInfo nodeToLabelsValue = nodeToLabels.getOrDefault(item.getKey(), null);
Set<NodeLabel> hashSet = new HashSet<>();
if (itemValue != null) {
hashSet.addAll(itemValue.getNodeLabels());
}
if (nodeToLabelsValue != null) {
hashSet.addAll(nodeToLabelsValue.getNodeLabels());
}
nodeToLabels.put(key, new NodeLabelsInfo(hashSet));
}
});
return new NodeToLabelsInfo(nodeToLabels);
}
}

View File

@ -22,6 +22,8 @@
import java.net.ConnectException;
import java.util.HashSet;
import java.util.Set;
import java.util.HashMap;
import java.util.Collections;
import java.util.concurrent.atomic.AtomicInteger;
import javax.servlet.http.HttpServletRequest;
@ -52,6 +54,8 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceOptionInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeToLabelsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeLabelsInfo;
import org.apache.hadoop.yarn.server.webapp.dao.ContainerInfo;
import org.apache.hadoop.yarn.server.webapp.dao.ContainersInfo;
import org.apache.hadoop.yarn.webapp.NotFoundException;
@ -279,4 +283,18 @@ public ContainersInfo getContainers(HttpServletRequest req, HttpServletResponse
return containers;
}
@Override
public NodeToLabelsInfo getNodeToLabels(HttpServletRequest hsr) throws IOException {
if (!isRunning) {
throw new RuntimeException("RM is stopped");
}
NodeLabelsInfo cpuNode = new NodeLabelsInfo(Collections.singleton("CPU"));
NodeLabelsInfo gpuNode = new NodeLabelsInfo(Collections.singleton("GPU"));
HashMap<String, NodeLabelsInfo> nodeLabels = new HashMap<>();
nodeLabels.put("node1", cpuNode);
nodeLabels.put("node2", gpuNode);
return new NodeToLabelsInfo(nodeLabels);
}
}

View File

@ -21,6 +21,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import javax.ws.rs.core.Response;
@ -49,6 +50,8 @@
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodesInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceOptionInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeToLabelsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeLabelsInfo;
import org.apache.hadoop.yarn.server.webapp.dao.ContainersInfo;
import org.apache.hadoop.yarn.util.MonotonicClock;
import org.junit.Assert;
@ -605,4 +608,22 @@ public void testGetContainersWrongFormat() {
Assert.assertTrue(response.getContainers().isEmpty());
}
@Test
public void testGetNodeToLabels() throws IOException {
NodeToLabelsInfo info = interceptor.getNodeToLabels(null);
HashMap<String, NodeLabelsInfo> map = info.getNodeToLabels();
Assert.assertNotNull(map);
Assert.assertEquals(2, map.size());
NodeLabelsInfo node1Value = map.getOrDefault("node1", null);
Assert.assertNotNull(node1Value);
Assert.assertEquals(1, node1Value.getNodeLabelsName().size());
Assert.assertEquals("CPU", node1Value.getNodeLabelsName().get(0));
NodeLabelsInfo node2Value = map.getOrDefault("node2", null);
Assert.assertNotNull(node2Value);
Assert.assertEquals(1, node2Value.getNodeLabelsName().size());
Assert.assertEquals("GPU", node2Value.getNodeLabelsName().get(0));
}
}