YARN-11222. [Federation] Add addToClusterNodeLabels, removeFromClusterNodeLabels REST APIs for Router. (#5328)

This commit is contained in:
slfan1989 2023-02-25 02:52:57 +08:00 committed by GitHub
parent 27a54955f9
commit 25ebd0b8b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 366 additions and 3 deletions

View File

@ -147,6 +147,10 @@ public final class RouterMetrics {
private MutableGaugeInt numRefreshSuperUserGroupsConfigurationFailedRetrieved;
@Metric("# of refreshUserToGroupsMappings failed to be retrieved")
private MutableGaugeInt numRefreshUserToGroupsMappingsFailedRetrieved;
@Metric("# of addToClusterNodeLabels failed to be retrieved")
private MutableGaugeInt numAddToClusterNodeLabelsFailedRetrieved;
@Metric("# of removeFromClusterNodeLabels failed to be retrieved")
private MutableGaugeInt numRemoveFromClusterNodeLabelsFailedRetrieved;
// Aggregate metrics are shared, and don't have to be looked up per call
@Metric("Total number of successful Submitted apps and latency(ms)")
@ -253,9 +257,12 @@ public final class RouterMetrics {
private MutableRate totalSucceededRefreshSuperUserGroupsConfigurationRetrieved;
@Metric("Total number of successful Retrieved RefreshUserToGroupsMappings and latency(ms)")
private MutableRate totalSucceededRefreshUserToGroupsMappingsRetrieved;
@Metric("Total number of successful Retrieved GetSchedulerInfo and latency(ms)")
private MutableRate totalSucceededGetSchedulerInfoRetrieved;
@Metric("Total number of successful Retrieved AddToClusterNodeLabels and latency(ms)")
private MutableRate totalSucceededAddToClusterNodeLabelsRetrieved;
@Metric("Total number of successful Retrieved RemoveFromClusterNodeLabels and latency(ms)")
private MutableRate totalSucceededRemoveFromClusterNodeLabelsRetrieved;
/**
* Provide quantile counters for all latencies.
@ -313,6 +320,8 @@ public final class RouterMetrics {
private MutableQuantiles getSchedulerInfoRetrievedLatency;
private MutableQuantiles refreshSuperUserGroupsConfLatency;
private MutableQuantiles refreshUserToGroupsMappingsLatency;
private MutableQuantiles addToClusterNodeLabelsLatency;
private MutableQuantiles removeFromClusterNodeLabelsLatency;
private static volatile RouterMetrics instance = null;
private static MetricsRegistry registry;
@ -504,6 +513,12 @@ private RouterMetrics() {
refreshUserToGroupsMappingsLatency = registry.newQuantiles("refreshUserToGroupsMappingsLatency",
"latency of refresh user to groups mappings timeouts", "ops", "latency", 10);
addToClusterNodeLabelsLatency = registry.newQuantiles("addToClusterNodeLabelsLatency",
"latency of add cluster nodelabels timeouts", "ops", "latency", 10);
removeFromClusterNodeLabelsLatency = registry.newQuantiles("removeFromClusterNodeLabelsLatency",
"latency of remove cluster nodelabels timeouts", "ops", "latency", 10);
}
public static RouterMetrics getMetrics() {
@ -780,6 +795,16 @@ public long getNumSucceededGetSchedulerInfoRetrieved() {
return totalSucceededGetSchedulerInfoRetrieved.lastStat().numSamples();
}
@VisibleForTesting
public long getNumSucceededAddToClusterNodeLabelsRetrieved() {
return totalSucceededAddToClusterNodeLabelsRetrieved.lastStat().numSamples();
}
@VisibleForTesting
public long getNumSucceededRemoveFromClusterNodeLabelsRetrieved() {
return totalSucceededRemoveFromClusterNodeLabelsRetrieved.lastStat().numSamples();
}
@VisibleForTesting
public long getNumSucceededRefreshSuperUserGroupsConfigurationRetrieved() {
return totalSucceededRefreshSuperUserGroupsConfigurationRetrieved.lastStat().numSamples();
@ -1040,6 +1065,16 @@ public double getLatencySucceededGetSchedulerInfoRetrieved() {
return totalSucceededGetSchedulerInfoRetrieved.lastStat().mean();
}
@VisibleForTesting
public double getLatencySucceededAddToClusterNodeLabelsRetrieved() {
return totalSucceededAddToClusterNodeLabelsRetrieved.lastStat().mean();
}
@VisibleForTesting
public double getLatencySucceededRemoveFromClusterNodeLabelsRetrieved() {
return totalSucceededRemoveFromClusterNodeLabelsRetrieved.lastStat().mean();
}
@VisibleForTesting
public double getLatencySucceededRefreshSuperUserGroupsConfigurationRetrieved() {
return totalSucceededRefreshSuperUserGroupsConfigurationRetrieved.lastStat().mean();
@ -1251,6 +1286,14 @@ public int getNumRefreshUserToGroupsMappingsFailedRetrieved() {
return numRefreshUserToGroupsMappingsFailedRetrieved.value();
}
public int getNumAddToClusterNodeLabelsFailedRetrieved() {
return numAddToClusterNodeLabelsFailedRetrieved.value();
}
public int getNumRemoveFromClusterNodeLabelsFailedRetrieved() {
return numRemoveFromClusterNodeLabelsFailedRetrieved.value();
}
public int getDelegationTokenFailedRetrieved() {
return numGetDelegationTokenFailedRetrieved.value();
}
@ -1534,6 +1577,16 @@ public void succeededGetSchedulerInfoRetrieved(long duration) {
getSchedulerInfoRetrievedLatency.add(duration);
}
public void succeededAddToClusterNodeLabelsRetrieved(long duration) {
totalSucceededAddToClusterNodeLabelsRetrieved.add(duration);
addToClusterNodeLabelsLatency.add(duration);
}
public void succeededRemoveFromClusterNodeLabelsRetrieved(long duration) {
totalSucceededRemoveFromClusterNodeLabelsRetrieved.add(duration);
removeFromClusterNodeLabelsLatency.add(duration);
}
public void succeededRefreshSuperUserGroupsConfRetrieved(long duration) {
totalSucceededRefreshSuperUserGroupsConfigurationRetrieved.add(duration);
refreshSuperUserGroupsConfLatency.add(duration);
@ -1728,6 +1781,14 @@ public void incrRefreshUserToGroupsMappingsFailedRetrieved() {
numRefreshUserToGroupsMappingsFailedRetrieved.incr();
}
public void incrAddToClusterNodeLabelsFailedRetrieved() {
numAddToClusterNodeLabelsFailedRetrieved.incr();
}
public void incrRemoveFromClusterNodeLabelsFailedRetrieved() {
numRemoveFromClusterNodeLabelsFailedRetrieved.incr();
}
public void incrGetDelegationTokenFailedRetrieved() {
numGetDelegationTokenFailedRetrieved.incr();
}

View File

@ -43,6 +43,7 @@
import javax.ws.rs.core.Response;
import javax.ws.rs.core.Response.Status;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
@ -72,6 +73,7 @@
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.security.client.RMDelegationTokenIdentifier;
import org.apache.hadoop.yarn.server.api.ResourceManagerAdministrationProtocol;
import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils;
import org.apache.hadoop.yarn.server.federation.policies.RouterPolicyFacade;
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyException;
@ -1580,16 +1582,126 @@ public NodeLabelsInfo getClusterNodeLabels(HttpServletRequest hsr)
throw new RuntimeException("getClusterNodeLabels Failed.");
}
/**
* This method adds specific node labels for specific nodes, and it is
* reachable by using {@link RMWSConsts#ADD_NODE_LABELS}.
*
* @see ResourceManagerAdministrationProtocol#addToClusterNodeLabels
* @param newNodeLabels the node labels to add. It is a content param.
* @param hsr the servlet request
* @return Response containing the status code
* @throws Exception in case of bad request
*/
@Override
public Response addToClusterNodeLabels(NodeLabelsInfo newNodeLabels,
HttpServletRequest hsr) throws Exception {
throw new NotImplementedException("Code is not implemented");
if (newNodeLabels == null) {
routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved();
throw new IllegalArgumentException("Parameter error, the newNodeLabels is null.");
}
List<NodeLabelInfo> nodeLabelInfos = newNodeLabels.getNodeLabelsInfo();
if (CollectionUtils.isEmpty(nodeLabelInfos)) {
routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved();
throw new IllegalArgumentException("Parameter error, the nodeLabelsInfo is null or empty.");
}
try {
long startTime = clock.getTime();
Map<SubClusterId, SubClusterInfo> subClustersActive = getActiveSubclusters();
final HttpServletRequest hsrCopy = clone(hsr);
Class[] argsClasses = new Class[]{NodeLabelsInfo.class, HttpServletRequest.class};
Object[] args = new Object[]{newNodeLabels, hsrCopy};
ClientMethod remoteMethod = new ClientMethod("addToClusterNodeLabels", argsClasses, args);
Map<SubClusterInfo, Response> responseInfoMap =
invokeConcurrent(subClustersActive.values(), remoteMethod, Response.class);
StringBuffer buffer = new StringBuffer();
// SubCluster-0:SUCCESS,SubCluster-1:SUCCESS
responseInfoMap.forEach((subClusterInfo, response) -> {
buildAppendMsg(subClusterInfo, buffer, response);
});
long stopTime = clock.getTime();
routerMetrics.succeededAddToClusterNodeLabelsRetrieved((stopTime - startTime));
return Response.status(Status.OK).entity(buffer.toString()).build();
} catch (NotFoundException e) {
routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved();
RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e);
} catch (YarnException e) {
routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved();
RouterServerUtil.logAndThrowIOException("addToClusterNodeLabels with yarn error.", e);
}
routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved();
throw new RuntimeException("addToClusterNodeLabels Failed.");
}
/**
* This method removes all the node labels for specific nodes, and it is
* reachable by using {@link RMWSConsts#REMOVE_NODE_LABELS}.
*
* @see ResourceManagerAdministrationProtocol#removeFromClusterNodeLabels
* @param oldNodeLabels the node labels to remove. It is a QueryParam.
* @param hsr the servlet request
* @return Response containing the status code
* @throws Exception in case of bad request
*/
@Override
public Response removeFromClusterNodeLabels(Set<String> oldNodeLabels,
HttpServletRequest hsr) throws Exception {
throw new NotImplementedException("Code is not implemented");
if (CollectionUtils.isEmpty(oldNodeLabels)) {
routerMetrics.incrRemoveFromClusterNodeLabelsFailedRetrieved();
throw new IllegalArgumentException("Parameter error, the oldNodeLabels is null or empty.");
}
try {
long startTime = clock.getTime();
Map<SubClusterId, SubClusterInfo> subClustersActive = getActiveSubclusters();
final HttpServletRequest hsrCopy = clone(hsr);
Class[] argsClasses = new Class[]{Set.class, HttpServletRequest.class};
Object[] args = new Object[]{oldNodeLabels, hsrCopy};
ClientMethod remoteMethod =
new ClientMethod("removeFromClusterNodeLabels", argsClasses, args);
Map<SubClusterInfo, Response> responseInfoMap =
invokeConcurrent(subClustersActive.values(), remoteMethod, Response.class);
StringBuffer buffer = new StringBuffer();
// SubCluster-0:SUCCESS,SubCluster-1:SUCCESS
responseInfoMap.forEach((subClusterInfo, response) -> {
buildAppendMsg(subClusterInfo, buffer, response);
});
long stopTime = clock.getTime();
routerMetrics.succeededRemoveFromClusterNodeLabelsRetrieved(stopTime - startTime);
return Response.status(Status.OK).entity(buffer.toString()).build();
} catch (NotFoundException e) {
routerMetrics.incrRemoveFromClusterNodeLabelsFailedRetrieved();
RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e);
} catch (YarnException e) {
routerMetrics.incrRemoveFromClusterNodeLabelsFailedRetrieved();
RouterServerUtil.logAndThrowIOException("removeFromClusterNodeLabels with yarn error.", e);
}
routerMetrics.incrRemoveFromClusterNodeLabelsFailedRetrieved();
throw new RuntimeException("removeFromClusterNodeLabels Failed.");
}
/**
* Bbulid Append information.
*
* @param subClusterInfo subCluster information.
* @param buffer StringBuffer.
* @param response response message.
*/
private void buildAppendMsg(SubClusterInfo subClusterInfo, StringBuffer buffer,
Response response) {
SubClusterId subClusterId = subClusterInfo.getSubClusterId();
String state = response != null &&
(response.getStatus() == Status.OK.getStatusCode()) ? "SUCCESS" : "FAILED";
buffer.append("SubCluster-")
.append(subClusterId.getId())
.append(":")
.append(state)
.append(",");
}
@Override

View File

@ -793,6 +793,11 @@ public void getBulkActivitiesRetrieved(long duration) {
LOG.info("Mocked: successful GetBulkActivities call with duration {}", duration);
metrics.succeededGetBulkActivitiesRetrieved(duration);
}
public void addToClusterNodeLabelsRetrieved(long duration) {
LOG.info("Mocked: successful AddToClusterNodeLabels call with duration {}", duration);
metrics.succeededAddToClusterNodeLabelsRetrieved(duration);
}
}
@Test
@ -1696,4 +1701,19 @@ public void testGetBulkActivitiesRetrievedFailed() {
Assert.assertEquals(totalBadBefore + 1,
metrics.getBulkActivitiesFailedRetrieved());
}
@Test
public void testAddToClusterNodeLabelsRetrieved() {
long totalGoodBefore = metrics.getNumSucceededAddToClusterNodeLabelsRetrieved();
goodSubCluster.addToClusterNodeLabelsRetrieved(150);
Assert.assertEquals(totalGoodBefore + 1,
metrics.getNumSucceededAddToClusterNodeLabelsRetrieved());
Assert.assertEquals(150,
metrics.getLatencySucceededAddToClusterNodeLabelsRetrieved(), ASSERT_DOUBLE_DELTA);
goodSubCluster.addToClusterNodeLabelsRetrieved(300);
Assert.assertEquals(totalGoodBefore + 2,
metrics.getNumSucceededAddToClusterNodeLabelsRetrieved());
Assert.assertEquals(225,
metrics.getLatencySucceededAddToClusterNodeLabelsRetrieved(), ASSERT_DOUBLE_DELTA);
}
}

View File

@ -1307,4 +1307,44 @@ public SchedulerTypeInfo getSchedulerInfo() {
throw new RuntimeException(e);
}
}
@Override
public Response addToClusterNodeLabels(NodeLabelsInfo newNodeLabels, HttpServletRequest hsr)
throws Exception {
List<NodeLabelInfo> nodeLabelInfoList = newNodeLabels.getNodeLabelsInfo();
NodeLabelInfo nodeLabelInfo = nodeLabelInfoList.get(0);
String nodeLabelName = nodeLabelInfo.getName();
// If nodeLabelName is ALL, we let all subclusters pass
if (StringUtils.equals("ALL", nodeLabelName)) {
return Response.status(Status.OK).build();
} else if (StringUtils.equals("A0", nodeLabelName)) {
SubClusterId subClusterId = getSubClusterId();
String id = subClusterId.getId();
if (StringUtils.contains("A0", id)) {
return Response.status(Status.OK).build();
} else {
return Response.status(Status.BAD_REQUEST).entity(null).build();
}
}
throw new YarnException("addToClusterNodeLabels Error");
}
@Override
public Response removeFromClusterNodeLabels(Set<String> oldNodeLabels, HttpServletRequest hsr)
throws Exception {
// If oldNodeLabels contains ALL, we let all subclusters pass
if (oldNodeLabels.contains("ALL")) {
return Response.status(Status.OK).build();
} else if (oldNodeLabels.contains("A0")) {
SubClusterId subClusterId = getSubClusterId();
String id = subClusterId.getId();
if (StringUtils.contains("A0", id)) {
return Response.status(Status.OK).build();
} else {
return Response.status(Status.BAD_REQUEST).entity(null).build();
}
}
throw new YarnException("removeFromClusterNodeLabels Error");
}
}

View File

@ -25,6 +25,7 @@
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Collections;
import java.util.stream.Collectors;
@ -40,6 +41,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.http.HttpConfig;
import org.apache.hadoop.util.Lists;
import org.apache.hadoop.util.Sets;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest;
import org.apache.hadoop.yarn.api.records.ApplicationId;
@ -1899,4 +1901,132 @@ public void testGetBulkActivitiesError() throws Exception {
"'groupBy' must not be empty.",
() -> interceptor.getBulkActivities(null, "", 1));
}
@Test
public void testAddToClusterNodeLabels1() throws Exception {
// In this test, we try to add ALL label, all subClusters will return success.
NodeLabelsInfo nodeLabelsInfo = new NodeLabelsInfo();
NodeLabelInfo nodeLabelInfo = new NodeLabelInfo("ALL", true);
nodeLabelsInfo.getNodeLabelsInfo().add(nodeLabelInfo);
Response response = interceptor.addToClusterNodeLabels(nodeLabelsInfo, null);
Assert.assertNotNull(response);
Object entityObj = response.getEntity();
Assert.assertNotNull(entityObj);
String entity = String.valueOf(entityObj);
String[] entities = StringUtils.split(entity, ",");
Assert.assertNotNull(entities);
Assert.assertEquals(4, entities.length);
// The order in which the cluster returns messages is uncertain,
// we confirm the result by contains
String expectedMsg =
"SubCluster-0:SUCCESS,SubCluster-1:SUCCESS,SubCluster-2:SUCCESS,SubCluster-3:SUCCESS";
Arrays.stream(entities).forEach(item -> {
Assert.assertTrue(expectedMsg.contains(item));
});
}
@Test
public void testAddToClusterNodeLabels2() throws Exception {
// In this test, we try to add A0 label,
// subCluster0 will return success, and other subClusters will return null
NodeLabelsInfo nodeLabelsInfo = new NodeLabelsInfo();
NodeLabelInfo nodeLabelInfo = new NodeLabelInfo("A0", true);
nodeLabelsInfo.getNodeLabelsInfo().add(nodeLabelInfo);
Response response = interceptor.addToClusterNodeLabels(nodeLabelsInfo, null);
Assert.assertNotNull(response);
Object entityObj = response.getEntity();
Assert.assertNotNull(entityObj);
String expectedValue = "SubCluster-0:SUCCESS,";
String entity = String.valueOf(entityObj);
Assert.assertTrue(entity.contains(expectedValue));
}
@Test
public void testAddToClusterNodeLabelsError() throws Exception {
// the newNodeLabels is null
LambdaTestUtils.intercept(IllegalArgumentException.class,
"Parameter error, the newNodeLabels is null.",
() -> interceptor.addToClusterNodeLabels(null, null));
// the nodeLabelsInfo is null
NodeLabelsInfo nodeLabelsInfo = new NodeLabelsInfo();
LambdaTestUtils.intercept(IllegalArgumentException.class,
"Parameter error, the nodeLabelsInfo is null or empty.",
() -> interceptor.addToClusterNodeLabels(nodeLabelsInfo, null));
// error nodeLabelsInfo
NodeLabelsInfo nodeLabelsInfo1 = new NodeLabelsInfo();
NodeLabelInfo nodeLabelInfo1 = new NodeLabelInfo("A", true);
nodeLabelsInfo1.getNodeLabelsInfo().add(nodeLabelInfo1);
LambdaTestUtils.intercept(YarnRuntimeException.class, "addToClusterNodeLabels Error",
() -> interceptor.addToClusterNodeLabels(nodeLabelsInfo1, null));
}
@Test
public void testRemoveFromClusterNodeLabels1() throws Exception {
Set<String> oldNodeLabels = Sets.newHashSet();
oldNodeLabels.add("ALL");
Response response = interceptor.removeFromClusterNodeLabels(oldNodeLabels, null);
Assert.assertNotNull(response);
Object entityObj = response.getEntity();
Assert.assertNotNull(entityObj);
String entity = String.valueOf(entityObj);
String[] entities = StringUtils.split(entity, ",");
Assert.assertNotNull(entities);
Assert.assertEquals(4, entities.length);
// The order in which the cluster returns messages is uncertain,
// we confirm the result by contains
String expectedMsg =
"SubCluster-0:SUCCESS,SubCluster-1:SUCCESS,SubCluster-2:SUCCESS,SubCluster-3:SUCCESS";
Arrays.stream(entities).forEach(item -> {
Assert.assertTrue(expectedMsg.contains(item));
});
}
@Test
public void testRemoveFromClusterNodeLabels2() throws Exception {
Set<String> oldNodeLabels = Sets.newHashSet();
oldNodeLabels.add("A0");
Response response = interceptor.removeFromClusterNodeLabels(oldNodeLabels, null);
Assert.assertNotNull(response);
Object entityObj = response.getEntity();
Assert.assertNotNull(entityObj);
String expectedValue = "SubCluster-0:SUCCESS,";
String entity = String.valueOf(entityObj);
Assert.assertTrue(entity.contains(expectedValue));
}
@Test
public void testRemoveFromClusterNodeLabelsError() throws Exception {
// the oldNodeLabels is null
LambdaTestUtils.intercept(IllegalArgumentException.class,
"Parameter error, the oldNodeLabels is null or empty.",
() -> interceptor.removeFromClusterNodeLabels(null, null));
// the oldNodeLabels is empty
Set<String> oldNodeLabels = Sets.newHashSet();
LambdaTestUtils.intercept(IllegalArgumentException.class,
"Parameter error, the oldNodeLabels is null or empty.",
() -> interceptor.removeFromClusterNodeLabels(oldNodeLabels, null));
// error oldNodeLabels
Set<String> oldNodeLabels1 = Sets.newHashSet();
oldNodeLabels1.add("A1");
LambdaTestUtils.intercept(YarnRuntimeException.class, "removeFromClusterNodeLabels Error",
() -> interceptor.removeFromClusterNodeLabels(oldNodeLabels1, null));
}
}