YARN-11037. Add configurable logic to split resource request to the least loaded SC. (#5515)
This commit is contained in:
parent
655c3df050
commit
ea87aa2f5b
@ -69,4 +69,9 @@ public abstract class EnhancedHeadroom {
|
||||
sb.append(">");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public double getNormalizedPendingCount(long multiplier) {
|
||||
int totalPendingCount = getTotalPendingCount();
|
||||
return (double) totalPendingCount * multiplier;
|
||||
}
|
||||
}
|
||||
|
@ -4058,6 +4058,45 @@ public class YarnConfiguration extends Configuration {
|
||||
public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT =
|
||||
60000; // one minute
|
||||
|
||||
// Prefix for configs related to selecting SC based on load
|
||||
public static final String LOAD_BASED_SC_SELECTOR_PREFIX =
|
||||
NM_PREFIX + "least-load-policy-selector.";
|
||||
|
||||
// Config to enable re-rerouting node requests base on SC load
|
||||
public static final String LOAD_BASED_SC_SELECTOR_ENABLED =
|
||||
LOAD_BASED_SC_SELECTOR_PREFIX + "enabled";
|
||||
public static final boolean DEFAULT_LOAD_BASED_SC_SELECTOR_ENABLED = false;
|
||||
|
||||
// Pending container threshold for selecting SC
|
||||
public static final String LOAD_BASED_SC_SELECTOR_THRESHOLD =
|
||||
LOAD_BASED_SC_SELECTOR_PREFIX + "pending-container.threshold";
|
||||
public static final int DEFAULT_LOAD_BASED_SC_SELECTOR_THRESHOLD = 10000;
|
||||
|
||||
// Whether to consider total number of active cores in the subcluster for load
|
||||
public static final String LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE =
|
||||
LOAD_BASED_SC_SELECTOR_PREFIX + "use-active-core";
|
||||
public static final boolean DEFAULT_LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE = false;
|
||||
|
||||
// multiplier to normalize pending container to active cores
|
||||
public static final String LOAD_BASED_SC_SELECTOR_MULTIPLIER =
|
||||
LOAD_BASED_SC_SELECTOR_PREFIX + "multiplier";
|
||||
public static final int DEFAULT_LOAD_BASED_SC_SELECTOR_MULTIPLIER = 50000;
|
||||
|
||||
// max count to maintain for container allocation history
|
||||
public static final String FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY =
|
||||
FEDERATION_PREFIX + "amrmproxy.allocation.history.max.entry";
|
||||
public static final int DEFAULT_FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY = 100;
|
||||
|
||||
// Whether to fail directly if activeSubCluster is less than 1.
|
||||
public static final String LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR =
|
||||
LOAD_BASED_SC_SELECTOR_PREFIX + "fail-on-error";
|
||||
public static final boolean DEFAULT_LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR = true;
|
||||
|
||||
// Blacklisted subClusters.
|
||||
public static final String FEDERATION_BLACKLIST_SUBCLUSTERS =
|
||||
LOAD_BASED_SC_SELECTOR_PREFIX + "blacklist-subclusters";
|
||||
public static final String DEFAULT_FEDERATION_BLACKLIST_SUBCLUSTERS = "";
|
||||
|
||||
// AMRMProxy Register UAM Retry-Num
|
||||
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
|
||||
FEDERATION_PREFIX + "amrmproxy.register.uam.retry-count";
|
||||
|
@ -5558,4 +5558,84 @@
|
||||
<value>0.0.0.0:8070</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
This configuration will enable request rerouting according to the load of the subCluster.
|
||||
If it is true, it will reroute the request according to the load of the subCluster.
|
||||
The default configuration is false.
|
||||
</description>
|
||||
<name>yarn.nodemanager.least-load-policy-selector.enabled</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
SubCluster pending container threshold. The default value is 10000.
|
||||
This configuration will determine the load weight of a subCluster.
|
||||
For SC with pending containers count bigger than container threshold / 2,
|
||||
use threshold / pending as weight.
|
||||
For SC with pending containers count less than threshold / 2, we cap the weight at 2.
|
||||
</description>
|
||||
<name>yarn.nodemanager.least-load-policy-selector.pending-container.threshold</name>
|
||||
<value>10000</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Whether to consider the configured vcores when calculating the subCluster load.
|
||||
The default value is false, we only consider the number of cluster pending containers.
|
||||
If this configuration item is set to true, This configuration item needs to be used together
|
||||
with yarn.nodemanager.least-load-policy-selector.multiplier. We will use the following formula
|
||||
when calculating subCluster pending.
|
||||
pendingContainersCountNormalize = (totalPendingContainersCount * multiplier) / totalActiveCores.
|
||||
</description>
|
||||
<name>yarn.nodemanager.least-load-policy-selector.use-active-core</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Max count to maintain for container allocation history.
|
||||
</description>
|
||||
<name>yarn.federation.amrmproxy.allocation.history.max.entry</name>
|
||||
<value>100</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Whether to fail directly if activeSubCluster is less than 1.
|
||||
The default is true.
|
||||
If We set to false, We will try to re-fetch activeSubCluster list.
|
||||
</description>
|
||||
<name>yarn.nodemanager.least-load-policy-selector.fail-on-error</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
The subCluster configured in the blacklist will not be forwarded requests.
|
||||
The default value is empty.
|
||||
</description>
|
||||
<name>yarn.nodemanager.least-load-policy-selector.blacklist-subclusters</name>
|
||||
<value></value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Max count to maintain for container allocation history.
|
||||
</description>
|
||||
<name>yarn.federation.amrmproxy.allocation.history.max.entry</name>
|
||||
<value>100</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
This configuration will be used
|
||||
when yarn.nodemanager.least-load-policy-selector.use-active-core is set to true.
|
||||
The purpose of this value is to help normalize the pendingContainersCount.
|
||||
</description>
|
||||
<name>yarn.nodemanager.least-load-policy-selector.multiplier</name>
|
||||
<value>50000</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
|
@ -28,6 +28,7 @@ import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.hadoop.HadoopIllegalArgumentException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.ipc.RPC;
|
||||
import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
||||
@ -132,16 +133,23 @@ public class AMRMClientRelayer implements ApplicationMasterProtocol {
|
||||
|
||||
private AMRMClientRelayerMetrics metrics;
|
||||
|
||||
private ContainerAllocationHistory allocationHistory;
|
||||
|
||||
public AMRMClientRelayer(ApplicationMasterProtocol rmClient,
|
||||
ApplicationId appId, String rmId) {
|
||||
this.resetResponseId = -1;
|
||||
this.metrics = AMRMClientRelayerMetrics.getInstance();
|
||||
this.rmId = "";
|
||||
this.rmClient = rmClient;
|
||||
this.appId = appId;
|
||||
this.rmId = rmId;
|
||||
}
|
||||
|
||||
public AMRMClientRelayer(ApplicationMasterProtocol rmClient,
|
||||
ApplicationId appId, String rmId, Configuration conf) {
|
||||
this(rmClient, appId, rmId);
|
||||
this.allocationHistory = new ContainerAllocationHistory(conf);
|
||||
}
|
||||
|
||||
public void setAMRegistrationRequest(
|
||||
RegisterApplicationMasterRequest registerRequest) {
|
||||
this.amRegistrationRequest = registerRequest;
|
||||
@ -444,6 +452,8 @@ public class AMRMClientRelayer implements ApplicationMasterProtocol {
|
||||
if (this.knownContainers.add(container.getId())) {
|
||||
this.metrics.addFulfilledQPS(this.rmId, AMRMClientRelayerMetrics
|
||||
.getRequestType(container.getExecutionType()), 1);
|
||||
long currentTime = System.currentTimeMillis();
|
||||
long fulfillLatency = -1;
|
||||
if (container.getAllocationRequestId() != 0) {
|
||||
Integer count = this.pendingCountForMetrics
|
||||
.get(container.getAllocationRequestId());
|
||||
@ -453,13 +463,14 @@ public class AMRMClientRelayer implements ApplicationMasterProtocol {
|
||||
this.metrics.decrClientPending(this.rmId,
|
||||
AMRMClientRelayerMetrics
|
||||
.getRequestType(container.getExecutionType()), 1);
|
||||
this.metrics.addFulfillLatency(this.rmId,
|
||||
AMRMClientRelayerMetrics
|
||||
.getRequestType(container.getExecutionType()),
|
||||
System.currentTimeMillis() - this.askTimeStamp
|
||||
.get(container.getAllocationRequestId()));
|
||||
fulfillLatency = currentTime - this.askTimeStamp.get(
|
||||
container.getAllocationRequestId());
|
||||
AMRMClientRelayerMetrics.RequestType requestType = AMRMClientRelayerMetrics
|
||||
.getRequestType(container.getExecutionType());
|
||||
this.metrics.addFulfillLatency(this.rmId, requestType, fulfillLatency);
|
||||
}
|
||||
}
|
||||
addAllocationHistoryEntry(container, currentTime, fulfillLatency);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -576,6 +587,38 @@ public class AMRMClientRelayer implements ApplicationMasterProtocol {
|
||||
this.ask.add(remoteRequest);
|
||||
}
|
||||
|
||||
public ContainerAllocationHistory getAllocationHistory() {
|
||||
return this.allocationHistory;
|
||||
}
|
||||
|
||||
private void addAllocationHistoryEntry(Container container, long fulfillTimeStamp,
|
||||
long fulfillLatency) {
|
||||
ResourceRequestSetKey key = ResourceRequestSetKey.extractMatchingKey(container,
|
||||
this.remotePendingAsks.keySet());
|
||||
if (key == null) {
|
||||
LOG.info("allocation history ignoring {}, no matching request key found.", container);
|
||||
return;
|
||||
}
|
||||
this.allocationHistory.addAllocationEntry(container, this.remotePendingAsks.get(key),
|
||||
fulfillTimeStamp, fulfillLatency);
|
||||
}
|
||||
|
||||
public void gatherReadOnlyPendingAsksInfo(Map<ResourceRequestSetKey,
|
||||
ResourceRequestSet> pendingAsks, Map<ResourceRequestSetKey, Long> pendingTime) {
|
||||
pendingAsks.clear();
|
||||
pendingTime.clear();
|
||||
synchronized (this) {
|
||||
pendingAsks.putAll(this.remotePendingAsks);
|
||||
for (ResourceRequestSetKey key : pendingAsks.keySet()) {
|
||||
Long startTime = this.askTimeStamp.get(key.getAllocationRequestId());
|
||||
if (startTime != null) {
|
||||
long elapsedMs = System.currentTimeMillis() - startTime;
|
||||
pendingTime.put(key, elapsedMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected Map<ResourceRequestSetKey, ResourceRequestSet>
|
||||
getRemotePendingAsks() {
|
||||
|
@ -0,0 +1,69 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.server;
|
||||
|
||||
import java.util.AbstractMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Queue;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.records.Container;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.scheduler.ResourceRequestSet;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Records the allocation history from YarnRM and provide aggregated insights.
|
||||
*/
|
||||
public class ContainerAllocationHistory {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(AMRMClientRelayer.class);
|
||||
|
||||
private int maxEntryCount;
|
||||
|
||||
// Allocate timing history <AllocateTimeStamp, AllocateLatency>
|
||||
private Queue<Entry<Long, Long>> relaxableG = new LinkedList<>();
|
||||
|
||||
public ContainerAllocationHistory(Configuration conf) {
|
||||
this.maxEntryCount = conf.getInt(
|
||||
YarnConfiguration.FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY,
|
||||
YarnConfiguration.DEFAULT_FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Record the allocation history for the container.
|
||||
*
|
||||
* @param container to add record for
|
||||
* @param requestSet resource request ask set
|
||||
* @param fulfillTimeStamp time at which allocation happened
|
||||
* @param fulfillLatency time elapsed in allocating since asked
|
||||
*/
|
||||
public synchronized void addAllocationEntry(Container container,
|
||||
ResourceRequestSet requestSet, long fulfillTimeStamp, long fulfillLatency){
|
||||
if (!requestSet.isANYRelaxable()) {
|
||||
LOG.info("allocation history ignoring {}, relax locality is false", container);
|
||||
return;
|
||||
}
|
||||
this.relaxableG.add(new AbstractMap.SimpleEntry<>(
|
||||
fulfillTimeStamp, fulfillLatency));
|
||||
if (this.relaxableG.size() > this.maxEntryCount) {
|
||||
this.relaxableG.remove();
|
||||
}
|
||||
}
|
||||
}
|
@ -19,6 +19,7 @@
|
||||
package org.apache.hadoop.yarn.server.federation.policies.amrmproxy;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
@ -31,6 +32,9 @@ import java.util.TreeMap;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.apache.commons.collections.MapUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||
import org.apache.hadoop.yarn.api.records.EnhancedHeadroom;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
@ -54,6 +58,19 @@ import org.slf4j.LoggerFactory;
|
||||
import org.apache.hadoop.classification.VisibleForTesting;
|
||||
import org.apache.hadoop.util.Preconditions;
|
||||
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.LOAD_BASED_SC_SELECTOR_ENABLED;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_LOAD_BASED_SC_SELECTOR_ENABLED;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.LOAD_BASED_SC_SELECTOR_THRESHOLD;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_LOAD_BASED_SC_SELECTOR_THRESHOLD;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.LOAD_BASED_SC_SELECTOR_MULTIPLIER;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_LOAD_BASED_SC_SELECTOR_MULTIPLIER;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.FEDERATION_BLACKLIST_SUBCLUSTERS;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_FEDERATION_BLACKLIST_SUBCLUSTERS;
|
||||
|
||||
/**
|
||||
* An implementation of the {@link FederationAMRMProxyPolicy} interface that
|
||||
* carefully multicasts the requests with the following behavior:
|
||||
@ -131,11 +148,44 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
private Map<SubClusterId, Float> weights;
|
||||
private SubClusterResolver resolver;
|
||||
|
||||
private Configuration conf;
|
||||
private Map<SubClusterId, Resource> headroom;
|
||||
private Map<SubClusterId, EnhancedHeadroom> enhancedHeadroom;
|
||||
private float hrAlpha;
|
||||
private FederationStateStoreFacade federationFacade;
|
||||
private SubClusterId homeSubcluster;
|
||||
private int printRRMax;
|
||||
public static final String PRINT_RR_MAX =
|
||||
"yarn.nodemanager.amrmproxy.address.splitmerge.printmaxrrcount";
|
||||
public static final int DEFAULT_PRINT_RR_MAX = 1000;
|
||||
private boolean failOnError = DEFAULT_LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR;
|
||||
|
||||
/**
|
||||
* Print a list of Resource Requests into a one line string.
|
||||
*
|
||||
* @param response list of ResourceRequest
|
||||
* @param max number of ResourceRequest to print
|
||||
* @return the printed one line string
|
||||
*/
|
||||
public static String prettyPrintRequests(List<ResourceRequest> response, int max) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (ResourceRequest rr : response) {
|
||||
builder.append("[id:").append(rr.getAllocationRequestId())
|
||||
.append(" loc:")
|
||||
.append(rr.getResourceName())
|
||||
.append(" num:")
|
||||
.append(rr.getNumContainers())
|
||||
.append(" pri:")
|
||||
.append(((rr.getPriority() != null) ? rr.getPriority().getPriority() : -1))
|
||||
.append("], ");
|
||||
if (max != -1) {
|
||||
if (max-- <= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reinitialize(
|
||||
@ -182,6 +232,7 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
weights = newWeightsConverted;
|
||||
resolver = policyContext.getFederationSubclusterResolver();
|
||||
|
||||
// Data structures that only need to initialize once
|
||||
if (headroom == null) {
|
||||
headroom = new ConcurrentHashMap<>();
|
||||
enhancedHeadroom = new ConcurrentHashMap<>();
|
||||
@ -191,6 +242,11 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
this.federationFacade =
|
||||
policyContext.getFederationStateStoreFacade();
|
||||
this.homeSubcluster = policyContext.getHomeSubcluster();
|
||||
|
||||
this.conf = this.federationFacade.getConf();
|
||||
this.printRRMax = this.conf.getInt(PRINT_RR_MAX, DEFAULT_PRINT_RR_MAX);
|
||||
this.failOnError = this.conf.getBoolean(LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR,
|
||||
DEFAULT_LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -217,10 +273,9 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
// active subclusters. Create a new instance per call because this method
|
||||
// can be called concurrently.
|
||||
AllocationBookkeeper bookkeeper = new AllocationBookkeeper();
|
||||
bookkeeper.reinitialize(getActiveSubclusters(), timedOutSubClusters);
|
||||
bookkeeper.reinitialize(getActiveSubclusters(), timedOutSubClusters, conf);
|
||||
|
||||
List<ResourceRequest> nonLocalizedRequests =
|
||||
new ArrayList<ResourceRequest>();
|
||||
List<ResourceRequest> nonLocalizedRequests = new ArrayList<>();
|
||||
|
||||
SubClusterId targetId = null;
|
||||
Set<SubClusterId> targetIds = null;
|
||||
@ -240,6 +295,17 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
// Handle "node" requests
|
||||
try {
|
||||
targetId = resolver.getSubClusterForNode(rr.getResourceName());
|
||||
|
||||
// If needed, re-reroute node requests base on SC load
|
||||
boolean loadBasedSCSelectorEnabled =
|
||||
conf.getBoolean(LOAD_BASED_SC_SELECTOR_ENABLED, DEFAULT_LOAD_BASED_SC_SELECTOR_ENABLED);
|
||||
if (loadBasedSCSelectorEnabled) {
|
||||
int maxPendingThreshold = conf.getInt(LOAD_BASED_SC_SELECTOR_THRESHOLD,
|
||||
DEFAULT_LOAD_BASED_SC_SELECTOR_THRESHOLD);
|
||||
targetId = routeNodeRequestIfNeeded(targetId, maxPendingThreshold,
|
||||
bookkeeper.getActiveAndEnabledSC());
|
||||
}
|
||||
LOG.debug("Node request {}", rr.getResourceName());
|
||||
} catch (YarnException e) {
|
||||
// this might happen as we can't differentiate node from rack names
|
||||
// we log altogether later
|
||||
@ -285,7 +351,16 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
// handle all non-localized requests (ANY)
|
||||
splitAnyRequests(nonLocalizedRequests, bookkeeper);
|
||||
|
||||
return bookkeeper.getAnswer();
|
||||
// Take the split result, feed into the askBalancer
|
||||
Map<SubClusterId, List<ResourceRequest>> answer = bookkeeper.getAnswer();
|
||||
LOG.info("Before split {} RRs: {}", resourceRequests.size(),
|
||||
prettyPrintRequests(resourceRequests, this.printRRMax));
|
||||
|
||||
for (Map.Entry<SubClusterId, List<ResourceRequest>> entry : bookkeeper.getAnswer().entrySet()) {
|
||||
LOG.info("After split {} has {} RRs: {}", entry.getKey(), entry.getValue().size(),
|
||||
prettyPrintRequests(entry.getValue(), this.printRRMax));
|
||||
}
|
||||
return answer;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -495,6 +570,120 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
return headroomWeighting;
|
||||
}
|
||||
|
||||
/**
|
||||
* When certain subcluster is too loaded, reroute Node requests going there.
|
||||
*
|
||||
* @param targetId current subClusterId where request is sent
|
||||
* @param maxThreshold threshold for Pending count
|
||||
* @param activeAndEnabledSCs list of active sc
|
||||
* @return subClusterId target sc id
|
||||
*/
|
||||
protected SubClusterId routeNodeRequestIfNeeded(SubClusterId targetId,
|
||||
int maxThreshold, Set<SubClusterId> activeAndEnabledSCs) {
|
||||
// If targetId is not in the active and enabled SC list, reroute the traffic
|
||||
if (activeAndEnabledSCs.contains(targetId)) {
|
||||
int targetPendingCount = getSubClusterLoad(targetId);
|
||||
if (targetPendingCount == -1 || targetPendingCount < maxThreshold) {
|
||||
return targetId;
|
||||
}
|
||||
}
|
||||
SubClusterId scId = chooseSubClusterIdForMaxLoadSC(targetId, maxThreshold, activeAndEnabledSCs);
|
||||
return scId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the current target subcluster is over max load, and if it is
|
||||
* reroute it.
|
||||
*
|
||||
* @param targetId the original target subcluster id
|
||||
* @param maxThreshold the max load threshold to reroute
|
||||
* @param activeAndEnabledSCs the list of active and enabled subclusters
|
||||
* @return targetId if it is within maxThreshold, otherwise a new id
|
||||
*/
|
||||
private SubClusterId chooseSubClusterIdForMaxLoadSC(SubClusterId targetId,
|
||||
int maxThreshold, Set<SubClusterId> activeAndEnabledSCs) {
|
||||
ArrayList<Float> weight = new ArrayList<>();
|
||||
ArrayList<SubClusterId> scIds = new ArrayList<>();
|
||||
int targetLoad = getSubClusterLoad(targetId);
|
||||
if (targetLoad == -1 || !activeAndEnabledSCs.contains(targetId)) {
|
||||
// Probably a SC that's not active and enabled. Forcing a reroute
|
||||
targetLoad = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prepare the weight for a random draw among all known SCs.
|
||||
*
|
||||
* For SC with pending bigger than maxThreshold / 2, use maxThreshold /
|
||||
* pending as weight. We multiplied by maxThreshold so that the weight
|
||||
* won't be too small in value.
|
||||
*
|
||||
* For SC with pending less than maxThreshold / 2, we cap the weight at 2
|
||||
* = (maxThreshold / (maxThreshold / 2)) so that SC with small pending
|
||||
* will not get a huge weight and thus get swamped.
|
||||
*/
|
||||
for (SubClusterId sc : activeAndEnabledSCs) {
|
||||
int scLoad = getSubClusterLoad(sc);
|
||||
if (scLoad > targetLoad) {
|
||||
// Never mind if it is not the most loaded SC
|
||||
return targetId;
|
||||
}
|
||||
if (scLoad <= maxThreshold / 2) {
|
||||
weight.add(2f);
|
||||
} else {
|
||||
weight.add((float) maxThreshold / scLoad);
|
||||
}
|
||||
scIds.add(sc);
|
||||
}
|
||||
if (weights.size() == 0) {
|
||||
return targetId;
|
||||
}
|
||||
return scIds.get(FederationPolicyUtils.getWeightedRandom(weight));
|
||||
}
|
||||
|
||||
/**
|
||||
* get the Load data of the subCluster.
|
||||
*
|
||||
* @param subClusterId subClusterId.
|
||||
* @return The number of pending containers for the subCluster.
|
||||
*/
|
||||
private int getSubClusterLoad(SubClusterId subClusterId) {
|
||||
EnhancedHeadroom headroomData = this.enhancedHeadroom.get(subClusterId);
|
||||
if (headroomData == null) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Use new data from enhanced headroom
|
||||
boolean useActiveCoreEnabled = conf.getBoolean(LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE,
|
||||
DEFAULT_LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE);
|
||||
|
||||
// If we consider the number of vCores in the subCluster
|
||||
if (useActiveCoreEnabled) {
|
||||
|
||||
// If the vcore of the subCluster is less than or equal to 0,
|
||||
// it means that containers cannot be scheduled to this subCluster,
|
||||
// and we will return a very large number, indicating that the subCluster is unavailable.
|
||||
if (headroomData.getTotalActiveCores() <= 0) {
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
// Multiply by a constant factor, to ensure the numerator > denominator.
|
||||
// We will normalize the PendingCount, using PendingCount * multiplier / TotalActiveCores.
|
||||
long multiplier = conf.getLong(LOAD_BASED_SC_SELECTOR_MULTIPLIER,
|
||||
DEFAULT_LOAD_BASED_SC_SELECTOR_MULTIPLIER);
|
||||
double value =
|
||||
headroomData.getNormalizedPendingCount(multiplier) / headroomData.getTotalActiveCores();
|
||||
if (value > Integer.MAX_VALUE) {
|
||||
return Integer.MAX_VALUE;
|
||||
} else {
|
||||
return (int) value;
|
||||
}
|
||||
} else {
|
||||
// If the number of vcores in the subCluster is not considered,
|
||||
// we directly return the number of pending containers in the subCluster.
|
||||
return headroomData.getTotalPendingCount();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This helper class is used to book-keep the requests made to each
|
||||
* subcluster, and maintain useful statistics to split ANY requests.
|
||||
@ -523,8 +712,9 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
|
||||
private void reinitialize(
|
||||
Map<SubClusterId, SubClusterInfo> activeSubclusters,
|
||||
Set<SubClusterId> timedOutSubClusters) throws YarnException {
|
||||
if (activeSubclusters == null) {
|
||||
Set<SubClusterId> timedOutSubClusters, Configuration pConf) throws YarnException {
|
||||
|
||||
if (MapUtils.isEmpty(activeSubclusters)) {
|
||||
throw new YarnRuntimeException("null activeSubclusters received");
|
||||
}
|
||||
|
||||
@ -548,10 +738,28 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
// subCluster blacklisting from configuration
|
||||
String blacklistedSubClusters = pConf.get(FEDERATION_BLACKLIST_SUBCLUSTERS,
|
||||
DEFAULT_FEDERATION_BLACKLIST_SUBCLUSTERS);
|
||||
if (blacklistedSubClusters != null) {
|
||||
Collection<String> tempList = StringUtils.getStringCollection(blacklistedSubClusters);
|
||||
for (String item : tempList) {
|
||||
activeAndEnabledSC.remove(SubClusterId.newInstance(item.trim()));
|
||||
}
|
||||
}
|
||||
|
||||
if (activeAndEnabledSC.size() < 1) {
|
||||
throw new NoActiveSubclustersException(
|
||||
"None of the subclusters enabled in this policy (weight>0) are "
|
||||
+ "currently active we cannot forward the ResourceRequest(s)");
|
||||
String errorMsg = "None of the subClusters enabled in this Policy (weight > 0) are "
|
||||
+ "currently active we cannot forward the ResourceRequest(s)";
|
||||
if (failOnError) {
|
||||
throw new NoActiveSubclustersException(errorMsg);
|
||||
} else {
|
||||
LOG.error(errorMsg + ", continuing by enabling all active subClusters.");
|
||||
activeAndEnabledSC.addAll(activeSubclusters.keySet());
|
||||
for (SubClusterId sc : activeSubclusters.keySet()) {
|
||||
policyWeights.put(sc, 1.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Set<SubClusterId> tmpSCSet = new HashSet<>(activeAndEnabledSC);
|
||||
@ -559,7 +767,7 @@ public class LocalityMulticastAMRMProxyPolicy extends AbstractAMRMProxyPolicy {
|
||||
|
||||
if (tmpSCSet.size() < 1) {
|
||||
LOG.warn("All active and enabled subclusters have expired last "
|
||||
+ "heartbeat time. Ignore the expiry check for this request");
|
||||
+ "heartbeat time. Ignore the expiry check for this request.");
|
||||
} else {
|
||||
activeAndEnabledSC = tmpSCSet;
|
||||
}
|
||||
|
@ -36,6 +36,8 @@ public class ResourceRequestSet {
|
||||
|
||||
private ResourceRequestSetKey key;
|
||||
private int numContainers;
|
||||
// Whether the ANY RR is relaxable
|
||||
private boolean relaxable;
|
||||
// ResourceName -> RR
|
||||
private Map<String, ResourceRequest> asks;
|
||||
|
||||
@ -49,6 +51,7 @@ public class ResourceRequestSet {
|
||||
this.key = key;
|
||||
// leave it zero for now, as if it is a cancel
|
||||
this.numContainers = 0;
|
||||
this.relaxable = true;
|
||||
this.asks = new HashMap<>();
|
||||
}
|
||||
|
||||
@ -61,6 +64,7 @@ public class ResourceRequestSet {
|
||||
this.key = other.key;
|
||||
this.numContainers = other.numContainers;
|
||||
this.asks = new HashMap<>();
|
||||
this.relaxable = other.relaxable;
|
||||
// The assumption is that the RR objects should not be modified without
|
||||
// making a copy
|
||||
this.asks.putAll(other.asks);
|
||||
@ -86,6 +90,7 @@ public class ResourceRequestSet {
|
||||
// For G requestSet, update the numContainers only for ANY RR
|
||||
if (ask.getResourceName().equals(ResourceRequest.ANY)) {
|
||||
this.numContainers = ask.getNumContainers();
|
||||
this.relaxable = ask.getRelaxLocality();
|
||||
}
|
||||
} else {
|
||||
// The assumption we made about O asks is that all RR in a requestSet has
|
||||
@ -182,6 +187,15 @@ public class ResourceRequestSet {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether the request set is relaxable at ANY level.
|
||||
*
|
||||
* @return whether the request set is relaxable at ANY level
|
||||
*/
|
||||
public boolean isANYRelaxable() {
|
||||
return this.relaxable;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
@ -18,11 +18,16 @@
|
||||
|
||||
package org.apache.hadoop.yarn.server.scheduler;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.Container;
|
||||
import org.apache.hadoop.yarn.api.records.ExecutionType;
|
||||
import org.apache.hadoop.yarn.api.records.Priority;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* The scheduler key for a group of {@link ResourceRequest}.
|
||||
@ -32,6 +37,9 @@ import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
*/
|
||||
public class ResourceRequestSetKey extends SchedulerRequestKey {
|
||||
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(ResourceRequestSetKey.class);
|
||||
|
||||
// More ResourceRequest key fields on top of SchedulerRequestKey
|
||||
private final Resource resource;
|
||||
private final ExecutionType execType;
|
||||
@ -123,6 +131,46 @@ public class ResourceRequestSetKey extends SchedulerRequestKey {
|
||||
return this.execType.compareTo(otherKey.execType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the corresponding ResourceRequestSetKey for an allocated container
|
||||
* from a given set. Return null if not found.
|
||||
*
|
||||
* @param container the allocated container
|
||||
* @param keys the set of keys to look from
|
||||
* @return ResourceRequestSetKey
|
||||
*/
|
||||
public static ResourceRequestSetKey extractMatchingKey(Container container,
|
||||
Set<ResourceRequestSetKey> keys) {
|
||||
ResourceRequestSetKey resourceRequestSetKey = new ResourceRequestSetKey(
|
||||
container.getAllocationRequestId(), container.getPriority(),
|
||||
container.getResource(), container.getExecutionType());
|
||||
if (keys.contains(resourceRequestSetKey)) {
|
||||
return resourceRequestSetKey;
|
||||
}
|
||||
|
||||
if (container.getAllocationRequestId() > 0) {
|
||||
// If no exact match, look for the one with the same (non-zero)
|
||||
// allocationRequestId
|
||||
for (ResourceRequestSetKey candidate : keys) {
|
||||
if (candidate.getAllocationRequestId() == container.getAllocationRequestId()) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Using possible match for {} : {}", resourceRequestSetKey, candidate);
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("not match found for container {}.", container.getId());
|
||||
for (ResourceRequestSetKey candidate : keys) {
|
||||
LOG.debug("candidate set keys: {}.", candidate.toString());
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "[id:" + getAllocationRequestId() + " p:"
|
||||
|
@ -140,7 +140,7 @@ public class UnmanagedApplicationManager {
|
||||
this.userUgi = null;
|
||||
// Relayer's rmClient will be set after the RM connection is created
|
||||
this.rmProxyRelayer =
|
||||
new AMRMClientRelayer(null, this.applicationId, rmName);
|
||||
new AMRMClientRelayer(null, this.applicationId, rmName, this.conf);
|
||||
this.heartbeatHandler = createAMHeartbeatRequestHandler(this.conf,
|
||||
this.applicationId, this.rmProxyRelayer);
|
||||
|
||||
|
@ -155,7 +155,7 @@ public class TestAMRMClientRelayer {
|
||||
this.conf = new Configuration();
|
||||
|
||||
this.mockAMS = new MockApplicationMasterService();
|
||||
this.relayer = new AMRMClientRelayer(this.mockAMS, null, "TEST");
|
||||
this.relayer = new AMRMClientRelayer(this.mockAMS, null, "TEST", conf);
|
||||
this.relayer.registerApplicationMaster(
|
||||
RegisterApplicationMasterRequest.newInstance("", 0, ""));
|
||||
|
||||
|
@ -32,6 +32,7 @@ import java.util.Set;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||
import org.apache.hadoop.yarn.api.records.EnhancedHeadroom;
|
||||
import org.apache.hadoop.yarn.api.records.NMToken;
|
||||
import org.apache.hadoop.yarn.api.records.NodeReport;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
@ -820,4 +821,99 @@ public class TestLocalityMulticastAMRMProxyPolicy
|
||||
return getHomeSubCluster();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the rerouting behavior when some subclusters are loaded. Make sure
|
||||
* that the AMRMProxy rerouting decisions attempt to redirect requests
|
||||
* to the least loaded subcluster when load thresholds are exceeded
|
||||
*/
|
||||
@Test
|
||||
public void testLoadBasedSubClusterReroute() throws YarnException {
|
||||
int pendingThreshold = 1000;
|
||||
|
||||
LocalityMulticastAMRMProxyPolicy policy = (LocalityMulticastAMRMProxyPolicy) getPolicy();
|
||||
initializePolicy();
|
||||
|
||||
SubClusterId sc0 = SubClusterId.newInstance("0");
|
||||
SubClusterId sc1 = SubClusterId.newInstance("1");
|
||||
SubClusterId sc2 = SubClusterId.newInstance("2");
|
||||
SubClusterId sc3 = SubClusterId.newInstance("3");
|
||||
SubClusterId sc4 = SubClusterId.newInstance("4");
|
||||
|
||||
Set<SubClusterId> scList = new HashSet<>();
|
||||
scList.add(sc0);
|
||||
scList.add(sc1);
|
||||
scList.add(sc2);
|
||||
scList.add(sc3);
|
||||
scList.add(sc4);
|
||||
|
||||
// This cluster is the most overloaded - 4 times the threshold.
|
||||
policy.notifyOfResponse(sc0,
|
||||
getAllocateResponseWithEnhancedHeadroom(4 * pendingThreshold, 0));
|
||||
|
||||
// This cluster is the most overloaded - 4 times the threshold.
|
||||
policy.notifyOfResponse(sc1,
|
||||
getAllocateResponseWithEnhancedHeadroom(4 * pendingThreshold, 0));
|
||||
|
||||
// This cluster is 2 times the threshold, but not the most loaded.
|
||||
policy.notifyOfResponse(sc2,
|
||||
getAllocateResponseWithEnhancedHeadroom(2 * pendingThreshold, 0));
|
||||
|
||||
// This cluster is at the threshold, but not the most loaded.
|
||||
policy.notifyOfResponse(sc3,
|
||||
getAllocateResponseWithEnhancedHeadroom(pendingThreshold, 0));
|
||||
|
||||
// This cluster has zero pending.
|
||||
policy.notifyOfResponse(sc4, getAllocateResponseWithEnhancedHeadroom(0, 0));
|
||||
|
||||
// sc2, sc3 and sc4 should just return the original subcluster.
|
||||
Assert.assertEquals(
|
||||
policy.routeNodeRequestIfNeeded(sc2, pendingThreshold, scList), sc2);
|
||||
Assert.assertEquals(
|
||||
policy.routeNodeRequestIfNeeded(sc3, pendingThreshold, scList), sc3);
|
||||
Assert.assertEquals(
|
||||
policy.routeNodeRequestIfNeeded(sc4, pendingThreshold, scList), sc4);
|
||||
|
||||
// sc0 and sc1 must select from sc0/sc1/sc2/sc3/sc4 according to weights
|
||||
// 1/4, 1/4, 1/2, 1, 2. Let's run tons of random of samples, and verify that
|
||||
// the proportion approximately holds.
|
||||
Map<SubClusterId, Integer> counts = new HashMap<>();
|
||||
counts.put(sc0, 0);
|
||||
counts.put(sc1, 0);
|
||||
counts.put(sc2, 0);
|
||||
counts.put(sc3, 0);
|
||||
counts.put(sc4, 0);
|
||||
|
||||
int n = 100000;
|
||||
for (int i = 0; i < n; i++) {
|
||||
SubClusterId selectedId = policy.routeNodeRequestIfNeeded(sc0, pendingThreshold, scList);
|
||||
counts.put(selectedId, counts.get(selectedId) + 1);
|
||||
|
||||
selectedId = policy.routeNodeRequestIfNeeded(sc1, pendingThreshold, scList);
|
||||
counts.put(selectedId, counts.get(selectedId) + 1);
|
||||
|
||||
// Also try a new SCId that's not active and enabled. Should be rerouted
|
||||
// to sc0-4 with the same distribution as above
|
||||
selectedId = policy.routeNodeRequestIfNeeded(SubClusterId.newInstance("10"),
|
||||
pendingThreshold, scList);
|
||||
counts.put(selectedId, counts.get(selectedId) + 1);
|
||||
}
|
||||
|
||||
// The probability should be 1/16, 1/16, 1/8, 1/4, 1/2R
|
||||
Assert.assertEquals((double) counts.get(sc0) / n / 3, 1 / 16.0, 0.01);
|
||||
Assert.assertEquals((double) counts.get(sc1) / n / 3, 1 / 16.0, 0.01);
|
||||
Assert.assertEquals((double) counts.get(sc2) / n / 3, 1 / 8.0, 0.01);
|
||||
Assert.assertEquals((double) counts.get(sc3) / n / 3, 1 / 4.0, 0.01);
|
||||
Assert.assertEquals((double) counts.get(sc4) / n / 3, 1 / 2.0, 0.01);
|
||||
|
||||
// Everything should be routed to these five active and enabled SCs
|
||||
Assert.assertEquals(5, counts.size());
|
||||
}
|
||||
|
||||
private AllocateResponse getAllocateResponseWithEnhancedHeadroom(int pending, int activeCores) {
|
||||
return AllocateResponse.newInstance(0, null, null,
|
||||
Collections.emptyList(), Resource.newInstance(0, 0), null, 10, null,
|
||||
Collections.emptyList(), null, null, null,
|
||||
EnhancedHeadroom.newInstance(pending, activeCores));
|
||||
}
|
||||
}
|
@ -140,12 +140,12 @@ public class TestAMRMClientRelayerMetrics {
|
||||
this.mockAMS = new MockApplicationMasterService();
|
||||
|
||||
this.homeRelayer = new AMRMClientRelayer(this.mockAMS,
|
||||
ApplicationId.newInstance(0, 0), this.homeID);
|
||||
ApplicationId.newInstance(0, 0), this.homeID, conf);
|
||||
this.homeRelayer.registerApplicationMaster(
|
||||
RegisterApplicationMasterRequest.newInstance("", 0, ""));
|
||||
|
||||
this.uamRelayer = new AMRMClientRelayer(this.mockAMS,
|
||||
ApplicationId.newInstance(0, 0), this.uamID);
|
||||
ApplicationId.newInstance(0, 0), this.uamID, conf);
|
||||
this.uamRelayer.registerApplicationMaster(
|
||||
RegisterApplicationMasterRequest.newInstance("", 0, ""));
|
||||
|
||||
|
@ -321,7 +321,7 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||
SubClusterId.newInstance(YarnConfiguration.getClusterId(conf));
|
||||
this.homeRMRelayer = new AMRMClientRelayer(createHomeRMProxy(appContext,
|
||||
ApplicationMasterProtocol.class, appOwner), appId,
|
||||
this.homeSubClusterId.toString());
|
||||
this.homeSubClusterId.toString(), conf);
|
||||
|
||||
this.homeHeartbeatHandler =
|
||||
createHomeHeartbeatHandler(conf, appId, this.homeRMRelayer);
|
||||
|
Loading…
x
Reference in New Issue
Block a user