HDFS-14403. Cost-based extension to the RPC Fair Call Queue. Contributed by Christopher Gregorian.

This commit is contained in:
Christopher Gregorian 2019-05-24 17:09:52 -07:00 committed by Erik Krogen
parent d023f1f864
commit 129576f628
10 changed files with 639 additions and 192 deletions

View File

@ -106,6 +106,7 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic {
public static final String IPC_CALLQUEUE_IMPL_KEY = "callqueue.impl";
public static final String IPC_SCHEDULER_IMPL_KEY = "scheduler.impl";
public static final String IPC_IDENTITY_PROVIDER_KEY = "identity-provider.impl";
public static final String IPC_COST_PROVIDER_KEY = "cost-provider.impl";
public static final String IPC_BACKOFF_ENABLE = "backoff.enable";
public static final boolean IPC_BACKOFF_ENABLE_DEFAULT = false;

View File

@ -198,7 +198,6 @@ void addResponseTime(String name, Schedulable e, ProcessingDetails details) {
}
// This should be only called once per call and cached in the call object
// each getPriorityLevel call will increment the counter for the caller
int getPriorityLevel(Schedulable e) {
return scheduler.getPriorityLevel(e);
}

View File

@ -0,0 +1,46 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ipc;
import org.apache.hadoop.conf.Configuration;
/**
* Used by {@link DecayRpcScheduler} to get the cost of users' operations. This
* is configurable using
* {@link org.apache.hadoop.fs.CommonConfigurationKeys#IPC_COST_PROVIDER_KEY}.
*/
public interface CostProvider {
/**
* Initialize this provider using the given configuration, examining only
* ones which fall within the provided namespace.
*
* @param namespace The namespace to use when looking up configurations.
* @param conf The configuration
*/
void init(String namespace, Configuration conf);
/**
* Get cost from {@link ProcessingDetails} which will be used in scheduler.
*
* @param details Process details
* @return The cost of the call
*/
long getCost(ProcessingDetails details);
}

View File

@ -58,8 +58,8 @@
import static org.apache.hadoop.ipc.ProcessingDetails.Timing;
/**
* The decay RPC scheduler counts incoming requests in a map, then
* decays the counts at a fixed time interval. The scheduler is optimized
* The decay RPC scheduler tracks the cost of incoming requests in a map, then
* decays the costs at a fixed time interval. The scheduler is optimized
* for large periods (on the order of seconds), as it offloads work to the
* decay sweep.
*/
@ -77,7 +77,7 @@ public class DecayRpcScheduler implements RpcScheduler,
"faircallqueue.decay-scheduler.period-ms";
/**
* Decay factor controls how much each count is suppressed by on each sweep.
* Decay factor controls how much each cost is suppressed by on each sweep.
* Valid numbers are > 0 and < 1. Decay factor works in tandem with
* period
* to control how long the scheduler remembers an identity.
@ -135,15 +135,15 @@ public class DecayRpcScheduler implements RpcScheduler,
private static final ObjectWriter WRITER = new ObjectMapper().writer();
// Track the decayed and raw (no decay) number of calls for each schedulable
// identity from all previous decay windows: idx 0 for decayed call count and
// idx 1 for the raw call count
private final ConcurrentHashMap<Object, List<AtomicLong>> callCounts =
// identity from all previous decay windows: idx 0 for decayed call cost and
// idx 1 for the raw call cost
private final ConcurrentHashMap<Object, List<AtomicLong>> callCosts =
new ConcurrentHashMap<Object, List<AtomicLong>>();
// Should be the sum of all AtomicLongs in decayed callCounts
private final AtomicLong totalDecayedCallCount = new AtomicLong();
// The sum of all AtomicLongs in raw callCounts
private final AtomicLong totalRawCallCount = new AtomicLong();
// Should be the sum of all AtomicLongs in decayed callCosts
private final AtomicLong totalDecayedCallCost = new AtomicLong();
// The sum of all AtomicLongs in raw callCosts
private final AtomicLong totalRawCallCost = new AtomicLong();
// Track total call count and response time in current decay window
@ -161,7 +161,7 @@ public class DecayRpcScheduler implements RpcScheduler,
// Tune the behavior of the scheduler
private final long decayPeriodMillis; // How long between each tick
private final double decayFactor; // nextCount = currentCount * decayFactor
private final double decayFactor; // nextCost = currentCost * decayFactor
private final int numLevels;
private final double[] thresholds;
private final IdentityProvider identityProvider;
@ -171,9 +171,10 @@ public class DecayRpcScheduler implements RpcScheduler,
private final int topUsersCount; // e.g., report top 10 users' metrics
private static final double PRECISION = 0.0001;
private MetricsProxy metricsProxy;
private final CostProvider costProvider;
/**
* This TimerTask will call decayCurrentCounts until
* This TimerTask will call decayCurrentCosts until
* the scheduler has been garbage collected.
*/
public static class DecayTask extends TimerTask {
@ -189,7 +190,7 @@ public DecayTask(DecayRpcScheduler scheduler, Timer timer) {
public void run() {
DecayRpcScheduler sched = schedulerRef.get();
if (sched != null) {
sched.decayCurrentCounts();
sched.decayCurrentCosts();
} else {
// Our scheduler was garbage collected since it is no longer in use,
// so we should terminate the timer as well
@ -216,6 +217,7 @@ public DecayRpcScheduler(int numLevels, String ns, Configuration conf) {
this.decayFactor = parseDecayFactor(ns, conf);
this.decayPeriodMillis = parseDecayPeriodMillis(ns, conf);
this.identityProvider = this.parseIdentityProvider(ns, conf);
this.costProvider = this.parseCostProvider(ns, conf);
this.thresholds = parseThresholds(ns, conf, numLevels);
this.backOffByResponseTimeEnabled = parseBackOffByResponseTimeEnabled(ns,
conf);
@ -243,6 +245,24 @@ public DecayRpcScheduler(int numLevels, String ns, Configuration conf) {
recomputeScheduleCache();
}
private CostProvider parseCostProvider(String ns, Configuration conf) {
List<CostProvider> providers = conf.getInstances(
ns + "." + CommonConfigurationKeys.IPC_COST_PROVIDER_KEY,
CostProvider.class);
if (providers.size() < 1) {
LOG.info("CostProvider not specified, defaulting to DefaultCostProvider");
return new DefaultCostProvider();
} else if (providers.size() > 1) {
LOG.warn("Found multiple CostProviders; using: {}",
providers.get(0).getClass());
}
CostProvider provider = providers.get(0); // use the first
provider.init(ns, conf);
return provider;
}
// Load configs
private IdentityProvider parseIdentityProvider(String ns,
Configuration conf) {
@ -389,69 +409,69 @@ private static Boolean parseBackOffByResponseTimeEnabled(String ns,
}
/**
* Decay the stored counts for each user and clean as necessary.
* Decay the stored costs for each user and clean as necessary.
* This method should be called periodically in order to keep
* counts current.
* costs current.
*/
private void decayCurrentCounts() {
LOG.debug("Start to decay current counts.");
private void decayCurrentCosts() {
LOG.debug("Start to decay current costs.");
try {
long totalDecayedCount = 0;
long totalRawCount = 0;
long totalDecayedCost = 0;
long totalRawCost = 0;
Iterator<Map.Entry<Object, List<AtomicLong>>> it =
callCounts.entrySet().iterator();
callCosts.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<Object, List<AtomicLong>> entry = it.next();
AtomicLong decayedCount = entry.getValue().get(0);
AtomicLong rawCount = entry.getValue().get(1);
AtomicLong decayedCost = entry.getValue().get(0);
AtomicLong rawCost = entry.getValue().get(1);
// Compute the next value by reducing it by the decayFactor
totalRawCount += rawCount.get();
long currentValue = decayedCount.get();
totalRawCost += rawCost.get();
long currentValue = decayedCost.get();
long nextValue = (long) (currentValue * decayFactor);
totalDecayedCount += nextValue;
decayedCount.set(nextValue);
totalDecayedCost += nextValue;
decayedCost.set(nextValue);
LOG.debug("Decaying counts for the user: {}, " +
"its decayedCount: {}, rawCount: {}", entry.getKey(),
nextValue, rawCount.get());
LOG.debug(
"Decaying costs for the user: {}, its decayedCost: {}, rawCost: {}",
entry.getKey(), nextValue, rawCost.get());
if (nextValue == 0) {
LOG.debug("The decayed count for the user {} is zero " +
LOG.debug("The decayed cost for the user {} is zero " +
"and being cleaned.", entry.getKey());
// We will clean up unused keys here. An interesting optimization
// might be to have an upper bound on keyspace in callCounts and only
// might be to have an upper bound on keyspace in callCosts and only
// clean once we pass it.
it.remove();
}
}
// Update the total so that we remain in sync
totalDecayedCallCount.set(totalDecayedCount);
totalRawCallCount.set(totalRawCount);
totalDecayedCallCost.set(totalDecayedCost);
totalRawCallCost.set(totalRawCost);
LOG.debug("After decaying the stored counts, totalDecayedCount: {}, " +
"totalRawCallCount: {}.", totalDecayedCount, totalRawCount);
LOG.debug("After decaying the stored costs, totalDecayedCost: {}, " +
"totalRawCallCost: {}.", totalDecayedCost, totalRawCost);
// Now refresh the cache of scheduling decisions
recomputeScheduleCache();
// Update average response time with decay
updateAverageResponseTime(true);
} catch (Exception ex) {
LOG.error("decayCurrentCounts exception: " +
LOG.error("decayCurrentCosts exception: " +
ExceptionUtils.getStackTrace(ex));
throw ex;
}
}
/**
* Update the scheduleCache to match current conditions in callCounts.
* Update the scheduleCache to match current conditions in callCosts.
*/
private void recomputeScheduleCache() {
Map<Object, Integer> nextCache = new HashMap<Object, Integer>();
for (Map.Entry<Object, List<AtomicLong>> entry : callCounts.entrySet()) {
for (Map.Entry<Object, List<AtomicLong>> entry : callCosts.entrySet()) {
Object id = entry.getKey();
AtomicLong value = entry.getValue().get(0);
@ -466,51 +486,52 @@ private void recomputeScheduleCache() {
}
/**
* Get the number of occurrences and increment atomically.
* @param identity the identity of the user to increment
* @return the value before incrementation
* Adjust the stored cost for a given identity.
*
* @param identity the identity of the user whose cost should be adjusted
* @param costDelta the cost to add for the given identity
*/
private long getAndIncrementCallCounts(Object identity)
throws InterruptedException {
// We will increment the count, or create it if no such count exists
List<AtomicLong> count = this.callCounts.get(identity);
if (count == null) {
// Create the counts since no such count exists.
// idx 0 for decayed call count
// idx 1 for the raw call count
count = new ArrayList<AtomicLong>(2);
count.add(new AtomicLong(0));
count.add(new AtomicLong(0));
private void addCost(Object identity, long costDelta) {
// We will increment the cost, or create it if no such cost exists
List<AtomicLong> cost = this.callCosts.get(identity);
if (cost == null) {
// Create the costs since no such cost exists.
// idx 0 for decayed call cost
// idx 1 for the raw call cost
cost = new ArrayList<AtomicLong>(2);
cost.add(new AtomicLong(0));
cost.add(new AtomicLong(0));
// Put it in, or get the AtomicInteger that was put in by another thread
List<AtomicLong> otherCount = callCounts.putIfAbsent(identity, count);
if (otherCount != null) {
count = otherCount;
List<AtomicLong> otherCost = callCosts.putIfAbsent(identity, cost);
if (otherCost != null) {
cost = otherCost;
}
}
// Update the total
totalDecayedCallCount.getAndIncrement();
totalRawCallCount.getAndIncrement();
totalDecayedCallCost.getAndAdd(costDelta);
totalRawCallCost.getAndAdd(costDelta);
// At this point value is guaranteed to be not null. It may however have
// been clobbered from callCounts. Nonetheless, we return what
// been clobbered from callCosts. Nonetheless, we return what
// we have.
count.get(1).getAndIncrement();
return count.get(0).getAndIncrement();
cost.get(1).getAndAdd(costDelta);
cost.get(0).getAndAdd(costDelta);
}
/**
* Given the number of occurrences, compute a scheduling decision.
* @param occurrences how many occurrences
* Given the cost for an identity, compute a scheduling decision.
*
* @param cost the cost for an identity
* @return scheduling decision from 0 to numLevels - 1
*/
private int computePriorityLevel(long occurrences) {
long totalCallSnapshot = totalDecayedCallCount.get();
private int computePriorityLevel(long cost) {
long totalCallSnapshot = totalDecayedCallCost.get();
double proportion = 0;
if (totalCallSnapshot > 0) {
proportion = (double) occurrences / totalCallSnapshot;
proportion = (double) cost / totalCallSnapshot;
}
// Start with low priority levels, since they will be most common
@ -531,9 +552,6 @@ private int computePriorityLevel(long occurrences) {
* @return integer scheduling decision from 0 to numLevels - 1
*/
private int cachedOrComputedPriorityLevel(Object identity) {
try {
long occurrences = this.getAndIncrementCallCounts(identity);
// Try the cache
Map<Object, Integer> scheduleCache = scheduleCacheRef.get();
if (scheduleCache != null) {
@ -546,16 +564,11 @@ private int cachedOrComputedPriorityLevel(Object identity) {
}
// Cache was no good, compute it
int priority = computePriorityLevel(occurrences);
LOG.debug("compute priority for " + identity + " priority " + priority);
List<AtomicLong> costList = callCosts.get(identity);
long currentCost = costList == null ? 0 : costList.get(0).get();
int priority = computePriorityLevel(currentCost);
LOG.debug("compute priority for {} priority {}", identity, priority);
return priority;
} catch (InterruptedException ie) {
LOG.warn("Caught InterruptedException, returning low priority level");
LOG.debug("Fallback priority for: {} with priority: {}", identity,
numLevels - 1);
return numLevels - 1;
}
}
/**
@ -605,6 +618,10 @@ public boolean shouldBackOff(Schedulable obj) {
@Override
public void addResponseTime(String callName, Schedulable schedulable,
ProcessingDetails details) {
String user = identityProvider.makeIdentity(schedulable);
long processingCost = costProvider.getCost(details);
addCost(user, processingCost);
int priorityLevel = schedulable.getPriorityLevel();
long queueTime = details.get(Timing.QUEUE, TimeUnit.MILLISECONDS);
long processingTime = details.get(Timing.PROCESSING, TimeUnit.MILLISECONDS);
@ -652,22 +669,30 @@ void updateAverageResponseTime(boolean enableDecay) {
// For testing
@VisibleForTesting
public double getDecayFactor() { return decayFactor; }
double getDecayFactor() {
return decayFactor;
}
@VisibleForTesting
public long getDecayPeriodMillis() { return decayPeriodMillis; }
long getDecayPeriodMillis() {
return decayPeriodMillis;
}
@VisibleForTesting
public double[] getThresholds() { return thresholds; }
double[] getThresholds() {
return thresholds;
}
@VisibleForTesting
public void forceDecay() { decayCurrentCounts(); }
void forceDecay() {
decayCurrentCosts();
}
@VisibleForTesting
public Map<Object, Long> getCallCountSnapshot() {
Map<Object, Long> getCallCostSnapshot() {
HashMap<Object, Long> snapshot = new HashMap<Object, Long>();
for (Map.Entry<Object, List<AtomicLong>> entry : callCounts.entrySet()) {
for (Map.Entry<Object, List<AtomicLong>> entry : callCosts.entrySet()) {
snapshot.put(entry.getKey(), entry.getValue().get(0).get());
}
@ -675,8 +700,8 @@ public Map<Object, Long> getCallCountSnapshot() {
}
@VisibleForTesting
public long getTotalCallSnapshot() {
return totalDecayedCallCount.get();
long getTotalCallSnapshot() {
return totalDecayedCallCost.get();
}
/**
@ -809,15 +834,15 @@ public void getMetrics(MetricsCollector collector, boolean all) {
}
public int getUniqueIdentityCount() {
return callCounts.size();
return callCosts.size();
}
public long getTotalCallVolume() {
return totalDecayedCallCount.get();
return totalDecayedCallCost.get();
}
public long getTotalRawCallVolume() {
return totalRawCallCount.get();
return totalRawCallCost.get();
}
public long[] getResponseTimeCountInLastWindow() {
@ -910,17 +935,17 @@ private void addTopNCallerSummary(MetricsRecordBuilder rb) {
}
}
// Get the top N callers' raw call count and scheduler decision
// Get the top N callers' raw call cost and scheduler decision
private TopN getTopCallers(int n) {
TopN topNCallers = new TopN(n);
Iterator<Map.Entry<Object, List<AtomicLong>>> it =
callCounts.entrySet().iterator();
callCosts.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<Object, List<AtomicLong>> entry = it.next();
String caller = entry.getKey().toString();
Long count = entry.getValue().get(1).get();
if (count > 0) {
topNCallers.offer(new NameValuePair(caller, count));
Long cost = entry.getValue().get(1).get();
if (cost > 0) {
topNCallers.offer(new NameValuePair(caller, cost));
}
}
return topNCallers;
@ -941,25 +966,25 @@ public String getSchedulingDecisionSummary() {
public String getCallVolumeSummary() {
try {
return WRITER.writeValueAsString(getDecayedCallCounts());
return WRITER.writeValueAsString(getDecayedCallCosts());
} catch (Exception e) {
return "Error: " + e.getMessage();
}
}
private Map<Object, Long> getDecayedCallCounts() {
Map<Object, Long> decayedCallCounts = new HashMap<>(callCounts.size());
private Map<Object, Long> getDecayedCallCosts() {
Map<Object, Long> decayedCallCosts = new HashMap<>(callCosts.size());
Iterator<Map.Entry<Object, List<AtomicLong>>> it =
callCounts.entrySet().iterator();
callCosts.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<Object, List<AtomicLong>> entry = it.next();
Object user = entry.getKey();
Long decayedCount = entry.getValue().get(0).get();
if (decayedCount > 0) {
decayedCallCounts.put(user, decayedCount);
Long decayedCost = entry.getValue().get(0).get();
if (decayedCost > 0) {
decayedCallCosts.put(user, decayedCost);
}
}
return decayedCallCounts;
return decayedCallCosts;
}
@Override

View File

@ -0,0 +1,43 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ipc;
import org.apache.hadoop.conf.Configuration;
/**
* Ignores process details and returns a constant value for each call.
*/
public class DefaultCostProvider implements CostProvider {
@Override
public void init(String namespace, Configuration conf) {
// No-op
}
/**
* Returns 1, regardless of the processing details.
*
* @param details Process details (ignored)
* @return 1
*/
@Override
public long getCost(ProcessingDetails details) {
return 1;
}
}

View File

@ -0,0 +1,110 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ipc;
import java.util.Locale;
import org.apache.hadoop.conf.Configuration;
import static org.apache.hadoop.ipc.ProcessingDetails.Timing;
/**
* A {@link CostProvider} that calculates the cost for an operation
* as a weighted sum of its processing time values (see
* {@link ProcessingDetails}). This can be used by specifying the
* {@link org.apache.hadoop.fs.CommonConfigurationKeys#IPC_COST_PROVIDER_KEY}
* configuration key.
*
* <p/>This allows for configuration of how heavily each of the operations
* within {@link ProcessingDetails} is weighted. By default,
* {@link ProcessingDetails.Timing#LOCKFREE},
* {@link ProcessingDetails.Timing#RESPONSE}, and
* {@link ProcessingDetails.Timing#HANDLER} times have a weight of
* {@value #DEFAULT_LOCKFREE_WEIGHT},
* {@link ProcessingDetails.Timing#LOCKSHARED} has a weight of
* {@value #DEFAULT_LOCKSHARED_WEIGHT},
* {@link ProcessingDetails.Timing#LOCKEXCLUSIVE} has a weight of
* {@value #DEFAULT_LOCKEXCLUSIVE_WEIGHT}, and others are ignored.
* These values can all be configured using the {@link #WEIGHT_CONFIG_PREFIX}
* key, prefixed with the IPC namespace, and suffixed with the name of the
* timing measurement from {@link ProcessingDetails} (all lowercase).
* For example, to set the lock exclusive weight to be 1000, set:
* <pre>
* ipc.8020.cost-provider.impl=org.apache.hadoop.ipc.WeightedTimeCostProvider
* ipc.8020.weighted-cost.lockexclusive=1000
* </pre>
*/
public class WeightedTimeCostProvider implements CostProvider {
/**
* The prefix used in configuration values specifying the weight to use when
* determining the cost of an operation. See the class Javadoc for more info.
*/
public static final String WEIGHT_CONFIG_PREFIX = ".weighted-cost.";
static final int DEFAULT_LOCKFREE_WEIGHT = 1;
static final int DEFAULT_LOCKSHARED_WEIGHT = 10;
static final int DEFAULT_LOCKEXCLUSIVE_WEIGHT = 100;
private long[] weights;
@Override
public void init(String namespace, Configuration conf) {
weights = new long[Timing.values().length];
for (Timing timing : ProcessingDetails.Timing.values()) {
final int defaultValue;
switch (timing) {
case LOCKFREE:
case RESPONSE:
case HANDLER:
defaultValue = DEFAULT_LOCKFREE_WEIGHT;
break;
case LOCKSHARED:
defaultValue = DEFAULT_LOCKSHARED_WEIGHT;
break;
case LOCKEXCLUSIVE:
defaultValue = DEFAULT_LOCKEXCLUSIVE_WEIGHT;
break;
default:
// by default don't bill for queueing or lock wait time
defaultValue = 0;
}
String key = namespace + WEIGHT_CONFIG_PREFIX
+ timing.name().toLowerCase(Locale.ENGLISH);
weights[timing.ordinal()] = conf.getInt(key, defaultValue);
}
}
/**
* Calculates a weighted sum of the times stored on the provided processing
* details to be used as the cost in {@link DecayRpcScheduler}.
*
* @param details Processing details
* @return The weighted sum of the times. The returned unit is the same
* as the default unit used by the provided processing details.
*/
@Override
public long getCost(ProcessingDetails details) {
assert weights != null : "Cost provider must be initialized before use";
long cost = 0;
// weights was initialized to the same length as Timing.values()
for (int i = 0; i < Timing.values().length; i++) {
cost += details.get(Timing.values()[i]) * weights[i];
}
return cost;
}
}

View File

@ -91,6 +91,21 @@ This is configurable via the **identity provider**, which defaults to the **User
provider simply uses the username of the client submitting the request. However, a custom identity provider can be used
to performing throttling based on other groupings, or using an external identity provider.
### Cost-based Fair Call Queue
Though the fair call queue itself does a good job of mitigating the impact from users who submit a very high _number_
of requests, it does not take account into how expensive each request is to process. Thus, when considering the
HDFS NameNode, a user who submits 1000 "getFileInfo" requests would be prioritized the same as a user who submits 1000
"listStatus" requests on some very large directory, or a user who submits 1000 "mkdir" requests, which are more
expensive as they require an exclusive lock on the namesystem. To account for the _cost_ of an operation when
considering the prioritization of user requests, there is a "cost-based" extension to the Fair Call Queue which uses
the aggregate processing time of a user's operations to determine how that user should be prioritized. By default,
queue time (time spent waiting to be processed) and lock wait time (time spent waiting to acquire a lock) is not
considered in the cost, time spent processing without a lock is neutrally (1x) weighted, time spent processing with a
shared lock is weighted 10x higher, and time spent processing with an exclusive lock is weighted 100x higher.
This attempts to prioritize users based on the actual load they place on the server. To enable this feature, set the
`costprovder.impl` configuration to `org.apache.hadoop.ipc.WeightedTimeCostProvider` as described below.
Configuration
-------------
@ -115,12 +130,16 @@ omitted.
| scheduler.priority.levels | RpcScheduler, CallQueue | How many priority levels to use within the scheduler and call queue. | 4 |
| faircallqueue.multiplexer.weights | WeightedRoundRobinMultiplexer | How much weight to give to each priority queue. This should be a comma-separated list of length equal to the number of priority levels. | Weights descend by a factor of 2 (e.g., for 4 levels: `8,4,2,1`) |
| identity-provider.impl | DecayRpcScheduler | The identity provider mapping user requests to their identity. | org.apache.hadoop.ipc.UserIdentityProvider |
| cost-provider.impl | DecayRpcScheduler | The cost provider mapping user requests to their cost. To enable determination of cost based on processing time, use `org.apache.hadoop.ipc.WeightedTimeCostProvider`. | org.apache.hadoop.ipc.DefaultCostProvider |
| decay-scheduler.period-ms | DecayRpcScheduler | How frequently the decay factor should be applied to the operation counts of users. Higher values have less overhead, but respond less quickly to changes in client behavior. | 5000 |
| decay-scheduler.decay-factor | DecayRpcScheduler | When decaying the operation counts of users, the multiplicative decay factor to apply. Higher values will weight older operations more strongly, essentially giving the scheduler a longer memory, and penalizing heavy clients for a longer period of time. | 0.5 |
| decay-scheduler.thresholds | DecayRpcScheduler | The client load threshold, as an integer percentage, for each priority queue. Clients producing less load, as a percent of total operations, than specified at position _i_ will be given priority _i_. This should be a comma-separated list of length equal to the number of priority levels minus 1 (the last is implicitly 100). | Thresholds ascend by a factor of 2 (e.g., for 4 levels: `13,25,50`) |
| decay-scheduler.backoff.responsetime.enable | DecayRpcScheduler | Whether or not to enable the backoff by response time feature. | false |
| decay-scheduler.backoff.responsetime.thresholds | DecayRpcScheduler | The response time thresholds, as time durations, for each priority queue. If the average response time for a queue is above this threshold, backoff will occur in lower priority queues. This should be a comma-separated list of length equal to the number of priority levels. | Threshold increases by 10s per level (e.g., for 4 levels: `10s,20s,30s,40s`) |
| decay-scheduler.metrics.top.user.count | DecayRpcScheduler | The number of top (i.e., heaviest) users to emit metric information about. | 10 |
| weighted-cost.lockshared | WeightedTimeCostProvider | The weight multiplier to apply to the time spent in the processing phase which holds a shared (read) lock. | 10 |
| weighted-cost.lockexclusive | WeightedTimeCostProvider | The weight multiplier to apply to the time spent in the processing phase which holds an exclusive (write) lock. | 100 |
| weighted-cost.{handler,lockfree,response} | WeightedTimeCostProvider | The weight multiplier to apply to the time spent in the processing phases which do not involve holding a lock. See `org.apache.hadoop.ipc.ProcessingDetails.Timing` for more details on each phase. | 1 |
### Example Configuration

View File

@ -26,6 +26,7 @@
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.conf.Configuration;
@ -36,6 +37,7 @@
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.lang.management.ManagementFactory;
import java.util.concurrent.TimeUnit;
public class TestDecayRpcScheduler {
private Schedulable mockCall(String id) {
@ -131,67 +133,69 @@ public void testAccumulate() {
conf.set("ns." + DecayRpcScheduler.IPC_FCQ_DECAYSCHEDULER_PERIOD_KEY, "99999999"); // Never flush
scheduler = new DecayRpcScheduler(1, "ns", conf);
assertEquals(0, scheduler.getCallCountSnapshot().size()); // empty first
assertEquals(0, scheduler.getCallCostSnapshot().size()); // empty first
scheduler.getPriorityLevel(mockCall("A"));
assertEquals(1, scheduler.getCallCountSnapshot().get("A").longValue());
assertEquals(1, scheduler.getCallCountSnapshot().get("A").longValue());
getPriorityIncrementCallCount("A");
assertEquals(1, scheduler.getCallCostSnapshot().get("A").longValue());
assertEquals(1, scheduler.getCallCostSnapshot().get("A").longValue());
scheduler.getPriorityLevel(mockCall("A"));
scheduler.getPriorityLevel(mockCall("B"));
scheduler.getPriorityLevel(mockCall("A"));
getPriorityIncrementCallCount("A");
getPriorityIncrementCallCount("B");
getPriorityIncrementCallCount("A");
assertEquals(3, scheduler.getCallCountSnapshot().get("A").longValue());
assertEquals(1, scheduler.getCallCountSnapshot().get("B").longValue());
assertEquals(3, scheduler.getCallCostSnapshot().get("A").longValue());
assertEquals(1, scheduler.getCallCostSnapshot().get("B").longValue());
}
@Test
@SuppressWarnings("deprecation")
public void testDecay() throws Exception {
Configuration conf = new Configuration();
conf.set("ns." + DecayRpcScheduler.IPC_FCQ_DECAYSCHEDULER_PERIOD_KEY, "999999999"); // Never
conf.set("ns." + DecayRpcScheduler.IPC_FCQ_DECAYSCHEDULER_FACTOR_KEY, "0.5");
conf.setLong("ns." // Never decay
+ DecayRpcScheduler.IPC_SCHEDULER_DECAYSCHEDULER_PERIOD_KEY, 999999999);
conf.setDouble("ns."
+ DecayRpcScheduler.IPC_SCHEDULER_DECAYSCHEDULER_FACTOR_KEY, 0.5);
scheduler = new DecayRpcScheduler(1, "ns", conf);
assertEquals(0, scheduler.getTotalCallSnapshot());
for (int i = 0; i < 4; i++) {
scheduler.getPriorityLevel(mockCall("A"));
getPriorityIncrementCallCount("A");
}
sleep(1000);
for (int i = 0; i < 8; i++) {
scheduler.getPriorityLevel(mockCall("B"));
getPriorityIncrementCallCount("B");
}
assertEquals(12, scheduler.getTotalCallSnapshot());
assertEquals(4, scheduler.getCallCountSnapshot().get("A").longValue());
assertEquals(8, scheduler.getCallCountSnapshot().get("B").longValue());
assertEquals(4, scheduler.getCallCostSnapshot().get("A").longValue());
assertEquals(8, scheduler.getCallCostSnapshot().get("B").longValue());
scheduler.forceDecay();
assertEquals(6, scheduler.getTotalCallSnapshot());
assertEquals(2, scheduler.getCallCountSnapshot().get("A").longValue());
assertEquals(4, scheduler.getCallCountSnapshot().get("B").longValue());
assertEquals(2, scheduler.getCallCostSnapshot().get("A").longValue());
assertEquals(4, scheduler.getCallCostSnapshot().get("B").longValue());
scheduler.forceDecay();
assertEquals(3, scheduler.getTotalCallSnapshot());
assertEquals(1, scheduler.getCallCountSnapshot().get("A").longValue());
assertEquals(2, scheduler.getCallCountSnapshot().get("B").longValue());
assertEquals(1, scheduler.getCallCostSnapshot().get("A").longValue());
assertEquals(2, scheduler.getCallCostSnapshot().get("B").longValue());
scheduler.forceDecay();
assertEquals(1, scheduler.getTotalCallSnapshot());
assertEquals(null, scheduler.getCallCountSnapshot().get("A"));
assertEquals(1, scheduler.getCallCountSnapshot().get("B").longValue());
assertEquals(null, scheduler.getCallCostSnapshot().get("A"));
assertEquals(1, scheduler.getCallCostSnapshot().get("B").longValue());
scheduler.forceDecay();
assertEquals(0, scheduler.getTotalCallSnapshot());
assertEquals(null, scheduler.getCallCountSnapshot().get("A"));
assertEquals(null, scheduler.getCallCountSnapshot().get("B"));
assertEquals(null, scheduler.getCallCostSnapshot().get("A"));
assertEquals(null, scheduler.getCallCostSnapshot().get("B"));
}
@Test
@ -205,16 +209,16 @@ public void testPriority() throws Exception {
.IPC_FCQ_DECAYSCHEDULER_THRESHOLDS_KEY, "25, 50, 75");
scheduler = new DecayRpcScheduler(4, namespace, conf);
assertEquals(0, scheduler.getPriorityLevel(mockCall("A")));
assertEquals(2, scheduler.getPriorityLevel(mockCall("A")));
assertEquals(0, scheduler.getPriorityLevel(mockCall("B")));
assertEquals(1, scheduler.getPriorityLevel(mockCall("B")));
assertEquals(0, scheduler.getPriorityLevel(mockCall("C")));
assertEquals(0, scheduler.getPriorityLevel(mockCall("C")));
assertEquals(1, scheduler.getPriorityLevel(mockCall("A")));
assertEquals(1, scheduler.getPriorityLevel(mockCall("A")));
assertEquals(1, scheduler.getPriorityLevel(mockCall("A")));
assertEquals(2, scheduler.getPriorityLevel(mockCall("A")));
assertEquals(0, getPriorityIncrementCallCount("A")); // 0 out of 0 calls
assertEquals(3, getPriorityIncrementCallCount("A")); // 1 out of 1 calls
assertEquals(0, getPriorityIncrementCallCount("B")); // 0 out of 2 calls
assertEquals(1, getPriorityIncrementCallCount("B")); // 1 out of 3 calls
assertEquals(0, getPriorityIncrementCallCount("C")); // 0 out of 4 calls
assertEquals(0, getPriorityIncrementCallCount("C")); // 1 out of 5 calls
assertEquals(1, getPriorityIncrementCallCount("A")); // 2 out of 6 calls
assertEquals(1, getPriorityIncrementCallCount("A")); // 3 out of 7 calls
assertEquals(2, getPriorityIncrementCallCount("A")); // 4 out of 8 calls
assertEquals(2, getPriorityIncrementCallCount("A")); // 5 out of 9 calls
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
ObjectName mxbeanName = new ObjectName(
@ -243,7 +247,7 @@ public void testPeriodic() throws InterruptedException {
assertEquals(0, scheduler.getTotalCallSnapshot());
for (int i = 0; i < 64; i++) {
scheduler.getPriorityLevel(mockCall("A"));
getPriorityIncrementCallCount("A");
}
// It should eventually decay to zero
@ -272,6 +276,108 @@ public void testNPEatInitialization() throws InterruptedException {
//set systout back
System.setOut(output);
}
}
@Test
public void testUsingWeightedTimeCostProvider() {
scheduler = getSchedulerWithWeightedTimeCostProvider(3);
// 3 details in increasing order of cost. Although medium has a longer
// duration, the shared lock is weighted less than the exclusive lock
ProcessingDetails callDetailsLow =
new ProcessingDetails(TimeUnit.MILLISECONDS);
callDetailsLow.set(ProcessingDetails.Timing.LOCKFREE, 1);
ProcessingDetails callDetailsMedium =
new ProcessingDetails(TimeUnit.MILLISECONDS);
callDetailsMedium.set(ProcessingDetails.Timing.LOCKSHARED, 500);
ProcessingDetails callDetailsHigh =
new ProcessingDetails(TimeUnit.MILLISECONDS);
callDetailsHigh.set(ProcessingDetails.Timing.LOCKEXCLUSIVE, 100);
for (int i = 0; i < 10; i++) {
scheduler.addResponseTime("ignored", mockCall("LOW"), callDetailsLow);
}
scheduler.addResponseTime("ignored", mockCall("MED"), callDetailsMedium);
scheduler.addResponseTime("ignored", mockCall("HIGH"), callDetailsHigh);
assertEquals(0, scheduler.getPriorityLevel(mockCall("LOW")));
assertEquals(1, scheduler.getPriorityLevel(mockCall("MED")));
assertEquals(2, scheduler.getPriorityLevel(mockCall("HIGH")));
assertEquals(3, scheduler.getUniqueIdentityCount());
long totalCallInitial = scheduler.getTotalRawCallVolume();
assertEquals(totalCallInitial, scheduler.getTotalCallVolume());
scheduler.forceDecay();
// Relative priorities should stay the same after a single decay
assertEquals(0, scheduler.getPriorityLevel(mockCall("LOW")));
assertEquals(1, scheduler.getPriorityLevel(mockCall("MED")));
assertEquals(2, scheduler.getPriorityLevel(mockCall("HIGH")));
assertEquals(3, scheduler.getUniqueIdentityCount());
assertEquals(totalCallInitial, scheduler.getTotalRawCallVolume());
assertTrue(scheduler.getTotalCallVolume() < totalCallInitial);
for (int i = 0; i < 100; i++) {
scheduler.forceDecay();
}
// After enough decay cycles, all callers should be high priority again
assertEquals(0, scheduler.getPriorityLevel(mockCall("LOW")));
assertEquals(0, scheduler.getPriorityLevel(mockCall("MED")));
assertEquals(0, scheduler.getPriorityLevel(mockCall("HIGH")));
}
@Test
public void testUsingWeightedTimeCostProviderWithZeroCostCalls() {
scheduler = getSchedulerWithWeightedTimeCostProvider(2);
ProcessingDetails emptyDetails =
new ProcessingDetails(TimeUnit.MILLISECONDS);
for (int i = 0; i < 1000; i++) {
scheduler.addResponseTime("ignored", mockCall("MANY"), emptyDetails);
}
scheduler.addResponseTime("ignored", mockCall("FEW"), emptyDetails);
// Since the calls are all "free", they should have the same priority
assertEquals(0, scheduler.getPriorityLevel(mockCall("MANY")));
assertEquals(0, scheduler.getPriorityLevel(mockCall("FEW")));
}
@Test
public void testUsingWeightedTimeCostProviderNoRequests() {
scheduler = getSchedulerWithWeightedTimeCostProvider(2);
assertEquals(0, scheduler.getPriorityLevel(mockCall("A")));
}
/**
* Get a scheduler that uses {@link WeightedTimeCostProvider} and has
* normal decaying disabled.
*/
private static DecayRpcScheduler getSchedulerWithWeightedTimeCostProvider(
int priorityLevels) {
Configuration conf = new Configuration();
conf.setClass("ns." + CommonConfigurationKeys.IPC_COST_PROVIDER_KEY,
WeightedTimeCostProvider.class, CostProvider.class);
conf.setLong("ns."
+ DecayRpcScheduler.IPC_SCHEDULER_DECAYSCHEDULER_PERIOD_KEY, 999999);
return new DecayRpcScheduler(priorityLevels, "ns", conf);
}
/**
* Get the priority and increment the call count, assuming that
* {@link DefaultCostProvider} is in use.
*/
private int getPriorityIncrementCallCount(String callId) {
Schedulable mockCall = mockCall(callId);
int priority = scheduler.getPriorityLevel(mockCall);
// The DefaultCostProvider uses a cost of 1 for all calls, ignoring
// the processing details, so an empty one is fine
ProcessingDetails emptyProcessingDetails =
new ProcessingDetails(TimeUnit.MILLISECONDS);
scheduler.addResponseTime("ignored", mockCall, emptyProcessingDetails);
return priority;
}
}

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.ipc;
import com.google.common.base.Supplier;
import com.google.protobuf.ServiceException;
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.conf.Configuration;
@ -1195,15 +1194,6 @@ public void testClientBackOffByResponseTime() throws Exception {
Exception lastException = null;
proxy = getClient(addr, conf);
MetricsRecordBuilder rb1 =
getMetrics("DecayRpcSchedulerMetrics2." + ns);
final long beginDecayedCallVolume = MetricsAsserts.getLongCounter(
"DecayedCallVolume", rb1);
final long beginRawCallVolume = MetricsAsserts.getLongCounter(
"CallVolume", rb1);
final int beginUniqueCaller = MetricsAsserts.getIntCounter("UniqueCallers",
rb1);
try {
// start a sleep RPC call that sleeps 3s.
for (int i = 0; i < numClients; i++) {
@ -1231,11 +1221,40 @@ public Void call() throws ServiceException, InterruptedException {
} else {
lastException = unwrapExeption;
}
}
} finally {
executorService.shutdown();
stop(server, proxy);
}
if (lastException != null) {
LOG.error("Last received non-RetriableException:", lastException);
}
assertTrue("RetriableException not received", succeeded);
}
/** Test that the metrics for DecayRpcScheduler are updated. */
@Test (timeout=30000)
public void testDecayRpcSchedulerMetrics() throws Exception {
final String ns = CommonConfigurationKeys.IPC_NAMESPACE + ".0";
Server server = setupDecayRpcSchedulerandTestServer(ns + ".");
MetricsRecordBuilder rb1 =
getMetrics("DecayRpcSchedulerMetrics2." + ns);
final long beginDecayedCallVolume = MetricsAsserts.getLongCounter(
"DecayedCallVolume", rb1);
final long beginRawCallVolume = MetricsAsserts.getLongCounter(
"CallVolume", rb1);
final int beginUniqueCaller = MetricsAsserts.getIntCounter("UniqueCallers",
rb1);
TestRpcService proxy = getClient(addr, conf);
try {
for (int i = 0; i < 2; i++) {
proxy.sleep(null, newSleepRequest(100));
}
// Lets Metric system update latest metrics
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
GenericTestUtils.waitFor(() -> {
MetricsRecordBuilder rb2 =
getMetrics("DecayRpcSchedulerMetrics2." + ns);
long decayedCallVolume1 = MetricsAsserts.getLongCounter(
@ -1253,28 +1272,21 @@ public Boolean get() {
double avgRespTimePriority1 = MetricsAsserts.getDoubleGauge(
"Priority.1.AvgResponseTime", rb2);
LOG.info("DecayedCallVolume: " + decayedCallVolume1);
LOG.info("CallVolume: " + rawCallVolume1);
LOG.info("UniqueCaller: " + uniqueCaller1);
LOG.info("Priority.0.CompletedCallVolume: " + callVolumePriority0);
LOG.info("Priority.1.CompletedCallVolume: " + callVolumePriority1);
LOG.info("Priority.0.AvgResponseTime: " + avgRespTimePriority0);
LOG.info("Priority.1.AvgResponseTime: " + avgRespTimePriority1);
LOG.info("DecayedCallVolume: {}", decayedCallVolume1);
LOG.info("CallVolume: {}", rawCallVolume1);
LOG.info("UniqueCaller: {}", uniqueCaller1);
LOG.info("Priority.0.CompletedCallVolume: {}", callVolumePriority0);
LOG.info("Priority.1.CompletedCallVolume: {}", callVolumePriority1);
LOG.info("Priority.0.AvgResponseTime: {}", avgRespTimePriority0);
LOG.info("Priority.1.AvgResponseTime: {}", avgRespTimePriority1);
return decayedCallVolume1 > beginDecayedCallVolume &&
rawCallVolume1 > beginRawCallVolume &&
uniqueCaller1 > beginUniqueCaller;
}
}, 30, 60000);
}
} finally {
executorService.shutdown();
stop(server, proxy);
}
if (lastException != null) {
LOG.error("Last received non-RetriableException:", lastException);
}
assertTrue("RetriableException not received", succeeded);
}
private Server setupDecayRpcSchedulerandTestServer(String ns)

View File

@ -0,0 +1,86 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ipc;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ipc.ProcessingDetails.Timing;
import org.junit.Before;
import org.junit.Test;
import static org.apache.hadoop.ipc.WeightedTimeCostProvider.DEFAULT_LOCKEXCLUSIVE_WEIGHT;
import static org.apache.hadoop.ipc.WeightedTimeCostProvider.DEFAULT_LOCKFREE_WEIGHT;
import static org.apache.hadoop.ipc.WeightedTimeCostProvider.DEFAULT_LOCKSHARED_WEIGHT;
import static org.junit.Assert.assertEquals;
/** Tests for {@link WeightedTimeCostProvider}. */
public class TestWeightedTimeCostProvider {
private static final int QUEUE_TIME = 3;
private static final int LOCKFREE_TIME = 5;
private static final int LOCKSHARED_TIME = 7;
private static final int LOCKEXCLUSIVE_TIME = 11;
private WeightedTimeCostProvider costProvider;
private ProcessingDetails processingDetails;
@Before
public void setup() {
costProvider = new WeightedTimeCostProvider();
processingDetails = new ProcessingDetails(TimeUnit.MILLISECONDS);
processingDetails.set(Timing.QUEUE, QUEUE_TIME);
processingDetails.set(Timing.LOCKFREE, LOCKFREE_TIME);
processingDetails.set(Timing.LOCKSHARED, LOCKSHARED_TIME);
processingDetails.set(Timing.LOCKEXCLUSIVE, LOCKEXCLUSIVE_TIME);
}
@Test(expected = AssertionError.class)
public void testGetCostBeforeInit() {
costProvider.getCost(null);
}
@Test
public void testGetCostDefaultWeights() {
costProvider.init("foo", new Configuration());
long actualCost = costProvider.getCost(processingDetails);
long expectedCost = DEFAULT_LOCKFREE_WEIGHT * LOCKFREE_TIME
+ DEFAULT_LOCKSHARED_WEIGHT * LOCKSHARED_TIME
+ DEFAULT_LOCKEXCLUSIVE_WEIGHT * LOCKEXCLUSIVE_TIME;
assertEquals(expectedCost, actualCost);
}
@Test
public void testGetCostConfiguredWeights() {
Configuration conf = new Configuration();
int queueWeight = 1000;
int lockfreeWeight = 10000;
int locksharedWeight = 100000;
conf.setInt("foo.weighted-cost.queue", queueWeight);
conf.setInt("foo.weighted-cost.lockfree", lockfreeWeight);
conf.setInt("foo.weighted-cost.lockshared", locksharedWeight);
conf.setInt("bar.weighted-cost.lockexclusive", 0); // should not apply
costProvider.init("foo", conf);
long actualCost = costProvider.getCost(processingDetails);
long expectedCost = queueWeight * QUEUE_TIME
+ lockfreeWeight * LOCKFREE_TIME
+ locksharedWeight * LOCKSHARED_TIME
+ DEFAULT_LOCKEXCLUSIVE_WEIGHT * LOCKEXCLUSIVE_TIME;
assertEquals(expectedCost, actualCost);
}
}