From 2b66d9ec5bdaec7e6b278926fbb6f222c4e3afaa Mon Sep 17 00:00:00 2001 From: Jian He Date: Tue, 20 Sep 2016 15:03:07 +0800 Subject: [PATCH 1/9] YARN-3140. Improve locks in AbstractCSQueue/LeafQueue/ParentQueue. Contributed by Wangda Tan --- .../dev-support/findbugs-exclude.xml | 10 + .../scheduler/capacity/AbstractCSQueue.java | 368 ++-- .../scheduler/capacity/LeafQueue.java | 1857 +++++++++-------- .../scheduler/capacity/ParentQueue.java | 851 ++++---- .../scheduler/capacity/PlanQueue.java | 128 +- .../scheduler/capacity/ReservationQueue.java | 67 +- .../capacity/TestContainerResizing.java | 4 +- 7 files changed, 1817 insertions(+), 1468 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index a5c0f71446..01b1da7f1c 100644 --- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -564,4 +564,14 @@              + + + + +     + + +     + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java index 1d8f9290cc..096f5ea7e9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java @@ -24,6 +24,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; +import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; @@ -60,25 +61,25 @@ import com.google.common.collect.Sets; public abstract class AbstractCSQueue implements CSQueue { private static final Log LOG = LogFactory.getLog(AbstractCSQueue.class); - CSQueue parent; + volatile CSQueue parent; final String queueName; volatile int numContainers; final Resource minimumAllocation; volatile Resource maximumAllocation; - QueueState state; + volatile QueueState state; final CSQueueMetrics metrics; protected final PrivilegedEntity queueEntity; final ResourceCalculator resourceCalculator; Set accessibleLabels; - RMNodeLabelsManager labelManager; + final RMNodeLabelsManager labelManager; String defaultLabelExpression; Map acls = new HashMap(); volatile boolean reservationsContinueLooking; - private boolean preemptionDisabled; + private volatile boolean preemptionDisabled; // Track resource usage-by-label like used-resource/pending-resource, etc. volatile ResourceUsage queueUsage; @@ -94,6 +95,9 @@ public abstract class AbstractCSQueue implements CSQueue { protected ActivitiesManager activitiesManager; + protected ReentrantReadWriteLock.ReadLock readLock; + protected ReentrantReadWriteLock.WriteLock writeLock; + public AbstractCSQueue(CapacitySchedulerContext cs, String queueName, CSQueue parent, CSQueue old) throws IOException { this.labelManager = cs.getRMContext().getNodeLabelManager(); @@ -116,7 +120,11 @@ public abstract class AbstractCSQueue implements CSQueue { queueEntity = new PrivilegedEntity(EntityType.QUEUE, getQueuePath()); // initialize QueueCapacities - queueCapacities = new QueueCapacities(parent == null); + queueCapacities = new QueueCapacities(parent == null); + + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + readLock = lock.readLock(); + writeLock = lock.writeLock(); } protected void setupConfigurableCapacities() { @@ -128,12 +136,12 @@ public abstract class AbstractCSQueue implements CSQueue { } @Override - public synchronized float getCapacity() { + public float getCapacity() { return queueCapacities.getCapacity(); } @Override - public synchronized float getAbsoluteCapacity() { + public float getAbsoluteCapacity() { return queueCapacities.getAbsoluteCapacity(); } @@ -167,7 +175,7 @@ public abstract class AbstractCSQueue implements CSQueue { } @Override - public synchronized QueueState getState() { + public QueueState getState() { return state; } @@ -187,13 +195,13 @@ public abstract class AbstractCSQueue implements CSQueue { } @Override - public synchronized CSQueue getParent() { + public CSQueue getParent() { return parent; } @Override - public synchronized void setParent(CSQueue newParentQueue) { - this.parent = (ParentQueue)newParentQueue; + public void setParent(CSQueue newParentQueue) { + this.parent = newParentQueue; } public Set getAccessibleNodeLabels() { @@ -221,18 +229,22 @@ public abstract class AbstractCSQueue implements CSQueue { * Set maximum capacity - used only for testing. * @param maximumCapacity new max capacity */ - synchronized void setMaxCapacity(float maximumCapacity) { - // Sanity check - CSQueueUtils.checkMaxCapacity(getQueueName(), - queueCapacities.getCapacity(), maximumCapacity); - float absMaxCapacity = - CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent); - CSQueueUtils.checkAbsoluteCapacity(getQueueName(), - queueCapacities.getAbsoluteCapacity(), - absMaxCapacity); - - queueCapacities.setMaximumCapacity(maximumCapacity); - queueCapacities.setAbsoluteMaximumCapacity(absMaxCapacity); + void setMaxCapacity(float maximumCapacity) { + try { + writeLock.lock(); + // Sanity check + CSQueueUtils.checkMaxCapacity(getQueueName(), + queueCapacities.getCapacity(), maximumCapacity); + float absMaxCapacity = CSQueueUtils.computeAbsoluteMaximumCapacity( + maximumCapacity, parent); + CSQueueUtils.checkAbsoluteCapacity(getQueueName(), + queueCapacities.getAbsoluteCapacity(), absMaxCapacity); + + queueCapacities.setMaximumCapacity(maximumCapacity); + queueCapacities.setAbsoluteMaximumCapacity(absMaxCapacity); + } finally { + writeLock.unlock(); + } } @Override @@ -240,70 +252,82 @@ public abstract class AbstractCSQueue implements CSQueue { return defaultLabelExpression; } - synchronized void setupQueueConfigs(Resource clusterResource) + void setupQueueConfigs(Resource clusterResource) throws IOException { - // get labels - this.accessibleLabels = - csContext.getConfiguration().getAccessibleNodeLabels(getQueuePath()); - this.defaultLabelExpression = csContext.getConfiguration() - .getDefaultNodeLabelExpression(getQueuePath()); + try { + writeLock.lock(); + // get labels + this.accessibleLabels = + csContext.getConfiguration().getAccessibleNodeLabels(getQueuePath()); + this.defaultLabelExpression = + csContext.getConfiguration().getDefaultNodeLabelExpression( + getQueuePath()); - // inherit from parent if labels not set - if (this.accessibleLabels == null && parent != null) { - this.accessibleLabels = parent.getAccessibleNodeLabels(); - } - - // inherit from parent if labels not set - if (this.defaultLabelExpression == null && parent != null - && this.accessibleLabels.containsAll(parent.getAccessibleNodeLabels())) { - this.defaultLabelExpression = parent.getDefaultNodeLabelExpression(); - } + // inherit from parent if labels not set + if (this.accessibleLabels == null && parent != null) { + this.accessibleLabels = parent.getAccessibleNodeLabels(); + } - // After we setup labels, we can setup capacities - setupConfigurableCapacities(); - - this.maximumAllocation = - csContext.getConfiguration().getMaximumAllocationPerQueue( - getQueuePath()); - - authorizer = YarnAuthorizationProvider.getInstance(csContext.getConf()); - - this.state = csContext.getConfiguration().getState(getQueuePath()); - this.acls = csContext.getConfiguration().getAcls(getQueuePath()); + // inherit from parent if labels not set + if (this.defaultLabelExpression == null && parent != null + && this.accessibleLabels.containsAll( + parent.getAccessibleNodeLabels())) { + this.defaultLabelExpression = parent.getDefaultNodeLabelExpression(); + } - // Update metrics - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, null); - - // Check if labels of this queue is a subset of parent queue, only do this - // when we not root - if (parent != null && parent.getParent() != null) { - if (parent.getAccessibleNodeLabels() != null - && !parent.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { - // if parent isn't "*", child shouldn't be "*" too - if (this.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { - throw new IOException("Parent's accessible queue is not ANY(*), " - + "but child's accessible queue is *"); - } else { - Set diff = - Sets.difference(this.getAccessibleNodeLabels(), - parent.getAccessibleNodeLabels()); - if (!diff.isEmpty()) { - throw new IOException("Some labels of child queue is not a subset " - + "of parent queue, these labels=[" - + StringUtils.join(diff, ",") + "]"); + // After we setup labels, we can setup capacities + setupConfigurableCapacities(); + + this.maximumAllocation = + csContext.getConfiguration().getMaximumAllocationPerQueue( + getQueuePath()); + + authorizer = YarnAuthorizationProvider.getInstance(csContext.getConf()); + + this.state = csContext.getConfiguration().getState(getQueuePath()); + this.acls = csContext.getConfiguration().getAcls(getQueuePath()); + + // Update metrics + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, null); + + // Check if labels of this queue is a subset of parent queue, only do this + // when we not root + if (parent != null && parent.getParent() != null) { + if (parent.getAccessibleNodeLabels() != null && !parent + .getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { + // if parent isn't "*", child shouldn't be "*" too + if (this.getAccessibleNodeLabels().contains( + RMNodeLabelsManager.ANY)) { + throw new IOException("Parent's accessible queue is not ANY(*), " + + "but child's accessible queue is *"); + } else{ + Set diff = Sets.difference(this.getAccessibleNodeLabels(), + parent.getAccessibleNodeLabels()); + if (!diff.isEmpty()) { + throw new IOException( + "Some labels of child queue is not a subset " + + "of parent queue, these labels=[" + StringUtils + .join(diff, ",") + "]"); + } } } } + + this.reservationsContinueLooking = + csContext.getConfiguration().getReservationContinueLook(); + + this.preemptionDisabled = isQueueHierarchyPreemptionDisabled(this); + } finally { + writeLock.unlock(); } - - this.reservationsContinueLooking = csContext.getConfiguration() - .getReservationContinueLook(); - - this.preemptionDisabled = isQueueHierarchyPreemptionDisabled(this); } - + protected QueueInfo getQueueInfo() { + // Deliberately doesn't use lock here, because this method will be invoked + // from schedulerApplicationAttempt, to avoid deadlock, sacrifice + // consistency here. + // TODO, improve this QueueInfo queueInfo = recordFactory.newRecordInstance(QueueInfo.class); queueInfo.setQueueName(queueName); queueInfo.setAccessibleNodeLabels(accessibleLabels); @@ -318,8 +342,12 @@ public abstract class AbstractCSQueue implements CSQueue { } public QueueStatistics getQueueStatistics() { - QueueStatistics stats = - recordFactory.newRecordInstance(QueueStatistics.class); + // Deliberately doesn't use lock here, because this method will be invoked + // from schedulerApplicationAttempt, to avoid deadlock, sacrifice + // consistency here. + // TODO, improve this + QueueStatistics stats = recordFactory.newRecordInstance( + QueueStatistics.class); stats.setNumAppsSubmitted(getMetrics().getAppsSubmitted()); stats.setNumAppsRunning(getMetrics().getAppsRunning()); stats.setNumAppsPending(getMetrics().getAppsPending()); @@ -351,26 +379,36 @@ public abstract class AbstractCSQueue implements CSQueue { return minimumAllocation; } - synchronized void allocateResource(Resource clusterResource, + void allocateResource(Resource clusterResource, Resource resource, String nodePartition, boolean changeContainerResource) { - queueUsage.incUsed(nodePartition, resource); + try { + writeLock.lock(); + queueUsage.incUsed(nodePartition, resource); - if (!changeContainerResource) { - ++numContainers; + if (!changeContainerResource) { + ++numContainers; + } + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, nodePartition); + } finally { + writeLock.unlock(); } - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, nodePartition); } - protected synchronized void releaseResource(Resource clusterResource, + protected void releaseResource(Resource clusterResource, Resource resource, String nodePartition, boolean changeContainerResource) { - queueUsage.decUsed(nodePartition, resource); + try { + writeLock.lock(); + queueUsage.decUsed(nodePartition, resource); - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, nodePartition); + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, nodePartition); - if (!changeContainerResource) { - --numContainers; + if (!changeContainerResource) { + --numContainers; + } + } finally { + writeLock.unlock(); } } @@ -381,7 +419,13 @@ public abstract class AbstractCSQueue implements CSQueue { @Private public Map getACLs() { - return acls; + try { + readLock.lock(); + return acls; + } finally { + readLock.unlock(); + } + } @Private @@ -464,86 +508,88 @@ public abstract class AbstractCSQueue implements CSQueue { minimumAllocation); } - synchronized boolean canAssignToThisQueue(Resource clusterResource, + boolean canAssignToThisQueue(Resource clusterResource, String nodePartition, ResourceLimits currentResourceLimits, Resource resourceCouldBeUnreserved, SchedulingMode schedulingMode) { - // Get current limited resource: - // - When doing RESPECT_PARTITION_EXCLUSIVITY allocation, we will respect - // queues' max capacity. - // - When doing IGNORE_PARTITION_EXCLUSIVITY allocation, we will not respect - // queue's max capacity, queue's max capacity on the partition will be - // considered to be 100%. Which is a queue can use all resource in the - // partition. - // Doing this because: for non-exclusive allocation, we make sure there's - // idle resource on the partition, to avoid wastage, such resource will be - // leveraged as much as we can, and preemption policy will reclaim it back - // when partitoned-resource-request comes back. - Resource currentLimitResource = - getCurrentLimitResource(nodePartition, clusterResource, - currentResourceLimits, schedulingMode); + try { + readLock.lock(); + // Get current limited resource: + // - When doing RESPECT_PARTITION_EXCLUSIVITY allocation, we will respect + // queues' max capacity. + // - When doing IGNORE_PARTITION_EXCLUSIVITY allocation, we will not respect + // queue's max capacity, queue's max capacity on the partition will be + // considered to be 100%. Which is a queue can use all resource in the + // partition. + // Doing this because: for non-exclusive allocation, we make sure there's + // idle resource on the partition, to avoid wastage, such resource will be + // leveraged as much as we can, and preemption policy will reclaim it back + // when partitoned-resource-request comes back. + Resource currentLimitResource = getCurrentLimitResource(nodePartition, + clusterResource, currentResourceLimits, schedulingMode); - Resource nowTotalUsed = queueUsage.getUsed(nodePartition); + Resource nowTotalUsed = queueUsage.getUsed(nodePartition); - // Set headroom for currentResourceLimits: - // When queue is a parent queue: Headroom = limit - used + killable - // When queue is a leaf queue: Headroom = limit - used (leaf queue cannot preempt itself) - Resource usedExceptKillable = nowTotalUsed; - if (null != getChildQueues() && !getChildQueues().isEmpty()) { - usedExceptKillable = Resources.subtract(nowTotalUsed, - getTotalKillableResource(nodePartition)); - } - currentResourceLimits.setHeadroom( - Resources.subtract(currentLimitResource, usedExceptKillable)); + // Set headroom for currentResourceLimits: + // When queue is a parent queue: Headroom = limit - used + killable + // When queue is a leaf queue: Headroom = limit - used (leaf queue cannot preempt itself) + Resource usedExceptKillable = nowTotalUsed; + if (null != getChildQueues() && !getChildQueues().isEmpty()) { + usedExceptKillable = Resources.subtract(nowTotalUsed, + getTotalKillableResource(nodePartition)); + } + currentResourceLimits.setHeadroom( + Resources.subtract(currentLimitResource, usedExceptKillable)); - if (Resources.greaterThanOrEqual(resourceCalculator, clusterResource, - usedExceptKillable, currentLimitResource)) { + if (Resources.greaterThanOrEqual(resourceCalculator, clusterResource, + usedExceptKillable, currentLimitResource)) { - // if reservation continous looking enabled, check to see if could we - // potentially use this node instead of a reserved node if the application - // has reserved containers. - // TODO, now only consider reservation cases when the node has no label - if (this.reservationsContinueLooking - && nodePartition.equals(RMNodeLabelsManager.NO_LABEL) - && Resources.greaterThan(resourceCalculator, clusterResource, - resourceCouldBeUnreserved, Resources.none())) { - // resource-without-reserved = used - reserved - Resource newTotalWithoutReservedResource = - Resources.subtract(usedExceptKillable, resourceCouldBeUnreserved); + // if reservation continous looking enabled, check to see if could we + // potentially use this node instead of a reserved node if the application + // has reserved containers. + // TODO, now only consider reservation cases when the node has no label + if (this.reservationsContinueLooking && nodePartition.equals( + RMNodeLabelsManager.NO_LABEL) && Resources.greaterThan( + resourceCalculator, clusterResource, resourceCouldBeUnreserved, + Resources.none())) { + // resource-without-reserved = used - reserved + Resource newTotalWithoutReservedResource = Resources.subtract( + usedExceptKillable, resourceCouldBeUnreserved); - // when total-used-without-reserved-resource < currentLimit, we still - // have chance to allocate on this node by unreserving some containers - if (Resources.lessThan(resourceCalculator, clusterResource, - newTotalWithoutReservedResource, currentLimitResource)) { - if (LOG.isDebugEnabled()) { - LOG.debug("try to use reserved: " + getQueueName() - + " usedResources: " + queueUsage.getUsed() - + ", clusterResources: " + clusterResource - + ", reservedResources: " + resourceCouldBeUnreserved - + ", capacity-without-reserved: " - + newTotalWithoutReservedResource + ", maxLimitCapacity: " - + currentLimitResource); + // when total-used-without-reserved-resource < currentLimit, we still + // have chance to allocate on this node by unreserving some containers + if (Resources.lessThan(resourceCalculator, clusterResource, + newTotalWithoutReservedResource, currentLimitResource)) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "try to use reserved: " + getQueueName() + " usedResources: " + + queueUsage.getUsed() + ", clusterResources: " + + clusterResource + ", reservedResources: " + + resourceCouldBeUnreserved + + ", capacity-without-reserved: " + + newTotalWithoutReservedResource + ", maxLimitCapacity: " + + currentLimitResource); + } + return true; } - return true; } + if (LOG.isDebugEnabled()) { + LOG.debug(getQueueName() + "Check assign to queue, nodePartition=" + + nodePartition + " usedResources: " + queueUsage + .getUsed(nodePartition) + " clusterResources: " + clusterResource + + " currentUsedCapacity " + Resources + .divide(resourceCalculator, clusterResource, + queueUsage.getUsed(nodePartition), labelManager + .getResourceByLabel(nodePartition, clusterResource)) + + " max-capacity: " + queueCapacities + .getAbsoluteMaximumCapacity(nodePartition) + ")"); + } + return false; } - if (LOG.isDebugEnabled()) { - LOG.debug(getQueueName() - + "Check assign to queue, nodePartition=" - + nodePartition - + " usedResources: " - + queueUsage.getUsed(nodePartition) - + " clusterResources: " - + clusterResource - + " currentUsedCapacity " - + Resources.divide(resourceCalculator, clusterResource, - queueUsage.getUsed(nodePartition), - labelManager.getResourceByLabel(nodePartition, clusterResource)) - + " max-capacity: " - + queueCapacities.getAbsoluteMaximumCapacity(nodePartition) + ")"); - } - return false; + return true; + } finally { + readLock.unlock(); } - return true; + } @Override diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 922d7115d9..6129772a5b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; import java.io.IOException; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; @@ -85,11 +86,11 @@ public class LeafQueue extends AbstractCSQueue { private static final Log LOG = LogFactory.getLog(LeafQueue.class); private float absoluteUsedCapacity = 0.0f; - private int userLimit; - private float userLimitFactor; + private volatile int userLimit; + private volatile float userLimitFactor; protected int maxApplications; - protected int maxApplicationsPerUser; + protected volatile int maxApplicationsPerUser; private float maxAMResourcePerQueuePercent; @@ -97,15 +98,15 @@ public class LeafQueue extends AbstractCSQueue { private volatile boolean rackLocalityFullReset; Map applicationAttemptMap = - new HashMap(); + new ConcurrentHashMap<>(); private Priority defaultAppPriorityPerQueue; - private OrderingPolicy pendingOrderingPolicy = null; + private final OrderingPolicy pendingOrderingPolicy; private volatile float minimumAllocationFactor; - private Map users = new HashMap(); + private Map users = new ConcurrentHashMap<>(); private final RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null); @@ -122,7 +123,7 @@ public class LeafQueue extends AbstractCSQueue { private volatile ResourceLimits cachedResourceLimitsForHeadroom = null; - private OrderingPolicy orderingPolicy = null; + private volatile OrderingPolicy orderingPolicy = null; // Summation of consumed ratios for all users in queue private float totalUserConsumedRatio = 0; @@ -131,7 +132,7 @@ public class LeafQueue extends AbstractCSQueue { // record all ignore partition exclusivityRMContainer, this will be used to do // preemption, key is the partition of the RMContainer allocated on private Map> ignorePartitionExclusivityRMContainers = - new HashMap<>(); + new ConcurrentHashMap<>(); @SuppressWarnings({ "unchecked", "rawtypes" }) public LeafQueue(CapacitySchedulerContext cs, @@ -154,125 +155,125 @@ public class LeafQueue extends AbstractCSQueue { setupQueueConfigs(cs.getClusterResource()); } - protected synchronized void setupQueueConfigs(Resource clusterResource) + protected void setupQueueConfigs(Resource clusterResource) throws IOException { - super.setupQueueConfigs(clusterResource); - - this.lastClusterResource = clusterResource; - - this.cachedResourceLimitsForHeadroom = new ResourceLimits(clusterResource); - - // Initialize headroom info, also used for calculating application - // master resource limits. Since this happens during queue initialization - // and all queues may not be realized yet, we'll use (optimistic) - // absoluteMaxCapacity (it will be replaced with the more accurate - // absoluteMaxAvailCapacity during headroom/userlimit/allocation events) - setQueueResourceLimitsInfo(clusterResource); + try { + writeLock.lock(); + super.setupQueueConfigs(clusterResource); - CapacitySchedulerConfiguration conf = csContext.getConfiguration(); + this.lastClusterResource = clusterResource; - setOrderingPolicy(conf.getOrderingPolicy(getQueuePath())); + this.cachedResourceLimitsForHeadroom = new ResourceLimits( + clusterResource); - userLimit = conf.getUserLimit(getQueuePath()); - userLimitFactor = conf.getUserLimitFactor(getQueuePath()); + // Initialize headroom info, also used for calculating application + // master resource limits. Since this happens during queue initialization + // and all queues may not be realized yet, we'll use (optimistic) + // absoluteMaxCapacity (it will be replaced with the more accurate + // absoluteMaxAvailCapacity during headroom/userlimit/allocation events) + setQueueResourceLimitsInfo(clusterResource); - maxApplications = conf.getMaximumApplicationsPerQueue(getQueuePath()); - if (maxApplications < 0) { - int maxSystemApps = conf.getMaximumSystemApplications(); - maxApplications = - (int) (maxSystemApps * queueCapacities.getAbsoluteCapacity()); - } - maxApplicationsPerUser = Math.min(maxApplications, - (int)(maxApplications * (userLimit / 100.0f) * userLimitFactor)); - - maxAMResourcePerQueuePercent = - conf.getMaximumApplicationMasterResourcePerQueuePercent(getQueuePath()); + CapacitySchedulerConfiguration conf = csContext.getConfiguration(); - if (!SchedulerUtils.checkQueueLabelExpression( - this.accessibleLabels, this.defaultLabelExpression, null)) { - throw new IOException("Invalid default label expression of " - + " queue=" - + getQueueName() - + " doesn't have permission to access all labels " - + "in default label expression. labelExpression of resource request=" - + (this.defaultLabelExpression == null ? "" - : this.defaultLabelExpression) - + ". Queue labels=" - + (getAccessibleNodeLabels() == null ? "" : StringUtils.join( - getAccessibleNodeLabels().iterator(), ','))); - } - - nodeLocalityDelay = conf.getNodeLocalityDelay(); - rackLocalityFullReset = conf.getRackLocalityFullReset(); + setOrderingPolicy( + conf.getOrderingPolicy(getQueuePath())); - // re-init this since max allocation could have changed - this.minimumAllocationFactor = - Resources.ratio(resourceCalculator, - Resources.subtract(maximumAllocation, minimumAllocation), - maximumAllocation); + userLimit = conf.getUserLimit(getQueuePath()); + userLimitFactor = conf.getUserLimitFactor(getQueuePath()); - StringBuilder aclsString = new StringBuilder(); - for (Map.Entry e : acls.entrySet()) { - aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); - } - - StringBuilder labelStrBuilder = new StringBuilder(); - if (accessibleLabels != null) { - for (String s : accessibleLabels) { - labelStrBuilder.append(s); - labelStrBuilder.append(","); + maxApplications = conf.getMaximumApplicationsPerQueue(getQueuePath()); + if (maxApplications < 0) { + int maxSystemApps = conf.getMaximumSystemApplications(); + maxApplications = + (int) (maxSystemApps * queueCapacities.getAbsoluteCapacity()); } + maxApplicationsPerUser = Math.min(maxApplications, + (int) (maxApplications * (userLimit / 100.0f) * userLimitFactor)); + + maxAMResourcePerQueuePercent = + conf.getMaximumApplicationMasterResourcePerQueuePercent( + getQueuePath()); + + if (!SchedulerUtils.checkQueueLabelExpression(this.accessibleLabels, + this.defaultLabelExpression, null)) { + throw new IOException( + "Invalid default label expression of " + " queue=" + getQueueName() + + " doesn't have permission to access all labels " + + "in default label expression. labelExpression of resource request=" + + (this.defaultLabelExpression == null ? + "" : + this.defaultLabelExpression) + ". Queue labels=" + ( + getAccessibleNodeLabels() == null ? + "" : + StringUtils + .join(getAccessibleNodeLabels().iterator(), ','))); + } + + nodeLocalityDelay = conf.getNodeLocalityDelay(); + rackLocalityFullReset = conf.getRackLocalityFullReset(); + + // re-init this since max allocation could have changed + this.minimumAllocationFactor = Resources.ratio(resourceCalculator, + Resources.subtract(maximumAllocation, minimumAllocation), + maximumAllocation); + + StringBuilder aclsString = new StringBuilder(); + for (Map.Entry e : acls.entrySet()) { + aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); + } + + StringBuilder labelStrBuilder = new StringBuilder(); + if (accessibleLabels != null) { + for (String s : accessibleLabels) { + labelStrBuilder.append(s); + labelStrBuilder.append(","); + } + } + + defaultAppPriorityPerQueue = Priority.newInstance( + conf.getDefaultApplicationPriorityConfPerQueue(getQueuePath())); + + LOG.info( + "Initializing " + queueName + "\n" + "capacity = " + queueCapacities + .getCapacity() + " [= (float) configuredCapacity / 100 ]" + "\n" + + "absoluteCapacity = " + queueCapacities.getAbsoluteCapacity() + + " [= parentAbsoluteCapacity * capacity ]" + "\n" + + "maxCapacity = " + queueCapacities.getMaximumCapacity() + + " [= configuredMaxCapacity ]" + "\n" + "absoluteMaxCapacity = " + + queueCapacities.getAbsoluteMaximumCapacity() + + " [= 1.0 maximumCapacity undefined, " + + "(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + + "\n" + "userLimit = " + userLimit + " [= configuredUserLimit ]" + + "\n" + "userLimitFactor = " + userLimitFactor + + " [= configuredUserLimitFactor ]" + "\n" + "maxApplications = " + + maxApplications + + " [= configuredMaximumSystemApplicationsPerQueue or" + + " (int)(configuredMaximumSystemApplications * absoluteCapacity)]" + + "\n" + "maxApplicationsPerUser = " + maxApplicationsPerUser + + " [= (int)(maxApplications * (userLimit / 100.0f) * " + + "userLimitFactor) ]" + "\n" + "usedCapacity = " + + queueCapacities.getUsedCapacity() + " [= usedResourcesMemory / " + + "(clusterResourceMemory * absoluteCapacity)]" + "\n" + + "absoluteUsedCapacity = " + absoluteUsedCapacity + + " [= usedResourcesMemory / clusterResourceMemory]" + "\n" + + "maxAMResourcePerQueuePercent = " + maxAMResourcePerQueuePercent + + " [= configuredMaximumAMResourcePercent ]" + "\n" + + "minimumAllocationFactor = " + minimumAllocationFactor + + " [= (float)(maximumAllocationMemory - minimumAllocationMemory) / " + + "maximumAllocationMemory ]" + "\n" + "maximumAllocation = " + + maximumAllocation + " [= configuredMaxAllocation ]" + "\n" + + "numContainers = " + numContainers + + " [= currentNumContainers ]" + "\n" + "state = " + state + + " [= configuredState ]" + "\n" + "acls = " + aclsString + + " [= configuredAcls ]" + "\n" + "nodeLocalityDelay = " + + nodeLocalityDelay + "\n" + "labels=" + labelStrBuilder + .toString() + "\n" + "reservationsContinueLooking = " + + reservationsContinueLooking + "\n" + "preemptionDisabled = " + + getPreemptionDisabled() + "\n" + "defaultAppPriorityPerQueue = " + + defaultAppPriorityPerQueue); + } finally { + writeLock.unlock(); } - - defaultAppPriorityPerQueue = Priority.newInstance(conf - .getDefaultApplicationPriorityConfPerQueue(getQueuePath())); - - LOG.info("Initializing " + queueName + "\n" + - "capacity = " + queueCapacities.getCapacity() + - " [= (float) configuredCapacity / 100 ]" + "\n" + - "absoluteCapacity = " + queueCapacities.getAbsoluteCapacity() + - " [= parentAbsoluteCapacity * capacity ]" + "\n" + - "maxCapacity = " + queueCapacities.getMaximumCapacity() + - " [= configuredMaxCapacity ]" + "\n" + - "absoluteMaxCapacity = " + queueCapacities.getAbsoluteMaximumCapacity() + - " [= 1.0 maximumCapacity undefined, " + - "(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + - "\n" + - "userLimit = " + userLimit + - " [= configuredUserLimit ]" + "\n" + - "userLimitFactor = " + userLimitFactor + - " [= configuredUserLimitFactor ]" + "\n" + - "maxApplications = " + maxApplications + - " [= configuredMaximumSystemApplicationsPerQueue or" + - " (int)(configuredMaximumSystemApplications * absoluteCapacity)]" + - "\n" + - "maxApplicationsPerUser = " + maxApplicationsPerUser + - " [= (int)(maxApplications * (userLimit / 100.0f) * " + - "userLimitFactor) ]" + "\n" + - "usedCapacity = " + queueCapacities.getUsedCapacity() + - " [= usedResourcesMemory / " + - "(clusterResourceMemory * absoluteCapacity)]" + "\n" + - "absoluteUsedCapacity = " + absoluteUsedCapacity + - " [= usedResourcesMemory / clusterResourceMemory]" + "\n" + - "maxAMResourcePerQueuePercent = " + maxAMResourcePerQueuePercent + - " [= configuredMaximumAMResourcePercent ]" + "\n" + - "minimumAllocationFactor = " + minimumAllocationFactor + - " [= (float)(maximumAllocationMemory - minimumAllocationMemory) / " + - "maximumAllocationMemory ]" + "\n" + - "maximumAllocation = " + maximumAllocation + - " [= configuredMaxAllocation ]" + "\n" + - "numContainers = " + numContainers + - " [= currentNumContainers ]" + "\n" + - "state = " + state + - " [= configuredState ]" + "\n" + - "acls = " + aclsString + - " [= configuredAcls ]" + "\n" + - "nodeLocalityDelay = " + nodeLocalityDelay + "\n" + - "labels=" + labelStrBuilder.toString() + "\n" + - "reservationsContinueLooking = " + - reservationsContinueLooking + "\n" + - "preemptionDisabled = " + getPreemptionDisabled() + "\n" + - "defaultAppPriorityPerQueue = " + defaultAppPriorityPerQueue); } @Override @@ -300,7 +301,7 @@ public class LeafQueue extends AbstractCSQueue { return maxApplications; } - public synchronized int getMaxApplicationsPerUser() { + public int getMaxApplicationsPerUser() { return maxApplicationsPerUser; } @@ -318,7 +319,8 @@ public class LeafQueue extends AbstractCSQueue { * Set user limit - used only for testing. * @param userLimit new user limit */ - synchronized void setUserLimit(int userLimit) { + @VisibleForTesting + void setUserLimit(int userLimit) { this.userLimit = userLimit; } @@ -326,50 +328,74 @@ public class LeafQueue extends AbstractCSQueue { * Set user limit factor - used only for testing. * @param userLimitFactor new user limit factor */ - synchronized void setUserLimitFactor(float userLimitFactor) { + @VisibleForTesting + void setUserLimitFactor(float userLimitFactor) { this.userLimitFactor = userLimitFactor; } @Override - public synchronized int getNumApplications() { - return getNumPendingApplications() + getNumActiveApplications(); + public int getNumApplications() { + try { + readLock.lock(); + return getNumPendingApplications() + getNumActiveApplications(); + } finally { + readLock.unlock(); + } } - public synchronized int getNumPendingApplications() { - return pendingOrderingPolicy.getNumSchedulableEntities(); + public int getNumPendingApplications() { + try { + readLock.lock(); + return pendingOrderingPolicy.getNumSchedulableEntities(); + } finally { + readLock.unlock(); + } } - public synchronized int getNumActiveApplications() { - return orderingPolicy.getNumSchedulableEntities(); + public int getNumActiveApplications() { + try { + readLock.lock(); + return orderingPolicy.getNumSchedulableEntities(); + } finally { + readLock.unlock(); + } } @Private - public synchronized int getNumApplications(String user) { - return getUser(user).getTotalApplications(); + public int getNumPendingApplications(String user) { + try { + readLock.lock(); + User u = getUser(user); + if (null == u) { + return 0; + } + return u.getPendingApplications(); + } finally { + readLock.unlock(); + } } @Private - public synchronized int getNumPendingApplications(String user) { - return getUser(user).getPendingApplications(); + public int getNumActiveApplications(String user) { + try { + readLock.lock(); + User u = getUser(user); + if (null == u) { + return 0; + } + return u.getActiveApplications(); + } finally { + readLock.unlock(); + } } @Private - public synchronized int getNumActiveApplications(String user) { - return getUser(user).getActiveApplications(); - } - - @Override - public synchronized QueueState getState() { - return state; - } - - @Private - public synchronized int getUserLimit() { + public int getUserLimit() { return userLimit; } @Private - public synchronized float getUserLimitFactor() { + public float getUserLimitFactor() { return userLimitFactor; } @@ -381,112 +407,145 @@ public class LeafQueue extends AbstractCSQueue { } @Override - public synchronized List + public List getQueueUserAclInfo(UserGroupInformation user) { - QueueUserACLInfo userAclInfo = - recordFactory.newRecordInstance(QueueUserACLInfo.class); - List operations = new ArrayList(); - for (QueueACL operation : QueueACL.values()) { - if (hasAccess(operation, user)) { - operations.add(operation); + try { + readLock.lock(); + QueueUserACLInfo userAclInfo = recordFactory.newRecordInstance( + QueueUserACLInfo.class); + List operations = new ArrayList<>(); + for (QueueACL operation : QueueACL.values()) { + if (hasAccess(operation, user)) { + operations.add(operation); + } } + + userAclInfo.setQueueName(getQueueName()); + userAclInfo.setUserAcls(operations); + return Collections.singletonList(userAclInfo); + } finally { + readLock.unlock(); } - userAclInfo.setQueueName(getQueueName()); - userAclInfo.setUserAcls(operations); - return Collections.singletonList(userAclInfo); } public String toString() { - return queueName + ": " + - "capacity=" + queueCapacities.getCapacity() + ", " + - "absoluteCapacity=" + queueCapacities.getAbsoluteCapacity() + ", " + - "usedResources=" + queueUsage.getUsed() + ", " + - "usedCapacity=" + getUsedCapacity() + ", " + - "absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + ", " + - "numApps=" + getNumApplications() + ", " + - "numContainers=" + getNumContainers(); - } - - @VisibleForTesting - public synchronized void setNodeLabelManager(RMNodeLabelsManager mgr) { - this.labelManager = mgr; + try { + readLock.lock(); + return queueName + ": " + "capacity=" + queueCapacities.getCapacity() + + ", " + "absoluteCapacity=" + queueCapacities.getAbsoluteCapacity() + + ", " + "usedResources=" + queueUsage.getUsed() + ", " + + "usedCapacity=" + getUsedCapacity() + ", " + "absoluteUsedCapacity=" + + getAbsoluteUsedCapacity() + ", " + "numApps=" + getNumApplications() + + ", " + "numContainers=" + getNumContainers(); + } finally { + readLock.unlock(); + } + } @VisibleForTesting - public synchronized User getUser(String userName) { - User user = users.get(userName); - if (user == null) { - user = new User(); - users.put(userName, user); + public User getUser(String userName) { + return users.get(userName); + } + + // Get and add user if absent + private User getUserAndAddIfAbsent(String userName) { + try { + writeLock.lock(); + User u = users.get(userName); + if (null == u) { + u = new User(); + users.put(userName, u); + } + return u; + } finally { + writeLock.unlock(); } - return user; } /** * @return an ArrayList of UserInfo objects who are active in this queue */ - public synchronized ArrayList getUsers() { - ArrayList usersToReturn = new ArrayList(); - for (Map.Entry entry : users.entrySet()) { - User user = entry.getValue(); - usersToReturn.add(new UserInfo(entry.getKey(), Resources.clone(user - .getAllUsed()), user.getActiveApplications(), user - .getPendingApplications(), Resources.clone(user - .getConsumedAMResources()), Resources.clone(user - .getUserResourceLimit()), user.getResourceUsage())); + public ArrayList getUsers() { + try { + readLock.lock(); + ArrayList usersToReturn = new ArrayList(); + for (Map.Entry entry : users.entrySet()) { + User user = entry.getValue(); + usersToReturn.add( + new UserInfo(entry.getKey(), Resources.clone(user.getAllUsed()), + user.getActiveApplications(), user.getPendingApplications(), + Resources.clone(user.getConsumedAMResources()), + Resources.clone(user.getUserResourceLimit()), + user.getResourceUsage())); + } + return usersToReturn; + } finally { + readLock.unlock(); } - return usersToReturn; } @Override - public synchronized void reinitialize( + public void reinitialize( CSQueue newlyParsedQueue, Resource clusterResource) throws IOException { - // Sanity check - if (!(newlyParsedQueue instanceof LeafQueue) || - !newlyParsedQueue.getQueuePath().equals(getQueuePath())) { - throw new IOException("Trying to reinitialize " + getQueuePath() + - " from " + newlyParsedQueue.getQueuePath()); + try { + writeLock.lock(); + // Sanity check + if (!(newlyParsedQueue instanceof LeafQueue) || !newlyParsedQueue + .getQueuePath().equals(getQueuePath())) { + throw new IOException( + "Trying to reinitialize " + getQueuePath() + " from " + + newlyParsedQueue.getQueuePath()); + } + + LeafQueue newlyParsedLeafQueue = (LeafQueue) newlyParsedQueue; + + // don't allow the maximum allocation to be decreased in size + // since we have already told running AM's the size + Resource oldMax = getMaximumAllocation(); + Resource newMax = newlyParsedLeafQueue.getMaximumAllocation(); + if (newMax.getMemorySize() < oldMax.getMemorySize() + || newMax.getVirtualCores() < oldMax.getVirtualCores()) { + throw new IOException("Trying to reinitialize " + getQueuePath() + + " the maximum allocation size can not be decreased!" + + " Current setting: " + oldMax + ", trying to set it to: " + + newMax); + } + + setupQueueConfigs(clusterResource); + + // queue metrics are updated, more resource may be available + // activate the pending applications if possible + activateApplications(); + } finally { + writeLock.unlock(); } - - LeafQueue newlyParsedLeafQueue = (LeafQueue)newlyParsedQueue; - - // don't allow the maximum allocation to be decreased in size - // since we have already told running AM's the size - Resource oldMax = getMaximumAllocation(); - Resource newMax = newlyParsedLeafQueue.getMaximumAllocation(); - if (newMax.getMemorySize() < oldMax.getMemorySize() - || newMax.getVirtualCores() < oldMax.getVirtualCores()) { - throw new IOException( - "Trying to reinitialize " - + getQueuePath() - + " the maximum allocation size can not be decreased!" - + " Current setting: " + oldMax - + ", trying to set it to: " + newMax); - } - - setupQueueConfigs(clusterResource); - - // queue metrics are updated, more resource may be available - // activate the pending applications if possible - activateApplications(); } @Override public void submitApplicationAttempt(FiCaSchedulerApp application, String userName) { // Careful! Locking order is important! - synchronized (this) { - User user = getUser(userName); + try { + writeLock.lock(); + + // TODO, should use getUser, use this method just to avoid UT failure + // which is caused by wrong invoking order, will fix UT separately + User user = getUserAndAddIfAbsent(userName); + // Add the attempt to our data-structures addApplicationAttempt(application, user); + } finally { + writeLock.unlock(); } // We don't want to update metrics for move app if (application.isPending()) { metrics.submitAppAttempt(userName); } + getParent().submitApplicationAttempt(application, userName); } @@ -494,37 +553,38 @@ public class LeafQueue extends AbstractCSQueue { public void submitApplication(ApplicationId applicationId, String userName, String queue) throws AccessControlException { // Careful! Locking order is important! - - User user = null; - synchronized (this) { - + try { + writeLock.lock(); // Check if the queue is accepting jobs if (getState() != QueueState.RUNNING) { - String msg = "Queue " + getQueuePath() + - " is STOPPED. Cannot accept submission of application: " + applicationId; + String msg = "Queue " + getQueuePath() + + " is STOPPED. Cannot accept submission of application: " + + applicationId; LOG.info(msg); throw new AccessControlException(msg); } // Check submission limits for queues if (getNumApplications() >= getMaxApplications()) { - String msg = "Queue " + getQueuePath() + - " already has " + getNumApplications() + " applications," + - " cannot accept submission of application: " + applicationId; + String msg = + "Queue " + getQueuePath() + " already has " + getNumApplications() + + " applications," + + " cannot accept submission of application: " + applicationId; LOG.info(msg); throw new AccessControlException(msg); } // Check submission limits for the user on this queue - user = getUser(userName); + User user = getUserAndAddIfAbsent(userName); if (user.getTotalApplications() >= getMaxApplicationsPerUser()) { - String msg = "Queue " + getQueuePath() + - " already has " + user.getTotalApplications() + - " applications from user " + userName + - " cannot accept submission of application: " + applicationId; + String msg = "Queue " + getQueuePath() + " already has " + user + .getTotalApplications() + " applications from user " + userName + + " cannot accept submission of application: " + applicationId; LOG.info(msg); throw new AccessControlException(msg); } + } finally { + writeLock.unlock(); } // Inform the parent queue @@ -546,214 +606,237 @@ public class LeafQueue extends AbstractCSQueue { return queueUsage.getAMLimit(nodePartition); } - public synchronized Resource calculateAndGetAMResourceLimit() { + @VisibleForTesting + public Resource calculateAndGetAMResourceLimit() { return calculateAndGetAMResourceLimitPerPartition( RMNodeLabelsManager.NO_LABEL); } @VisibleForTesting - public synchronized Resource getUserAMResourceLimit() { + public Resource getUserAMResourceLimit() { return getUserAMResourceLimitPerPartition(RMNodeLabelsManager.NO_LABEL); } - public synchronized Resource getUserAMResourceLimitPerPartition( + public Resource getUserAMResourceLimitPerPartition( String nodePartition) { - /* - * The user am resource limit is based on the same approach as the user - * limit (as it should represent a subset of that). This means that it uses - * the absolute queue capacity (per partition) instead of the max and is - * modified by the userlimit and the userlimit factor as is the userlimit - */ - float effectiveUserLimit = Math.max(userLimit / 100.0f, - 1.0f / Math.max(getActiveUsersManager().getNumActiveUsers(), 1)); + try { + readLock.lock(); + /* + * The user am resource limit is based on the same approach as the user + * limit (as it should represent a subset of that). This means that it uses + * the absolute queue capacity (per partition) instead of the max and is + * modified by the userlimit and the userlimit factor as is the userlimit + */ + float effectiveUserLimit = Math.max(userLimit / 100.0f, + 1.0f / Math.max(getActiveUsersManager().getNumActiveUsers(), 1)); - Resource queuePartitionResource = Resources.multiplyAndNormalizeUp( - resourceCalculator, - labelManager.getResourceByLabel(nodePartition, lastClusterResource), - queueCapacities.getAbsoluteCapacity(nodePartition), minimumAllocation); + Resource queuePartitionResource = Resources.multiplyAndNormalizeUp( + resourceCalculator, + labelManager.getResourceByLabel(nodePartition, lastClusterResource), + queueCapacities.getAbsoluteCapacity(nodePartition), + minimumAllocation); - Resource userAMLimit = Resources.multiplyAndNormalizeUp(resourceCalculator, - queuePartitionResource, - queueCapacities.getMaxAMResourcePercentage(nodePartition) - * effectiveUserLimit * userLimitFactor, minimumAllocation); - return Resources.lessThanOrEqual(resourceCalculator, lastClusterResource, - userAMLimit, getAMResourceLimitPerPartition(nodePartition)) - ? userAMLimit - : getAMResourceLimitPerPartition(nodePartition); - } - - public synchronized Resource calculateAndGetAMResourceLimitPerPartition( - String nodePartition) { - /* - * For non-labeled partition, get the max value from resources currently - * available to the queue and the absolute resources guaranteed for the - * partition in the queue. For labeled partition, consider only the absolute - * resources guaranteed. Multiply this value (based on labeled/ - * non-labeled), * with per-partition am-resource-percent to get the max am - * resource limit for this queue and partition. - */ - Resource queuePartitionResource = Resources.multiplyAndNormalizeUp( - resourceCalculator, - labelManager.getResourceByLabel(nodePartition, lastClusterResource), - queueCapacities.getAbsoluteCapacity(nodePartition), minimumAllocation); - - Resource queueCurrentLimit = Resources.none(); - // For non-labeled partition, we need to consider the current queue - // usage limit. - if (nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { - synchronized (queueResourceLimitsInfo) { - queueCurrentLimit = queueResourceLimitsInfo.getQueueCurrentLimit(); - } + Resource userAMLimit = Resources.multiplyAndNormalizeUp( + resourceCalculator, queuePartitionResource, + queueCapacities.getMaxAMResourcePercentage(nodePartition) + * effectiveUserLimit * userLimitFactor, minimumAllocation); + return Resources.lessThanOrEqual(resourceCalculator, lastClusterResource, + userAMLimit, getAMResourceLimitPerPartition(nodePartition)) ? + userAMLimit : + getAMResourceLimitPerPartition(nodePartition); + } finally { + readLock.unlock(); } - float amResourcePercent = queueCapacities - .getMaxAMResourcePercentage(nodePartition); - - // Current usable resource for this queue and partition is the max of - // queueCurrentLimit and queuePartitionResource. - Resource queuePartitionUsableResource = Resources.max(resourceCalculator, - lastClusterResource, queueCurrentLimit, queuePartitionResource); - - Resource amResouceLimit = Resources.multiplyAndNormalizeUp( - resourceCalculator, queuePartitionUsableResource, amResourcePercent, - minimumAllocation); - - metrics.setAMResouceLimit(amResouceLimit); - queueUsage.setAMLimit(nodePartition, amResouceLimit); - return amResouceLimit; } - private synchronized void activateApplications() { - // limit of allowed resource usage for application masters - Map userAmPartitionLimit = - new HashMap(); + public Resource calculateAndGetAMResourceLimitPerPartition( + String nodePartition) { + try { + writeLock.lock(); + /* + * For non-labeled partition, get the max value from resources currently + * available to the queue and the absolute resources guaranteed for the + * partition in the queue. For labeled partition, consider only the absolute + * resources guaranteed. Multiply this value (based on labeled/ + * non-labeled), * with per-partition am-resource-percent to get the max am + * resource limit for this queue and partition. + */ + Resource queuePartitionResource = Resources.multiplyAndNormalizeUp( + resourceCalculator, + labelManager.getResourceByLabel(nodePartition, lastClusterResource), + queueCapacities.getAbsoluteCapacity(nodePartition), + minimumAllocation); - // AM Resource Limit for accessible labels can be pre-calculated. - // This will help in updating AMResourceLimit for all labels when queue - // is initialized for the first time (when no applications are present). - for (String nodePartition : getNodeLabelsForQueue()) { - calculateAndGetAMResourceLimitPerPartition(nodePartition); - } - - for (Iterator fsApp = - getPendingAppsOrderingPolicy().getAssignmentIterator(); - fsApp.hasNext();) { - FiCaSchedulerApp application = fsApp.next(); - ApplicationId applicationId = application.getApplicationId(); - - // Get the am-node-partition associated with each application - // and calculate max-am resource limit for this partition. - String partitionName = application.getAppAMNodePartitionName(); - - Resource amLimit = getAMResourceLimitPerPartition(partitionName); - // Verify whether we already calculated am-limit for this label. - if (amLimit == null) { - amLimit = calculateAndGetAMResourceLimitPerPartition(partitionName); - } - // Check am resource limit. - Resource amIfStarted = Resources.add( - application.getAMResource(partitionName), - queueUsage.getAMUsed(partitionName)); - - if (LOG.isDebugEnabled()) { - LOG.debug("application "+application.getId() +" AMResource " - + application.getAMResource(partitionName) - + " maxAMResourcePerQueuePercent " + maxAMResourcePerQueuePercent - + " amLimit " + amLimit + " lastClusterResource " - + lastClusterResource + " amIfStarted " + amIfStarted - + " AM node-partition name " + partitionName); - } - - if (!Resources.lessThanOrEqual(resourceCalculator, lastClusterResource, - amIfStarted, amLimit)) { - if (getNumActiveApplications() < 1 - || (Resources.lessThanOrEqual(resourceCalculator, - lastClusterResource, queueUsage.getAMUsed(partitionName), - Resources.none()))) { - LOG.warn("maximum-am-resource-percent is insufficient to start a" - + " single application in queue, it is likely set too low." - + " skipping enforcement to allow at least one application" - + " to start"); - } else { - application.updateAMContainerDiagnostics(AMState.INACTIVATED, - CSAMContainerLaunchDiagnosticsConstants.QUEUE_AM_RESOURCE_LIMIT_EXCEED); - LOG.info("Not activating application " + applicationId - + " as amIfStarted: " + amIfStarted + " exceeds amLimit: " - + amLimit); - continue; + Resource queueCurrentLimit = Resources.none(); + // For non-labeled partition, we need to consider the current queue + // usage limit. + if (nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { + synchronized (queueResourceLimitsInfo){ + queueCurrentLimit = queueResourceLimitsInfo.getQueueCurrentLimit(); } } - // Check user am resource limit - User user = getUser(application.getUser()); - Resource userAMLimit = userAmPartitionLimit.get(partitionName); + float amResourcePercent = queueCapacities.getMaxAMResourcePercentage( + nodePartition); - // Verify whether we already calculated user-am-limit for this label. - if (userAMLimit == null) { - userAMLimit = getUserAMResourceLimitPerPartition(partitionName); - userAmPartitionLimit.put(partitionName, userAMLimit); + // Current usable resource for this queue and partition is the max of + // queueCurrentLimit and queuePartitionResource. + Resource queuePartitionUsableResource = Resources.max(resourceCalculator, + lastClusterResource, queueCurrentLimit, queuePartitionResource); + + Resource amResouceLimit = Resources.multiplyAndNormalizeUp( + resourceCalculator, queuePartitionUsableResource, amResourcePercent, + minimumAllocation); + + metrics.setAMResouceLimit(amResouceLimit); + queueUsage.setAMLimit(nodePartition, amResouceLimit); + return amResouceLimit; + } finally { + writeLock.unlock(); + } + } + + private void activateApplications() { + try { + writeLock.lock(); + // limit of allowed resource usage for application masters + Map userAmPartitionLimit = + new HashMap(); + + // AM Resource Limit for accessible labels can be pre-calculated. + // This will help in updating AMResourceLimit for all labels when queue + // is initialized for the first time (when no applications are present). + for (String nodePartition : getNodeLabelsForQueue()) { + calculateAndGetAMResourceLimitPerPartition(nodePartition); } - Resource userAmIfStarted = Resources.add( - application.getAMResource(partitionName), - user.getConsumedAMResources(partitionName)); + for (Iterator fsApp = + getPendingAppsOrderingPolicy().getAssignmentIterator(); + fsApp.hasNext(); ) { + FiCaSchedulerApp application = fsApp.next(); + ApplicationId applicationId = application.getApplicationId(); - if (!Resources.lessThanOrEqual(resourceCalculator, lastClusterResource, - userAmIfStarted, userAMLimit)) { - if (getNumActiveApplications() < 1 - || (Resources.lessThanOrEqual(resourceCalculator, - lastClusterResource, queueUsage.getAMUsed(partitionName), - Resources.none()))) { - LOG.warn("maximum-am-resource-percent is insufficient to start a" - + " single application in queue for user, it is likely set too" - + " low. skipping enforcement to allow at least one application" - + " to start"); - } else { - application.updateAMContainerDiagnostics(AMState.INACTIVATED, - CSAMContainerLaunchDiagnosticsConstants.USER_AM_RESOURCE_LIMIT_EXCEED); - LOG.info("Not activating application " + applicationId - + " for user: " + user + " as userAmIfStarted: " - + userAmIfStarted + " exceeds userAmLimit: " + userAMLimit); - continue; + // Get the am-node-partition associated with each application + // and calculate max-am resource limit for this partition. + String partitionName = application.getAppAMNodePartitionName(); + + Resource amLimit = getAMResourceLimitPerPartition(partitionName); + // Verify whether we already calculated am-limit for this label. + if (amLimit == null) { + amLimit = calculateAndGetAMResourceLimitPerPartition(partitionName); } - } - user.activateApplication(); - orderingPolicy.addSchedulableEntity(application); - application.updateAMContainerDiagnostics(AMState.ACTIVATED, null); + // Check am resource limit. + Resource amIfStarted = Resources.add( + application.getAMResource(partitionName), + queueUsage.getAMUsed(partitionName)); - queueUsage.incAMUsed(partitionName, - application.getAMResource(partitionName)); - user.getResourceUsage().incAMUsed(partitionName, - application.getAMResource(partitionName)); - user.getResourceUsage().setAMLimit(partitionName, userAMLimit); - metrics.incAMUsed(application.getUser(), - application.getAMResource(partitionName)); - metrics.setAMResouceLimitForUser(application.getUser(), userAMLimit); - fsApp.remove(); - LOG.info("Application " + applicationId + " from user: " - + application.getUser() + " activated in queue: " + getQueueName()); + if (LOG.isDebugEnabled()) { + LOG.debug("application " + application.getId() + " AMResource " + + application.getAMResource(partitionName) + + " maxAMResourcePerQueuePercent " + maxAMResourcePerQueuePercent + + " amLimit " + amLimit + " lastClusterResource " + + lastClusterResource + " amIfStarted " + amIfStarted + + " AM node-partition name " + partitionName); + } + + if (!Resources.lessThanOrEqual(resourceCalculator, lastClusterResource, + amIfStarted, amLimit)) { + if (getNumActiveApplications() < 1 || (Resources.lessThanOrEqual( + resourceCalculator, lastClusterResource, + queueUsage.getAMUsed(partitionName), Resources.none()))) { + LOG.warn("maximum-am-resource-percent is insufficient to start a" + + " single application in queue, it is likely set too low." + + " skipping enforcement to allow at least one application" + + " to start"); + } else{ + application.updateAMContainerDiagnostics(AMState.INACTIVATED, + CSAMContainerLaunchDiagnosticsConstants.QUEUE_AM_RESOURCE_LIMIT_EXCEED); + LOG.info("Not activating application " + applicationId + + " as amIfStarted: " + amIfStarted + " exceeds amLimit: " + + amLimit); + continue; + } + } + + // Check user am resource limit + User user = getUser(application.getUser()); + Resource userAMLimit = userAmPartitionLimit.get(partitionName); + + // Verify whether we already calculated user-am-limit for this label. + if (userAMLimit == null) { + userAMLimit = getUserAMResourceLimitPerPartition(partitionName); + userAmPartitionLimit.put(partitionName, userAMLimit); + } + + Resource userAmIfStarted = Resources.add( + application.getAMResource(partitionName), + user.getConsumedAMResources(partitionName)); + + if (!Resources.lessThanOrEqual(resourceCalculator, lastClusterResource, + userAmIfStarted, userAMLimit)) { + if (getNumActiveApplications() < 1 || (Resources.lessThanOrEqual( + resourceCalculator, lastClusterResource, + queueUsage.getAMUsed(partitionName), Resources.none()))) { + LOG.warn("maximum-am-resource-percent is insufficient to start a" + + " single application in queue for user, it is likely set too" + + " low. skipping enforcement to allow at least one application" + + " to start"); + } else{ + application.updateAMContainerDiagnostics(AMState.INACTIVATED, + CSAMContainerLaunchDiagnosticsConstants.USER_AM_RESOURCE_LIMIT_EXCEED); + LOG.info( + "Not activating application " + applicationId + " for user: " + + user + " as userAmIfStarted: " + userAmIfStarted + + " exceeds userAmLimit: " + userAMLimit); + continue; + } + } + user.activateApplication(); + orderingPolicy.addSchedulableEntity(application); + application.updateAMContainerDiagnostics(AMState.ACTIVATED, null); + + queueUsage.incAMUsed(partitionName, + application.getAMResource(partitionName)); + user.getResourceUsage().incAMUsed(partitionName, + application.getAMResource(partitionName)); + user.getResourceUsage().setAMLimit(partitionName, userAMLimit); + metrics.incAMUsed(application.getUser(), + application.getAMResource(partitionName)); + metrics.setAMResouceLimitForUser(application.getUser(), userAMLimit); + fsApp.remove(); + LOG.info("Application " + applicationId + " from user: " + application + .getUser() + " activated in queue: " + getQueueName()); + } + } finally { + writeLock.unlock(); } } - private synchronized void addApplicationAttempt(FiCaSchedulerApp application, + private void addApplicationAttempt(FiCaSchedulerApp application, User user) { - // Accept - user.submitApplication(); - getPendingAppsOrderingPolicy().addSchedulableEntity(application); - applicationAttemptMap.put(application.getApplicationAttemptId(), application); + try { + writeLock.lock(); + // Accept + user.submitApplication(); + getPendingAppsOrderingPolicy().addSchedulableEntity(application); + applicationAttemptMap.put(application.getApplicationAttemptId(), + application); - // Activate applications - activateApplications(); - - LOG.info("Application added -" + - " appId: " + application.getApplicationId() + - " user: " + application.getUser() + "," + - " leaf-queue: " + getQueueName() + - " #user-pending-applications: " + user.getPendingApplications() + - " #user-active-applications: " + user.getActiveApplications() + - " #queue-pending-applications: " + getNumPendingApplications() + - " #queue-active-applications: " + getNumActiveApplications() - ); + // Activate applications + activateApplications(); + + LOG.info( + "Application added -" + " appId: " + application.getApplicationId() + + " user: " + application.getUser() + "," + " leaf-queue: " + + getQueueName() + " #user-pending-applications: " + user + .getPendingApplications() + " #user-active-applications: " + user + .getActiveApplications() + " #queue-pending-applications: " + + getNumPendingApplications() + " #queue-active-applications: " + + getNumActiveApplications()); + } finally { + writeLock.unlock(); + } } @Override @@ -767,49 +850,54 @@ public class LeafQueue extends AbstractCSQueue { @Override public void finishApplicationAttempt(FiCaSchedulerApp application, String queue) { // Careful! Locking order is important! - synchronized (this) { - removeApplicationAttempt(application, getUser(application.getUser())); - } + removeApplicationAttempt(application, application.getUser()); getParent().finishApplicationAttempt(application, queue); } - public synchronized void removeApplicationAttempt( - FiCaSchedulerApp application, User user) { - String partitionName = application.getAppAMNodePartitionName(); - boolean wasActive = - orderingPolicy.removeSchedulableEntity(application); - if (!wasActive) { - pendingOrderingPolicy.removeSchedulableEntity(application); - } else { - queueUsage.decAMUsed(partitionName, - application.getAMResource(partitionName)); - user.getResourceUsage().decAMUsed(partitionName, - application.getAMResource(partitionName)); - metrics.decAMUsed(application.getUser(), - application.getAMResource(partitionName)); + private void removeApplicationAttempt( + FiCaSchedulerApp application, String userName) { + try { + writeLock.lock(); + + // TODO, should use getUser, use this method just to avoid UT failure + // which is caused by wrong invoking order, will fix UT separately + User user = getUserAndAddIfAbsent(userName); + + String partitionName = application.getAppAMNodePartitionName(); + boolean wasActive = orderingPolicy.removeSchedulableEntity(application); + if (!wasActive) { + pendingOrderingPolicy.removeSchedulableEntity(application); + } else{ + queueUsage.decAMUsed(partitionName, + application.getAMResource(partitionName)); + user.getResourceUsage().decAMUsed(partitionName, + application.getAMResource(partitionName)); + metrics.decAMUsed(application.getUser(), + application.getAMResource(partitionName)); + } + applicationAttemptMap.remove(application.getApplicationAttemptId()); + + user.finishApplication(wasActive); + if (user.getTotalApplications() == 0) { + users.remove(application.getUser()); + } + + // Check if we can activate more applications + activateApplications(); + + LOG.info( + "Application removed -" + " appId: " + application.getApplicationId() + + " user: " + application.getUser() + " queue: " + getQueueName() + + " #user-pending-applications: " + user.getPendingApplications() + + " #user-active-applications: " + user.getActiveApplications() + + " #queue-pending-applications: " + getNumPendingApplications() + + " #queue-active-applications: " + getNumActiveApplications()); + } finally { + writeLock.unlock(); } - applicationAttemptMap.remove(application.getApplicationAttemptId()); - - user.finishApplication(wasActive); - if (user.getTotalApplications() == 0) { - users.remove(application.getUser()); - } - - // Check if we can activate more applications - activateApplications(); - - LOG.info("Application removed -" + - " appId: " + application.getApplicationId() + - " user: " + application.getUser() + - " queue: " + getQueueName() + - " #user-pending-applications: " + user.getPendingApplications() + - " #user-active-applications: " + user.getActiveApplications() + - " #queue-pending-applications: " + getNumPendingApplications() + - " #queue-active-applications: " + getNumActiveApplications() - ); } - private synchronized FiCaSchedulerApp getApplication( + private FiCaSchedulerApp getApplication( ApplicationAttemptId applicationAttemptId) { return applicationAttemptMap.get(applicationAttemptId); } @@ -871,170 +959,171 @@ public class LeafQueue extends AbstractCSQueue { } @Override - public synchronized CSAssignment assignContainers(Resource clusterResource, + public CSAssignment assignContainers(Resource clusterResource, FiCaSchedulerNode node, ResourceLimits currentResourceLimits, SchedulingMode schedulingMode) { - updateCurrentResourceLimits(currentResourceLimits, clusterResource); + try { + writeLock.lock(); + updateCurrentResourceLimits(currentResourceLimits, clusterResource); - if (LOG.isDebugEnabled()) { - LOG.debug("assignContainers: node=" + node.getNodeName() - + " #applications=" + orderingPolicy.getNumSchedulableEntities()); - } + if (LOG.isDebugEnabled()) { + LOG.debug( + "assignContainers: node=" + node.getNodeName() + " #applications=" + + orderingPolicy.getNumSchedulableEntities()); + } - setPreemptionAllowed(currentResourceLimits, node.getPartition()); + setPreemptionAllowed(currentResourceLimits, node.getPartition()); - // Check for reserved resources - RMContainer reservedContainer = node.getReservedContainer(); - if (reservedContainer != null) { - FiCaSchedulerApp application = - getApplication(reservedContainer.getApplicationAttemptId()); + // Check for reserved resources + RMContainer reservedContainer = node.getReservedContainer(); + if (reservedContainer != null) { + FiCaSchedulerApp application = getApplication( + reservedContainer.getApplicationAttemptId()); - ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager, - node.getNodeID(), SystemClock.getInstance().getTime(), application); + ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager, + node.getNodeID(), SystemClock.getInstance().getTime(), application); - synchronized (application) { - CSAssignment assignment = - application.assignContainers(clusterResource, node, - currentResourceLimits, schedulingMode, reservedContainer); + CSAssignment assignment = application.assignContainers(clusterResource, + node, currentResourceLimits, schedulingMode, reservedContainer); handleExcessReservedContainer(clusterResource, assignment, node, application); killToPreemptContainers(clusterResource, node, assignment); return assignment; } - } - // if our queue cannot access this node, just return - if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY - && !accessibleToPartition(node.getPartition())) { - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParent().getQueueName(), getQueueName(), ActivityState.REJECTED, - ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + node + // if our queue cannot access this node, just return + if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY + && !accessibleToPartition(node.getPartition())) { + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParent().getQueueName(), getQueueName(), ActivityState.REJECTED, + ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + node + .getPartition()); + return CSAssignment.NULL_ASSIGNMENT; + } + + // Check if this queue need more resource, simply skip allocation if this + // queue doesn't need more resources. + if (!hasPendingResourceRequest(node.getPartition(), clusterResource, + schedulingMode)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skip this queue=" + getQueuePath() + + ", because it doesn't need more resource, schedulingMode=" + + schedulingMode.name() + " node-partition=" + node .getPartition()); - return CSAssignment.NULL_ASSIGNMENT; - } + } + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE); + return CSAssignment.NULL_ASSIGNMENT; + } - // Check if this queue need more resource, simply skip allocation if this - // queue doesn't need more resources. - if (!hasPendingResourceRequest(node.getPartition(), clusterResource, - schedulingMode)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skip this queue=" + getQueuePath() - + ", because it doesn't need more resource, schedulingMode=" - + schedulingMode.name() + " node-partition=" + node.getPartition()); + for (Iterator assignmentIterator = + orderingPolicy.getAssignmentIterator(); + assignmentIterator.hasNext(); ) { + FiCaSchedulerApp application = assignmentIterator.next(); + + ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager, + node.getNodeID(), SystemClock.getInstance().getTime(), application); + + // Check queue max-capacity limit + if (!super.canAssignToThisQueue(clusterResource, node.getPartition(), + currentResourceLimits, application.getCurrentReservation(), + schedulingMode)) { + ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue( + activitiesManager, node, application, application.getPriority(), + ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT); + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.EMPTY); + return CSAssignment.NULL_ASSIGNMENT; + } + + Resource userLimit = computeUserLimitAndSetHeadroom(application, + clusterResource, node.getPartition(), schedulingMode); + + // Check user limit + if (!canAssignToUser(clusterResource, application.getUser(), userLimit, + application, node.getPartition(), currentResourceLimits)) { + application.updateAMContainerDiagnostics(AMState.ACTIVATED, + "User capacity has reached its maximum limit."); + ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue( + activitiesManager, node, application, application.getPriority(), + ActivityDiagnosticConstant.USER_CAPACITY_MAXIMUM_LIMIT); + continue; + } + + // Try to schedule + CSAssignment assignment = application.assignContainers(clusterResource, + node, currentResourceLimits, schedulingMode, null); + + if (LOG.isDebugEnabled()) { + LOG.debug("post-assignContainers for application " + application + .getApplicationId()); + application.showRequests(); + } + + // Did we schedule or reserve a container? + Resource assigned = assignment.getResource(); + + handleExcessReservedContainer(clusterResource, assignment, node, + application); + killToPreemptContainers(clusterResource, node, assignment); + + if (Resources.greaterThan(resourceCalculator, clusterResource, assigned, + Resources.none())) { + // Get reserved or allocated container from application + RMContainer reservedOrAllocatedRMContainer = + application.getRMContainer(assignment.getAssignmentInformation() + .getFirstAllocatedOrReservedContainerId()); + + // Book-keeping + // Note: Update headroom to account for current allocation too... + allocateResource(clusterResource, application, assigned, + node.getPartition(), reservedOrAllocatedRMContainer, + assignment.isIncreasedAllocation()); + + // Update reserved metrics + Resource reservedRes = + assignment.getAssignmentInformation().getReserved(); + if (reservedRes != null && !reservedRes.equals(Resources.none())) { + incReservedResource(node.getPartition(), reservedRes); + } + + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParent().getQueueName(), getQueueName(), + ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY); + + // Done + return assignment; + } else if (assignment.getSkippedType() + == CSAssignment.SkippedType.OTHER) { + ActivitiesLogger.APP.finishSkippedAppAllocationRecording( + activitiesManager, application.getApplicationId(), + ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY); + application.updateNodeInfoForAMDiagnostics(node); + } else if (assignment.getSkippedType() + == CSAssignment.SkippedType.QUEUE_LIMIT) { + return assignment; + } else{ + // If we don't allocate anything, and it is not skipped by application, + // we will return to respect FIFO of applications + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.RESPECT_FIFO); + ActivitiesLogger.APP.finishSkippedAppAllocationRecording( + activitiesManager, application.getApplicationId(), + ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY); + return CSAssignment.NULL_ASSIGNMENT; + } } ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE); + ActivityDiagnosticConstant.EMPTY); + return CSAssignment.NULL_ASSIGNMENT; + } finally { + writeLock.unlock(); } - - for (Iterator assignmentIterator = - orderingPolicy.getAssignmentIterator(); assignmentIterator.hasNext();) { - FiCaSchedulerApp application = assignmentIterator.next(); - - ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager, - node.getNodeID(), SystemClock.getInstance().getTime(), application); - - // Check queue max-capacity limit - if (!super.canAssignToThisQueue(clusterResource, node.getPartition(), - currentResourceLimits, application.getCurrentReservation(), - schedulingMode)) { - ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue( - activitiesManager, node, - application, application.getPriority(), - ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT); - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.EMPTY); - return CSAssignment.NULL_ASSIGNMENT; - } - - Resource userLimit = - computeUserLimitAndSetHeadroom(application, clusterResource, - node.getPartition(), schedulingMode); - - // Check user limit - if (!canAssignToUser(clusterResource, application.getUser(), userLimit, - application, node.getPartition(), currentResourceLimits)) { - application.updateAMContainerDiagnostics(AMState.ACTIVATED, - "User capacity has reached its maximum limit."); - ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue( - activitiesManager, node, - application, application.getPriority(), - ActivityDiagnosticConstant.USER_CAPACITY_MAXIMUM_LIMIT); - continue; - } - - // Try to schedule - CSAssignment assignment = - application.assignContainers(clusterResource, node, - currentResourceLimits, schedulingMode, null); - - if (LOG.isDebugEnabled()) { - LOG.debug("post-assignContainers for application " - + application.getApplicationId()); - application.showRequests(); - } - - // Did we schedule or reserve a container? - Resource assigned = assignment.getResource(); - - handleExcessReservedContainer(clusterResource, assignment, node, - application); - killToPreemptContainers(clusterResource, node, assignment); - - if (Resources.greaterThan(resourceCalculator, clusterResource, assigned, - Resources.none())) { - // Get reserved or allocated container from application - RMContainer reservedOrAllocatedRMContainer = - application.getRMContainer(assignment.getAssignmentInformation() - .getFirstAllocatedOrReservedContainerId()); - - // Book-keeping - // Note: Update headroom to account for current allocation too... - allocateResource(clusterResource, application, assigned, - node.getPartition(), reservedOrAllocatedRMContainer, - assignment.isIncreasedAllocation()); - - // Update reserved metrics - Resource reservedRes = assignment.getAssignmentInformation() - .getReserved(); - if (reservedRes != null && !reservedRes.equals(Resources.none())) { - incReservedResource(node.getPartition(), reservedRes); - } - - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParent().getQueueName(), getQueueName(), ActivityState.ACCEPTED, - ActivityDiagnosticConstant.EMPTY); - - // Done - return assignment; - } else if (assignment.getSkippedType() - == CSAssignment.SkippedType.OTHER) { - ActivitiesLogger.APP.finishSkippedAppAllocationRecording( - activitiesManager, application.getApplicationId(), - ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY); - application.updateNodeInfoForAMDiagnostics(node); - } else if(assignment.getSkippedType() - == CSAssignment.SkippedType.QUEUE_LIMIT) { - return assignment; - } else { - // If we don't allocate anything, and it is not skipped by application, - // we will return to respect FIFO of applications - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.RESPECT_FIFO); - ActivitiesLogger.APP.finishSkippedAppAllocationRecording( - activitiesManager, application.getApplicationId(), - ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY); - return CSAssignment.NULL_ASSIGNMENT; - } - } - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.EMPTY); - - return CSAssignment.NULL_ASSIGNMENT; } protected Resource getHeadroom(User user, Resource queueCurrentLimit, @@ -1109,7 +1198,8 @@ public class LeafQueue extends AbstractCSQueue { } } - @Lock({LeafQueue.class, FiCaSchedulerApp.class}) + // It doesn't necessarily to hold application's lock here. + @Lock({LeafQueue.class}) Resource computeUserLimitAndSetHeadroom(FiCaSchedulerApp application, Resource clusterResource, String nodePartition, SchedulingMode schedulingMode) { @@ -1281,51 +1371,53 @@ public class LeafQueue extends AbstractCSQueue { } @Private - protected synchronized boolean canAssignToUser(Resource clusterResource, + protected boolean canAssignToUser(Resource clusterResource, String userName, Resource limit, FiCaSchedulerApp application, String nodePartition, ResourceLimits currentResourceLimits) { - User user = getUser(userName); + try { + readLock.lock(); + User user = getUser(userName); - currentResourceLimits.setAmountNeededUnreserve(Resources.none()); + currentResourceLimits.setAmountNeededUnreserve(Resources.none()); - // Note: We aren't considering the current request since there is a fixed - // overhead of the AM, but it's a > check, not a >= check, so... - if (Resources - .greaterThan(resourceCalculator, clusterResource, - user.getUsed(nodePartition), - limit)) { - // if enabled, check to see if could we potentially use this node instead - // of a reserved node if the application has reserved containers - if (this.reservationsContinueLooking && - nodePartition.equals(CommonNodeLabelsManager.NO_LABEL)) { - if (Resources.lessThanOrEqual( - resourceCalculator, - clusterResource, - Resources.subtract(user.getUsed(), - application.getCurrentReservation()), limit)) { + // Note: We aren't considering the current request since there is a fixed + // overhead of the AM, but it's a > check, not a >= check, so... + if (Resources.greaterThan(resourceCalculator, clusterResource, + user.getUsed(nodePartition), limit)) { + // if enabled, check to see if could we potentially use this node instead + // of a reserved node if the application has reserved containers + if (this.reservationsContinueLooking && nodePartition.equals( + CommonNodeLabelsManager.NO_LABEL)) { + if (Resources.lessThanOrEqual(resourceCalculator, clusterResource, + Resources.subtract(user.getUsed(), + application.getCurrentReservation()), limit)) { - if (LOG.isDebugEnabled()) { - LOG.debug("User " + userName + " in queue " + getQueueName() - + " will exceed limit based on reservations - " + " consumed: " - + user.getUsed() + " reserved: " - + application.getCurrentReservation() + " limit: " + limit); + if (LOG.isDebugEnabled()) { + LOG.debug("User " + userName + " in queue " + getQueueName() + + " will exceed limit based on reservations - " + + " consumed: " + user.getUsed() + " reserved: " + application + .getCurrentReservation() + " limit: " + limit); + } + Resource amountNeededToUnreserve = Resources.subtract( + user.getUsed(nodePartition), limit); + // we can only acquire a new container if we unreserve first to + // respect user-limit + currentResourceLimits.setAmountNeededUnreserve( + amountNeededToUnreserve); + return true; } - Resource amountNeededToUnreserve = - Resources.subtract(user.getUsed(nodePartition), limit); - // we can only acquire a new container if we unreserve first to - // respect user-limit - currentResourceLimits.setAmountNeededUnreserve(amountNeededToUnreserve); - return true; } + if (LOG.isDebugEnabled()) { + LOG.debug("User " + userName + " in queue " + getQueueName() + + " will exceed limit - " + " consumed: " + user + .getUsed(nodePartition) + " limit: " + limit); + } + return false; } - if (LOG.isDebugEnabled()) { - LOG.debug("User " + userName + " in queue " + getQueueName() - + " will exceed limit - " + " consumed: " - + user.getUsed(nodePartition) + " limit: " + limit); - } - return false; + return true; + } finally { + readLock.unlock(); } - return true; } @Override @@ -1333,15 +1425,15 @@ public class LeafQueue extends AbstractCSQueue { FiCaSchedulerApp app, FiCaSchedulerNode node, RMContainer rmContainer) { boolean removed = false; Priority priority = null; - - synchronized (this) { + + try { + writeLock.lock(); if (rmContainer.getContainer() != null) { priority = rmContainer.getContainer().getPriority(); } if (null != priority) { - removed = app.unreserve( - rmContainer.getAllocatedSchedulerKey(), node, + removed = app.unreserve(rmContainer.getAllocatedSchedulerKey(), node, rmContainer); } @@ -1352,8 +1444,10 @@ public class LeafQueue extends AbstractCSQueue { releaseResource(clusterResource, app, rmContainer.getReservedResource(), node.getPartition(), rmContainer, true); } + } finally { + writeLock.unlock(); } - + if (removed) { getParent().unreserveIncreasedContainer(clusterResource, app, node, rmContainer); @@ -1380,42 +1474,52 @@ public class LeafQueue extends AbstractCSQueue { } } - private synchronized float calculateUserUsageRatio(Resource clusterResource, + private float calculateUserUsageRatio(Resource clusterResource, String nodePartition) { - Resource resourceByLabel = - labelManager.getResourceByLabel(nodePartition, clusterResource); - float consumed = 0; - User user; - for (Map.Entry entry : users.entrySet()) { - user = entry.getValue(); - consumed += user.resetAndUpdateUsageRatio(resourceCalculator, - resourceByLabel, nodePartition); - } - return consumed; - } - - private synchronized void recalculateQueueUsageRatio(Resource clusterResource, - String nodePartition) { - ResourceUsage queueResourceUsage = this.getQueueResourceUsage(); - - if (nodePartition == null) { - for (String partition : Sets.union(queueCapacities.getNodePartitionsSet(), - queueResourceUsage.getNodePartitionsSet())) { - qUsageRatios.setUsageRatio(partition, - calculateUserUsageRatio(clusterResource, partition)); + try { + writeLock.lock(); + Resource resourceByLabel = labelManager.getResourceByLabel(nodePartition, + clusterResource); + float consumed = 0; + User user; + for (Map.Entry entry : users.entrySet()) { + user = entry.getValue(); + consumed += user.resetAndUpdateUsageRatio(resourceCalculator, + resourceByLabel, nodePartition); } - } else { - qUsageRatios.setUsageRatio(nodePartition, - calculateUserUsageRatio(clusterResource, nodePartition)); + return consumed; + } finally { + writeLock.unlock(); } } - private synchronized void updateQueueUsageRatio(String nodePartition, + private void recalculateQueueUsageRatio(Resource clusterResource, + String nodePartition) { + try { + writeLock.lock(); + ResourceUsage queueResourceUsage = this.getQueueResourceUsage(); + + if (nodePartition == null) { + for (String partition : Sets.union( + queueCapacities.getNodePartitionsSet(), + queueResourceUsage.getNodePartitionsSet())) { + qUsageRatios.setUsageRatio(partition, + calculateUserUsageRatio(clusterResource, partition)); + } + } else{ + qUsageRatios.setUsageRatio(nodePartition, + calculateUserUsageRatio(clusterResource, nodePartition)); + } + } finally { + writeLock.unlock(); + } + } + + private void updateQueueUsageRatio(String nodePartition, float delta) { qUsageRatios.incUsageRatio(nodePartition, delta); } - @Override public void completedContainer(Resource clusterResource, FiCaSchedulerApp application, FiCaSchedulerNode node, RMContainer rmContainer, @@ -1438,21 +1542,20 @@ public class LeafQueue extends AbstractCSQueue { boolean removed = false; // Careful! Locking order is important! - synchronized (this) { - + try { + writeLock.lock(); Container container = rmContainer.getContainer(); // Inform the application & the node // Note: It's safe to assume that all state changes to RMContainer - // happen under scheduler's lock... + // happen under scheduler's lock... // So, this is, in effect, a transaction across application & node if (rmContainer.getState() == RMContainerState.RESERVED) { removed = application.unreserve(rmContainer.getReservedSchedulerKey(), node, rmContainer); - } else { - removed = - application.containerCompleted(rmContainer, containerStatus, - event, node.getPartition()); + } else{ + removed = application.containerCompleted(rmContainer, containerStatus, + event, node.getPartition()); node.releaseContainer(container); } @@ -1462,12 +1565,15 @@ public class LeafQueue extends AbstractCSQueue { // Inform the ordering policy orderingPolicy.containerReleased(application, rmContainer); - + releaseResource(clusterResource, application, container.getResource(), node.getPartition(), rmContainer, false); } + } finally { + writeLock.unlock(); } + if (removed) { // Inform the parent queue _outside_ of the leaf-queue lock getParent().completedContainer(clusterResource, application, node, @@ -1480,91 +1586,104 @@ public class LeafQueue extends AbstractCSQueue { new KillableContainer(rmContainer, node.getPartition(), queueName)); } - synchronized void allocateResource(Resource clusterResource, + void allocateResource(Resource clusterResource, SchedulerApplicationAttempt application, Resource resource, String nodePartition, RMContainer rmContainer, boolean isIncreasedAllocation) { - super.allocateResource(clusterResource, resource, nodePartition, - isIncreasedAllocation); - Resource resourceByLabel = labelManager.getResourceByLabel(nodePartition, - clusterResource); - - // handle ignore exclusivity container - if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( - RMNodeLabelsManager.NO_LABEL) - && !nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { - TreeSet rmContainers = null; - if (null == (rmContainers = - ignorePartitionExclusivityRMContainers.get(nodePartition))) { - rmContainers = new TreeSet<>(); - ignorePartitionExclusivityRMContainers.put(nodePartition, rmContainers); + try { + writeLock.lock(); + super.allocateResource(clusterResource, resource, nodePartition, + isIncreasedAllocation); + Resource resourceByLabel = labelManager.getResourceByLabel(nodePartition, + clusterResource); + + // handle ignore exclusivity container + if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( + RMNodeLabelsManager.NO_LABEL) && !nodePartition.equals( + RMNodeLabelsManager.NO_LABEL)) { + TreeSet rmContainers = null; + if (null == (rmContainers = ignorePartitionExclusivityRMContainers.get( + nodePartition))) { + rmContainers = new TreeSet<>(); + ignorePartitionExclusivityRMContainers.put(nodePartition, + rmContainers); + } + rmContainers.add(rmContainer); } - rmContainers.add(rmContainer); - } - // Update user metrics - String userName = application.getUser(); - User user = getUser(userName); - user.assignContainer(resource, nodePartition); + // Update user metrics + String userName = application.getUser(); - // Update usage ratios - updateQueueUsageRatio(nodePartition, - user.updateUsageRatio(resourceCalculator, resourceByLabel, - nodePartition)); + // TODO, should use getUser, use this method just to avoid UT failure + // which is caused by wrong invoking order, will fix UT separately + User user = getUserAndAddIfAbsent(userName); - // Note this is a bit unconventional since it gets the object and modifies - // it here, rather then using set routine - Resources.subtractFrom(application.getHeadroom(), resource); // headroom - metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); - - if (LOG.isDebugEnabled()) { - LOG.debug(getQueueName() + - " user=" + userName + - " used=" + queueUsage.getUsed() + " numContainers=" + numContainers + - " headroom = " + application.getHeadroom() + - " user-resources=" + user.getUsed() - ); + user.assignContainer(resource, nodePartition); + + // Update usage ratios + updateQueueUsageRatio(nodePartition, + user.updateUsageRatio(resourceCalculator, resourceByLabel, + nodePartition)); + + // Note this is a bit unconventional since it gets the object and modifies + // it here, rather then using set routine + Resources.subtractFrom(application.getHeadroom(), resource); // headroom + metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); + + if (LOG.isDebugEnabled()) { + LOG.debug(getQueueName() + " user=" + userName + " used=" + queueUsage + .getUsed() + " numContainers=" + numContainers + " headroom = " + + application.getHeadroom() + " user-resources=" + user.getUsed()); + } + } finally { + writeLock.unlock(); } } - synchronized void releaseResource(Resource clusterResource, + void releaseResource(Resource clusterResource, FiCaSchedulerApp application, Resource resource, String nodePartition, RMContainer rmContainer, boolean isChangeResource) { - super.releaseResource(clusterResource, resource, nodePartition, - isChangeResource); - Resource resourceByLabel = labelManager.getResourceByLabel(nodePartition, - clusterResource); - - // handle ignore exclusivity container - if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( - RMNodeLabelsManager.NO_LABEL) - && !nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { - if (ignorePartitionExclusivityRMContainers.containsKey(nodePartition)) { - Set rmContainers = - ignorePartitionExclusivityRMContainers.get(nodePartition); - rmContainers.remove(rmContainer); - if (rmContainers.isEmpty()) { - ignorePartitionExclusivityRMContainers.remove(nodePartition); + try { + writeLock.lock(); + super.releaseResource(clusterResource, resource, nodePartition, + isChangeResource); + Resource resourceByLabel = labelManager.getResourceByLabel(nodePartition, + clusterResource); + + // handle ignore exclusivity container + if (null != rmContainer && rmContainer.getNodeLabelExpression().equals( + RMNodeLabelsManager.NO_LABEL) && !nodePartition.equals( + RMNodeLabelsManager.NO_LABEL)) { + if (ignorePartitionExclusivityRMContainers.containsKey(nodePartition)) { + Set rmContainers = + ignorePartitionExclusivityRMContainers.get(nodePartition); + rmContainers.remove(rmContainer); + if (rmContainers.isEmpty()) { + ignorePartitionExclusivityRMContainers.remove(nodePartition); + } } } - } - // Update user metrics - String userName = application.getUser(); - User user = getUser(userName); - user.releaseContainer(resource, nodePartition); + // Update user metrics + String userName = application.getUser(); + User user = getUserAndAddIfAbsent(userName); + user.releaseContainer(resource, nodePartition); - // Update usage ratios - updateQueueUsageRatio(nodePartition, - user.updateUsageRatio(resourceCalculator, resourceByLabel, - nodePartition)); + // Update usage ratios + updateQueueUsageRatio(nodePartition, + user.updateUsageRatio(resourceCalculator, resourceByLabel, + nodePartition)); - metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); + metrics.setAvailableResourcesToUser(userName, application.getHeadroom()); - if (LOG.isDebugEnabled()) { - LOG.debug(getQueueName() + - " used=" + queueUsage.getUsed() + " numContainers=" + numContainers + - " user=" + userName + " user-resources=" + user.getUsed()); + if (LOG.isDebugEnabled()) { + LOG.debug( + getQueueName() + " used=" + queueUsage.getUsed() + " numContainers=" + + numContainers + " user=" + userName + " user-resources=" + + user.getUsed()); + } + } finally { + writeLock.unlock(); } } @@ -1589,35 +1708,38 @@ public class LeafQueue extends AbstractCSQueue { } @Override - public synchronized void updateClusterResource(Resource clusterResource, + public void updateClusterResource(Resource clusterResource, ResourceLimits currentResourceLimits) { - updateCurrentResourceLimits(currentResourceLimits, clusterResource); - lastClusterResource = clusterResource; - - // Update headroom info based on new cluster resource value - // absoluteMaxCapacity now, will be replaced with absoluteMaxAvailCapacity - // during allocation - setQueueResourceLimitsInfo(clusterResource); + try { + writeLock.lock(); + updateCurrentResourceLimits(currentResourceLimits, clusterResource); + lastClusterResource = clusterResource; - // Update user consumedRatios - recalculateQueueUsageRatio(clusterResource, null); + // Update headroom info based on new cluster resource value + // absoluteMaxCapacity now, will be replaced with absoluteMaxAvailCapacity + // during allocation + setQueueResourceLimitsInfo(clusterResource); - // Update metrics - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, null); + // Update user consumedRatios + recalculateQueueUsageRatio(clusterResource, null); - // queue metrics are updated, more resource may be available - // activate the pending applications if possible - activateApplications(); + // Update metrics + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, null); - // Update application properties - for (FiCaSchedulerApp application : - orderingPolicy.getSchedulableEntities()) { - synchronized (application) { + // queue metrics are updated, more resource may be available + // activate the pending applications if possible + activateApplications(); + + // Update application properties + for (FiCaSchedulerApp application : orderingPolicy + .getSchedulableEntities()) { computeUserLimitAndSetHeadroom(application, clusterResource, RMNodeLabelsManager.NO_LABEL, SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); } + } finally { + writeLock.unlock(); } } @@ -1714,30 +1836,47 @@ public class LeafQueue extends AbstractCSQueue { public static class User { ResourceUsage userResourceUsage = new ResourceUsage(); volatile Resource userResourceLimit = Resource.newInstance(0, 0); - int pendingApplications = 0; - int activeApplications = 0; + volatile int pendingApplications = 0; + volatile int activeApplications = 0; private UsageRatios userUsageRatios = new UsageRatios(); + private WriteLock writeLock; + + User() { + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + // Nobody uses read-lock now, will add it when necessary + writeLock = lock.writeLock(); + } public ResourceUsage getResourceUsage() { return userResourceUsage; } - public synchronized float resetAndUpdateUsageRatio( + public float resetAndUpdateUsageRatio( ResourceCalculator resourceCalculator, Resource resource, String nodePartition) { - userUsageRatios.setUsageRatio(nodePartition, 0); - return updateUsageRatio(resourceCalculator, resource, nodePartition); + try { + writeLock.lock(); + userUsageRatios.setUsageRatio(nodePartition, 0); + return updateUsageRatio(resourceCalculator, resource, nodePartition); + } finally { + writeLock.unlock(); + } } - public synchronized float updateUsageRatio( + public float updateUsageRatio( ResourceCalculator resourceCalculator, Resource resource, String nodePartition) { - float delta; - float newRatio = - Resources.ratio(resourceCalculator, getUsed(nodePartition), resource); - delta = newRatio - userUsageRatios.getUsageRatio(nodePartition); - userUsageRatios.setUsageRatio(nodePartition, newRatio); - return delta; + try { + writeLock.lock(); + float delta; + float newRatio = Resources.ratio(resourceCalculator, + getUsed(nodePartition), resource); + delta = newRatio - userUsageRatios.getUsageRatio(nodePartition); + userUsageRatios.setUsageRatio(nodePartition, newRatio); + return delta; + } finally { + writeLock.unlock(); + } } public Resource getUsed() { @@ -1772,21 +1911,35 @@ public class LeafQueue extends AbstractCSQueue { return getPendingApplications() + getActiveApplications(); } - public synchronized void submitApplication() { - ++pendingApplications; + public void submitApplication() { + try { + writeLock.lock(); + ++pendingApplications; + } finally { + writeLock.unlock(); + } } - public synchronized void activateApplication() { - --pendingApplications; - ++activeApplications; + public void activateApplication() { + try { + writeLock.lock(); + --pendingApplications; + ++activeApplications; + } finally { + writeLock.unlock(); + } } - public synchronized void finishApplication(boolean wasActive) { - if (wasActive) { - --activeApplications; - } - else { - --pendingApplications; + public void finishApplication(boolean wasActive) { + try { + writeLock.lock(); + if (wasActive) { + --activeApplications; + } else{ + --pendingApplications; + } + } finally { + writeLock.unlock(); } } @@ -1813,13 +1966,19 @@ public class LeafQueue extends AbstractCSQueue { if (rmContainer.getState().equals(RMContainerState.COMPLETED)) { return; } - // Careful! Locking order is important! - synchronized (this) { - FiCaSchedulerNode node = - scheduler.getNode(rmContainer.getContainer().getNodeId()); - allocateResource(clusterResource, attempt, rmContainer.getContainer() - .getResource(), node.getPartition(), rmContainer, false); + + // Careful! Locking order is important! + try { + writeLock.lock(); + FiCaSchedulerNode node = scheduler.getNode( + rmContainer.getContainer().getNodeId()); + allocateResource(clusterResource, attempt, + rmContainer.getContainer().getResource(), node.getPartition(), + rmContainer, false); + } finally { + writeLock.unlock(); } + getParent().recoverContainer(clusterResource, attempt, rmContainer); } @@ -1843,44 +2002,56 @@ public class LeafQueue extends AbstractCSQueue { // Total pending for the queue = // sum(for each user(min((user's headroom), sum(user's pending requests)))) // NOTE: Used for calculating pedning resources in the preemption monitor. - public synchronized Resource getTotalPendingResourcesConsideringUserLimit( + public Resource getTotalPendingResourcesConsideringUserLimit( Resource resources, String partition) { - Map userNameToHeadroom = new HashMap(); - Resource pendingConsideringUserLimit = Resource.newInstance(0, 0); - for (FiCaSchedulerApp app : getApplications()) { - String userName = app.getUser(); - if (!userNameToHeadroom.containsKey(userName)) { - User user = getUser(userName); - Resource headroom = Resources.subtract( - computeUserLimit(app, resources, user, partition, - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), - user.getUsed(partition)); - // Make sure headroom is not negative. - headroom = Resources.componentwiseMax(headroom, Resources.none()); - userNameToHeadroom.put(userName, headroom); + try { + readLock.lock(); + Map userNameToHeadroom = + new HashMap<>(); + Resource pendingConsideringUserLimit = Resource.newInstance(0, 0); + for (FiCaSchedulerApp app : getApplications()) { + String userName = app.getUser(); + if (!userNameToHeadroom.containsKey(userName)) { + User user = getUser(userName); + Resource headroom = Resources.subtract( + computeUserLimit(app, resources, user, partition, + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY), + user.getUsed(partition)); + // Make sure headroom is not negative. + headroom = Resources.componentwiseMax(headroom, Resources.none()); + userNameToHeadroom.put(userName, headroom); + } + Resource minpendingConsideringUserLimit = Resources.componentwiseMin( + userNameToHeadroom.get(userName), + app.getAppAttemptResourceUsage().getPending(partition)); + Resources.addTo(pendingConsideringUserLimit, + minpendingConsideringUserLimit); + Resources.subtractFrom(userNameToHeadroom.get(userName), + minpendingConsideringUserLimit); } - Resource minpendingConsideringUserLimit = - Resources.componentwiseMin(userNameToHeadroom.get(userName), - app.getAppAttemptResourceUsage().getPending(partition)); - Resources.addTo(pendingConsideringUserLimit, - minpendingConsideringUserLimit); - Resources.subtractFrom( - userNameToHeadroom.get(userName), minpendingConsideringUserLimit); + return pendingConsideringUserLimit; + } finally { + readLock.unlock(); } - return pendingConsideringUserLimit; + } @Override - public synchronized void collectSchedulerApplications( + public void collectSchedulerApplications( Collection apps) { - for (FiCaSchedulerApp pendingApp : pendingOrderingPolicy - .getSchedulableEntities()) { - apps.add(pendingApp.getApplicationAttemptId()); - } - for (FiCaSchedulerApp app : - orderingPolicy.getSchedulableEntities()) { - apps.add(app.getApplicationAttemptId()); + try { + readLock.lock(); + for (FiCaSchedulerApp pendingApp : pendingOrderingPolicy + .getSchedulableEntities()) { + apps.add(pendingApp.getApplicationAttemptId()); + } + for (FiCaSchedulerApp app : orderingPolicy.getSchedulableEntities()) { + apps.add(app.getApplicationAttemptId()); + } + } finally { + readLock.unlock(); } + } @Override @@ -1921,13 +2092,24 @@ public class LeafQueue extends AbstractCSQueue { /** * return all ignored partition exclusivity RMContainers in the LeafQueue, this - * will be used by preemption policy, and use of return - * ignorePartitionExclusivityRMContainer should protected by LeafQueue - * synchronized lock + * will be used by preemption policy. */ - public synchronized Map> + public Map> getIgnoreExclusivityRMContainers() { - return ignorePartitionExclusivityRMContainers; + Map> clonedMap = new HashMap<>(); + try { + readLock.lock(); + + for (Map.Entry> entry : ignorePartitionExclusivityRMContainers + .entrySet()) { + clonedMap.put(entry.getKey(), new TreeSet<>(entry.getValue())); + } + + return clonedMap; + + } finally { + readLock.unlock(); + } } public void setCapacity(float capacity) { @@ -1942,18 +2124,23 @@ public class LeafQueue extends AbstractCSQueue { this.maxApplications = maxApplications; } - public synchronized OrderingPolicy + public OrderingPolicy getOrderingPolicy() { return orderingPolicy; } - public synchronized void setOrderingPolicy( + void setOrderingPolicy( OrderingPolicy orderingPolicy) { - if (null != this.orderingPolicy) { - orderingPolicy.addAllSchedulableEntities(this.orderingPolicy - .getSchedulableEntities()); + try { + writeLock.lock(); + if (null != this.orderingPolicy) { + orderingPolicy.addAllSchedulableEntities( + this.orderingPolicy.getSchedulableEntities()); + } + this.orderingPolicy = orderingPolicy; + } finally { + writeLock.unlock(); } - this.orderingPolicy = orderingPolicy; } @Override @@ -1981,25 +2168,26 @@ public class LeafQueue extends AbstractCSQueue { boolean resourceDecreased = false; Resource resourceBeforeDecrease; // Grab queue lock to avoid race condition when getting container resource - synchronized (this) { + + try { + writeLock.lock(); // Make sure the decrease request is valid in terms of current resource // and target resource. This must be done under the leaf queue lock. // Throws exception if the check fails. RMServerUtils.checkSchedContainerChangeRequest(decreaseRequest, false); // Save resource before decrease for debug log - resourceBeforeDecrease = - Resources.clone(rmContainer.getAllocatedResource()); + resourceBeforeDecrease = Resources.clone( + rmContainer.getAllocatedResource()); // Do we have increase request for the same container? If so, remove it - boolean hasIncreaseRequest = - app.removeIncreaseRequest(decreaseRequest.getNodeId(), - decreaseRequest.getRMContainer().getAllocatedSchedulerKey(), - decreaseRequest.getContainerId()); + boolean hasIncreaseRequest = app.removeIncreaseRequest( + decreaseRequest.getNodeId(), + decreaseRequest.getRMContainer().getAllocatedSchedulerKey(), + decreaseRequest.getContainerId()); if (hasIncreaseRequest) { if (LOG.isDebugEnabled()) { LOG.debug("While processing decrease requests, found an increase" - + " request for the same container " - + decreaseRequest.getContainerId() - + ", removed the increase request"); + + " request for the same container " + decreaseRequest + .getContainerId() + ", removed the increase request"); } } // Delta capacity is negative when it's a decrease request @@ -2013,19 +2201,20 @@ public class LeafQueue extends AbstractCSQueue { + " container:" + decreaseRequest.getContainerId() + " ignore this decrease request."); } - } else { + } else{ // Release the delta resource releaseResource(clusterResource, app, absDelta, decreaseRequest.getNodePartition(), - decreaseRequest.getRMContainer(), - true); + decreaseRequest.getRMContainer(), true); // Notify application app.decreaseContainer(decreaseRequest); // Notify node - decreaseRequest.getSchedulerNode() - .decreaseContainer(decreaseRequest.getContainerId(), absDelta); + decreaseRequest.getSchedulerNode().decreaseContainer( + decreaseRequest.getContainerId(), absDelta); resourceDecreased = true; } + } finally { + writeLock.unlock(); } if (resourceDecreased) { @@ -2038,7 +2227,7 @@ public class LeafQueue extends AbstractCSQueue { } } - public synchronized OrderingPolicy + public OrderingPolicy getPendingAppsOrderingPolicy() { return pendingOrderingPolicy; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java index 3e9785fc92..ffb68928ec 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java @@ -107,68 +107,77 @@ public class ParentQueue extends AbstractCSQueue { ", fullname=" + getQueuePath()); } - synchronized void setupQueueConfigs(Resource clusterResource) + void setupQueueConfigs(Resource clusterResource) throws IOException { - super.setupQueueConfigs(clusterResource); - StringBuilder aclsString = new StringBuilder(); - for (Map.Entry e : acls.entrySet()) { - aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); - } - - StringBuilder labelStrBuilder = new StringBuilder(); - if (accessibleLabels != null) { - for (String s : accessibleLabels) { - labelStrBuilder.append(s); - labelStrBuilder.append(","); + try { + writeLock.lock(); + super.setupQueueConfigs(clusterResource); + StringBuilder aclsString = new StringBuilder(); + for (Map.Entry e : acls.entrySet()) { + aclsString.append(e.getKey() + ":" + e.getValue().getAclString()); } - } - LOG.info(queueName + - ", capacity=" + this.queueCapacities.getCapacity() + - ", absoluteCapacity=" + this.queueCapacities.getAbsoluteCapacity() + - ", maxCapacity=" + this.queueCapacities.getMaximumCapacity() + - ", absoluteMaxCapacity=" + this.queueCapacities.getAbsoluteMaximumCapacity() + - ", state=" + state + - ", acls=" + aclsString + - ", labels=" + labelStrBuilder.toString() + "\n" + - ", reservationsContinueLooking=" + reservationsContinueLooking); + StringBuilder labelStrBuilder = new StringBuilder(); + if (accessibleLabels != null) { + for (String s : accessibleLabels) { + labelStrBuilder.append(s); + labelStrBuilder.append(","); + } + } + + LOG.info(queueName + ", capacity=" + this.queueCapacities.getCapacity() + + ", absoluteCapacity=" + this.queueCapacities.getAbsoluteCapacity() + + ", maxCapacity=" + this.queueCapacities.getMaximumCapacity() + + ", absoluteMaxCapacity=" + this.queueCapacities + .getAbsoluteMaximumCapacity() + ", state=" + state + ", acls=" + + aclsString + ", labels=" + labelStrBuilder.toString() + "\n" + + ", reservationsContinueLooking=" + reservationsContinueLooking); + } finally { + writeLock.unlock(); + } } private static float PRECISION = 0.0005f; // 0.05% precision - synchronized void setChildQueues(Collection childQueues) { - // Validate - float childCapacities = 0; - for (CSQueue queue : childQueues) { - childCapacities += queue.getCapacity(); - } - float delta = Math.abs(1.0f - childCapacities); // crude way to check - // allow capacities being set to 0, and enforce child 0 if parent is 0 - if (((queueCapacities.getCapacity() > 0) && (delta > PRECISION)) || - ((queueCapacities.getCapacity() == 0) && (childCapacities > 0))) { - throw new IllegalArgumentException("Illegal" + - " capacity of " + childCapacities + - " for children of queue " + queueName); - } - // check label capacities - for (String nodeLabel : queueCapacities.getExistingNodeLabels()) { - float capacityByLabel = queueCapacities.getCapacity(nodeLabel); - // check children's labels - float sum = 0; + + void setChildQueues(Collection childQueues) { + try { + writeLock.lock(); + // Validate + float childCapacities = 0; for (CSQueue queue : childQueues) { - sum += queue.getQueueCapacities().getCapacity(nodeLabel); + childCapacities += queue.getCapacity(); } - if ((capacityByLabel > 0 && Math.abs(1.0f - sum) > PRECISION) - || (capacityByLabel == 0) && (sum > 0)) { - throw new IllegalArgumentException("Illegal" + " capacity of " - + sum + " for children of queue " + queueName - + " for label=" + nodeLabel); + float delta = Math.abs(1.0f - childCapacities); // crude way to check + // allow capacities being set to 0, and enforce child 0 if parent is 0 + if (((queueCapacities.getCapacity() > 0) && (delta > PRECISION)) || ( + (queueCapacities.getCapacity() == 0) && (childCapacities > 0))) { + throw new IllegalArgumentException( + "Illegal" + " capacity of " + childCapacities + + " for children of queue " + queueName); } - } - - this.childQueues.clear(); - this.childQueues.addAll(childQueues); - if (LOG.isDebugEnabled()) { - LOG.debug("setChildQueues: " + getChildQueuesToPrint()); + // check label capacities + for (String nodeLabel : queueCapacities.getExistingNodeLabels()) { + float capacityByLabel = queueCapacities.getCapacity(nodeLabel); + // check children's labels + float sum = 0; + for (CSQueue queue : childQueues) { + sum += queue.getQueueCapacities().getCapacity(nodeLabel); + } + if ((capacityByLabel > 0 && Math.abs(1.0f - sum) > PRECISION) + || (capacityByLabel == 0) && (sum > 0)) { + throw new IllegalArgumentException( + "Illegal" + " capacity of " + sum + " for children of queue " + + queueName + " for label=" + nodeLabel); + } + } + + this.childQueues.clear(); + this.childQueues.addAll(childQueues); + if (LOG.isDebugEnabled()) { + LOG.debug("setChildQueues: " + getChildQueuesToPrint()); + } + } finally { + writeLock.unlock(); } } @@ -179,53 +188,70 @@ public class ParentQueue extends AbstractCSQueue { } @Override - public synchronized QueueInfo getQueueInfo( + public QueueInfo getQueueInfo( boolean includeChildQueues, boolean recursive) { - QueueInfo queueInfo = getQueueInfo(); + try { + readLock.lock(); + QueueInfo queueInfo = getQueueInfo(); - List childQueuesInfo = new ArrayList(); - if (includeChildQueues) { - for (CSQueue child : childQueues) { - // Get queue information recursively? - childQueuesInfo.add( - child.getQueueInfo(recursive, recursive)); + List childQueuesInfo = new ArrayList<>(); + if (includeChildQueues) { + for (CSQueue child : childQueues) { + // Get queue information recursively? + childQueuesInfo.add(child.getQueueInfo(recursive, recursive)); + } } + queueInfo.setChildQueues(childQueuesInfo); + + return queueInfo; + } finally { + readLock.unlock(); } - queueInfo.setChildQueues(childQueuesInfo); - - return queueInfo; + } - private synchronized QueueUserACLInfo getUserAclInfo( + private QueueUserACLInfo getUserAclInfo( UserGroupInformation user) { - QueueUserACLInfo userAclInfo = - recordFactory.newRecordInstance(QueueUserACLInfo.class); - List operations = new ArrayList(); - for (QueueACL operation : QueueACL.values()) { - if (hasAccess(operation, user)) { - operations.add(operation); - } + try { + readLock.lock(); + QueueUserACLInfo userAclInfo = recordFactory.newRecordInstance( + QueueUserACLInfo.class); + List operations = new ArrayList(); + for (QueueACL operation : QueueACL.values()) { + if (hasAccess(operation, user)) { + operations.add(operation); + } + } + + userAclInfo.setQueueName(getQueueName()); + userAclInfo.setUserAcls(operations); + return userAclInfo; + } finally { + readLock.unlock(); } - userAclInfo.setQueueName(getQueueName()); - userAclInfo.setUserAcls(operations); - return userAclInfo; } @Override - public synchronized List getQueueUserAclInfo( + public List getQueueUserAclInfo( UserGroupInformation user) { - List userAcls = new ArrayList(); - - // Add parent queue acls - userAcls.add(getUserAclInfo(user)); - - // Add children queue acls - for (CSQueue child : childQueues) { - userAcls.addAll(child.getQueueUserAclInfo(user)); + try { + readLock.lock(); + List userAcls = new ArrayList<>(); + + // Add parent queue acls + userAcls.add(getUserAclInfo(user)); + + // Add children queue acls + for (CSQueue child : childQueues) { + userAcls.addAll(child.getQueueUserAclInfo(user)); + } + + return userAcls; + } finally { + readLock.unlock(); } - - return userAcls; + } public String toString() { @@ -240,52 +266,59 @@ public class ParentQueue extends AbstractCSQueue { } @Override - public synchronized void reinitialize(CSQueue newlyParsedQueue, + public void reinitialize(CSQueue newlyParsedQueue, Resource clusterResource) throws IOException { - // Sanity check - if (!(newlyParsedQueue instanceof ParentQueue) || - !newlyParsedQueue.getQueuePath().equals(getQueuePath())) { - throw new IOException("Trying to reinitialize " + getQueuePath() + - " from " + newlyParsedQueue.getQueuePath()); - } - - ParentQueue newlyParsedParentQueue = (ParentQueue)newlyParsedQueue; - - // Set new configs - setupQueueConfigs(clusterResource); - - // Re-configure existing child queues and add new ones - // The CS has already checked to ensure all existing child queues are present! - Map currentChildQueues = getQueues(childQueues); - Map newChildQueues = - getQueues(newlyParsedParentQueue.childQueues); - for (Map.Entry e : newChildQueues.entrySet()) { - String newChildQueueName = e.getKey(); - CSQueue newChildQueue = e.getValue(); - - CSQueue childQueue = currentChildQueues.get(newChildQueueName); - - // Check if the child-queue already exists - if (childQueue != null) { - // Re-init existing child queues - childQueue.reinitialize(newChildQueue, clusterResource); - LOG.info(getQueueName() + ": re-configured queue: " + childQueue); - } else { - // New child queue, do not re-init - - // Set parent to 'this' - newChildQueue.setParent(this); - - // Save in list of current child queues - currentChildQueues.put(newChildQueueName, newChildQueue); - - LOG.info(getQueueName() + ": added new child queue: " + newChildQueue); + try { + writeLock.lock(); + // Sanity check + if (!(newlyParsedQueue instanceof ParentQueue) || !newlyParsedQueue + .getQueuePath().equals(getQueuePath())) { + throw new IOException( + "Trying to reinitialize " + getQueuePath() + " from " + + newlyParsedQueue.getQueuePath()); } - } - // Re-sort all queues - childQueues.clear(); - childQueues.addAll(currentChildQueues.values()); + ParentQueue newlyParsedParentQueue = (ParentQueue) newlyParsedQueue; + + // Set new configs + setupQueueConfigs(clusterResource); + + // Re-configure existing child queues and add new ones + // The CS has already checked to ensure all existing child queues are present! + Map currentChildQueues = getQueues(childQueues); + Map newChildQueues = getQueues( + newlyParsedParentQueue.childQueues); + for (Map.Entry e : newChildQueues.entrySet()) { + String newChildQueueName = e.getKey(); + CSQueue newChildQueue = e.getValue(); + + CSQueue childQueue = currentChildQueues.get(newChildQueueName); + + // Check if the child-queue already exists + if (childQueue != null) { + // Re-init existing child queues + childQueue.reinitialize(newChildQueue, clusterResource); + LOG.info(getQueueName() + ": re-configured queue: " + childQueue); + } else{ + // New child queue, do not re-init + + // Set parent to 'this' + newChildQueue.setParent(this); + + // Save in list of current child queues + currentChildQueues.put(newChildQueueName, newChildQueue); + + LOG.info( + getQueueName() + ": added new child queue: " + newChildQueue); + } + } + + // Re-sort all queues + childQueues.clear(); + childQueues.addAll(currentChildQueues.values()); + } finally { + writeLock.unlock(); + } } Map getQueues(Set queues) { @@ -299,21 +332,24 @@ public class ParentQueue extends AbstractCSQueue { @Override public void submitApplication(ApplicationId applicationId, String user, String queue) throws AccessControlException { - - synchronized (this) { + + try { + writeLock.lock(); // Sanity check if (queue.equals(queueName)) { - throw new AccessControlException("Cannot submit application " + - "to non-leaf queue: " + queueName); + throw new AccessControlException( + "Cannot submit application " + "to non-leaf queue: " + queueName); } - + if (state != QueueState.RUNNING) { - throw new AccessControlException("Queue " + getQueuePath() + - " is STOPPED. Cannot accept submission of application: " + - applicationId); + throw new AccessControlException("Queue " + getQueuePath() + + " is STOPPED. Cannot accept submission of application: " + + applicationId); } addApplication(applicationId, user); + } finally { + writeLock.unlock(); } // Inform the parent queue @@ -342,24 +378,26 @@ public class ParentQueue extends AbstractCSQueue { // finish attempt logic. } - private synchronized void addApplication(ApplicationId applicationId, + private void addApplication(ApplicationId applicationId, String user) { - ++numApplications; + try { + writeLock.lock(); + ++numApplications; - LOG.info("Application added -" + - " appId: " + applicationId + - " user: " + user + - " leaf-queue of parent: " + getQueueName() + - " #applications: " + getNumApplications()); + LOG.info( + "Application added -" + " appId: " + applicationId + " user: " + user + + " leaf-queue of parent: " + getQueueName() + " #applications: " + + getNumApplications()); + } finally { + writeLock.unlock(); + } } @Override public void finishApplication(ApplicationId application, String user) { - - synchronized (this) { - removeApplication(application, user); - } + + removeApplication(application, user); // Inform the parent queue if (parent != null) { @@ -367,16 +405,18 @@ public class ParentQueue extends AbstractCSQueue { } } - private synchronized void removeApplication(ApplicationId applicationId, + private void removeApplication(ApplicationId applicationId, String user) { - - --numApplications; + try { + writeLock.lock(); + --numApplications; - LOG.info("Application removed -" + - " appId: " + applicationId + - " user: " + user + - " leaf-queue of parent: " + getQueueName() + - " #applications: " + getNumApplications()); + LOG.info("Application removed -" + " appId: " + applicationId + " user: " + + user + " leaf-queue of parent: " + getQueueName() + + " #applications: " + getNumApplications()); + } finally { + writeLock.unlock(); + } } private String getParentName() { @@ -384,183 +424,181 @@ public class ParentQueue extends AbstractCSQueue { } @Override - public synchronized CSAssignment assignContainers(Resource clusterResource, + public CSAssignment assignContainers(Resource clusterResource, FiCaSchedulerNode node, ResourceLimits resourceLimits, SchedulingMode schedulingMode) { - // if our queue cannot access this node, just return - if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY - && !accessibleToPartition(node.getPartition())) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skip this queue=" + getQueuePath() - + ", because it is not able to access partition=" + node - .getPartition()); - } - - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.REJECTED, - ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + node + try { + writeLock.lock(); + // if our queue cannot access this node, just return + if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY + && !accessibleToPartition(node.getPartition())) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skip this queue=" + getQueuePath() + + ", because it is not able to access partition=" + node .getPartition()); - if (rootQueue) { - ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, - node); - } - - return CSAssignment.NULL_ASSIGNMENT; - } - - // Check if this queue need more resource, simply skip allocation if this - // queue doesn't need more resources. - if (!super.hasPendingResourceRequest(node.getPartition(), - clusterResource, schedulingMode)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skip this queue=" + getQueuePath() - + ", because it doesn't need more resource, schedulingMode=" - + schedulingMode.name() + " node-partition=" + node.getPartition()); - } - - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE); - if (rootQueue) { - ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, - node); - } - - return CSAssignment.NULL_ASSIGNMENT; - } - - CSAssignment assignment = - new CSAssignment(Resources.createResource(0, 0), NodeType.NODE_LOCAL); - - while (canAssign(clusterResource, node)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Trying to assign containers to child-queue of " - + getQueueName()); - } - - // Are we over maximum-capacity for this queue? - // This will also consider parent's limits and also continuous reservation - // looking - if (!super.canAssignToThisQueue(clusterResource, node.getPartition(), - resourceLimits, Resources.createResource( - getMetrics().getReservedMB(), getMetrics() - .getReservedVirtualCores()), schedulingMode)) { + } ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT); + getParentName(), getQueueName(), ActivityState.REJECTED, + ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + node + .getPartition()); if (rootQueue) { ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, node); } - break; + return CSAssignment.NULL_ASSIGNMENT; } - // Schedule - CSAssignment assignedToChild = - assignContainersToChildQueues(clusterResource, node, resourceLimits, - schedulingMode); - assignment.setType(assignedToChild.getType()); - - // Done if no child-queue assigned anything - if (Resources.greaterThan( - resourceCalculator, clusterResource, - assignedToChild.getResource(), Resources.none())) { - - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.ACCEPTED, - ActivityDiagnosticConstant.EMPTY); - - if (node.getReservedContainer() == null) { - if (rootQueue) { - ActivitiesLogger.NODE.finishAllocatedNodeAllocation( - activitiesManager, node, - assignedToChild.getAssignmentInformation() - .getFirstAllocatedOrReservedContainerId(), - AllocationState.ALLOCATED); - } - } else { - if (rootQueue) { - ActivitiesLogger.NODE.finishAllocatedNodeAllocation( - activitiesManager, node, - assignedToChild.getAssignmentInformation() - .getFirstAllocatedOrReservedContainerId(), - AllocationState.RESERVED); - } + // Check if this queue need more resource, simply skip allocation if this + // queue doesn't need more resources. + if (!super.hasPendingResourceRequest(node.getPartition(), clusterResource, + schedulingMode)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skip this queue=" + getQueuePath() + + ", because it doesn't need more resource, schedulingMode=" + + schedulingMode.name() + " node-partition=" + node + .getPartition()); } - // Track resource utilization for the parent-queue - allocateResource(clusterResource, assignedToChild.getResource(), - node.getPartition(), assignedToChild.isIncreasedAllocation()); - - // Track resource utilization in this pass of the scheduler - Resources - .addTo(assignment.getResource(), assignedToChild.getResource()); - Resources.addTo(assignment.getAssignmentInformation().getAllocated(), - assignedToChild.getAssignmentInformation().getAllocated()); - Resources.addTo(assignment.getAssignmentInformation().getReserved(), - assignedToChild.getAssignmentInformation().getReserved()); - assignment.getAssignmentInformation().incrAllocations( - assignedToChild.getAssignmentInformation().getNumAllocations()); - assignment.getAssignmentInformation().incrReservations( - assignedToChild.getAssignmentInformation().getNumReservations()); - assignment - .getAssignmentInformation() - .getAllocationDetails() - .addAll( - assignedToChild.getAssignmentInformation().getAllocationDetails()); - assignment - .getAssignmentInformation() - .getReservationDetails() - .addAll( + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE); + if (rootQueue) { + ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, + node); + } + + return CSAssignment.NULL_ASSIGNMENT; + } + + CSAssignment assignment = new CSAssignment(Resources.createResource(0, 0), + NodeType.NODE_LOCAL); + + while (canAssign(clusterResource, node)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Trying to assign containers to child-queue of " + + getQueueName()); + } + + // Are we over maximum-capacity for this queue? + // This will also consider parent's limits and also continuous reservation + // looking + if (!super.canAssignToThisQueue(clusterResource, node.getPartition(), + resourceLimits, Resources + .createResource(getMetrics().getReservedMB(), + getMetrics().getReservedVirtualCores()), schedulingMode)) { + + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT); + if (rootQueue) { + ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, + node); + } + + break; + } + + // Schedule + CSAssignment assignedToChild = assignContainersToChildQueues( + clusterResource, node, resourceLimits, schedulingMode); + assignment.setType(assignedToChild.getType()); + + // Done if no child-queue assigned anything + if (Resources.greaterThan(resourceCalculator, clusterResource, + assignedToChild.getResource(), Resources.none())) { + + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.ACCEPTED, + ActivityDiagnosticConstant.EMPTY); + + if (node.getReservedContainer() == null) { + if (rootQueue) { + ActivitiesLogger.NODE.finishAllocatedNodeAllocation( + activitiesManager, node, + assignedToChild.getAssignmentInformation() + .getFirstAllocatedOrReservedContainerId(), + AllocationState.ALLOCATED); + } + } else{ + if (rootQueue) { + ActivitiesLogger.NODE.finishAllocatedNodeAllocation( + activitiesManager, node, + assignedToChild.getAssignmentInformation() + .getFirstAllocatedOrReservedContainerId(), + AllocationState.RESERVED); + } + } + + // Track resource utilization for the parent-queue + allocateResource(clusterResource, assignedToChild.getResource(), + node.getPartition(), assignedToChild.isIncreasedAllocation()); + + // Track resource utilization in this pass of the scheduler + Resources.addTo(assignment.getResource(), + assignedToChild.getResource()); + Resources.addTo(assignment.getAssignmentInformation().getAllocated(), + assignedToChild.getAssignmentInformation().getAllocated()); + Resources.addTo(assignment.getAssignmentInformation().getReserved(), + assignedToChild.getAssignmentInformation().getReserved()); + assignment.getAssignmentInformation().incrAllocations( + assignedToChild.getAssignmentInformation().getNumAllocations()); + assignment.getAssignmentInformation().incrReservations( + assignedToChild.getAssignmentInformation().getNumReservations()); + assignment.getAssignmentInformation().getAllocationDetails().addAll( + assignedToChild.getAssignmentInformation() + .getAllocationDetails()); + assignment.getAssignmentInformation().getReservationDetails().addAll( assignedToChild.getAssignmentInformation() .getReservationDetails()); - assignment.setIncreasedAllocation(assignedToChild - .isIncreasedAllocation()); - - LOG.info("assignedContainer" + - " queue=" + getQueueName() + - " usedCapacity=" + getUsedCapacity() + - " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + - " used=" + queueUsage.getUsed() + - " cluster=" + clusterResource); + assignment.setIncreasedAllocation( + assignedToChild.isIncreasedAllocation()); - } else { - assignment.setSkippedType(assignedToChild.getSkippedType()); + LOG.info("assignedContainer" + " queue=" + getQueueName() + + " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity=" + + getAbsoluteUsedCapacity() + " used=" + queueUsage.getUsed() + + " cluster=" + clusterResource); - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - getParentName(), getQueueName(), ActivityState.SKIPPED, - ActivityDiagnosticConstant.EMPTY); - if (rootQueue) { - ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, - node); - } + } else{ + assignment.setSkippedType(assignedToChild.getSkippedType()); - break; - } - - if (LOG.isDebugEnabled()) { - LOG.debug("ParentQ=" + getQueueName() - + " assignedSoFarInThisIteration=" + assignment.getResource() - + " usedCapacity=" + getUsedCapacity() - + " absoluteUsedCapacity=" + getAbsoluteUsedCapacity()); - } - - // Do not assign more than one container if this isn't the root queue - // or if we've already assigned an off-switch container - if (!rootQueue || assignment.getType() == NodeType.OFF_SWITCH) { - if (LOG.isDebugEnabled()) { - if (rootQueue && assignment.getType() == NodeType.OFF_SWITCH) { - LOG.debug("Not assigning more than one off-switch container," + - " assignments so far: " + assignment); + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + getParentName(), getQueueName(), ActivityState.SKIPPED, + ActivityDiagnosticConstant.EMPTY); + if (rootQueue) { + ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager, + node); } + + break; + } + + if (LOG.isDebugEnabled()) { + LOG.debug( + "ParentQ=" + getQueueName() + " assignedSoFarInThisIteration=" + + assignment.getResource() + " usedCapacity=" + + getUsedCapacity() + " absoluteUsedCapacity=" + + getAbsoluteUsedCapacity()); + } + + // Do not assign more than one container if this isn't the root queue + // or if we've already assigned an off-switch container + if (!rootQueue || assignment.getType() == NodeType.OFF_SWITCH) { + if (LOG.isDebugEnabled()) { + if (rootQueue && assignment.getType() == NodeType.OFF_SWITCH) { + LOG.debug("Not assigning more than one off-switch container," + + " assignments so far: " + assignment); + } + } + break; } - break; } - } - - return assignment; + + return assignment; + } finally { + writeLock.unlock(); + } } private boolean canAssign(Resource clusterResource, FiCaSchedulerNode node) { @@ -628,7 +666,7 @@ public class ParentQueue extends AbstractCSQueue { return childrenList.iterator(); } - private synchronized CSAssignment assignContainersToChildQueues( + private CSAssignment assignContainersToChildQueues( Resource cluster, FiCaSchedulerNode node, ResourceLimits limits, SchedulingMode schedulingMode) { CSAssignment assignment = CSAssignment.NULL_ASSIGNMENT; @@ -717,39 +755,45 @@ public class ParentQueue extends AbstractCSQueue { } } - private synchronized void internalReleaseResource(Resource clusterResource, + private void internalReleaseResource(Resource clusterResource, FiCaSchedulerNode node, Resource releasedResource, boolean changeResource, CSQueue completedChildQueue, boolean sortQueues) { - super.releaseResource(clusterResource, - releasedResource, node.getPartition(), - changeResource); + try { + writeLock.lock(); + super.releaseResource(clusterResource, releasedResource, + node.getPartition(), changeResource); - if (LOG.isDebugEnabled()) { - LOG.debug("completedContainer " + this + ", cluster=" + clusterResource); - } + if (LOG.isDebugEnabled()) { + LOG.debug( + "completedContainer " + this + ", cluster=" + clusterResource); + } - // Note that this is using an iterator on the childQueues so this can't - // be called if already within an iterator for the childQueues. Like - // from assignContainersToChildQueues. - if (sortQueues) { - // reinsert the updated queue - for (Iterator iter = childQueues.iterator(); iter.hasNext();) { - CSQueue csqueue = iter.next(); - if (csqueue.equals(completedChildQueue)) { - iter.remove(); - if (LOG.isDebugEnabled()) { - LOG.debug("Re-sorting completed queue: " + csqueue); + // Note that this is using an iterator on the childQueues so this can't + // be called if already within an iterator for the childQueues. Like + // from assignContainersToChildQueues. + if (sortQueues) { + // reinsert the updated queue + for (Iterator iter = childQueues.iterator(); + iter.hasNext(); ) { + CSQueue csqueue = iter.next(); + if (csqueue.equals(completedChildQueue)) { + iter.remove(); + if (LOG.isDebugEnabled()) { + LOG.debug("Re-sorting completed queue: " + csqueue); + } + childQueues.add(csqueue); + break; } - childQueues.add(csqueue); - break; } } - } - // If we skipped sort queue this time, we need to resort queues to make - // sure we allocate from least usage (or order defined by queue policy) - // queues. - needToResortQueuesAtNextAllocation = !sortQueues; + // If we skipped sort queue this time, we need to resort queues to make + // sure we allocate from least usage (or order defined by queue policy) + // queues. + needToResortQueuesAtNextAllocation = !sortQueues; + } finally { + writeLock.unlock(); + } } @Override @@ -806,24 +850,35 @@ public class ParentQueue extends AbstractCSQueue { } @Override - public synchronized void updateClusterResource(Resource clusterResource, + public void updateClusterResource(Resource clusterResource, ResourceLimits resourceLimits) { - // Update all children - for (CSQueue childQueue : childQueues) { - // Get ResourceLimits of child queue before assign containers - ResourceLimits childLimits = getResourceLimitsOfChild(childQueue, - clusterResource, resourceLimits.getLimit(), - RMNodeLabelsManager.NO_LABEL); - childQueue.updateClusterResource(clusterResource, childLimits); + try { + writeLock.lock(); + // Update all children + for (CSQueue childQueue : childQueues) { + // Get ResourceLimits of child queue before assign containers + ResourceLimits childLimits = getResourceLimitsOfChild(childQueue, + clusterResource, resourceLimits.getLimit(), + RMNodeLabelsManager.NO_LABEL); + childQueue.updateClusterResource(clusterResource, childLimits); + } + + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, null); + } finally { + writeLock.unlock(); } - - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, null); } @Override - public synchronized List getChildQueues() { - return new ArrayList(childQueues); + public List getChildQueues() { + try { + readLock.lock(); + return new ArrayList(childQueues); + } finally { + readLock.unlock(); + } + } @Override @@ -832,13 +887,18 @@ public class ParentQueue extends AbstractCSQueue { if (rmContainer.getState().equals(RMContainerState.COMPLETED)) { return; } - // Careful! Locking order is important! - synchronized (this) { - FiCaSchedulerNode node = - scheduler.getNode(rmContainer.getContainer().getNodeId()); + + // Careful! Locking order is important! + try { + writeLock.lock(); + FiCaSchedulerNode node = scheduler.getNode( + rmContainer.getContainer().getNodeId()); allocateResource(clusterResource, rmContainer.getContainer().getResource(), node.getPartition(), false); + } finally { + writeLock.unlock(); } + if (parent != null) { parent.recoverContainer(clusterResource, attempt, rmContainer); } @@ -851,11 +911,17 @@ public class ParentQueue extends AbstractCSQueue { } @Override - public synchronized void collectSchedulerApplications( + public void collectSchedulerApplications( Collection apps) { - for (CSQueue queue : childQueues) { - queue.collectSchedulerApplications(apps); + try { + readLock.lock(); + for (CSQueue queue : childQueues) { + queue.collectSchedulerApplications(apps); + } + } finally { + readLock.unlock(); } + } @Override @@ -897,44 +963,49 @@ public class ParentQueue extends AbstractCSQueue { } } - public synchronized int getNumApplications() { + public int getNumApplications() { return numApplications; } - synchronized void allocateResource(Resource clusterResource, + void allocateResource(Resource clusterResource, Resource resource, String nodePartition, boolean changeContainerResource) { - super.allocateResource(clusterResource, resource, nodePartition, - changeContainerResource); + try { + writeLock.lock(); + super.allocateResource(clusterResource, resource, nodePartition, + changeContainerResource); - /** - * check if we need to kill (killable) containers if maximum resource violated. - * Doing this because we will deduct killable resource when going from root. - * For example: - *
-     *      Root
-     *      /   \
-     *     a     b
-     *   /  \
-     *  a1  a2
-     * 
- * - * a: max=10G, used=10G, killable=2G - * a1: used=8G, killable=2G - * a2: used=2G, pending=2G, killable=0G - * - * When we get queue-a to allocate resource, even if queue-a - * reaches its max resource, we deduct its used by killable, so we can allocate - * at most 2G resources. ResourceLimits passed down to a2 has headroom set to 2G. - * - * If scheduler finds a 2G available resource in existing cluster, and assigns it - * to a2, now a2's used= 2G + 2G = 4G, and a's used = 8G + 4G = 12G > 10G - * - * When this happens, we have to preempt killable container (on same or different - * nodes) of parent queue to avoid violating parent's max resource. - */ - if (getQueueCapacities().getAbsoluteMaximumCapacity(nodePartition) - < getQueueCapacities().getAbsoluteUsedCapacity(nodePartition)) { - killContainersToEnforceMaxQueueCapacity(nodePartition, clusterResource); + /** + * check if we need to kill (killable) containers if maximum resource violated. + * Doing this because we will deduct killable resource when going from root. + * For example: + *
+       *      Root
+       *      /   \
+       *     a     b
+       *   /  \
+       *  a1  a2
+       * 
+ * + * a: max=10G, used=10G, killable=2G + * a1: used=8G, killable=2G + * a2: used=2G, pending=2G, killable=0G + * + * When we get queue-a to allocate resource, even if queue-a + * reaches its max resource, we deduct its used by killable, so we can allocate + * at most 2G resources. ResourceLimits passed down to a2 has headroom set to 2G. + * + * If scheduler finds a 2G available resource in existing cluster, and assigns it + * to a2, now a2's used= 2G + 2G = 4G, and a's used = 8G + 4G = 12G > 10G + * + * When this happens, we have to preempt killable container (on same or different + * nodes) of parent queue to avoid violating parent's max resource. + */ + if (getQueueCapacities().getAbsoluteMaximumCapacity(nodePartition) + < getQueueCapacities().getAbsoluteUsedCapacity(nodePartition)) { + killContainersToEnforceMaxQueueCapacity(nodePartition, clusterResource); + } + } finally { + writeLock.unlock(); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/PlanQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/PlanQueue.java index 7b53ad5e49..a391f25fba 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/PlanQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/PlanQueue.java @@ -79,76 +79,98 @@ public class PlanQueue extends ParentQueue { } @Override - public synchronized void reinitialize(CSQueue newlyParsedQueue, + public void reinitialize(CSQueue newlyParsedQueue, Resource clusterResource) throws IOException { - // Sanity check - if (!(newlyParsedQueue instanceof PlanQueue) - || !newlyParsedQueue.getQueuePath().equals(getQueuePath())) { - throw new IOException("Trying to reinitialize " + getQueuePath() - + " from " + newlyParsedQueue.getQueuePath()); - } + try { + writeLock.lock(); + // Sanity check + if (!(newlyParsedQueue instanceof PlanQueue) || !newlyParsedQueue + .getQueuePath().equals(getQueuePath())) { + throw new IOException( + "Trying to reinitialize " + getQueuePath() + " from " + + newlyParsedQueue.getQueuePath()); + } - PlanQueue newlyParsedParentQueue = (PlanQueue) newlyParsedQueue; + PlanQueue newlyParsedParentQueue = (PlanQueue) newlyParsedQueue; - if (newlyParsedParentQueue.getChildQueues().size() > 0) { - throw new IOException( - "Reservable Queue should not have sub-queues in the" - + "configuration"); - } + if (newlyParsedParentQueue.getChildQueues().size() > 0) { + throw new IOException( + "Reservable Queue should not have sub-queues in the" + + "configuration"); + } - // Set new configs - setupQueueConfigs(clusterResource); + // Set new configs + setupQueueConfigs(clusterResource); - updateQuotas(newlyParsedParentQueue.userLimit, - newlyParsedParentQueue.userLimitFactor, - newlyParsedParentQueue.maxAppsForReservation, - newlyParsedParentQueue.maxAppsPerUserForReservation); + updateQuotas(newlyParsedParentQueue.userLimit, + newlyParsedParentQueue.userLimitFactor, + newlyParsedParentQueue.maxAppsForReservation, + newlyParsedParentQueue.maxAppsPerUserForReservation); - // run reinitialize on each existing queue, to trigger absolute cap - // recomputations - for (CSQueue res : this.getChildQueues()) { - res.reinitialize(res, clusterResource); - } - showReservationsAsQueues = newlyParsedParentQueue.showReservationsAsQueues; - } - - synchronized void addChildQueue(CSQueue newQueue) - throws SchedulerDynamicEditException { - if (newQueue.getCapacity() > 0) { - throw new SchedulerDynamicEditException("Queue " + newQueue - + " being added has non zero capacity."); - } - boolean added = this.childQueues.add(newQueue); - if (LOG.isDebugEnabled()) { - LOG.debug("updateChildQueues (action: add queue): " + added + " " - + getChildQueuesToPrint()); + // run reinitialize on each existing queue, to trigger absolute cap + // recomputations + for (CSQueue res : this.getChildQueues()) { + res.reinitialize(res, clusterResource); + } + showReservationsAsQueues = + newlyParsedParentQueue.showReservationsAsQueues; + } finally { + writeLock.unlock(); } } - synchronized void removeChildQueue(CSQueue remQueue) + void addChildQueue(CSQueue newQueue) throws SchedulerDynamicEditException { - if (remQueue.getCapacity() > 0) { - throw new SchedulerDynamicEditException("Queue " + remQueue - + " being removed has non zero capacity."); + try { + writeLock.lock(); + if (newQueue.getCapacity() > 0) { + throw new SchedulerDynamicEditException( + "Queue " + newQueue + " being added has non zero capacity."); + } + boolean added = this.childQueues.add(newQueue); + if (LOG.isDebugEnabled()) { + LOG.debug("updateChildQueues (action: add queue): " + added + " " + + getChildQueuesToPrint()); + } + } finally { + writeLock.unlock(); } - Iterator qiter = childQueues.iterator(); - while (qiter.hasNext()) { - CSQueue cs = qiter.next(); - if (cs.equals(remQueue)) { - qiter.remove(); - if (LOG.isDebugEnabled()) { - LOG.debug("Removed child queue: {}", cs.getQueueName()); + } + + void removeChildQueue(CSQueue remQueue) + throws SchedulerDynamicEditException { + try { + writeLock.lock(); + if (remQueue.getCapacity() > 0) { + throw new SchedulerDynamicEditException( + "Queue " + remQueue + " being removed has non zero capacity."); + } + Iterator qiter = childQueues.iterator(); + while (qiter.hasNext()) { + CSQueue cs = qiter.next(); + if (cs.equals(remQueue)) { + qiter.remove(); + if (LOG.isDebugEnabled()) { + LOG.debug("Removed child queue: {}", cs.getQueueName()); + } } } + } finally { + writeLock.unlock(); } } - protected synchronized float sumOfChildCapacities() { - float ret = 0; - for (CSQueue l : childQueues) { - ret += l.getCapacity(); + protected float sumOfChildCapacities() { + try { + writeLock.lock(); + float ret = 0; + for (CSQueue l : childQueues) { + ret += l.getCapacity(); + } + return ret; + } finally { + writeLock.unlock(); } - return ret; } private void updateQuotas(int userLimit, float userLimitFactor, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ReservationQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ReservationQueue.java index 976cf8cf74..faeb37e8f8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ReservationQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ReservationQueue.java @@ -51,22 +51,28 @@ public class ReservationQueue extends LeafQueue { } @Override - public synchronized void reinitialize(CSQueue newlyParsedQueue, + public void reinitialize(CSQueue newlyParsedQueue, Resource clusterResource) throws IOException { - // Sanity check - if (!(newlyParsedQueue instanceof ReservationQueue) - || !newlyParsedQueue.getQueuePath().equals(getQueuePath())) { - throw new IOException("Trying to reinitialize " + getQueuePath() - + " from " + newlyParsedQueue.getQueuePath()); - } - super.reinitialize(newlyParsedQueue, clusterResource); - CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, - minimumAllocation, this, labelManager, null); + try { + writeLock.lock(); + // Sanity check + if (!(newlyParsedQueue instanceof ReservationQueue) || !newlyParsedQueue + .getQueuePath().equals(getQueuePath())) { + throw new IOException( + "Trying to reinitialize " + getQueuePath() + " from " + + newlyParsedQueue.getQueuePath()); + } + super.reinitialize(newlyParsedQueue, clusterResource); + CSQueueUtils.updateQueueStatistics(resourceCalculator, clusterResource, + minimumAllocation, this, labelManager, null); - updateQuotas(parent.getUserLimitForReservation(), - parent.getUserLimitFactor(), - parent.getMaxApplicationsForReservations(), - parent.getMaxApplicationsPerUserForReservation()); + updateQuotas(parent.getUserLimitForReservation(), + parent.getUserLimitFactor(), + parent.getMaxApplicationsForReservations(), + parent.getMaxApplicationsPerUserForReservation()); + } finally { + writeLock.unlock(); + } } /** @@ -77,21 +83,26 @@ public class ReservationQueue extends LeafQueue { * maxCapacity, etc..) * @throws SchedulerDynamicEditException */ - public synchronized void setEntitlement(QueueEntitlement entitlement) + public void setEntitlement(QueueEntitlement entitlement) throws SchedulerDynamicEditException { - float capacity = entitlement.getCapacity(); - if (capacity < 0 || capacity > 1.0f) { - throw new SchedulerDynamicEditException( - "Capacity demand is not in the [0,1] range: " + capacity); - } - setCapacity(capacity); - setAbsoluteCapacity(getParent().getAbsoluteCapacity() * getCapacity()); - // note: we currently set maxCapacity to capacity - // this might be revised later - setMaxCapacity(entitlement.getMaxCapacity()); - if (LOG.isDebugEnabled()) { - LOG.debug("successfully changed to " + capacity + " for queue " - + this.getQueueName()); + try { + writeLock.lock(); + float capacity = entitlement.getCapacity(); + if (capacity < 0 || capacity > 1.0f) { + throw new SchedulerDynamicEditException( + "Capacity demand is not in the [0,1] range: " + capacity); + } + setCapacity(capacity); + setAbsoluteCapacity(getParent().getAbsoluteCapacity() * getCapacity()); + // note: we currently set maxCapacity to capacity + // this might be revised later + setMaxCapacity(entitlement.getMaxCapacity()); + if (LOG.isDebugEnabled()) { + LOG.debug("successfully changed to " + capacity + " for queue " + this + .getQueueName()); + } + } finally { + writeLock.unlock(); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java index 6fba22a928..26146301d4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerResizing.java @@ -828,8 +828,8 @@ public class TestContainerResizing { app.getAppAttemptResourceUsage().getPending().getMemorySize()); // Queue/user/application's usage will be updated checkUsedResource(rm1, "default", 0 * GB, null); - Assert.assertEquals(0 * GB, ((LeafQueue) cs.getQueue("default")) - .getUser("user").getUsed().getMemorySize()); + // User will be removed + Assert.assertNull(((LeafQueue) cs.getQueue("default")).getUser("user")); Assert.assertEquals(0 * GB, app.getAppAttemptResourceUsage().getReserved().getMemorySize()); Assert.assertEquals(0 * GB, From 734d54c1a8950446e68098f62d8964e02ecc2890 Mon Sep 17 00:00:00 2001 From: Kai Zheng Date: Wed, 21 Sep 2016 21:34:48 +0800 Subject: [PATCH 2/9] HDFS-10861. Refactor StripeReaders and use ECChunk version decode API. Contributed by Sammi Chen --- .../hadoop/io/ElasticByteBufferPool.java | 2 +- .../apache/hadoop/io/erasurecode/ECChunk.java | 22 + .../io/erasurecode/rawcoder/CoderUtil.java | 3 + .../apache/hadoop/hdfs/DFSInputStream.java | 20 +- .../hadoop/hdfs/DFSStripedInputStream.java | 654 +++--------------- .../hadoop/hdfs/PositionStripeReader.java | 104 +++ .../hadoop/hdfs/StatefulStripeReader.java | 95 +++ .../org/apache/hadoop/hdfs/StripeReader.java | 463 +++++++++++++ .../hadoop/hdfs/util/StripedBlockUtil.java | 158 ++--- .../hdfs/util/TestStripedBlockUtil.java | 1 - 10 files changed, 844 insertions(+), 678 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/PositionStripeReader.java create mode 100644 hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StatefulStripeReader.java create mode 100644 hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StripeReader.java diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ElasticByteBufferPool.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ElasticByteBufferPool.java index c35d60885d..023f37f9b4 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ElasticByteBufferPool.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ElasticByteBufferPool.java @@ -85,7 +85,7 @@ public final class ElasticByteBufferPool implements ByteBufferPool { private final TreeMap getBufferTree(boolean direct) { return direct ? directBuffers : buffers; } - + @Override public synchronized ByteBuffer getBuffer(boolean direct, int length) { TreeMap tree = getBufferTree(direct); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/ECChunk.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/ECChunk.java index cd7c6be1b3..536715b6a5 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/ECChunk.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/ECChunk.java @@ -29,6 +29,9 @@ public class ECChunk { private ByteBuffer chunkBuffer; + // TODO: should be in a more general flags + private boolean allZero = false; + /** * Wrapping a ByteBuffer * @param buffer buffer to be wrapped by the chunk @@ -37,6 +40,13 @@ public class ECChunk { this.chunkBuffer = buffer; } + public ECChunk(ByteBuffer buffer, int offset, int len) { + ByteBuffer tmp = buffer.duplicate(); + tmp.position(offset); + tmp.limit(offset + len); + this.chunkBuffer = tmp.slice(); + } + /** * Wrapping a bytes array * @param buffer buffer to be wrapped by the chunk @@ -45,6 +55,18 @@ public class ECChunk { this.chunkBuffer = ByteBuffer.wrap(buffer); } + public ECChunk(byte[] buffer, int offset, int len) { + this.chunkBuffer = ByteBuffer.wrap(buffer, offset, len); + } + + public boolean isAllZero() { + return allZero; + } + + public void setAllZero(boolean allZero) { + this.allZero = allZero; + } + /** * Convert to ByteBuffer * @return ByteBuffer diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/CoderUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/CoderUtil.java index b22d44fcea..ef346392a1 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/CoderUtil.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/CoderUtil.java @@ -115,6 +115,9 @@ final class CoderUtil { buffers[i] = null; } else { buffers[i] = chunk.getBuffer(); + if (chunk.isAllZero()) { + CoderUtil.resetBuffer(buffers[i], buffers[i].remaining()); + } } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java index 31fa89757f..dbffc64c5c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java @@ -240,7 +240,7 @@ public class DFSInputStream extends FSInputStream Iterator oldIter = locatedBlocks.getLocatedBlocks().iterator(); Iterator newIter = newInfo.getLocatedBlocks().iterator(); while (oldIter.hasNext() && newIter.hasNext()) { - if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) { + if (!oldIter.next().getBlock().equals(newIter.next().getBlock())) { throw new IOException("Blocklist for " + src + " has changed!"); } } @@ -677,8 +677,8 @@ public class DFSInputStream extends FSInputStream if (oneByteBuf == null) { oneByteBuf = new byte[1]; } - int ret = read( oneByteBuf, 0, 1 ); - return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff); + int ret = read(oneByteBuf, 0, 1); + return (ret <= 0) ? -1 : (oneByteBuf[0] & 0xff); } /* This is a used by regular read() and handles ChecksumExceptions. @@ -702,7 +702,7 @@ public class DFSInputStream extends FSInputStream // retry as many times as seekToNewSource allows. try { return reader.readFromBlock(blockReader, len); - } catch ( ChecksumException ce ) { + } catch (ChecksumException ce) { DFSClient.LOG.warn("Found Checksum error for " + getCurrentBlock() + " from " + currentNode + " at " + ce.getPos()); @@ -710,7 +710,7 @@ public class DFSInputStream extends FSInputStream retryCurrentNode = false; // we want to remember which block replicas we have tried corruptedBlocks.addCorruptedBlock(getCurrentBlock(), currentNode); - } catch ( IOException e ) { + } catch (IOException e) { if (!retryCurrentNode) { DFSClient.LOG.warn("Exception while reading from " + getCurrentBlock() + " of " + src + " from " @@ -779,7 +779,9 @@ public class DFSInputStream extends FSInputStream DFSClient.LOG.warn("DFS Read", e); } blockEnd = -1; - if (currentNode != null) { addToDeadNodes(currentNode); } + if (currentNode != null) { + addToDeadNodes(currentNode); + } if (--retries == 0) { throw e; } @@ -1397,10 +1399,10 @@ public class DFSInputStream extends FSInputStream @Override public long skip(long n) throws IOException { - if ( n > 0 ) { + if (n > 0) { long curPos = getPos(); long fileLen = getFileLength(); - if( n+curPos > fileLen ) { + if (n+curPos > fileLen) { n = fileLen - curPos; } seek(curPos+n); @@ -1550,7 +1552,7 @@ public class DFSInputStream extends FSInputStream * Get statistics about the reads which this DFSInputStream has done. */ public ReadStatistics getReadStatistics() { - return new ReadStatistics(readStatistics); + return readStatistics; } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java index ccaf6a78db..922f74eaec 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java @@ -17,24 +17,21 @@ */ package org.apache.hadoop.hdfs; -import com.google.common.base.Preconditions; import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.fs.ChecksumException; import org.apache.hadoop.fs.ReadOption; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; -import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock; import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; import org.apache.hadoop.hdfs.DFSUtilClient.CorruptedBlocks; +import org.apache.hadoop.hdfs.StripeReader.BlockReaderInfo; +import org.apache.hadoop.hdfs.StripeReader.ReaderRetryPolicy; import org.apache.hadoop.hdfs.util.StripedBlockUtil; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.StripeRange; import org.apache.hadoop.io.ByteBufferPool; -import static org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe; -import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunk; -import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunkReadResult; - import org.apache.hadoop.io.ElasticByteBufferPool; import org.apache.hadoop.io.erasurecode.CodecUtil; import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; @@ -44,7 +41,6 @@ import org.apache.hadoop.io.erasurecode.rawcoder.RawErasureDecoder; import java.io.EOFException; import java.io.IOException; -import java.io.InterruptedIOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; @@ -53,111 +49,32 @@ import java.util.EnumSet; import java.util.List; import java.util.Set; import java.util.Collection; -import java.util.Map; -import java.util.HashMap; -import java.util.concurrent.CompletionService; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.Callable; -import java.util.concurrent.Future; +import java.util.concurrent.ThreadPoolExecutor; /** - * DFSStripedInputStream reads from striped block groups + * DFSStripedInputStream reads from striped block groups. */ @InterfaceAudience.Private public class DFSStripedInputStream extends DFSInputStream { - private static class ReaderRetryPolicy { - private int fetchEncryptionKeyTimes = 1; - private int fetchTokenTimes = 1; - - void refetchEncryptionKey() { - fetchEncryptionKeyTimes--; - } - - void refetchToken() { - fetchTokenTimes--; - } - - boolean shouldRefetchEncryptionKey() { - return fetchEncryptionKeyTimes > 0; - } - - boolean shouldRefetchToken() { - return fetchTokenTimes > 0; - } - } - - /** Used to indicate the buffered data's range in the block group */ - private static class StripeRange { - /** start offset in the block group (inclusive) */ - final long offsetInBlock; - /** length of the stripe range */ - final long length; - - StripeRange(long offsetInBlock, long length) { - Preconditions.checkArgument(offsetInBlock >= 0 && length >= 0); - this.offsetInBlock = offsetInBlock; - this.length = length; - } - - boolean include(long pos) { - return pos >= offsetInBlock && pos < offsetInBlock + length; - } - } - - private static class BlockReaderInfo { - final BlockReader reader; - final DatanodeInfo datanode; - /** - * when initializing block readers, their starting offsets are set to the same - * number: the smallest internal block offsets among all the readers. This is - * because it is possible that for some internal blocks we have to read - * "backwards" for decoding purpose. We thus use this offset array to track - * offsets for all the block readers so that we can skip data if necessary. - */ - long blockReaderOffset; - /** - * We use this field to indicate whether we should use this reader. In case - * we hit any issue with this reader, we set this field to true and avoid - * using it for the next stripe. - */ - boolean shouldSkip = false; - - BlockReaderInfo(BlockReader reader, DatanodeInfo dn, long offset) { - this.reader = reader; - this.datanode = dn; - this.blockReaderOffset = offset; - } - - void setOffset(long offset) { - this.blockReaderOffset = offset; - } - - void skip() { - this.shouldSkip = true; - } - } - private static final ByteBufferPool BUFFER_POOL = new ElasticByteBufferPool(); - private final BlockReaderInfo[] blockReaders; private final int cellSize; private final short dataBlkNum; private final short parityBlkNum; private final int groupSize; - /** the buffer for a complete stripe */ + /** the buffer for a complete stripe. */ private ByteBuffer curStripeBuf; private ByteBuffer parityBuf; private final ErasureCodingPolicy ecPolicy; private final RawErasureDecoder decoder; /** - * indicate the start/end offset of the current buffered stripe in the - * block group + * Indicate the start/end offset of the current buffered stripe in the + * block group. */ private StripeRange curStripeRange; - private final CompletionService readingService; /** * When warning the user of a lost block in striping mode, we remember the @@ -167,8 +84,8 @@ public class DFSStripedInputStream extends DFSInputStream { * * To minimize the overhead, we only store the datanodeUuid in this set */ - private final Set warnedNodes = Collections.newSetFromMap( - new ConcurrentHashMap()); + private final Set warnedNodes = + Collections.newSetFromMap(new ConcurrentHashMap<>()); DFSStripedInputStream(DFSClient dfsClient, String src, boolean verifyChecksum, ErasureCodingPolicy ecPolicy, @@ -183,8 +100,6 @@ public class DFSStripedInputStream extends DFSInputStream { groupSize = dataBlkNum + parityBlkNum; blockReaders = new BlockReaderInfo[groupSize]; curStripeRange = new StripeRange(0, 0); - readingService = - new ExecutorCompletionService<>(dfsClient.getStripedReadsThreadPool()); ErasureCoderOptions coderOptions = new ErasureCoderOptions( dataBlkNum, parityBlkNum); decoder = CodecUtil.createRawDecoder(dfsClient.getConfiguration(), @@ -198,7 +113,7 @@ public class DFSStripedInputStream extends DFSInputStream { return decoder.preferDirectBuffer(); } - private void resetCurStripeBuffer() { + void resetCurStripeBuffer() { if (curStripeBuf == null) { curStripeBuf = BUFFER_POOL.getBuffer(useDirectBuffer(), cellSize * dataBlkNum); @@ -207,7 +122,7 @@ public class DFSStripedInputStream extends DFSInputStream { curStripeRange = new StripeRange(0, 0); } - private ByteBuffer getParityBuffer() { + protected ByteBuffer getParityBuffer() { if (parityBuf == null) { parityBuf = BUFFER_POOL.getBuffer(useDirectBuffer(), cellSize * parityBlkNum); @@ -216,6 +131,29 @@ public class DFSStripedInputStream extends DFSInputStream { return parityBuf; } + protected ByteBuffer getCurStripeBuf() { + return curStripeBuf; + } + + protected String getSrc() { + return src; + } + + protected DFSClient getDFSClient() { + return dfsClient; + } + + protected LocatedBlocks getLocatedBlocks() { + return locatedBlocks; + } + + protected ByteBufferPool getBufferPool() { + return BUFFER_POOL; + } + + protected ThreadPoolExecutor getStripedReadsThreadPool(){ + return dfsClient.getStripedReadsThreadPool(); + } /** * When seeking into a new block group, create blockReader for each internal * block in the group. @@ -268,7 +206,7 @@ public class DFSStripedInputStream extends DFSInputStream { blockEnd = -1; } - private void closeReader(BlockReaderInfo readerInfo) { + protected void closeReader(BlockReaderInfo readerInfo) { if (readerInfo != null) { if (readerInfo.reader != null) { try { @@ -288,6 +226,59 @@ public class DFSStripedInputStream extends DFSInputStream { return pos - currentLocatedBlock.getStartOffset(); } + boolean createBlockReader(LocatedBlock block, long offsetInBlock, + LocatedBlock[] targetBlocks, BlockReaderInfo[] readerInfos, + int chunkIndex) throws IOException { + BlockReader reader = null; + final ReaderRetryPolicy retry = new ReaderRetryPolicy(); + DFSInputStream.DNAddrPair dnInfo = + new DFSInputStream.DNAddrPair(null, null, null); + + while (true) { + try { + // the cached block location might have been re-fetched, so always + // get it from cache. + block = refreshLocatedBlock(block); + targetBlocks[chunkIndex] = block; + + // internal block has one location, just rule out the deadNodes + dnInfo = getBestNodeDNAddrPair(block, null); + if (dnInfo == null) { + break; + } + reader = getBlockReader(block, offsetInBlock, + block.getBlockSize() - offsetInBlock, + dnInfo.addr, dnInfo.storageType, dnInfo.info); + } catch (IOException e) { + if (e instanceof InvalidEncryptionKeyException && + retry.shouldRefetchEncryptionKey()) { + DFSClient.LOG.info("Will fetch a new encryption key and retry, " + + "encryption key was invalid when connecting to " + dnInfo.addr + + " : " + e); + dfsClient.clearDataEncryptionKey(); + retry.refetchEncryptionKey(); + } else if (retry.shouldRefetchToken() && + tokenRefetchNeeded(e, dnInfo.addr)) { + fetchBlockAt(block.getStartOffset()); + retry.refetchToken(); + } else { + //TODO: handles connection issues + DFSClient.LOG.warn("Failed to connect to " + dnInfo.addr + " for " + + "block" + block.getBlock(), e); + // re-fetch the block in case the block has been moved + fetchBlockAt(block.getStartOffset()); + addToDeadNodes(dnInfo.info); + } + } + if (reader != null) { + readerInfos[chunkIndex] = + new BlockReaderInfo(reader, dnInfo.info, offsetInBlock); + return true; + } + } + return false; + } + /** * Read a new stripe covering the current position, and store the data in the * {@link #curStripeBuf}. @@ -303,20 +294,20 @@ public class DFSStripedInputStream extends DFSInputStream { final int stripeBufOffset = (int) (offsetInBlockGroup % stripeLen); final int stripeLimit = (int) Math.min(currentLocatedBlock.getBlockSize() - (stripeIndex * stripeLen), stripeLen); - StripeRange stripeRange = new StripeRange(offsetInBlockGroup, - stripeLimit - stripeBufOffset); + StripeRange stripeRange = + new StripeRange(offsetInBlockGroup, stripeLimit - stripeBufOffset); LocatedStripedBlock blockGroup = (LocatedStripedBlock) currentLocatedBlock; AlignedStripe[] stripes = StripedBlockUtil.divideOneStripe(ecPolicy, cellSize, blockGroup, offsetInBlockGroup, - offsetInBlockGroup + stripeRange.length - 1, curStripeBuf); + offsetInBlockGroup + stripeRange.getLength() - 1, curStripeBuf); final LocatedBlock[] blks = StripedBlockUtil.parseStripedBlockGroup( blockGroup, cellSize, dataBlkNum, parityBlkNum); // read the whole stripe for (AlignedStripe stripe : stripes) { // Parse group to get chosen DN location - StripeReader sreader = new StatefulStripeReader(readingService, stripe, - blks, blockReaders, corruptedBlocks); + StripeReader sreader = new StatefulStripeReader(stripe, ecPolicy, blks, + blockReaders, corruptedBlocks, decoder, this); sreader.readStripe(); } curStripeBuf.position(stripeBufOffset); @@ -324,69 +315,8 @@ public class DFSStripedInputStream extends DFSInputStream { curStripeRange = stripeRange; } - private Callable readCells(final BlockReader reader, - final DatanodeInfo datanode, final long currentReaderOffset, - final long targetReaderOffset, final ByteBufferStrategy[] strategies, - final ExtendedBlock currentBlock, - final CorruptedBlocks corruptedBlocks) { - return new Callable() { - @Override - public Void call() throws Exception { - // reader can be null if getBlockReaderWithRetry failed or - // the reader hit exception before - if (reader == null) { - throw new IOException("The BlockReader is null. " + - "The BlockReader creation failed or the reader hit exception."); - } - Preconditions.checkState(currentReaderOffset <= targetReaderOffset); - if (currentReaderOffset < targetReaderOffset) { - long skipped = reader.skip(targetReaderOffset - currentReaderOffset); - Preconditions.checkState( - skipped == targetReaderOffset - currentReaderOffset); - } - int result = 0; - for (ByteBufferStrategy strategy : strategies) { - result += readToBuffer(reader, datanode, strategy, currentBlock, - corruptedBlocks); - } - return null; - } - }; - } - - private int readToBuffer(BlockReader blockReader, - DatanodeInfo currentNode, ByteBufferStrategy strategy, - ExtendedBlock currentBlock, - CorruptedBlocks corruptedBlocks) - throws IOException { - final int targetLength = strategy.getTargetLength(); - int length = 0; - try { - while (length < targetLength) { - int ret = strategy.readFromBlock(blockReader); - if (ret < 0) { - throw new IOException("Unexpected EOS from the reader"); - } - length += ret; - } - return length; - } catch (ChecksumException ce) { - DFSClient.LOG.warn("Found Checksum error for " - + currentBlock + " from " + currentNode - + " at " + ce.getPos()); - // we want to remember which block replicas we have tried - corruptedBlocks.addCorruptedBlock(currentBlock, currentNode); - throw ce; - } catch (IOException e) { - DFSClient.LOG.warn("Exception while reading from " - + currentBlock + " of " + src + " from " - + currentNode, e); - throw e; - } - } - /** - * Seek to a new arbitrary location + * Seek to a new arbitrary location. */ @Override public synchronized void seek(long targetPos) throws IOException { @@ -469,7 +399,7 @@ public class DFSStripedInputStream extends DFSInputStream { } /** - * Copy the data from {@link #curStripeBuf} into the given buffer + * Copy the data from {@link #curStripeBuf} into the given buffer. * @param strategy the ReaderStrategy containing the given buffer * @param length target length * @return number of bytes copied @@ -530,17 +460,19 @@ public class DFSStripedInputStream extends DFSInputStream { AlignedStripe[] stripes = StripedBlockUtil.divideByteRangeIntoStripes( ecPolicy, cellSize, blockGroup, start, end, buf); - CompletionService readService = new ExecutorCompletionService<>( - dfsClient.getStripedReadsThreadPool()); final LocatedBlock[] blks = StripedBlockUtil.parseStripedBlockGroup( blockGroup, cellSize, dataBlkNum, parityBlkNum); final BlockReaderInfo[] preaderInfos = new BlockReaderInfo[groupSize]; try { for (AlignedStripe stripe : stripes) { // Parse group to get chosen DN location - StripeReader preader = new PositionStripeReader(readService, stripe, - blks, preaderInfos, corruptedBlocks); - preader.readStripe(); + StripeReader preader = new PositionStripeReader(stripe, ecPolicy, blks, + preaderInfos, corruptedBlocks, decoder, this); + try { + preader.readStripe(); + } finally { + preader.close(); + } } buf.position(buf.position() + (int)(end - start + 1)); } finally { @@ -570,376 +502,6 @@ public class DFSStripedInputStream extends DFSInputStream { } } - /** - * The reader for reading a complete {@link AlignedStripe}. Note that an - * {@link AlignedStripe} may cross multiple stripes with cellSize width. - */ - private abstract class StripeReader { - final Map, Integer> futures = new HashMap<>(); - final AlignedStripe alignedStripe; - final CompletionService service; - final LocatedBlock[] targetBlocks; - final CorruptedBlocks corruptedBlocks; - final BlockReaderInfo[] readerInfos; - - StripeReader(CompletionService service, AlignedStripe alignedStripe, - LocatedBlock[] targetBlocks, BlockReaderInfo[] readerInfos, - CorruptedBlocks corruptedBlocks) { - this.service = service; - this.alignedStripe = alignedStripe; - this.targetBlocks = targetBlocks; - this.readerInfos = readerInfos; - this.corruptedBlocks = corruptedBlocks; - } - - /** prepare all the data chunks */ - abstract void prepareDecodeInputs(); - - /** prepare the parity chunk and block reader if necessary */ - abstract boolean prepareParityChunk(int index); - - abstract void decode(); - - void updateState4SuccessRead(StripingChunkReadResult result) { - Preconditions.checkArgument( - result.state == StripingChunkReadResult.SUCCESSFUL); - readerInfos[result.index].setOffset(alignedStripe.getOffsetInBlock() - + alignedStripe.getSpanInBlock()); - } - - private void checkMissingBlocks() throws IOException { - if (alignedStripe.missingChunksNum > parityBlkNum) { - clearFutures(futures.keySet()); - throw new IOException(alignedStripe.missingChunksNum - + " missing blocks, the stripe is: " + alignedStripe - + "; locatedBlocks is: " + locatedBlocks); - } - } - - /** - * We need decoding. Thus go through all the data chunks and make sure we - * submit read requests for all of them. - */ - private void readDataForDecoding() throws IOException { - prepareDecodeInputs(); - for (int i = 0; i < dataBlkNum; i++) { - Preconditions.checkNotNull(alignedStripe.chunks[i]); - if (alignedStripe.chunks[i].state == StripingChunk.REQUESTED) { - if (!readChunk(targetBlocks[i], i)) { - alignedStripe.missingChunksNum++; - } - } - } - checkMissingBlocks(); - } - - void readParityChunks(int num) throws IOException { - for (int i = dataBlkNum, j = 0; i < dataBlkNum + parityBlkNum && j < num; - i++) { - if (alignedStripe.chunks[i] == null) { - if (prepareParityChunk(i) && readChunk(targetBlocks[i], i)) { - j++; - } else { - alignedStripe.missingChunksNum++; - } - } - } - checkMissingBlocks(); - } - - boolean createBlockReader(LocatedBlock block, int chunkIndex) - throws IOException { - BlockReader reader = null; - final ReaderRetryPolicy retry = new ReaderRetryPolicy(); - DNAddrPair dnInfo = new DNAddrPair(null, null, null); - - while(true) { - try { - // the cached block location might have been re-fetched, so always - // get it from cache. - block = refreshLocatedBlock(block); - targetBlocks[chunkIndex] = block; - - // internal block has one location, just rule out the deadNodes - dnInfo = getBestNodeDNAddrPair(block, null); - if (dnInfo == null) { - break; - } - reader = getBlockReader(block, alignedStripe.getOffsetInBlock(), - block.getBlockSize() - alignedStripe.getOffsetInBlock(), - dnInfo.addr, dnInfo.storageType, dnInfo.info); - } catch (IOException e) { - if (e instanceof InvalidEncryptionKeyException && - retry.shouldRefetchEncryptionKey()) { - DFSClient.LOG.info("Will fetch a new encryption key and retry, " - + "encryption key was invalid when connecting to " + dnInfo.addr - + " : " + e); - dfsClient.clearDataEncryptionKey(); - retry.refetchEncryptionKey(); - } else if (retry.shouldRefetchToken() && - tokenRefetchNeeded(e, dnInfo.addr)) { - fetchBlockAt(block.getStartOffset()); - retry.refetchToken(); - } else { - //TODO: handles connection issues - DFSClient.LOG.warn("Failed to connect to " + dnInfo.addr + " for " + - "block" + block.getBlock(), e); - // re-fetch the block in case the block has been moved - fetchBlockAt(block.getStartOffset()); - addToDeadNodes(dnInfo.info); - } - } - if (reader != null) { - readerInfos[chunkIndex] = new BlockReaderInfo(reader, dnInfo.info, - alignedStripe.getOffsetInBlock()); - return true; - } - } - return false; - } - - private ByteBufferStrategy[] getReadStrategies(StripingChunk chunk) { - if (chunk.useByteBuffer()) { - ByteBufferStrategy strategy = new ByteBufferStrategy( - chunk.getByteBuffer(), readStatistics, dfsClient); - return new ByteBufferStrategy[]{strategy}; - } else { - ByteBufferStrategy[] strategies = - new ByteBufferStrategy[chunk.getChunkBuffer().getSlices().size()]; - for (int i = 0; i < strategies.length; i++) { - ByteBuffer buffer = chunk.getChunkBuffer().getSlice(i); - strategies[i] = - new ByteBufferStrategy(buffer, readStatistics, dfsClient); - } - return strategies; - } - } - - boolean readChunk(final LocatedBlock block, int chunkIndex) - throws IOException { - final StripingChunk chunk = alignedStripe.chunks[chunkIndex]; - if (block == null) { - chunk.state = StripingChunk.MISSING; - return false; - } - if (readerInfos[chunkIndex] == null) { - if (!createBlockReader(block, chunkIndex)) { - chunk.state = StripingChunk.MISSING; - return false; - } - } else if (readerInfos[chunkIndex].shouldSkip) { - chunk.state = StripingChunk.MISSING; - return false; - } - - chunk.state = StripingChunk.PENDING; - Callable readCallable = readCells(readerInfos[chunkIndex].reader, - readerInfos[chunkIndex].datanode, - readerInfos[chunkIndex].blockReaderOffset, - alignedStripe.getOffsetInBlock(), getReadStrategies(chunk), - block.getBlock(), corruptedBlocks); - - Future request = service.submit(readCallable); - futures.put(request, chunkIndex); - return true; - } - - /** read the whole stripe. do decoding if necessary */ - void readStripe() throws IOException { - for (int i = 0; i < dataBlkNum; i++) { - if (alignedStripe.chunks[i] != null && - alignedStripe.chunks[i].state != StripingChunk.ALLZERO) { - if (!readChunk(targetBlocks[i], i)) { - alignedStripe.missingChunksNum++; - } - } - } - // There are missing block locations at this stage. Thus we need to read - // the full stripe and one more parity block. - if (alignedStripe.missingChunksNum > 0) { - checkMissingBlocks(); - readDataForDecoding(); - // read parity chunks - readParityChunks(alignedStripe.missingChunksNum); - } - // TODO: for a full stripe we can start reading (dataBlkNum + 1) chunks - - // Input buffers for potential decode operation, which remains null until - // first read failure - while (!futures.isEmpty()) { - try { - StripingChunkReadResult r = StripedBlockUtil - .getNextCompletedStripedRead(service, futures, 0); - if (DFSClient.LOG.isDebugEnabled()) { - DFSClient.LOG.debug("Read task returned: " + r + ", for stripe " - + alignedStripe); - } - StripingChunk returnedChunk = alignedStripe.chunks[r.index]; - Preconditions.checkNotNull(returnedChunk); - Preconditions.checkState(returnedChunk.state == StripingChunk.PENDING); - - if (r.state == StripingChunkReadResult.SUCCESSFUL) { - returnedChunk.state = StripingChunk.FETCHED; - alignedStripe.fetchedChunksNum++; - updateState4SuccessRead(r); - if (alignedStripe.fetchedChunksNum == dataBlkNum) { - clearFutures(futures.keySet()); - break; - } - } else { - returnedChunk.state = StripingChunk.MISSING; - // close the corresponding reader - closeReader(readerInfos[r.index]); - - final int missing = alignedStripe.missingChunksNum; - alignedStripe.missingChunksNum++; - checkMissingBlocks(); - - readDataForDecoding(); - readParityChunks(alignedStripe.missingChunksNum - missing); - } - } catch (InterruptedException ie) { - String err = "Read request interrupted"; - DFSClient.LOG.error(err); - clearFutures(futures.keySet()); - // Don't decode if read interrupted - throw new InterruptedIOException(err); - } - } - - if (alignedStripe.missingChunksNum > 0) { - decode(); - } - } - } - - class PositionStripeReader extends StripeReader { - private ByteBuffer[] decodeInputs = null; - - PositionStripeReader(CompletionService service, - AlignedStripe alignedStripe, LocatedBlock[] targetBlocks, - BlockReaderInfo[] readerInfos, CorruptedBlocks corruptedBlocks) { - super(service, alignedStripe, targetBlocks, readerInfos, - corruptedBlocks); - } - - @Override - void prepareDecodeInputs() { - if (decodeInputs == null) { - decodeInputs = StripedBlockUtil.initDecodeInputs(alignedStripe, - dataBlkNum, parityBlkNum); - } - } - - @Override - boolean prepareParityChunk(int index) { - Preconditions.checkState(index >= dataBlkNum && - alignedStripe.chunks[index] == null); - alignedStripe.chunks[index] = new StripingChunk(decodeInputs[index]); - return true; - } - - @Override - void decode() { - StripedBlockUtil.finalizeDecodeInputs(decodeInputs, alignedStripe); - StripedBlockUtil.decodeAndFillBuffer(decodeInputs, alignedStripe, - dataBlkNum, parityBlkNum, decoder); - } - } - - class StatefulStripeReader extends StripeReader { - ByteBuffer[] decodeInputs; - - StatefulStripeReader(CompletionService service, - AlignedStripe alignedStripe, LocatedBlock[] targetBlocks, - BlockReaderInfo[] readerInfos, CorruptedBlocks corruptedBlocks) { - super(service, alignedStripe, targetBlocks, readerInfos, - corruptedBlocks); - } - - @Override - void prepareDecodeInputs() { - if (decodeInputs == null) { - decodeInputs = new ByteBuffer[dataBlkNum + parityBlkNum]; - final ByteBuffer cur; - synchronized (DFSStripedInputStream.this) { - cur = curStripeBuf.duplicate(); - } - StripedBlockUtil.VerticalRange range = alignedStripe.range; - for (int i = 0; i < dataBlkNum; i++) { - cur.limit(cur.capacity()); - int pos = (int) (range.offsetInBlock % cellSize + cellSize * i); - cur.position(pos); - cur.limit((int) (pos + range.spanInBlock)); - decodeInputs[i] = cur.slice(); - if (alignedStripe.chunks[i] == null) { - alignedStripe.chunks[i] = new StripingChunk(decodeInputs[i]); - } - } - } - } - - @Override - boolean prepareParityChunk(int index) { - Preconditions.checkState(index >= dataBlkNum - && alignedStripe.chunks[index] == null); - if (blockReaders[index] != null && blockReaders[index].shouldSkip) { - alignedStripe.chunks[index] = new StripingChunk(StripingChunk.MISSING); - // we have failed the block reader before - return false; - } - final int parityIndex = index - dataBlkNum; - ByteBuffer buf = getParityBuffer().duplicate(); - buf.position(cellSize * parityIndex); - buf.limit(cellSize * parityIndex + (int) alignedStripe.range.spanInBlock); - decodeInputs[index] = buf.slice(); - alignedStripe.chunks[index] = new StripingChunk(decodeInputs[index]); - return true; - } - - @Override - void decode() { - final int span = (int) alignedStripe.getSpanInBlock(); - for (int i = 0; i < alignedStripe.chunks.length; i++) { - if (alignedStripe.chunks[i] != null && - alignedStripe.chunks[i].state == StripingChunk.ALLZERO) { - for (int j = 0; j < span; j++) { - decodeInputs[i].put((byte) 0); - } - decodeInputs[i].flip(); - } else if (alignedStripe.chunks[i] != null && - alignedStripe.chunks[i].state == StripingChunk.FETCHED) { - decodeInputs[i].position(0); - decodeInputs[i].limit(span); - } - } - int[] decodeIndices = new int[parityBlkNum]; - int pos = 0; - for (int i = 0; i < alignedStripe.chunks.length; i++) { - if (alignedStripe.chunks[i] != null && - alignedStripe.chunks[i].state == StripingChunk.MISSING) { - if (i < dataBlkNum) { - decodeIndices[pos++] = i; - } else { - decodeInputs[i] = null; - } - } - } - decodeIndices = Arrays.copyOf(decodeIndices, pos); - - final int decodeChunkNum = decodeIndices.length; - ByteBuffer[] outputs = new ByteBuffer[decodeChunkNum]; - for (int i = 0; i < decodeChunkNum; i++) { - outputs[i] = decodeInputs[decodeIndices[i]]; - outputs[i].position(0); - outputs[i].limit((int) alignedStripe.range.spanInBlock); - decodeInputs[decodeIndices[i]] = null; - } - - decoder.decode(decodeInputs, decodeIndices, outputs); - } - } - /** * May need online read recovery, zero-copy read doesn't make * sense, so don't support it. @@ -957,12 +519,4 @@ public class DFSStripedInputStream extends DFSInputStream { throw new UnsupportedOperationException( "Not support enhanced byte buffer access."); } - - /** A variation to {@link DFSInputStream#cancelAll} */ - private void clearFutures(Collection> futures) { - for (Future future : futures) { - future.cancel(false); - } - futures.clear(); - } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/PositionStripeReader.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/PositionStripeReader.java new file mode 100644 index 0000000000..5818291ff5 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/PositionStripeReader.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs; + +import com.google.common.base.Preconditions; +import org.apache.commons.configuration.SystemConfiguration; +import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; +import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.util.StripedBlockUtil; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunk; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe; +import org.apache.hadoop.io.erasurecode.ECChunk; +import org.apache.hadoop.io.erasurecode.rawcoder.RawErasureDecoder; +import org.apache.hadoop.hdfs.DFSUtilClient.CorruptedBlocks; + +import java.nio.ByteBuffer; + +/** + * The reader for reading a complete {@link StripedBlockUtil.AlignedStripe} + * which may cross multiple stripes with cellSize width. + */ +class PositionStripeReader extends StripeReader { + private ByteBuffer codingBuffer; + + PositionStripeReader(AlignedStripe alignedStripe, + ErasureCodingPolicy ecPolicy, LocatedBlock[] targetBlocks, + BlockReaderInfo[] readerInfos, CorruptedBlocks corruptedBlocks, + RawErasureDecoder decoder, DFSStripedInputStream dfsStripedInputStream) { + super(alignedStripe, ecPolicy, targetBlocks, readerInfos, + corruptedBlocks, decoder, dfsStripedInputStream); + } + + @Override + void prepareDecodeInputs() { + if (codingBuffer == null) { + this.decodeInputs = new ECChunk[dataBlkNum + parityBlkNum]; + initDecodeInputs(alignedStripe); + } + } + + @Override + boolean prepareParityChunk(int index) { + Preconditions.checkState(index >= dataBlkNum && + alignedStripe.chunks[index] == null); + + alignedStripe.chunks[index] = + new StripingChunk(decodeInputs[index].getBuffer()); + + return true; + } + + @Override + void decode() { + finalizeDecodeInputs(); + decodeAndFillBuffer(true); + } + + void initDecodeInputs(AlignedStripe alignedStripe) { + int bufLen = (int) alignedStripe.getSpanInBlock(); + int bufCount = dataBlkNum + parityBlkNum; + codingBuffer = dfsStripedInputStream.getBufferPool(). + getBuffer(useDirectBuffer(), bufLen * bufCount); + ByteBuffer buffer; + for (int i = 0; i < decodeInputs.length; i++) { + buffer = codingBuffer.duplicate(); + decodeInputs[i] = new ECChunk(buffer, i * bufLen, bufLen); + } + + for (int i = 0; i < dataBlkNum; i++) { + if (alignedStripe.chunks[i] == null) { + alignedStripe.chunks[i] = + new StripingChunk(decodeInputs[i].getBuffer()); + } + } + } + + void close() { + if (decodeInputs != null) { + for (int i = 0; i < decodeInputs.length; i++) { + decodeInputs[i] = null; + } + } + + if (codingBuffer != null) { + dfsStripedInputStream.getBufferPool().putBuffer(codingBuffer); + codingBuffer = null; + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StatefulStripeReader.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StatefulStripeReader.java new file mode 100644 index 0000000000..88795144ef --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StatefulStripeReader.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; +import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.util.StripedBlockUtil; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunk; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe; +import org.apache.hadoop.io.erasurecode.ECChunk; +import org.apache.hadoop.io.erasurecode.rawcoder.RawErasureDecoder; +import org.apache.hadoop.hdfs.DFSUtilClient.CorruptedBlocks; + +import java.nio.ByteBuffer; + +/** + * The reader for reading a complete {@link StripedBlockUtil.AlignedStripe} + * which belongs to a single stripe. + * Reading cross multiple strips is not supported in this reader. + */ +class StatefulStripeReader extends StripeReader { + + StatefulStripeReader(AlignedStripe alignedStripe, + ErasureCodingPolicy ecPolicy, LocatedBlock[] targetBlocks, + BlockReaderInfo[] readerInfos, CorruptedBlocks corruptedBlocks, + RawErasureDecoder decoder, DFSStripedInputStream dfsStripedInputStream) { + super(alignedStripe, ecPolicy, targetBlocks, readerInfos, + corruptedBlocks, decoder, dfsStripedInputStream); + } + + @Override + void prepareDecodeInputs() { + final ByteBuffer cur; + synchronized (dfsStripedInputStream) { + cur = dfsStripedInputStream.getCurStripeBuf().duplicate(); + } + + this.decodeInputs = new ECChunk[dataBlkNum + parityBlkNum]; + int bufLen = (int) alignedStripe.getSpanInBlock(); + int bufOff = (int) alignedStripe.getOffsetInBlock(); + for (int i = 0; i < dataBlkNum; i++) { + cur.limit(cur.capacity()); + int pos = bufOff % cellSize + cellSize * i; + cur.position(pos); + cur.limit(pos + bufLen); + decodeInputs[i] = new ECChunk(cur.slice(), 0, bufLen); + if (alignedStripe.chunks[i] == null) { + alignedStripe.chunks[i] = + new StripingChunk(decodeInputs[i].getBuffer()); + } + } + } + + @Override + boolean prepareParityChunk(int index) { + Preconditions.checkState(index >= dataBlkNum + && alignedStripe.chunks[index] == null); + if (readerInfos[index] != null && readerInfos[index].shouldSkip) { + alignedStripe.chunks[index] = new StripingChunk(StripingChunk.MISSING); + // we have failed the block reader before + return false; + } + final int parityIndex = index - dataBlkNum; + ByteBuffer buf = dfsStripedInputStream.getParityBuffer().duplicate(); + buf.position(cellSize * parityIndex); + buf.limit(cellSize * parityIndex + (int) alignedStripe.range.spanInBlock); + decodeInputs[index] = + new ECChunk(buf.slice(), 0, (int) alignedStripe.range.spanInBlock); + alignedStripe.chunks[index] = + new StripingChunk(decodeInputs[index].getBuffer()); + return true; + } + + @Override + void decode() { + finalizeDecodeInputs(); + decodeAndFillBuffer(false); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StripeReader.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StripeReader.java new file mode 100644 index 0000000000..5518752079 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StripeReader.java @@ -0,0 +1,463 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.fs.ChecksumException; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.util.StripedBlockUtil; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunk; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe; +import org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunkReadResult; +import org.apache.hadoop.io.erasurecode.ECChunk; +import org.apache.hadoop.io.erasurecode.rawcoder.RawErasureDecoder; +import org.apache.hadoop.hdfs.DFSUtilClient.CorruptedBlocks; + +import java.io.IOException; +import java.io.InterruptedIOException; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +/** + * The reader for reading a complete {@link StripedBlockUtil.AlignedStripe}. + * Note that an {@link StripedBlockUtil.AlignedStripe} may cross multiple + * stripes with cellSize width. + */ +abstract class StripeReader { + + static class ReaderRetryPolicy { + private int fetchEncryptionKeyTimes = 1; + private int fetchTokenTimes = 1; + + void refetchEncryptionKey() { + fetchEncryptionKeyTimes--; + } + + void refetchToken() { + fetchTokenTimes--; + } + + boolean shouldRefetchEncryptionKey() { + return fetchEncryptionKeyTimes > 0; + } + + boolean shouldRefetchToken() { + return fetchTokenTimes > 0; + } + } + + static class BlockReaderInfo { + final BlockReader reader; + final DatanodeInfo datanode; + /** + * when initializing block readers, their starting offsets are set to the + * same number: the smallest internal block offsets among all the readers. + * This is because it is possible that for some internal blocks we have to + * read "backwards" for decoding purpose. We thus use this offset array to + * track offsets for all the block readers so that we can skip data if + * necessary. + */ + long blockReaderOffset; + /** + * We use this field to indicate whether we should use this reader. In case + * we hit any issue with this reader, we set this field to true and avoid + * using it for the next stripe. + */ + boolean shouldSkip = false; + + BlockReaderInfo(BlockReader reader, DatanodeInfo dn, long offset) { + this.reader = reader; + this.datanode = dn; + this.blockReaderOffset = offset; + } + + void setOffset(long offset) { + this.blockReaderOffset = offset; + } + + void skip() { + this.shouldSkip = true; + } + } + + protected final Map, Integer> futures = new HashMap<>(); + protected final AlignedStripe alignedStripe; + protected final CompletionService service; + protected final LocatedBlock[] targetBlocks; + protected final CorruptedBlocks corruptedBlocks; + protected final BlockReaderInfo[] readerInfos; + protected final ErasureCodingPolicy ecPolicy; + protected final short dataBlkNum; + protected final short parityBlkNum; + protected final int cellSize; + protected final RawErasureDecoder decoder; + protected final DFSStripedInputStream dfsStripedInputStream; + + protected ECChunk[] decodeInputs; + + StripeReader(AlignedStripe alignedStripe, + ErasureCodingPolicy ecPolicy, LocatedBlock[] targetBlocks, + BlockReaderInfo[] readerInfos, CorruptedBlocks corruptedBlocks, + RawErasureDecoder decoder, + DFSStripedInputStream dfsStripedInputStream) { + this.alignedStripe = alignedStripe; + this.ecPolicy = ecPolicy; + this.dataBlkNum = (short)ecPolicy.getNumDataUnits(); + this.parityBlkNum = (short)ecPolicy.getNumParityUnits(); + this.cellSize = ecPolicy.getCellSize(); + this.targetBlocks = targetBlocks; + this.readerInfos = readerInfos; + this.corruptedBlocks = corruptedBlocks; + this.decoder = decoder; + this.dfsStripedInputStream = dfsStripedInputStream; + + service = new ExecutorCompletionService<>( + dfsStripedInputStream.getStripedReadsThreadPool()); + } + + /** + * Prepare all the data chunks. + */ + abstract void prepareDecodeInputs(); + + /** + * Prepare the parity chunk and block reader if necessary. + */ + abstract boolean prepareParityChunk(int index); + + /* + * Decode to get the missing data. + */ + abstract void decode(); + + /* + * Default close do nothing. + */ + void close() { + } + + void updateState4SuccessRead(StripingChunkReadResult result) { + Preconditions.checkArgument( + result.state == StripingChunkReadResult.SUCCESSFUL); + readerInfos[result.index].setOffset(alignedStripe.getOffsetInBlock() + + alignedStripe.getSpanInBlock()); + } + + private void checkMissingBlocks() throws IOException { + if (alignedStripe.missingChunksNum > parityBlkNum) { + clearFutures(); + throw new IOException(alignedStripe.missingChunksNum + + " missing blocks, the stripe is: " + alignedStripe + + "; locatedBlocks is: " + dfsStripedInputStream.getLocatedBlocks()); + } + } + + /** + * We need decoding. Thus go through all the data chunks and make sure we + * submit read requests for all of them. + */ + private void readDataForDecoding() throws IOException { + prepareDecodeInputs(); + for (int i = 0; i < dataBlkNum; i++) { + Preconditions.checkNotNull(alignedStripe.chunks[i]); + if (alignedStripe.chunks[i].state == StripingChunk.REQUESTED) { + if (!readChunk(targetBlocks[i], i)) { + alignedStripe.missingChunksNum++; + } + } + } + checkMissingBlocks(); + } + + void readParityChunks(int num) throws IOException { + for (int i = dataBlkNum, j = 0; i < dataBlkNum + parityBlkNum && j < num; + i++) { + if (alignedStripe.chunks[i] == null) { + if (prepareParityChunk(i) && readChunk(targetBlocks[i], i)) { + j++; + } else { + alignedStripe.missingChunksNum++; + } + } + } + checkMissingBlocks(); + } + + private ByteBufferStrategy[] getReadStrategies(StripingChunk chunk) { + if (chunk.useByteBuffer()) { + ByteBufferStrategy strategy = new ByteBufferStrategy( + chunk.getByteBuffer(), dfsStripedInputStream.getReadStatistics(), + dfsStripedInputStream.getDFSClient()); + return new ByteBufferStrategy[]{strategy}; + } + + ByteBufferStrategy[] strategies = + new ByteBufferStrategy[chunk.getChunkBuffer().getSlices().size()]; + for (int i = 0; i < strategies.length; i++) { + ByteBuffer buffer = chunk.getChunkBuffer().getSlice(i); + strategies[i] = new ByteBufferStrategy(buffer, + dfsStripedInputStream.getReadStatistics(), + dfsStripedInputStream.getDFSClient()); + } + return strategies; + } + + private int readToBuffer(BlockReader blockReader, + DatanodeInfo currentNode, ByteBufferStrategy strategy, + ExtendedBlock currentBlock) throws IOException { + final int targetLength = strategy.getTargetLength(); + int length = 0; + try { + while (length < targetLength) { + int ret = strategy.readFromBlock(blockReader); + if (ret < 0) { + throw new IOException("Unexpected EOS from the reader"); + } + length += ret; + } + return length; + } catch (ChecksumException ce) { + DFSClient.LOG.warn("Found Checksum error for " + + currentBlock + " from " + currentNode + + " at " + ce.getPos()); + // we want to remember which block replicas we have tried + corruptedBlocks.addCorruptedBlock(currentBlock, currentNode); + throw ce; + } catch (IOException e) { + DFSClient.LOG.warn("Exception while reading from " + + currentBlock + " of " + dfsStripedInputStream.getSrc() + " from " + + currentNode, e); + throw e; + } + } + + private Callable readCells(final BlockReader reader, + final DatanodeInfo datanode, final long currentReaderOffset, + final long targetReaderOffset, final ByteBufferStrategy[] strategies, + final ExtendedBlock currentBlock) { + return () -> { + // reader can be null if getBlockReaderWithRetry failed or + // the reader hit exception before + if (reader == null) { + throw new IOException("The BlockReader is null. " + + "The BlockReader creation failed or the reader hit exception."); + } + Preconditions.checkState(currentReaderOffset <= targetReaderOffset); + if (currentReaderOffset < targetReaderOffset) { + long skipped = reader.skip(targetReaderOffset - currentReaderOffset); + Preconditions.checkState( + skipped == targetReaderOffset - currentReaderOffset); + } + + for (ByteBufferStrategy strategy : strategies) { + readToBuffer(reader, datanode, strategy, currentBlock); + } + return null; + }; + } + + boolean readChunk(final LocatedBlock block, int chunkIndex) + throws IOException { + final StripingChunk chunk = alignedStripe.chunks[chunkIndex]; + if (block == null) { + chunk.state = StripingChunk.MISSING; + return false; + } + + if (readerInfos[chunkIndex] == null) { + if (!dfsStripedInputStream.createBlockReader(block, + alignedStripe.getOffsetInBlock(), targetBlocks, + readerInfos, chunkIndex)) { + chunk.state = StripingChunk.MISSING; + return false; + } + } else if (readerInfos[chunkIndex].shouldSkip) { + chunk.state = StripingChunk.MISSING; + return false; + } + + chunk.state = StripingChunk.PENDING; + Callable readCallable = readCells(readerInfos[chunkIndex].reader, + readerInfos[chunkIndex].datanode, + readerInfos[chunkIndex].blockReaderOffset, + alignedStripe.getOffsetInBlock(), getReadStrategies(chunk), + block.getBlock()); + + Future request = service.submit(readCallable); + futures.put(request, chunkIndex); + return true; + } + + /** + * read the whole stripe. do decoding if necessary + */ + void readStripe() throws IOException { + for (int i = 0; i < dataBlkNum; i++) { + if (alignedStripe.chunks[i] != null && + alignedStripe.chunks[i].state != StripingChunk.ALLZERO) { + if (!readChunk(targetBlocks[i], i)) { + alignedStripe.missingChunksNum++; + } + } + } + // There are missing block locations at this stage. Thus we need to read + // the full stripe and one more parity block. + if (alignedStripe.missingChunksNum > 0) { + checkMissingBlocks(); + readDataForDecoding(); + // read parity chunks + readParityChunks(alignedStripe.missingChunksNum); + } + // TODO: for a full stripe we can start reading (dataBlkNum + 1) chunks + + // Input buffers for potential decode operation, which remains null until + // first read failure + while (!futures.isEmpty()) { + try { + StripingChunkReadResult r = StripedBlockUtil + .getNextCompletedStripedRead(service, futures, 0); + if (DFSClient.LOG.isDebugEnabled()) { + DFSClient.LOG.debug("Read task returned: " + r + ", for stripe " + + alignedStripe); + } + StripingChunk returnedChunk = alignedStripe.chunks[r.index]; + Preconditions.checkNotNull(returnedChunk); + Preconditions.checkState(returnedChunk.state == StripingChunk.PENDING); + + if (r.state == StripingChunkReadResult.SUCCESSFUL) { + returnedChunk.state = StripingChunk.FETCHED; + alignedStripe.fetchedChunksNum++; + updateState4SuccessRead(r); + if (alignedStripe.fetchedChunksNum == dataBlkNum) { + clearFutures(); + break; + } + } else { + returnedChunk.state = StripingChunk.MISSING; + // close the corresponding reader + dfsStripedInputStream.closeReader(readerInfos[r.index]); + + final int missing = alignedStripe.missingChunksNum; + alignedStripe.missingChunksNum++; + checkMissingBlocks(); + + readDataForDecoding(); + readParityChunks(alignedStripe.missingChunksNum - missing); + } + } catch (InterruptedException ie) { + String err = "Read request interrupted"; + DFSClient.LOG.error(err); + clearFutures(); + // Don't decode if read interrupted + throw new InterruptedIOException(err); + } + } + + if (alignedStripe.missingChunksNum > 0) { + decode(); + } + } + + /** + * Some fetched {@link StripingChunk} might be stored in original application + * buffer instead of prepared decode input buffers. Some others are beyond + * the range of the internal blocks and should correspond to all zero bytes. + * When all pending requests have returned, this method should be called to + * finalize decode input buffers. + */ + + void finalizeDecodeInputs() { + for (int i = 0; i < alignedStripe.chunks.length; i++) { + final StripingChunk chunk = alignedStripe.chunks[i]; + if (chunk != null && chunk.state == StripingChunk.FETCHED) { + if (chunk.useChunkBuffer()) { + chunk.getChunkBuffer().copyTo(decodeInputs[i].getBuffer()); + } else { + chunk.getByteBuffer().flip(); + } + } else if (chunk != null && chunk.state == StripingChunk.ALLZERO) { + decodeInputs[i].setAllZero(true); + } + } + } + + /** + * Decode based on the given input buffers and erasure coding policy. + */ + void decodeAndFillBuffer(boolean fillBuffer) { + // Step 1: prepare indices and output buffers for missing data units + int[] decodeIndices = prepareErasedIndices(); + + final int decodeChunkNum = decodeIndices.length; + ECChunk[] outputs = new ECChunk[decodeChunkNum]; + for (int i = 0; i < decodeChunkNum; i++) { + outputs[i] = decodeInputs[decodeIndices[i]]; + decodeInputs[decodeIndices[i]] = null; + } + // Step 2: decode into prepared output buffers + decoder.decode(decodeInputs, decodeIndices, outputs); + + // Step 3: fill original application buffer with decoded data + if (fillBuffer) { + for (int i = 0; i < decodeIndices.length; i++) { + int missingBlkIdx = decodeIndices[i]; + StripingChunk chunk = alignedStripe.chunks[missingBlkIdx]; + if (chunk.state == StripingChunk.MISSING && chunk.useChunkBuffer()) { + chunk.getChunkBuffer().copyFrom(outputs[i].getBuffer()); + } + } + } + } + + /** + * Prepare erased indices. + */ + int[] prepareErasedIndices() { + int[] decodeIndices = new int[parityBlkNum]; + int pos = 0; + for (int i = 0; i < alignedStripe.chunks.length; i++) { + if (alignedStripe.chunks[i] != null && + alignedStripe.chunks[i].state == StripingChunk.MISSING){ + decodeIndices[pos++] = i; + } + } + + int[] erasedIndices = Arrays.copyOf(decodeIndices, pos); + return erasedIndices; + } + + void clearFutures() { + for (Future future : futures.keySet()) { + future.cancel(false); + } + futures.clear(); + } + + boolean useDirectBuffer() { + return decoder.preferDirectBuffer(); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java index 4dbbc3dd70..896ebc62b8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java @@ -22,7 +22,6 @@ import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.fs.StorageType; -import org.apache.hadoop.hdfs.DFSStripedOutputStream; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; @@ -32,7 +31,7 @@ import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock; import com.google.common.base.Preconditions; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; -import org.apache.hadoop.io.erasurecode.rawcoder.RawErasureDecoder; +import org.apache.hadoop.hdfs.DFSStripedOutputStream; import org.apache.hadoop.security.token.Token; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -76,18 +75,6 @@ public class StripedBlockUtil { public static final Logger LOG = LoggerFactory.getLogger(StripedBlockUtil.class); - /** - * Parses a striped block group into individual blocks. - * @param bg The striped block group - * @param ecPolicy The erasure coding policy - * @return An array of the blocks in the group - */ - public static LocatedBlock[] parseStripedBlockGroup(LocatedStripedBlock bg, - ErasureCodingPolicy ecPolicy) { - return parseStripedBlockGroup(bg, ecPolicy.getCellSize(), - ecPolicy.getNumDataUnits(), ecPolicy.getNumParityUnits()); - } - /** * This method parses a striped block group into individual blocks. * @@ -112,7 +99,7 @@ public class StripedBlockUtil { } /** - * This method creates an internal block at the given index of a block group + * This method creates an internal block at the given index of a block group. * * @param idxInReturnedLocs The index in the stored locations in the * {@link LocatedStripedBlock} object @@ -169,7 +156,7 @@ public class StripedBlockUtil { } /** - * Get the size of an internal block at the given index of a block group + * Get the size of an internal block at the given index of a block group. * * @param dataSize Size of the block group only counting data blocks * @param cellSize The size of a striping cell @@ -237,7 +224,7 @@ public class StripedBlockUtil { /** * Given a byte's offset in an internal block, calculate the offset in - * the block group + * the block group. */ public static long offsetInBlkToOffsetInBG(int cellSize, int dataBlkNum, long offsetInBlk, int idxInBlockGroup) { @@ -248,12 +235,12 @@ public class StripedBlockUtil { } /** - * Get the next completed striped read task + * Get the next completed striped read task. * - * @return {@link StripingChunkReadResult} indicating the status of the read task - * succeeded, and the block index of the task. If the method times - * out without getting any completed read tasks, -1 is returned as - * block index. + * @return {@link StripingChunkReadResult} indicating the status of the read + * task succeeded, and the block index of the task. If the method + * times out without getting any completed read tasks, -1 is + * returned as block index. * @throws InterruptedException */ public static StripingChunkReadResult getNextCompletedStripedRead( @@ -287,7 +274,7 @@ public class StripedBlockUtil { /** * Get the total usage of the striped blocks, which is the total of data - * blocks and parity blocks + * blocks and parity blocks. * * @param numDataBlkBytes * Size of the block group only counting data blocks @@ -307,91 +294,6 @@ public class StripedBlockUtil { return numDataBlkBytes + numParityBlkBytes; } - /** - * Initialize the decoding input buffers based on the chunk states in an - * {@link AlignedStripe}. For each chunk that was not initially requested, - * schedule a new fetch request with the decoding input buffer as transfer - * destination. - */ - public static ByteBuffer[] initDecodeInputs(AlignedStripe alignedStripe, - int dataBlkNum, int parityBlkNum) { - ByteBuffer[] decodeInputs = new ByteBuffer[dataBlkNum + parityBlkNum]; - for (int i = 0; i < decodeInputs.length; i++) { - decodeInputs[i] = ByteBuffer.allocate( - (int) alignedStripe.getSpanInBlock()); - } - // read the full data aligned stripe - for (int i = 0; i < dataBlkNum; i++) { - if (alignedStripe.chunks[i] == null) { - alignedStripe.chunks[i] = new StripingChunk(decodeInputs[i]); - } - } - return decodeInputs; - } - - /** - * Some fetched {@link StripingChunk} might be stored in original application - * buffer instead of prepared decode input buffers. Some others are beyond - * the range of the internal blocks and should correspond to all zero bytes. - * When all pending requests have returned, this method should be called to - * finalize decode input buffers. - */ - public static void finalizeDecodeInputs(final ByteBuffer[] decodeInputs, - AlignedStripe alignedStripe) { - for (int i = 0; i < alignedStripe.chunks.length; i++) { - final StripingChunk chunk = alignedStripe.chunks[i]; - if (chunk != null && chunk.state == StripingChunk.FETCHED) { - if (chunk.useChunkBuffer()) { - chunk.getChunkBuffer().copyTo(decodeInputs[i]); - } else { - chunk.getByteBuffer().flip(); - } - } else if (chunk != null && chunk.state == StripingChunk.ALLZERO) { - //ZERO it. Will be better handled in other following issue. - byte[] emptyBytes = new byte[decodeInputs[i].limit()]; - decodeInputs[i].put(emptyBytes); - decodeInputs[i].flip(); - } else { - decodeInputs[i] = null; - } - } - } - - /** - * Decode based on the given input buffers and erasure coding policy. - */ - public static void decodeAndFillBuffer(final ByteBuffer[] decodeInputs, - AlignedStripe alignedStripe, int dataBlkNum, int parityBlkNum, - RawErasureDecoder decoder) { - // Step 1: prepare indices and output buffers for missing data units - int[] decodeIndices = new int[parityBlkNum]; - int pos = 0; - for (int i = 0; i < dataBlkNum; i++) { - if (alignedStripe.chunks[i] != null && - alignedStripe.chunks[i].state == StripingChunk.MISSING){ - decodeIndices[pos++] = i; - } - } - decodeIndices = Arrays.copyOf(decodeIndices, pos); - ByteBuffer[] decodeOutputs = new ByteBuffer[decodeIndices.length]; - for (int i = 0; i < decodeOutputs.length; i++) { - decodeOutputs[i] = ByteBuffer.allocate( - (int) alignedStripe.getSpanInBlock()); - } - - // Step 2: decode into prepared output buffers - decoder.decode(decodeInputs, decodeIndices, decodeOutputs); - - // Step 3: fill original application buffer with decoded data - for (int i = 0; i < decodeIndices.length; i++) { - int missingBlkIdx = decodeIndices[i]; - StripingChunk chunk = alignedStripe.chunks[missingBlkIdx]; - if (chunk.state == StripingChunk.MISSING && chunk.useChunkBuffer()) { - chunk.getChunkBuffer().copyFrom(decodeOutputs[i]); - } - } - } - /** * Similar functionality with {@link #divideByteRangeIntoStripes}, but is used * by stateful read and uses ByteBuffer as reading target buffer. Besides the @@ -485,7 +387,7 @@ public class StripedBlockUtil { /** * Map the logical byte range to a set of inclusive {@link StripingCell} * instances, each representing the overlap of the byte range to a cell - * used by {@link DFSStripedOutputStream} in encoding + * used by {@link DFSStripedOutputStream} in encoding. */ @VisibleForTesting private static StripingCell[] getStripingCellsOfByteRange( @@ -530,7 +432,7 @@ public class StripedBlockUtil { int dataBlkNum = ecPolicy.getNumDataUnits(); int parityBlkNum = ecPolicy.getNumParityUnits(); - VerticalRange ranges[] = new VerticalRange[dataBlkNum + parityBlkNum]; + VerticalRange[] ranges = new VerticalRange[dataBlkNum + parityBlkNum]; long earliestStart = Long.MAX_VALUE; long latestEnd = -1; @@ -675,7 +577,7 @@ public class StripedBlockUtil { @VisibleForTesting static class StripingCell { final ErasureCodingPolicy ecPolicy; - /** Logical order in a block group, used when doing I/O to a block group */ + /** Logical order in a block group, used when doing I/O to a block group. */ final int idxInBlkGroup; final int idxInInternalBlk; final int idxInStripe; @@ -738,7 +640,7 @@ public class StripedBlockUtil { */ public static class AlignedStripe { public VerticalRange range; - /** status of each chunk in the stripe */ + /** status of each chunk in the stripe. */ public final StripingChunk[] chunks; public int fetchedChunksNum = 0; public int missingChunksNum = 0; @@ -790,9 +692,9 @@ public class StripedBlockUtil { * +-----+ */ public static class VerticalRange { - /** start offset in the block group (inclusive) */ + /** start offset in the block group (inclusive). */ public long offsetInBlock; - /** length of the stripe range */ + /** length of the stripe range. */ public long spanInBlock; public VerticalRange(long offsetInBlock, long length) { @@ -801,7 +703,7 @@ public class StripedBlockUtil { this.spanInBlock = length; } - /** whether a position is in the range */ + /** whether a position is in the range. */ public boolean include(long pos) { return pos >= offsetInBlock && pos < offsetInBlock + spanInBlock; } @@ -915,7 +817,7 @@ public class StripedBlockUtil { /** * Note: target will be ready-to-read state after the call. */ - void copyTo(ByteBuffer target) { + public void copyTo(ByteBuffer target) { for (ByteBuffer slice : slices) { slice.flip(); target.put(slice); @@ -923,7 +825,7 @@ public class StripedBlockUtil { target.flip(); } - void copyFrom(ByteBuffer src) { + public void copyFrom(ByteBuffer src) { ByteBuffer tmp; int len; for (ByteBuffer slice : slices) { @@ -970,6 +872,28 @@ public class StripedBlockUtil { } } + /** Used to indicate the buffered data's range in the block group. */ + public static class StripeRange { + /** start offset in the block group (inclusive). */ + final long offsetInBlock; + /** length of the stripe range. */ + final long length; + + public StripeRange(long offsetInBlock, long length) { + Preconditions.checkArgument(offsetInBlock >= 0 && length >= 0); + this.offsetInBlock = offsetInBlock; + this.length = length; + } + + public boolean include(long pos) { + return pos >= offsetInBlock && pos < offsetInBlock + length; + } + + public long getLength() { + return length; + } + } + /** * Check if the information such as IDs and generation stamps in block-i * match the block group. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java index 7d9d7dc540..999eb1faaa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java @@ -283,5 +283,4 @@ public class TestStripedBlockUtil { } } } - } From c6d1d742e70e7b8f1d89cf9a4780657646e6a367 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 20 Sep 2016 14:15:06 +0000 Subject: [PATCH 3/9] YARN-5655. TestContainerManagerSecurity#testNMTokens is asserting. Contributed by Robert Kanter --- .../hadoop/yarn/server/TestContainerManagerSecurity.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java index ee3396db60..408c1cc4a5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java @@ -68,6 +68,8 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.MockRMApp; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM; import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.security.BaseNMTokenSecretManager; @@ -205,6 +207,9 @@ public class TestContainerManagerSecurity extends KerberosSecurityTestcase { Resource r = Resource.newInstance(1024, 1); ApplicationId appId = ApplicationId.newInstance(1, 1); + MockRMApp m = new MockRMApp(appId.getId(), appId.getClusterTimestamp(), + RMAppState.NEW); + yarnCluster.getResourceManager().getRMContext().getRMApps().put(appId, m); ApplicationAttemptId validAppAttemptId = ApplicationAttemptId.newInstance(appId, 1); From e45307c9a063248fcfb08281025d87c4abd343b1 Mon Sep 17 00:00:00 2001 From: Wangda Tan Date: Tue, 20 Sep 2016 11:21:01 -0700 Subject: [PATCH 4/9] Addendum patch for fix javadocs failure which is caused by YARN-3141. (wangda) --- .../resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java index f40ecd79ea..fd43e748fc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java @@ -328,7 +328,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt { * of the resources that will be allocated to and preempted from this * application. * - * @param rc + * @param resourceCalculator * @param clusterResource * @param minimumAllocation * @return an allocation From 9f03b403ec69658fc57bc0f6b832da0e3c746497 Mon Sep 17 00:00:00 2001 From: Arun Suresh Date: Tue, 20 Sep 2016 12:27:17 -0700 Subject: [PATCH 5/9] YARN-5656. Fix ReservationACLsTestBase. (Sean Po via asuresh) --- .../reservation/NoOverCommitPolicy.java | 12 ----- .../exceptions/MismatchedUserException.java | 46 ------------------- .../ReservationACLsTestBase.java | 2 + .../reservation/TestNoOverCommitPolicy.java | 21 --------- 4 files changed, 2 insertions(+), 79 deletions(-) delete mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/exceptions/MismatchedUserException.java diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/NoOverCommitPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/NoOverCommitPolicy.java index 814d4b51a8..55f1d00e0c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/NoOverCommitPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/NoOverCommitPolicy.java @@ -21,7 +21,6 @@ package org.apache.hadoop.yarn.server.resourcemanager.reservation; import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate; import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.yarn.api.records.ReservationId; -import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.MismatchedUserException; import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.PlanningException; import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.ResourceOverCommitException; @@ -39,17 +38,6 @@ public class NoOverCommitPolicy implements SharingPolicy { public void validate(Plan plan, ReservationAllocation reservation) throws PlanningException { - ReservationAllocation oldReservation = - plan.getReservationById(reservation.getReservationId()); - - // check updates are using same name - if (oldReservation != null - && !oldReservation.getUser().equals(reservation.getUser())) { - throw new MismatchedUserException( - "Updating an existing reservation with mismatching user:" - + oldReservation.getUser() + " != " + reservation.getUser()); - } - RLESparseResourceAllocation available = plan.getAvailableResourceOverTime( reservation.getUser(), reservation.getReservationId(), reservation.getStartTime(), reservation.getEndTime()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/exceptions/MismatchedUserException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/exceptions/MismatchedUserException.java deleted file mode 100644 index 7b4419baab..0000000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/exceptions/MismatchedUserException.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions; - -import org.apache.hadoop.classification.InterfaceAudience.Public; -import org.apache.hadoop.classification.InterfaceStability.Unstable; - -/** - * Exception thrown when an update to an existing reservation is performed - * by a user that is not the reservation owner. - */ -@Public -@Unstable -public class MismatchedUserException extends PlanningException { - - private static final long serialVersionUID = 8313222590561668413L; - - public MismatchedUserException(String message) { - super(message); - } - - public MismatchedUserException(Throwable cause) { - super(cause); - } - - public MismatchedUserException(String message, Throwable cause) { - super(message, cause); - } - -} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/ReservationACLsTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/ReservationACLsTestBase.java index aa5acc6279..c536d8d327 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/ReservationACLsTestBase.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/ReservationACLsTestBase.java @@ -566,6 +566,8 @@ public class ReservationACLsTestBase extends ACLsTestBase { PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE)); out.println(""); out.println(""); + out.println(" drf" + + ""); out.println(" "); out.println(" " + "queueA_user,common_user " + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/TestNoOverCommitPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/TestNoOverCommitPolicy.java index 28dd62eed9..c5edaf000e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/TestNoOverCommitPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/TestNoOverCommitPolicy.java @@ -23,10 +23,8 @@ import static org.mockito.Mockito.mock; import java.io.IOException; import org.apache.hadoop.yarn.api.records.ReservationDefinition; -import org.apache.hadoop.yarn.api.records.ReservationId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; -import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.MismatchedUserException; import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.PlanningException; import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.ResourceOverCommitException; import org.apache.hadoop.yarn.server.resourcemanager.reservation.planning.ReservationAgent; @@ -127,25 +125,6 @@ public class TestNoOverCommitPolicy { .generateAllocation(initTime, step, f), res, minAlloc), false); } - @Test(expected = MismatchedUserException.class) - public void testUserMismatch() throws IOException, PlanningException { - // generate allocation from single tenant that exceed capacity - int[] f = generateData(3600, (int) (0.5 * totCont)); - ReservationDefinition rDef = - ReservationSystemTestUtil.createSimpleReservationDefinition( - initTime, initTime + f.length + 1, f.length); - ReservationId rid = ReservationSystemTestUtil.getNewReservationId(); - - plan.addReservation(new InMemoryReservationAllocation(rid, rDef, "u1", - "dedicated", initTime, initTime + f.length, ReservationSystemTestUtil - .generateAllocation(initTime, step, f), res, minAlloc), false); - - // trying to update a reservation with a mismatching user - plan.updateReservation(new InMemoryReservationAllocation(rid, rDef, "u2", - "dedicated", initTime, initTime + f.length, ReservationSystemTestUtil - .generateAllocation(initTime, step, f), res, minAlloc)); - } - @Test public void testMultiTenantPass() throws IOException, PlanningException { // generate allocation from multiple tenants that barely fit in tot capacity From e80386d69d5fb6a08aa3366e42d2518747af569f Mon Sep 17 00:00:00 2001 From: Mingliang Liu Date: Tue, 20 Sep 2016 13:19:44 -0700 Subject: [PATCH 6/9] HADOOP-13601. Fix a log message typo in AbstractDelegationTokenSecretManager. Contributed by Mehran Hassani. --- .../token/delegation/AbstractDelegationTokenSecretManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java index 1d7f2f5328..cc2efc907f 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java @@ -528,7 +528,7 @@ extends AbstractDelegationTokenIdentifier> DataInputStream in = new DataInputStream(buf); TokenIdent id = createIdentifier(); id.readFields(in); - LOG.info("Token cancelation requested for identifier: "+id); + LOG.info("Token cancellation requested for identifier: " + id); if (id.getUser() == null) { throw new InvalidToken("Token with no owner"); From 0e918dff594e9ba5434fdee7fc1f6394b62b32cd Mon Sep 17 00:00:00 2001 From: Xiao Chen Date: Tue, 20 Sep 2016 16:52:05 -0700 Subject: [PATCH 7/9] HDFS-10879. TestEncryptionZonesWithKMS#testReadWrite fails intermittently. Contributed by Xiao Chen. --- .../hadoop/hdfs/TestEncryptionZones.java | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java index b634dd26bb..9168ca6c89 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java @@ -45,7 +45,9 @@ import org.apache.hadoop.crypto.CipherSuite; import org.apache.hadoop.crypto.CryptoProtocolVersion; import org.apache.hadoop.crypto.key.JavaKeyStoreProvider; import org.apache.hadoop.crypto.key.KeyProvider; +import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension; import org.apache.hadoop.crypto.key.KeyProviderFactory; +import org.apache.hadoop.crypto.key.kms.server.EagerKeyGeneratorKeyProviderCryptoExtension; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FSDataOutputStream; @@ -734,14 +736,33 @@ public class TestEncryptionZones { // Roll the key of the encryption zone assertNumZones(1); String keyName = dfsAdmin.listEncryptionZones().next().getKeyName(); + FileEncryptionInfo feInfo1 = getFileEncryptionInfo(encFile1); cluster.getNamesystem().getProvider().rollNewVersion(keyName); + /** + * due to the cache on the server side, client may get old keys. + * @see EagerKeyGeneratorKeyProviderCryptoExtension#rollNewVersion(String) + */ + boolean rollSucceeded = false; + for (int i = 0; i <= EagerKeyGeneratorKeyProviderCryptoExtension + .KMS_KEY_CACHE_SIZE_DEFAULT + CommonConfigurationKeysPublic. + KMS_CLIENT_ENC_KEY_CACHE_SIZE_DEFAULT; ++i) { + KeyProviderCryptoExtension.EncryptedKeyVersion ekv2 = + cluster.getNamesystem().getProvider().generateEncryptedKey(TEST_KEY); + if (!(feInfo1.getEzKeyVersionName() + .equals(ekv2.getEncryptionKeyVersionName()))) { + rollSucceeded = true; + break; + } + } + Assert.assertTrue("rollover did not generate a new key even after" + + " queue is drained", rollSucceeded); + // Read them back in and compare byte-by-byte verifyFilesEqual(fs, baseFile, encFile1, len); // Write a new enc file and validate final Path encFile2 = new Path(zone, "myfile2"); DFSTestUtil.createFile(fs, encFile2, len, (short) 1, 0xFEED); // FEInfos should be different - FileEncryptionInfo feInfo1 = getFileEncryptionInfo(encFile1); FileEncryptionInfo feInfo2 = getFileEncryptionInfo(encFile2); assertFalse("EDEKs should be different", Arrays .equals(feInfo1.getEncryptedDataEncryptionKey(), From 5a58bfee30a662b1b556048504f66f9cf00d182a Mon Sep 17 00:00:00 2001 From: Wangda Tan Date: Tue, 20 Sep 2016 17:20:50 -0700 Subject: [PATCH 8/9] YARN-4591. YARN Web UIs should provide a robots.txt. (Sidharta Seethana via wangda) --- .../apache/hadoop/yarn/webapp/Dispatcher.java | 9 +++++ .../org/apache/hadoop/yarn/webapp/WebApp.java | 4 +- .../yarn/webapp/view/RobotsTextPage.java | 39 +++++++++++++++++++ .../apache/hadoop/yarn/webapp/TestWebApp.java | 26 +++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/view/RobotsTextPage.java diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/Dispatcher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/Dispatcher.java index 66dd21bbac..d519dbb4c0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/Dispatcher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/Dispatcher.java @@ -35,6 +35,7 @@ import org.apache.hadoop.http.HtmlQuoting; import org.apache.hadoop.yarn.webapp.Controller.RequestContext; import org.apache.hadoop.yarn.webapp.Router.Dest; import org.apache.hadoop.yarn.webapp.view.ErrorPage; +import org.apache.hadoop.yarn.webapp.view.RobotsTextPage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -117,6 +118,14 @@ public class Dispatcher extends HttpServlet { } Controller.RequestContext rc = injector.getInstance(Controller.RequestContext.class); + + //short-circuit robots.txt serving for all YARN webapps. + if (uri.equals(RobotsTextPage.ROBOTS_TXT_PATH)) { + rc.setStatus(HttpServletResponse.SC_FOUND); + render(RobotsTextPage.class); + return; + } + if (setCookieParams(rc, req) > 0) { Cookie ec = rc.cookies().get(ERROR_COOKIE); if (ec != null) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApp.java index 2c21d1b312..fe800f0852 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApp.java @@ -29,6 +29,7 @@ import java.util.Map; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.http.HttpServer2; +import org.apache.hadoop.yarn.webapp.view.RobotsTextPage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -158,7 +159,8 @@ public abstract class WebApp extends ServletModule { public void configureServlets() { setup(); - serve("/", "/__stop").with(Dispatcher.class); + serve("/", "/__stop", RobotsTextPage.ROBOTS_TXT_PATH) + .with(Dispatcher.class); for (String path : this.servePathSpecs) { serve(path).with(Dispatcher.class); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/view/RobotsTextPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/view/RobotsTextPage.java new file mode 100644 index 0000000000..b15d492d2f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/view/RobotsTextPage.java @@ -0,0 +1,39 @@ +/* + * * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * / + */ + +package org.apache.hadoop.yarn.webapp.view; + +/** + * Simple class that renders a robot.txt page that disallows crawling. + */ + +public class RobotsTextPage extends TextPage { + public static final String ROBOTS_TXT = "robots.txt"; + public static final String ROBOTS_TXT_PATH = "/" + ROBOTS_TXT; + + static final String USER_AGENT_LINE = "User-agent: *"; + static final String DISALLOW_LINE = "Disallow: /"; + + @Override + public void render() { + putWithoutEscapeHtml(USER_AGENT_LINE); + putWithoutEscapeHtml(DISALLOW_LINE); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/webapp/TestWebApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/webapp/TestWebApp.java index acec20524b..deef85590f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/webapp/TestWebApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/webapp/TestWebApp.java @@ -38,6 +38,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.yarn.MockApps; import org.apache.hadoop.yarn.webapp.view.HtmlPage; import org.apache.hadoop.yarn.webapp.view.JQueryUI; +import org.apache.hadoop.yarn.webapp.view.RobotsTextPage; import org.apache.hadoop.yarn.webapp.view.TextPage; import org.junit.Test; import org.slf4j.Logger; @@ -260,6 +261,31 @@ public class TestWebApp { } } + @Test public void testRobotsText() throws Exception { + WebApp app = + WebApps.$for("test", TestWebApp.class, this, "ws").start(new WebApp() { + @Override + public void setup() { + bind(MyTestJAXBContextResolver.class); + bind(MyTestWebService.class); + } + }); + String baseUrl = baseUrl(app); + try { + //using system line separator here since that is what + // TextView (via PrintWriter) seems to use. + String[] robotsTxtOutput = getContent(baseUrl + + RobotsTextPage.ROBOTS_TXT).trim().split(System.getProperty("line" + + ".separator")); + + assertEquals(2, robotsTxtOutput.length); + assertEquals("User-agent: *", robotsTxtOutput[0]); + assertEquals("Disallow: /", robotsTxtOutput[1]); + } finally { + app.stop(); + } + } + // This is to test the GuiceFilter should only be applied to webAppContext, // not to logContext; @Test public void testYARNWebAppContext() throws Exception { From 964e546ab1dba5f5d53b209ec6c9a70a85654765 Mon Sep 17 00:00:00 2001 From: Masatake Iwasaki Date: Wed, 21 Sep 2016 10:35:25 +0900 Subject: [PATCH 9/9] HDFS-9333. Some tests using MiniDFSCluster errored complaining port in use. (iwasakims) --- .../TestBlockTokenWithDFS.java | 8 ++++++- .../TestBlockTokenWithDFSStriped.java | 23 ++++++++++++++++++- .../tools/TestDFSZKFailoverController.java | 18 +++++++++++---- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFS.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFS.java index e7e7739899..9374ae8efe 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFS.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFS.java @@ -61,6 +61,7 @@ import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.net.ServerSocketUtil; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.test.GenericTestUtils; import org.apache.log4j.Level; @@ -349,7 +350,12 @@ public class TestBlockTokenWithDFS { Configuration conf = getConf(numDataNodes); try { - cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDataNodes).build(); + // prefer non-ephemeral port to avoid port collision on restartNameNode + cluster = new MiniDFSCluster.Builder(conf) + .nameNodePort(ServerSocketUtil.getPort(19820, 100)) + .nameNodeHttpPort(ServerSocketUtil.getPort(19870, 100)) + .numDataNodes(numDataNodes) + .build(); cluster.waitActive(); assertEquals(numDataNodes, cluster.getDataNodes().size()); doTestRead(conf, cluster, false); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFSStriped.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFSStriped.java index 64a48c234e..17145611cb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFSStriped.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFSStriped.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock; import org.apache.hadoop.hdfs.server.balancer.TestBalancer; import org.apache.hadoop.hdfs.util.StripedBlockUtil; +import org.apache.hadoop.net.ServerSocketUtil; import org.junit.Rule; import org.junit.Test; import org.junit.rules.Timeout; @@ -59,7 +60,27 @@ public class TestBlockTokenWithDFSStriped extends TestBlockTokenWithDFS { @Override public void testRead() throws Exception { conf = getConf(); - cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDNs).build(); + + /* + * prefer non-ephemeral port to avoid conflict with tests using + * ephemeral ports on MiniDFSCluster#restartDataNode(true). + */ + Configuration[] overlays = new Configuration[numDNs]; + for (int i = 0; i < overlays.length; i++) { + int offset = i * 10; + Configuration c = new Configuration(); + c.set(DFSConfigKeys.DFS_DATANODE_ADDRESS_KEY, "127.0.0.1:" + + ServerSocketUtil.getPort(19866 + offset, 100)); + c.set(DFSConfigKeys.DFS_DATANODE_IPC_ADDRESS_KEY, "127.0.0.1:" + + ServerSocketUtil.getPort(19867 + offset, 100)); + overlays[i] = c; + } + + cluster = new MiniDFSCluster.Builder(conf) + .nameNodePort(ServerSocketUtil.getPort(19820, 100)) + .nameNodeHttpPort(ServerSocketUtil.getPort(19870, 100)) + .numDataNodes(numDNs) + .build(); cluster.getFileSystem().getClient() .setErasureCodingPolicy("/", null); try { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSZKFailoverController.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSZKFailoverController.java index dfdcf3483c..bbb787e1d7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSZKFailoverController.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSZKFailoverController.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream; import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; +import org.apache.hadoop.net.ServerSocketUtil; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.MultithreadedTestUtil.TestContext; import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread; @@ -75,14 +76,21 @@ public class TestDFSZKFailoverController extends ClientBaseWithFixes { conf.setInt( CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY, 0); - - conf.setInt(DFSConfigKeys.DFS_HA_ZKFC_PORT_KEY + ".ns1.nn1", 10023); - conf.setInt(DFSConfigKeys.DFS_HA_ZKFC_PORT_KEY + ".ns1.nn2", 10024); + // Get random port numbers in advance. Because ZKFCs and DFSHAAdmin + // needs rpc port numbers of all ZKFCs, Setting 0 does not work here. + conf.setInt(DFSConfigKeys.DFS_HA_ZKFC_PORT_KEY + ".ns1.nn1", + ServerSocketUtil.getPort(10023, 100)); + conf.setInt(DFSConfigKeys.DFS_HA_ZKFC_PORT_KEY + ".ns1.nn2", + ServerSocketUtil.getPort(10024, 100)); + + // prefer non-ephemeral port to avoid port collision on restartNameNode MiniDFSNNTopology topology = new MiniDFSNNTopology() .addNameservice(new MiniDFSNNTopology.NSConf("ns1") - .addNN(new MiniDFSNNTopology.NNConf("nn1").setIpcPort(10021)) - .addNN(new MiniDFSNNTopology.NNConf("nn2").setIpcPort(10022))); + .addNN(new MiniDFSNNTopology.NNConf("nn1") + .setIpcPort(ServerSocketUtil.getPort(10021, 100))) + .addNN(new MiniDFSNNTopology.NNConf("nn2") + .setIpcPort(ServerSocketUtil.getPort(10022, 100)))); cluster = new MiniDFSCluster.Builder(conf) .nnTopology(topology) .numDataNodes(0)