From 2690d0db0e860a05b0e46c234a3f82a82178c6c7 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Sun, 23 Oct 2011 14:13:58 +0000 Subject: [PATCH] HDFS-2485 git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1187887 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../UnderReplicatedBlocks.java | 196 ++++++++++++++---- 2 files changed, 153 insertions(+), 46 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index b32d67188c..148726bd0e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -841,6 +841,9 @@ Release 0.23.0 - Unreleased HDFS-2471. Add federation documentation. (suresh) + HDFS-2485. Improve code layout and constants in UnderReplicatedBlocks + (stevel) + OPTIMIZATIONS HDFS-1458. Improve checkpoint performance by avoiding unnecessary image diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java index 7b39860d06..dc8d9e8db3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java @@ -26,19 +26,66 @@ import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.server.namenode.NameNode; -/** Keep track of under replication blocks. - * Blocks have replication priority, with priority 0 indicating the highest - * Blocks have only one replicas has the highest +/** + * Keep prioritized queues of under replicated blocks. + * Blocks have replication priority, with priority {@link #QUEUE_HIGHEST_PRIORITY} + * indicating the highest priority. + *

+ * Having a prioritised queue allows the {@link BlockManager} to select + * which blocks to replicate first -it tries to give priority to data + * that is most at risk or considered most valuable. + * + *

+ * The policy for choosing which priority to give added blocks + * is implemented in {@link #getPriority(Block, int, int, int)}. + *

+ *

The queue order is as follows:

+ *
    + *
  1. {@link #QUEUE_HIGHEST_PRIORITY}: the blocks that must be replicated + * first. That is blocks with only one copy, or blocks with zero live + * copies but a copy in a node being decommissioned. These blocks + * are at risk of loss if the disk or server on which they + * remain fails.
  2. + *
  3. {@link #QUEUE_VERY_UNDER_REPLICATED}: blocks that are very + * under-replicated compared to their expected values. Currently + * that means the ratio of the ratio of actual:expected means that + * there is less than 1:3.
  4. . These blocks may not be at risk, + * but they are clearly considered "important". + *
  5. {@link #QUEUE_UNDER_REPLICATED}: blocks that are also under + * replicated, and the ratio of actual:expected is good enough that + * they do not need to go into the {@link #QUEUE_VERY_UNDER_REPLICATED} + * queue.
  6. + *
  7. {@link #QUEUE_REPLICAS_BADLY_DISTRIBUTED}: there are as least as + * many copies of a block as required, but the blocks are not adequately + * distributed. Loss of a rack/switch could take all copies off-line.
  8. + *
  9. {@link #QUEUE_WITH_CORRUPT_BLOCKS} This is for blocks that are corrupt + * and for which there are no-non-corrupt copies (currently) available. + * The policy here is to keep those corrupt blocks replicated, but give + * blocks that are not corrupt higher priority.
  10. + *
*/ class UnderReplicatedBlocks implements Iterable { + /** The total number of queues : {@value} */ static final int LEVEL = 5; + /** The queue with the highest priority: {@value} */ + static final int QUEUE_HIGHEST_PRIORITY = 0; + /** The queue for blocks that are way below their expected value : {@value} */ + static final int QUEUE_VERY_UNDER_REPLICATED = 1; + /** The queue for "normally" under-replicated blocks: {@value} */ + static final int QUEUE_UNDER_REPLICATED = 2; + /** The queue for blocks that have the right number of replicas, + * but which the block manager felt were badly distributed: {@value} + */ + static final int QUEUE_REPLICAS_BADLY_DISTRIBUTED = 3; + /** The queue for corrupt blocks: {@value} */ static final int QUEUE_WITH_CORRUPT_BLOCKS = 4; + /** the queues themselves */ private final List> priorityQueues - = new ArrayList>(); - + = new ArrayList>(LEVEL); + /** Create an object. */ UnderReplicatedBlocks() { - for(int i=0; i()); } } @@ -47,7 +94,7 @@ class UnderReplicatedBlocks implements Iterable { * Empty the queues. */ void clear() { - for(int i=0; i set : priorityQueues) { - if(set.contains(block)) { return true; } + for (NavigableSet set : priorityQueues) { + if (set.contains(block)) { + return true; + } } return false; } - + /** Return the priority of a block - * @param block a under replication block + * @param block a under replicated block * @param curReplicas current number of replicas of the block * @param expectedReplicas expected number of replicas of the block + * @return the priority for the blocks, between 0 and ({@link #LEVEL}-1) */ - private int getPriority(Block block, + private int getPriority(Block block, int curReplicas, int decommissionedReplicas, int expectedReplicas) { assert curReplicas >= 0 : "Negative replicas!"; if (curReplicas >= expectedReplicas) { - return 3; // Block doesn't have enough racks - } else if(curReplicas==0) { - // If there are zero non-decommissioned replica but there are + // Block has enough copies, but not enough racks + return QUEUE_REPLICAS_BADLY_DISTRIBUTED; + } else if (curReplicas == 0) { + // If there are zero non-decommissioned replicas but there are // some decommissioned replicas, then assign them highest priority if (decommissionedReplicas > 0) { - return 0; + return QUEUE_HIGHEST_PRIORITY; } - return QUEUE_WITH_CORRUPT_BLOCKS; // keep these blocks in needed replication. - } else if(curReplicas==1) { - return 0; // highest priority - } else if(curReplicas*3Warning: This is not a synchronized method. + * @param block block to remove + * @param priLevel expected privilege level + * @return true if the block was found and removed from one of the priority queues + */ boolean remove(Block block, int priLevel) { if(priLevel >= 0 && priLevel < LEVEL && priorityQueues.get(priLevel).remove(block)) { @@ -164,8 +238,8 @@ boolean remove(Block block, int priLevel) { } else { // Try to remove the block from all queues if the block was // not found in the queue for the given priority level. - for(int i=0; i { private int level; private boolean isIteratorForLevel = false; private List> iterators = new ArrayList>(); + /** + * Construct an iterator over all queues. + */ private BlockIterator() { level=0; for(int i=0; i