From 2690d0db0e860a05b0e46c234a3f82a82178c6c7 Mon Sep 17 00:00:00 2001
From: Steve Loughran
Date: Sun, 23 Oct 2011 14:13:58 +0000
Subject: [PATCH] HDFS-2485
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1187887 13f79535-47bb-0310-9956-ffa450edef68
---
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 +
.../UnderReplicatedBlocks.java | 196 ++++++++++++++----
2 files changed, 153 insertions(+), 46 deletions(-)
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index b32d67188c..148726bd0e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -841,6 +841,9 @@ Release 0.23.0 - Unreleased
HDFS-2471. Add federation documentation. (suresh)
+ HDFS-2485. Improve code layout and constants in UnderReplicatedBlocks
+ (stevel)
+
OPTIMIZATIONS
HDFS-1458. Improve checkpoint performance by avoiding unnecessary image
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java
index 7b39860d06..dc8d9e8db3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java
@@ -26,19 +26,66 @@
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
-/** Keep track of under replication blocks.
- * Blocks have replication priority, with priority 0 indicating the highest
- * Blocks have only one replicas has the highest
+/**
+ * Keep prioritized queues of under replicated blocks.
+ * Blocks have replication priority, with priority {@link #QUEUE_HIGHEST_PRIORITY}
+ * indicating the highest priority.
+ *
+ * Having a prioritised queue allows the {@link BlockManager} to select
+ * which blocks to replicate first -it tries to give priority to data
+ * that is most at risk or considered most valuable.
+ *
+ *
+ * The policy for choosing which priority to give added blocks
+ * is implemented in {@link #getPriority(Block, int, int, int)}.
+ *
+ * The queue order is as follows:
+ *
+ * - {@link #QUEUE_HIGHEST_PRIORITY}: the blocks that must be replicated
+ * first. That is blocks with only one copy, or blocks with zero live
+ * copies but a copy in a node being decommissioned. These blocks
+ * are at risk of loss if the disk or server on which they
+ * remain fails.
+ * - {@link #QUEUE_VERY_UNDER_REPLICATED}: blocks that are very
+ * under-replicated compared to their expected values. Currently
+ * that means the ratio of the ratio of actual:expected means that
+ * there is less than 1:3.
. These blocks may not be at risk,
+ * but they are clearly considered "important".
+ * - {@link #QUEUE_UNDER_REPLICATED}: blocks that are also under
+ * replicated, and the ratio of actual:expected is good enough that
+ * they do not need to go into the {@link #QUEUE_VERY_UNDER_REPLICATED}
+ * queue.
+ * - {@link #QUEUE_REPLICAS_BADLY_DISTRIBUTED}: there are as least as
+ * many copies of a block as required, but the blocks are not adequately
+ * distributed. Loss of a rack/switch could take all copies off-line.
+ * - {@link #QUEUE_WITH_CORRUPT_BLOCKS} This is for blocks that are corrupt
+ * and for which there are no-non-corrupt copies (currently) available.
+ * The policy here is to keep those corrupt blocks replicated, but give
+ * blocks that are not corrupt higher priority.
+ *
*/
class UnderReplicatedBlocks implements Iterable {
+ /** The total number of queues : {@value} */
static final int LEVEL = 5;
+ /** The queue with the highest priority: {@value} */
+ static final int QUEUE_HIGHEST_PRIORITY = 0;
+ /** The queue for blocks that are way below their expected value : {@value} */
+ static final int QUEUE_VERY_UNDER_REPLICATED = 1;
+ /** The queue for "normally" under-replicated blocks: {@value} */
+ static final int QUEUE_UNDER_REPLICATED = 2;
+ /** The queue for blocks that have the right number of replicas,
+ * but which the block manager felt were badly distributed: {@value}
+ */
+ static final int QUEUE_REPLICAS_BADLY_DISTRIBUTED = 3;
+ /** The queue for corrupt blocks: {@value} */
static final int QUEUE_WITH_CORRUPT_BLOCKS = 4;
+ /** the queues themselves */
private final List> priorityQueues
- = new ArrayList>();
-
+ = new ArrayList>(LEVEL);
+
/** Create an object. */
UnderReplicatedBlocks() {
- for(int i=0; i());
}
}
@@ -47,7 +94,7 @@ class UnderReplicatedBlocks implements Iterable {
* Empty the queues.
*/
void clear() {
- for(int i=0; i set : priorityQueues) {
- if(set.contains(block)) { return true; }
+ for (NavigableSet set : priorityQueues) {
+ if (set.contains(block)) {
+ return true;
+ }
}
return false;
}
-
+
/** Return the priority of a block
- * @param block a under replication block
+ * @param block a under replicated block
* @param curReplicas current number of replicas of the block
* @param expectedReplicas expected number of replicas of the block
+ * @return the priority for the blocks, between 0 and ({@link #LEVEL}-1)
*/
- private int getPriority(Block block,
+ private int getPriority(Block block,
int curReplicas,
int decommissionedReplicas,
int expectedReplicas) {
assert curReplicas >= 0 : "Negative replicas!";
if (curReplicas >= expectedReplicas) {
- return 3; // Block doesn't have enough racks
- } else if(curReplicas==0) {
- // If there are zero non-decommissioned replica but there are
+ // Block has enough copies, but not enough racks
+ return QUEUE_REPLICAS_BADLY_DISTRIBUTED;
+ } else if (curReplicas == 0) {
+ // If there are zero non-decommissioned replicas but there are
// some decommissioned replicas, then assign them highest priority
if (decommissionedReplicas > 0) {
- return 0;
+ return QUEUE_HIGHEST_PRIORITY;
}
- return QUEUE_WITH_CORRUPT_BLOCKS; // keep these blocks in needed replication.
- } else if(curReplicas==1) {
- return 0; // highest priority
- } else if(curReplicas*3Warning: This is not a synchronized method.
+ * @param block block to remove
+ * @param priLevel expected privilege level
+ * @return true if the block was found and removed from one of the priority queues
+ */
boolean remove(Block block, int priLevel) {
if(priLevel >= 0 && priLevel < LEVEL
&& priorityQueues.get(priLevel).remove(block)) {
@@ -164,8 +238,8 @@ boolean remove(Block block, int priLevel) {
} else {
// Try to remove the block from all queues if the block was
// not found in the queue for the given priority level.
- for(int i=0; i {
private int level;
private boolean isIteratorForLevel = false;
private List> iterators = new ArrayList>();
+ /**
+ * Construct an iterator over all queues.
+ */
private BlockIterator() {
level=0;
for(int i=0; i