HDFS-8461. Erasure coding: fix priority level of UnderReplicatedBlocks for striped block. Contributed by Walter Su.

2015-07-06 16:39:47 -07:00 · 2015-07-06 16:39:47 -07:00 · 2c494a8436
commit 2c494a8436
parent ee01a09500
5 changed files with 218 additions and 63 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES-HDFS-EC-7285.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES-HDFS-EC-7285.txt
@ -329,3 +329,6 @@

    HDFS-8684. Erasure Coding: fix some block number calculation for striped
    block. (yliu)
+
+    HDFS-8461. Erasure coding: fix priority level of UnderReplicatedBlocks for
+    striped block. (Walter Su via jing9)
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/UnderReplicatedBlocks.java
@ -34,7 +34,7 @@
 *
 * <p/>
 * The policy for choosing which priority to give added blocks
- * is implemented in {@link #getPriority(int, int, int)}.
+ * is implemented in {@link #getPriority(BlockInfo, int, int, int)}.
 * </p>
 * <p>The queue order is as follows:</p>
 * <ol>
@ -144,14 +144,28 @@ synchronized boolean contains(BlockInfo block) {
   * @param expectedReplicas expected number of replicas of the block
   * @return the priority for the blocks, between 0 and ({@link #LEVEL}-1)
   */
-  private int getPriority(int curReplicas,
+  private int getPriority(BlockInfo block,
+                          int curReplicas,
                          int decommissionedReplicas,
                          int expectedReplicas) {
    assert curReplicas >= 0 : "Negative replicas!";
    if (curReplicas >= expectedReplicas) {
      // Block has enough copies, but not enough racks
      return QUEUE_REPLICAS_BADLY_DISTRIBUTED;
-    } else if (curReplicas == 0) {
+    }
+    if (block.isStriped()) {
+      BlockInfoStriped sblk = (BlockInfoStriped) block;
+      return getPriorityStriped(curReplicas, decommissionedReplicas,
+          sblk.getRealDataBlockNum(), sblk.getParityBlockNum());
+    } else {
+      return getPriorityContiguous(curReplicas, decommissionedReplicas,
+          expectedReplicas);
+    }
+  }
+
+  private int getPriorityContiguous(int curReplicas, int decommissionedReplicas,
+      int expectedReplicas) {
+    if (curReplicas == 0) {
      // If there are zero non-decommissioned replicas but there are
      // some decommissioned replicas, then assign them highest priority
      if (decommissionedReplicas > 0) {
@ -160,7 +174,7 @@ private int getPriority(int curReplicas,
      //all we have are corrupt blocks
      return QUEUE_WITH_CORRUPT_BLOCKS;
    } else if (curReplicas == 1) {
-      //only on replica -risk of loss
+      // only one replica, highest risk of loss
      // highest priority
      return QUEUE_HIGHEST_PRIORITY;
    } else if ((curReplicas * 3) < expectedReplicas) {
@ -173,6 +187,27 @@ private int getPriority(int curReplicas,
    }
  }

+  private int getPriorityStriped(int curReplicas, int decommissionedReplicas,
+      short dataBlkNum, short parityBlkNum) {
+    if (curReplicas < dataBlkNum) {
+      // There are some replicas on decommissioned nodes so it's not corrupted
+      if (curReplicas + decommissionedReplicas >= dataBlkNum) {
+        return QUEUE_HIGHEST_PRIORITY;
+      }
+      return QUEUE_WITH_CORRUPT_BLOCKS;
+    } else if (curReplicas == dataBlkNum) {
+      // highest risk of loss, highest priority
+      return QUEUE_HIGHEST_PRIORITY;
+    } else if ((curReplicas - dataBlkNum) * 3 < parityBlkNum + 1) {
+      // can only afford one replica loss
+      // this is considered very under-replicated
+      return QUEUE_VERY_UNDER_REPLICATED;
+    } else {
+      // add to the normal queue for under replicated blocks
+      return QUEUE_UNDER_REPLICATED;
+    }
+  }
+
  /** add a block to a under replication queue according to its priority
   * @param block a under replication block
   * @param curReplicas current number of replicas of the block
@ -185,7 +220,7 @@ synchronized boolean add(BlockInfo block,
                           int decomissionedReplicas,
                           int expectedReplicas) {
    assert curReplicas >= 0 : "Negative replicas!";
-    int priLevel = getPriority(curReplicas, decomissionedReplicas,
+    int priLevel = getPriority(block, curReplicas, decomissionedReplicas,
                               expectedReplicas);
    if(priorityQueues.get(priLevel).add(block)) {
      if (priLevel == QUEUE_WITH_CORRUPT_BLOCKS &&
@ -208,7 +243,7 @@ synchronized boolean remove(BlockInfo block,
                              int oldReplicas, 
                              int decommissionedReplicas,
                              int oldExpectedReplicas) {
-    int priLevel = getPriority(oldReplicas,
+    int priLevel = getPriority(block, oldReplicas,
                               decommissionedReplicas,
                               oldExpectedReplicas);
    boolean removedBlock = remove(block, priLevel);
@ -282,8 +317,10 @@ synchronized void update(BlockInfo block, int curReplicas,
                           int curReplicasDelta, int expectedReplicasDelta) {
    int oldReplicas = curReplicas-curReplicasDelta;
    int oldExpectedReplicas = curExpectedReplicas-expectedReplicasDelta;
-    int curPri = getPriority(curReplicas, decommissionedReplicas, curExpectedReplicas);
-    int oldPri = getPriority(oldReplicas, decommissionedReplicas, oldExpectedReplicas);
+    int curPri = getPriority(block, curReplicas, decommissionedReplicas,
+        curExpectedReplicas);
+    int oldPri = getPriority(block, oldReplicas, decommissionedReplicas,
+        oldExpectedReplicas);
    if(NameNode.stateChangeLog.isDebugEnabled()) {
      NameNode.stateChangeLog.debug("UnderReplicationBlocks.update " + 
        block +
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
@ -306,4 +306,12 @@ public static void recheckDecommissionState(DatanodeManager dm)
      throws ExecutionException, InterruptedException {
    dm.getDecomManager().runMonitor();
  }
+
+  /**
+   * add block to the replicateBlocks queue of the Datanode
+   */
+  public static void addBlockToBeReplicated(DatanodeDescriptor node,
+      Block block, DatanodeStorageInfo[] targets) {
+    node.addBlockToBeReplicated(block, targets);
+  }
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestUnderReplicatedBlockQueues.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestUnderReplicatedBlockQueues.java
@ -19,6 +19,9 @@
 package org.apache.hadoop.hdfs.server.blockmanagement;

 import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.server.namenode.ErasureCodingSchemaManager;
+import org.apache.hadoop.io.erasurecode.ECSchema;
 import org.junit.Test;

 import static org.junit.Assert.assertEquals;
@ -28,10 +31,21 @@

 public class TestUnderReplicatedBlockQueues {

+  private final ECSchema ecSchema =
+      ErasureCodingSchemaManager.getSystemDefaultSchema();
+  private final int CELLSIZE = HdfsConstants.BLOCK_STRIPED_CELL_SIZE;
+
  private BlockInfo genBlockInfo(long id) {
    return new BlockInfoContiguous(new Block(id), (short) 3);
  }

+  private BlockInfo genStripedBlockInfo(long id, long numBytes) {
+    BlockInfoStriped sblk =  new BlockInfoStriped(new Block(id), ecSchema,
+        CELLSIZE);
+    sblk.setNumBytes(numBytes);
+    return sblk;
+  }
+
  /**
   * Test that adding blocks with different replication counts puts them
   * into different queues
@ -85,6 +99,54 @@ public void testBlockPriorities() throws Throwable {
    assertEquals(2, queues.getCorruptReplOneBlockSize());
  }

+  @Test
+  public void testStripedBlockPriorities() throws Throwable {
+    int dataBlkNum = ecSchema.getNumDataUnits();
+    int parityBlkNUm = ecSchema.getNumParityUnits();
+    doTestStripedBlockPriorities(1, parityBlkNUm);
+    doTestStripedBlockPriorities(dataBlkNum, parityBlkNUm);
+  }
+
+  private void doTestStripedBlockPriorities(int dataBlkNum, int parityBlkNum)
+      throws Throwable {
+    int groupSize = dataBlkNum + parityBlkNum;
+    long numBytes = CELLSIZE * dataBlkNum;
+    UnderReplicatedBlocks queues = new UnderReplicatedBlocks();
+
+    // add a striped block which been left NUM_DATA_BLOCKS internal blocks
+    BlockInfo block1 = genStripedBlockInfo(-100, numBytes);
+    assertAdded(queues, block1, dataBlkNum, 0, groupSize);
+    assertEquals(1, queues.getUnderReplicatedBlockCount());
+    assertEquals(1, queues.size());
+    assertInLevel(queues, block1, UnderReplicatedBlocks.QUEUE_HIGHEST_PRIORITY);
+
+    // add a striped block which been left NUM_DATA_BLOCKS+1 internal blocks
+    BlockInfo block2 = genStripedBlockInfo(-200, numBytes);
+    assertAdded(queues, block2, dataBlkNum + 1, 0, groupSize);
+    assertEquals(2, queues.getUnderReplicatedBlockCount());
+    assertEquals(2, queues.size());
+    assertInLevel(queues, block2,
+        UnderReplicatedBlocks.QUEUE_VERY_UNDER_REPLICATED);
+
+    // add a striped block which been left NUM_DATA_BLOCKS+2 internal blocks
+    BlockInfo block3 = genStripedBlockInfo(-300, numBytes);
+    assertAdded(queues, block3, dataBlkNum + 2, 0, groupSize);
+    assertEquals(3, queues.getUnderReplicatedBlockCount());
+    assertEquals(3, queues.size());
+    assertInLevel(queues, block3,
+        UnderReplicatedBlocks.QUEUE_UNDER_REPLICATED);
+
+    // add a corrupted block
+    BlockInfo block_corrupt = genStripedBlockInfo(-400, numBytes);
+    assertEquals(0, queues.getCorruptBlockSize());
+    assertAdded(queues, block_corrupt, dataBlkNum - 1, 0, groupSize);
+    assertEquals(4, queues.size());
+    assertEquals(3, queues.getUnderReplicatedBlockCount());
+    assertEquals(1, queues.getCorruptBlockSize());
+    assertInLevel(queues, block_corrupt,
+        UnderReplicatedBlocks.QUEUE_WITH_CORRUPT_BLOCKS);
+  }
+
  private void assertAdded(UnderReplicatedBlocks queues,
                           BlockInfo block,
                           int curReplicas,
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestRecoverStripedBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestRecoverStripedBlocks.java
@ -23,7 +23,7 @@
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoStriped;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
@ -32,29 +32,26 @@
 import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
 import org.apache.hadoop.hdfs.server.protocol.BlockECRecoveryCommand.BlockECRecoveryInfo;
-import org.junit.After;
-import org.junit.Before;
 import org.junit.Test;
-
-import java.io.IOException;
 import java.util.List;

 import static org.apache.hadoop.hdfs.protocol.HdfsConstants.BLOCK_STRIPED_CELL_SIZE;
 import static org.apache.hadoop.hdfs.protocol.HdfsConstants.NUM_DATA_BLOCKS;
+import static org.apache.hadoop.hdfs.protocol.HdfsConstants.NUM_PARITY_BLOCKS;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;

 public class TestRecoverStripedBlocks {
  private final short GROUP_SIZE =
-      NUM_DATA_BLOCKS + HdfsConstants.NUM_PARITY_BLOCKS;
+      NUM_DATA_BLOCKS + NUM_PARITY_BLOCKS;
  private MiniDFSCluster cluster;
  private final Path dirPath = new Path("/dir");
  private Path filePath = new Path(dirPath, "file");
+  private int maxReplicationStreams =
+      DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_DEFAULT;

-  @Before
-  public void setup() throws IOException {
-    final Configuration conf = new HdfsConfiguration();
+  private void initConf(Configuration conf) {
    // Large value to make sure the pending replication request can stay in
    // DatanodeDescriptor.replicateBlocks before test timeout.
    conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 100);
@ -62,63 +59,111 @@ public void setup() throws IOException {
    // chooseUnderReplicatedBlocks at once.
    conf.setInt(
        DFSConfigKeys.DFS_NAMENODE_REPLICATION_WORK_MULTIPLIER_PER_ITERATION, 5);
-
-    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(GROUP_SIZE + 1)
-        .build();
-    cluster.waitActive();
-  }
-
-  @After
-  public void tearDown() throws Exception {
-    if (cluster != null) {
-      cluster.shutdown();
-    }
  }

  @Test
  public void testMissingStripedBlock() throws Exception {
-    final int numBlocks = 4;
-    DFSTestUtil.createStripedFile(cluster, filePath,
-        dirPath, numBlocks, 1, true);
+    doTestMissingStripedBlock(1, 0);
+  }

-    // make sure the file is complete in NN
-    final INodeFile fileNode = cluster.getNamesystem().getFSDirectory()
-        .getINode4Write(filePath.toString()).asFile();
-    assertFalse(fileNode.isUnderConstruction());
-    assertTrue(fileNode.isStriped());
-    BlockInfo[] blocks = fileNode.getBlocks();
-    assertEquals(numBlocks, blocks.length);
-    for (BlockInfo blk : blocks) {
-      assertTrue(blk.isStriped());
-      assertTrue(blk.isComplete());
-      assertEquals(BLOCK_STRIPED_CELL_SIZE * NUM_DATA_BLOCKS, blk.getNumBytes());
-      final BlockInfoStriped sb = (BlockInfoStriped) blk;
-      assertEquals(GROUP_SIZE, sb.numNodes());
-    }
+  @Test
+  public void testMissingStripedBlockWithBusyNode1() throws Exception {
+    doTestMissingStripedBlock(2, 1);
+  }

-    final BlockManager bm = cluster.getNamesystem().getBlockManager();
-    BlockInfo firstBlock = fileNode.getBlocks()[0];
-    DatanodeStorageInfo[] storageInfos = bm.getStorages(firstBlock);
+  @Test
+  public void testMissingStripedBlockWithBusyNode2() throws Exception {
+    doTestMissingStripedBlock(3, 1);
+  }

-    DatanodeDescriptor secondDn = storageInfos[1].getDatanodeDescriptor();
-    assertEquals(numBlocks, secondDn.numBlocks());
+  /**
+   * Start GROUP_SIZE + 1 datanodes.
+   * Inject striped blocks to first GROUP_SIZE datanodes.
+   * Then make numOfBusy datanodes busy, make numOfMissed datanodes missed.
+   * Then trigger BlockManager to compute recovery works. (so all recovery work
+   * will be scheduled to the last datanode)
+   * Finally, verify the recovery work of the last datanode.
+   */
+  private void doTestMissingStripedBlock(int numOfMissed, int numOfBusy)
+      throws Exception {
+    Configuration conf = new HdfsConfiguration();
+    initConf(conf);
+    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(GROUP_SIZE + 1)
+        .build();

-    bm.getDatanodeManager().removeDatanode(secondDn);
+    try {
+      cluster.waitActive();
+      final int numBlocks = 4;
+      DFSTestUtil.createStripedFile(cluster, filePath,
+          dirPath, numBlocks, 1, true);
+      // all blocks will be located at first GROUP_SIZE DNs, the last DN is
+      // empty because of the util function createStripedFile

-    BlockManagerTestUtil.getComputedDatanodeWork(bm);
+      // make sure the file is complete in NN
+      final INodeFile fileNode = cluster.getNamesystem().getFSDirectory()
+          .getINode4Write(filePath.toString()).asFile();
+      assertFalse(fileNode.isUnderConstruction());
+      assertTrue(fileNode.isStriped());
+      BlockInfo[] blocks = fileNode.getBlocks();
+      assertEquals(numBlocks, blocks.length);
+      for (BlockInfo blk : blocks) {
+        assertTrue(blk.isStriped());
+        assertTrue(blk.isComplete());
+        assertEquals(BLOCK_STRIPED_CELL_SIZE * NUM_DATA_BLOCKS,
+            blk.getNumBytes());
+        final BlockInfoStriped sb = (BlockInfoStriped) blk;
+        assertEquals(GROUP_SIZE, sb.numNodes());
+      }

-    // all the recovery work will be scheduled on the last DN
-    DataNode lastDn = cluster.getDataNodes().get(GROUP_SIZE);
-    DatanodeDescriptor last =
+      final BlockManager bm = cluster.getNamesystem().getBlockManager();
+      BlockInfo firstBlock = fileNode.getBlocks()[0];
+      DatanodeStorageInfo[] storageInfos = bm.getStorages(firstBlock);
+
+      // make numOfBusy nodes busy
+      int i = 0;
+      for (; i < numOfBusy; i++) {
+        DatanodeDescriptor busyNode = storageInfos[i].getDatanodeDescriptor();
+        for (int j = 0; j < maxReplicationStreams + 1; j++) {
+          BlockManagerTestUtil.addBlockToBeReplicated(busyNode, new Block(j),
+              new DatanodeStorageInfo[]{storageInfos[0]});
+        }
+      }
+
+      // make numOfMissed internal blocks missed
+      for (; i < numOfBusy + numOfMissed; i++) {
+        DatanodeDescriptor missedNode = storageInfos[i].getDatanodeDescriptor();
+        assertEquals(numBlocks, missedNode.numBlocks());
+        bm.getDatanodeManager().removeDatanode(missedNode);
+      }
+
+      BlockManagerTestUtil.getComputedDatanodeWork(bm);
+
+      // all the recovery work will be scheduled on the last DN
+      DataNode lastDn = cluster.getDataNodes().get(GROUP_SIZE);
+      DatanodeDescriptor last =
          bm.getDatanodeManager().getDatanode(lastDn.getDatanodeId());
-    assertEquals("Counting the number of outstanding EC tasks", numBlocks,
-        last.getNumberOfBlocksToBeErasureCoded());
-    List<BlockECRecoveryInfo> recovery = last.getErasureCodeCommand(numBlocks);
-    for (BlockECRecoveryInfo info : recovery) {
-      assertEquals(1, info.getTargetDnInfos().length);
-      assertEquals(last, info.getTargetDnInfos()[0]);
-      assertEquals(GROUP_SIZE - 1, info.getSourceDnInfos().length);
-      assertEquals(GROUP_SIZE - 1, info.getLiveBlockIndices().length);
+      assertEquals("Counting the number of outstanding EC tasks", numBlocks,
+          last.getNumberOfBlocksToBeErasureCoded());
+      List<BlockECRecoveryInfo> recovery =
+          last.getErasureCodeCommand(numBlocks);
+      for (BlockECRecoveryInfo info : recovery) {
+        assertEquals(1, info.getTargetDnInfos().length);
+        assertEquals(last, info.getTargetDnInfos()[0]);
+        assertEquals(info.getSourceDnInfos().length,
+            info.getLiveBlockIndices().length);
+        if (GROUP_SIZE - numOfMissed == NUM_DATA_BLOCKS) {
+          // It's a QUEUE_HIGHEST_PRIORITY block, so the busy DNs will be chosen
+          // to make sure we have NUM_DATA_BLOCKS DNs to do recovery work.
+          assertEquals(NUM_DATA_BLOCKS, info.getSourceDnInfos().length);
+        } else {
+          // The block has no highest priority, so we don't use the busy DNs as
+          // sources
+          assertEquals(GROUP_SIZE - numOfMissed - numOfBusy,
+              info.getSourceDnInfos().length);
+        }
+      }
+    } finally {
+      cluster.shutdown();
    }
  }
 }