HDFS-7300. HDFS-7300. The getMaxNodesPerRack() method in

BlockPlacementPolicyDefault is flawed. contributed by Kihwal Lee
2014-10-29 17:23:34 -05:00 · 2014-10-29 17:23:34 -05:00 · 3ae84e1ba8
commit 3ae84e1ba8
parent d33e07dc49
5 changed files with 95 additions and 5 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@ -1116,6 +1116,10 @@ Release 2.6.0 - UNRELEASED
    HDFS-7287. The OfflineImageViewer (OIV) can output invalid XML depending on
    the filename (Ravi Prakash via Colin P. McCabe)
    HDFS-7300. The getMaxNodesPerRack() method in BlockPlacementPolicyDefault
    is flawed (kihwal)
  BREAKDOWN OF HDFS-6584 ARCHIVAL STORAGE
    HDFS-6677. Change INodeFile and FSImage to support storage policy ID.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
@ -139,13 +139,17 @@ DatanodeStorageInfo[] chooseTarget(String src,
      List<DatanodeStorageInfo> results = new ArrayList<DatanodeStorageInfo>();
      boolean avoidStaleNodes = stats != null
          && stats.isAvoidingStaleDataNodesForWrite();
      int maxNodesAndReplicas[] = getMaxNodesPerRack(0, numOfReplicas);
      numOfReplicas = maxNodesAndReplicas[0];
      int maxNodesPerRack = maxNodesAndReplicas[1];
      for (int i = 0; i < favoredNodes.size() && results.size() < numOfReplicas; i++) {
        DatanodeDescriptor favoredNode = favoredNodes.get(i);
        // Choose a single node which is local to favoredNode.
        // 'results' is updated within chooseLocalNode
        final DatanodeStorageInfo target = chooseLocalStorage(favoredNode,
-            favoriteAndExcludedNodes, blocksize, 
+            favoriteAndExcludedNodes, blocksize, maxNodesPerRack,
            getMaxNodesPerRack(results.size(), numOfReplicas)[1],
            results, avoidStaleNodes, storageTypes, false);
        if (target == null) {
          LOG.warn("Could not find a target for file " + src
@ -221,6 +225,19 @@ private DatanodeStorageInfo[] chooseTarget(int numOfReplicas,
        results.toArray(new DatanodeStorageInfo[results.size()]));
  }
  /**
   * Calculate the maximum number of replicas to allocate per rack. It also
   * limits the total number of replicas to the total number of nodes in the
   * cluster. Caller should adjust the replica count to the return value.
   *
   * @param numOfChosen The number of already chosen nodes.
   * @param numOfReplicas The number of additional nodes to allocate.
   * @return integer array. Index 0: The number of nodes allowed to allocate
   *         in addition to already chosen nodes.
   *         Index 1: The maximum allowed number of nodes per rack. This
   *         is independent of the number of chosen nodes, as it is calculated
   *         using the target number of replicas.
   */
  private int[] getMaxNodesPerRack(int numOfChosen, int numOfReplicas) {
    int clusterSize = clusterMap.getNumOfLeaves();
    int totalNumOfReplicas = numOfChosen + numOfReplicas;
@ -228,7 +245,26 @@ private int[] getMaxNodesPerRack(int numOfChosen, int numOfReplicas) {
      numOfReplicas -= (totalNumOfReplicas-clusterSize);
      totalNumOfReplicas = clusterSize;
    }
-    int maxNodesPerRack = (totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;
+    // No calculation needed when there is only one rack or picking one node.
    int numOfRacks = clusterMap.getNumOfRacks();
    if (numOfRacks == 1 || totalNumOfReplicas <= 1) {
      return new int[] {numOfReplicas, totalNumOfReplicas};
    }
    int maxNodesPerRack = (totalNumOfReplicas-1)/numOfRacks + 2;
    // At this point, there are more than one racks and more than one replicas
    // to store. Avoid all replicas being in the same rack.
    //
    // maxNodesPerRack has the following properties at this stage.
    //   1) maxNodesPerRack >= 2
    //   2) (maxNodesPerRack-1) * numOfRacks > totalNumOfReplicas
    //          when numOfRacks > 1
    //
    // Thus, the following adjustment will still result in a value that forces
    // multi-rack allocation and gives enough number of total nodes.
    if (maxNodesPerRack == totalNumOfReplicas) {
      maxNodesPerRack--;
    }
    return new int[] {numOfReplicas, maxNodesPerRack};
  }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
@ -188,7 +188,7 @@ public void testAppendWithPipelineRecovery() throws Exception {
    try {
      cluster = new MiniDFSCluster.Builder(conf).manageDataDfsDirs(true)
          .manageNameDfsDirs(true).numDataNodes(4)
-          .racks(new String[] { "/rack1", "/rack1", "/rack1", "/rack2" })
+          .racks(new String[] { "/rack1", "/rack1", "/rack2", "/rack2" })
          .build();
      cluster.waitActive();
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
@ -281,7 +281,8 @@ private void doTestOneOfTwoRacksDecommissioned(int testIndex) throws Exception {
    assertTrue("Source of replication should be one of the nodes the block " +
        "was on. Was: " + pipeline[0],
        origStorages.contains(pipeline[0]));
-    assertEquals("Should have three targets", 3, pipeline.length);
+    // Only up to two nodes can be picked per rack when there are two racks.
    assertEquals("Should have two targets", 2, pipeline.length);
    boolean foundOneOnRackB = false;
    for (int i = 1; i < pipeline.length; i++) {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
@ -431,6 +431,55 @@ public void testChooseTarget5() throws Exception {
    assertFalse(isOnSameRack(targets[0], targets[1]));
  }
  /**
   * In this testcase, there are enough total number of nodes, but only
   * one rack is actually available.
   * @throws Exception
   */
  @Test
  public void testChooseTarget6() throws Exception {
    DatanodeStorageInfo storage = DFSTestUtil.createDatanodeStorageInfo(
        "DS-xxxx", "7.7.7.7", "/d2/r3", "host7");
    DatanodeDescriptor newDn = storage.getDatanodeDescriptor();
    Set<Node> excludedNodes;
    List<DatanodeStorageInfo> chosenNodes = new ArrayList<DatanodeStorageInfo>();
    excludedNodes = new HashSet<Node>();
    excludedNodes.add(dataNodes[0]);
    excludedNodes.add(dataNodes[1]);
    excludedNodes.add(dataNodes[2]);
    excludedNodes.add(dataNodes[3]);
    DatanodeStorageInfo[] targets;
    // Only two nodes available in a rack. Try picking two nodes. Only one
    // should return.
    targets = chooseTarget(2, chosenNodes, excludedNodes);
    assertEquals(1, targets.length);
    // Make three nodes available in a rack.
    final BlockManager bm = namenode.getNamesystem().getBlockManager();
    bm.getDatanodeManager().getNetworkTopology().add(newDn);
    bm.getDatanodeManager().getHeartbeatManager().addDatanode(newDn);
    updateHeartbeatWithUsage(newDn,
        2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L,
        2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L, 0L, 0L, 0, 0);
    // Try picking three nodes. Only two should return.
    excludedNodes.clear();
    excludedNodes.add(dataNodes[0]);
    excludedNodes.add(dataNodes[1]);
    excludedNodes.add(dataNodes[2]);
    excludedNodes.add(dataNodes[3]);
    chosenNodes.clear();
    try {
      targets = chooseTarget(3, chosenNodes, excludedNodes);
      assertEquals(2, targets.length);
    } finally {
      bm.getDatanodeManager().getNetworkTopology().remove(newDn);
    }
  }
  /**
   * In this testcase, it tries to choose more targets than available nodes and
   * check the result, with stale node avoidance on the write path enabled.