HDFS-14882. Consider DataNode load when #getBlockLocation. Contributed by Xiaoqiao He.

Signed-off-by: Wei-Chiu Chuang <weichiu@apache.org>

Reviewed-by: Inigo Goiri <inigoiri@apache.org>
Reviewed-by: Istvan Fajth <pifta@cloudera.com>
This commit is contained in:
He Xiaoqiao 2019-11-15 12:15:33 -08:00 committed by Wei-Chiu Chuang
parent 9f0610fb83
commit c892a879dd
6 changed files with 199 additions and 26 deletions

View File

@ -31,6 +31,7 @@ import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Consumer;
/** The class represents a cluster of computer with a tree hierarchical
* network topology.
@ -874,11 +875,33 @@ public class NetworkTopology {
* This method is called if the reader is a datanode,
* so nonDataNodeReader flag is set to false.
*/
sortByDistance(reader, nodes, activeLen, false);
sortByDistance(reader, nodes, activeLen, list -> Collections.shuffle(list));
}
/**
* Sort nodes array by network distance to <i>reader</i>.
* Sort nodes array by network distance to <i>reader</i> with secondary sort.
* <p>
* In a three-level topology, a node can be either local, on the same rack,
* or on a different rack from the reader. Sorting the nodes based on network
* distance from the reader reduces network traffic and improves
* performance.
* <p>
* As an additional twist, we also randomize the nodes at each network
* distance. This helps with load balancing when there is data skew.
*
* @param reader Node where data will be read
* @param nodes Available replicas with the requested data
* @param activeLen Number of active nodes at the front of the array
* @param secondarySort a secondary sorting strategy which can inject into
* that point from outside to help sort the same distance.
*/
public <T extends Node> void sortByDistance(Node reader, T[] nodes,
int activeLen, Consumer<List<T>> secondarySort){
sortByDistance(reader, nodes, activeLen, secondarySort, false);
}
/**
* Sort nodes array by network distance to <i>reader</i> with secondary sort.
* <p> using network location. This is used when the reader
* is not a datanode. Sorting the nodes based on network distance
* from the reader reduces network traffic and improves
@ -895,7 +918,27 @@ public class NetworkTopology {
* This method is called if the reader is not a datanode,
* so nonDataNodeReader flag is set to true.
*/
sortByDistance(reader, nodes, activeLen, true);
sortByDistanceUsingNetworkLocation(reader, nodes, activeLen,
list -> Collections.shuffle(list));
}
/**
* Sort nodes array by network distance to <i>reader</i>.
* <p> using network location. This is used when the reader
* is not a datanode. Sorting the nodes based on network distance
* from the reader reduces network traffic and improves
* performance.
* <p>
*
* @param reader Node where data will be read
* @param nodes Available replicas with the requested data
* @param activeLen Number of active nodes at the front of the array
* @param secondarySort a secondary sorting strategy which can inject into
* that point from outside to help sort the same distance.
*/
public <T extends Node> void sortByDistanceUsingNetworkLocation(Node reader,
T[] nodes, int activeLen, Consumer<List<T>> secondarySort) {
sortByDistance(reader, nodes, activeLen, secondarySort, true);
}
/**
@ -909,7 +952,8 @@ public class NetworkTopology {
* @param activeLen Number of active nodes at the front of the array
* @param nonDataNodeReader True if the reader is not a datanode
*/
private void sortByDistance(Node reader, Node[] nodes, int activeLen,
private <T extends Node> void sortByDistance(Node reader, T[] nodes,
int activeLen, Consumer<List<T>> secondarySort,
boolean nonDataNodeReader) {
/** Sort weights for the nodes array */
int[] weights = new int[activeLen];
@ -921,23 +965,23 @@ public class NetworkTopology {
}
}
// Add weight/node pairs to a TreeMap to sort
TreeMap<Integer, List<Node>> tree = new TreeMap<Integer, List<Node>>();
TreeMap<Integer, List<T>> tree = new TreeMap<>();
for (int i=0; i<activeLen; i++) {
int weight = weights[i];
Node node = nodes[i];
List<Node> list = tree.get(weight);
T node = nodes[i];
List<T> list = tree.get(weight);
if (list == null) {
list = Lists.newArrayListWithExpectedSize(1);
tree.put(weight, list);
}
list.add(node);
}
// Sort nodes which have the same weight using secondarySort.
int idx = 0;
for (List<Node> list: tree.values()) {
for (List<T> list: tree.values()) {
if (list != null) {
Collections.shuffle(list, r);
for (Node n: list) {
secondarySort.accept(list);
for (T n: list) {
nodes[idx] = n;
idx++;
}

View File

@ -230,6 +230,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
HdfsClientConfigKeys.DeprecatedKeys.DFS_NAMENODE_REDUNDANCY_CONSIDERLOAD_KEY;
public static final boolean DFS_NAMENODE_REDUNDANCY_CONSIDERLOAD_DEFAULT =
true;
public static final String DFS_NAMENODE_READ_CONSIDERLOAD_KEY =
"dfs.namenode.read.considerLoad";
public static final boolean DFS_NAMENODE_READ_CONSIDERLOAD_DEFAULT =
false;
public static final String DFS_NAMENODE_REDUNDANCY_CONSIDERLOAD_FACTOR =
"dfs.namenode.redundancy.considerLoad.factor";
public static final double

View File

@ -30,7 +30,6 @@ import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
@ -66,6 +65,7 @@ import java.net.UnknownHostException;
import java.util.*;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
/**
* Manage datanodes, include decommission and other activities.
@ -134,6 +134,9 @@ public class DatanodeManager {
/** Whether or not to avoid using stale DataNodes for reading */
private final boolean avoidStaleDataNodesForRead;
/** Whether or not to consider lad for reading. */
private final boolean readConsiderLoad;
/**
* Whether or not to avoid using stale DataNodes for writing.
* Note that, even if this is configured, the policy may be
@ -314,6 +317,9 @@ public class DatanodeManager {
this.avoidStaleDataNodesForRead = conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_KEY,
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_DEFAULT);
this.readConsiderLoad = conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_READ_CONSIDERLOAD_KEY,
DFSConfigKeys.DFS_NAMENODE_READ_CONSIDERLOAD_DEFAULT);
this.avoidStaleDataNodesForWrite = conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY,
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT);
@ -530,9 +536,10 @@ public class DatanodeManager {
int activeLen = lastActiveIndex + 1;
if(nonDatanodeReader) {
networktopology.sortByDistanceUsingNetworkLocation(client,
lb.getLocations(), activeLen);
lb.getLocations(), activeLen, createSecondaryNodeSorter());
} else {
networktopology.sortByDistance(client, lb.getLocations(), activeLen);
networktopology.sortByDistance(client, lb.getLocations(), activeLen,
createSecondaryNodeSorter());
}
// move PROVIDED storage to the end to prefer local replicas.
lb.moveProvidedToEnd(activeLen);
@ -540,6 +547,17 @@ public class DatanodeManager {
lb.updateCachedStorageInfo();
}
private Consumer<List<DatanodeInfo>> createSecondaryNodeSorter() {
Consumer<List<DatanodeInfo>> secondarySort =
list -> Collections.shuffle(list);
if (readConsiderLoad) {
Comparator<DatanodeInfo> comp =
Comparator.comparingInt(DatanodeInfo::getXceiverCount);
secondarySort = list -> Collections.sort(list, comp);
}
return secondarySort;
}
/** @return the datanode descriptor for the host. */
public DatanodeDescriptor getDatanodeByHost(final String host) {
return host2DatanodeMap.getDatanodeByHost(host);

View File

@ -307,7 +307,9 @@
<property>
<name>dfs.namenode.redundancy.considerLoad</name>
<value>true</value>
<description>Decide if chooseTarget considers the target's load or not
<description>
Decide if chooseTarget considers the target's load or not when write.
Turn on by default.
</description>
</property>
@ -319,6 +321,15 @@
</description>
</property>
<property>
<name>dfs.namenode.read.considerLoad</name>
<value>false</value>
<description>
Decide if sort block locations considers the target's load or not when read.
Turn off by default.
</description>
</property>
<property>
<name>dfs.datanode.httpserver.filter.handlers</name>
<value>org.apache.hadoop.hdfs.server.datanode.web.RestCsrfPreventionFilterHandler</value>

View File

@ -518,6 +518,88 @@ public class TestDatanodeManager {
assertEquals(locs[4].getIpAddr(), sortedLocs2[0].getIpAddr());
}
@Test
public void testGetBlockLocationConsiderLoad()
throws IOException, URISyntaxException {
Configuration conf = new Configuration();
conf.setBoolean(
DFSConfigKeys.DFS_NAMENODE_READ_CONSIDERLOAD_KEY, true);
conf.setBoolean(
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_KEY, true);
FSNamesystem fsn = Mockito.mock(FSNamesystem.class);
Mockito.when(fsn.hasWriteLock()).thenReturn(true);
URL shellScript = getClass().getResource(
"/" + Shell.appendScriptExtension("topology-script"));
Path resourcePath = Paths.get(shellScript.toURI());
FileUtil.setExecutable(resourcePath.toFile(), true);
conf.set(DFSConfigKeys.NET_TOPOLOGY_SCRIPT_FILE_NAME_KEY,
resourcePath.toString());
DatanodeManager dm = mockDatanodeManager(fsn, conf);
int totalDNs = 5;
// Register 5 datanodes and 2 nodes per rack with different load.
DatanodeInfo[] locs = new DatanodeInfo[totalDNs];
String[] storageIDs = new String[totalDNs];
for (int i = 0; i < totalDNs; i++) {
// Register new datanode.
String uuid = "UUID-" + i;
String ip = "IP-" + i / 2 + "-" + i;
DatanodeRegistration dr = Mockito.mock(DatanodeRegistration.class);
Mockito.when(dr.getDatanodeUuid()).thenReturn(uuid);
Mockito.when(dr.getIpAddr()).thenReturn(ip);
dm.registerDatanode(dr);
// Get location and storage information.
locs[i] = dm.getDatanode(uuid);
storageIDs[i] = "storageID-" + i;
// Set load for datanodes.
locs[i].setXceiverCount(i);
}
// Set node 0 decommissioned.
locs[0].setDecommissioned();
// Create LocatedBlock with above locations.
ExtendedBlock b = new ExtendedBlock("somePoolID", 1234);
LocatedBlock block = new LocatedBlock(b, locs);
List<LocatedBlock> blocks = new ArrayList<>();
blocks.add(block);
// Test client located at locs[3] in cluster.
final String targetIpInCluster = locs[3].getIpAddr();
dm.sortLocatedBlocks(targetIpInCluster, blocks);
DatanodeInfo[] sortedLocs = block.getLocations();
assertEquals(totalDNs, sortedLocs.length);
// Ensure the local node is first.
assertEquals(targetIpInCluster, sortedLocs[0].getIpAddr());
// Ensure the lightweight node is more close when distance is same.
assertEquals(locs[3].getIpAddr(), sortedLocs[0].getIpAddr());
assertEquals(locs[2].getIpAddr(), sortedLocs[1].getIpAddr());
assertEquals(locs[1].getIpAddr(), sortedLocs[2].getIpAddr());
assertEquals(locs[4].getIpAddr(), sortedLocs[3].getIpAddr());
// Ensure the two decommissioned DNs were moved to the end.
assertThat(sortedLocs[4].getAdminState(),
is(DatanodeInfo.AdminStates.DECOMMISSIONED));
assertEquals(locs[0].getIpAddr(), sortedLocs[4].getIpAddr());
// Test client not in cluster but same rack with locs[3].
final String targetIpNotInCluster = locs[3].getIpAddr() + "-client";
dm.sortLocatedBlocks(targetIpNotInCluster, blocks);
DatanodeInfo[] sortedLocs2 = block.getLocations();
assertEquals(totalDNs, sortedLocs2.length);
// Ensure the local rack is first and lightweight node is first
// when distance is same.
assertEquals(locs[2].getIpAddr(), sortedLocs2[0].getIpAddr());
assertEquals(locs[3].getIpAddr(), sortedLocs2[1].getIpAddr());
assertEquals(locs[1].getIpAddr(), sortedLocs2[2].getIpAddr());
assertEquals(locs[4].getIpAddr(), sortedLocs2[3].getIpAddr());
// Ensure the two decommissioned DNs were moved to the end.
assertThat(sortedLocs[4].getAdminState(),
is(DatanodeInfo.AdminStates.DECOMMISSIONED));
assertEquals(locs[0].getIpAddr(), sortedLocs2[4].getIpAddr());
}
/**
* Test whether removing a host from the includes list without adding it to
* the excludes list will exclude it from data node reports.

View File

@ -241,10 +241,16 @@ public class TestNetworkTopology {
cluster.setRandomSeed(0xDEADBEEF);
cluster.sortByDistance(dataNodes[8], dtestNodes, dtestNodes.length - 2);
assertTrue(dtestNodes[0] == dataNodes[8]);
assertTrue(dtestNodes[1] == dataNodes[11]);
assertTrue(dtestNodes[2] == dataNodes[12]);
assertTrue(dtestNodes[3] == dataNodes[9]);
assertTrue(dtestNodes[4] == dataNodes[10]);
assertTrue(dtestNodes[1] != dtestNodes[2]);
assertTrue(dtestNodes[1] == dataNodes[11]
|| dtestNodes[1] == dataNodes[12]);
assertTrue(dtestNodes[2] == dataNodes[11]
|| dtestNodes[2] == dataNodes[12]);
assertTrue(dtestNodes[3] != dtestNodes[4]);
assertTrue(dtestNodes[3] == dataNodes[9]
|| dtestNodes[3] == dataNodes[10]);
assertTrue(dtestNodes[4] == dataNodes[9]
|| dtestNodes[4] == dataNodes[10]);
// array contains local node
testNodes[0] = dataNodes[1];
@ -331,10 +337,14 @@ public class TestNetworkTopology {
testNodes[2] = dataNodes[8];
Node rackClient = new NodeBase("/d3/r1/25.25.25");
cluster.setRandomSeed(0xDEADBEEF);
cluster.sortByDistance(rackClient, testNodes, testNodes.length);
cluster.sortByDistanceUsingNetworkLocation(rackClient, testNodes,
testNodes.length);
assertTrue(testNodes[0] == dataNodes[8]);
assertTrue(testNodes[1] == dataNodes[5]);
assertTrue(testNodes[2] == dataNodes[0]);
assertTrue(testNodes[1] != testNodes[2]);
assertTrue(testNodes[1] == dataNodes[0]
|| testNodes[1] == dataNodes[5]);
assertTrue(testNodes[2] == dataNodes[0]
|| testNodes[2] == dataNodes[5]);
//Reader is not a datanode , but is in one of the datanode's data center.
testNodes[0] = dataNodes[8];
@ -342,10 +352,14 @@ public class TestNetworkTopology {
testNodes[2] = dataNodes[0];
Node dcClient = new NodeBase("/d1/r2/25.25.25");
cluster.setRandomSeed(0xDEADBEEF);
cluster.sortByDistance(dcClient, testNodes, testNodes.length);
cluster.sortByDistanceUsingNetworkLocation(dcClient, testNodes,
testNodes.length);
assertTrue(testNodes[0] == dataNodes[0]);
assertTrue(testNodes[1] == dataNodes[5]);
assertTrue(testNodes[2] == dataNodes[8]);
assertTrue(testNodes[1] != testNodes[2]);
assertTrue(testNodes[1] == dataNodes[5]
|| testNodes[1] == dataNodes[8]);
assertTrue(testNodes[2] == dataNodes[5]
|| testNodes[2] == dataNodes[8]);
}