diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 08f240c665..013d53137f 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -12,6 +12,9 @@ Trunk (unreleased changes) HADOOP-8469. Make NetworkTopology class pluggable. (Junping Du via szetszwo) + HADOOP-8468. Add NetworkTopologyWithNodeGroup, a 4-layer implementation + of NetworkTopology. (Junping Du via szetszwo) + IMPROVEMENTS HADOOP-8017. Configure hadoop-main pom to get rid of M2E plugin execution diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java new file mode 100644 index 0000000000..6066cd2a61 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java @@ -0,0 +1,398 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.net; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * The class extends NetworkTopology to represents a cluster of computer with + * a 4-layers hierarchical network topology. + * In this network topology, leaves represent data nodes (computers) and inner + * nodes represent switches/routers that manage traffic in/out of data centers, + * racks or physical host (with virtual switch). + * + * @see NetworkTopology + */ +@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) +@InterfaceStability.Unstable +public class NetworkTopologyWithNodeGroup extends NetworkTopology { + + public final static String DEFAULT_NODEGROUP = "/default-nodegroup"; + + public NetworkTopologyWithNodeGroup() { + clusterMap = new InnerNodeWithNodeGroup(InnerNode.ROOT); + } + + @Override + protected Node getNodeForNetworkLocation(Node node) { + // if node only with default rack info, here we need to add default + // nodegroup info + if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) { + node.setNetworkLocation(node.getNetworkLocation() + + DEFAULT_NODEGROUP); + } + Node nodeGroup = getNode(node.getNetworkLocation()); + if (nodeGroup == null) { + nodeGroup = new InnerNode(node.getNetworkLocation()); + } + return getNode(nodeGroup.getNetworkLocation()); + } + + @Override + public String getRack(String loc) { + netlock.readLock().lock(); + try { + loc = InnerNode.normalize(loc); + Node locNode = getNode(loc); + if (locNode instanceof InnerNodeWithNodeGroup) { + InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode; + if (node.isRack()) { + return loc; + } else if (node.isNodeGroup()) { + return node.getNetworkLocation(); + } else { + // may be a data center + return null; + } + } else { + // not in cluster map, don't handle it + return loc; + } + } finally { + netlock.readLock().unlock(); + } + } + + /** + * Given a string representation of a node group for a specific network + * location + * + * @param loc + * a path-like string representation of a network location + * @return a node group string + */ + public String getNodeGroup(String loc) { + netlock.readLock().lock(); + try { + loc = InnerNode.normalize(loc); + Node locNode = getNode(loc); + if (locNode instanceof InnerNodeWithNodeGroup) { + InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode; + if (node.isNodeGroup()) { + return loc; + } else if (node.isRack()) { + // not sure the node group for a rack + return null; + } else { + // may be a leaf node + return getNodeGroup(node.getNetworkLocation()); + } + } else { + // not in cluster map, don't handle it + return loc; + } + } finally { + netlock.readLock().unlock(); + } + } + + @Override + public boolean isOnSameRack( Node node1, Node node2) { + if (node1 == null || node2 == null || + node1.getParent() == null || node2.getParent() == null) { + return false; + } + + netlock.readLock().lock(); + try { + return isSameParents(node1.getParent(), node2.getParent()); + } finally { + netlock.readLock().unlock(); + } + } + + /** + * Check if two nodes are on the same node group (hypervisor) The + * assumption here is: each nodes are leaf nodes. + * + * @param node1 + * one node (can be null) + * @param node2 + * another node (can be null) + * @return true if node1 and node2 are on the same node group; false + * otherwise + * @exception IllegalArgumentException + * when either node1 or node2 is null, or node1 or node2 do + * not belong to the cluster + */ + @Override + public boolean isOnSameNodeGroup(Node node1, Node node2) { + if (node1 == null || node2 == null) { + return false; + } + netlock.readLock().lock(); + try { + return isSameParents(node1, node2); + } finally { + netlock.readLock().unlock(); + } + } + + /** + * Check if network topology is aware of NodeGroup + */ + @Override + public boolean isNodeGroupAware() { + return true; + } + + /** Add a leaf node + * Update node counter & rack counter if necessary + * @param node node to be added; can be null + * @exception IllegalArgumentException if add a node to a leave + * or node to be added is not a leaf + */ + @Override + public void add(Node node) { + if (node==null) return; + if( node instanceof InnerNode ) { + throw new IllegalArgumentException( + "Not allow to add an inner node: "+NodeBase.getPath(node)); + } + netlock.writeLock().lock(); + try { + Node rack = null; + + // if node only with default rack info, here we need to add default + // nodegroup info + if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) { + node.setNetworkLocation(node.getNetworkLocation() + + NetworkTopologyWithNodeGroup.DEFAULT_NODEGROUP); + } + Node nodeGroup = getNode(node.getNetworkLocation()); + if (nodeGroup == null) { + nodeGroup = new InnerNodeWithNodeGroup(node.getNetworkLocation()); + } + rack = getNode(nodeGroup.getNetworkLocation()); + + if (rack != null && !(rack instanceof InnerNode)) { + throw new IllegalArgumentException("Unexpected data node " + + node.toString() + + " at an illegal network location"); + } + if (clusterMap.add(node)) { + LOG.info("Adding a new node: " + NodeBase.getPath(node)); + if (rack == null) { + // We only track rack number here + numOfRacks++; + } + } + if(LOG.isDebugEnabled()) { + LOG.debug("NetworkTopology became:\n" + this.toString()); + } + } finally { + netlock.writeLock().unlock(); + } + } + + /** Remove a node + * Update node counter and rack counter if necessary + * @param node node to be removed; can be null + */ + @Override + public void remove(Node node) { + if (node==null) return; + if( node instanceof InnerNode ) { + throw new IllegalArgumentException( + "Not allow to remove an inner node: "+NodeBase.getPath(node)); + } + LOG.info("Removing a node: "+NodeBase.getPath(node)); + netlock.writeLock().lock(); + try { + if (clusterMap.remove(node)) { + Node nodeGroup = getNode(node.getNetworkLocation()); + if (nodeGroup == null) { + nodeGroup = new InnerNode(node.getNetworkLocation()); + } + InnerNode rack = (InnerNode)getNode(nodeGroup.getNetworkLocation()); + if (rack == null) { + numOfRacks--; + } + } + if(LOG.isDebugEnabled()) { + LOG.debug("NetworkTopology became:\n" + this.toString()); + } + } finally { + netlock.writeLock().unlock(); + } + } + + /** Sort nodes array by their distances to reader + * It linearly scans the array, if a local node is found, swap it with + * the first element of the array. + * If a local node group node is found, swap it with the first element + * following the local node. + * If a local rack node is found, swap it with the first element following + * the local node group node. + * If neither local node, node group node or local rack node is found, put a + * random replica location at position 0. + * It leaves the rest nodes untouched. + * @param reader the node that wishes to read a block from one of the nodes + * @param nodes the list of nodes containing data for the reader + */ + @Override + public void pseudoSortByDistance( Node reader, Node[] nodes ) { + + if (reader != null && !this.contains(reader)) { + // if reader is not a datanode (not in NetworkTopology tree), we will + // replace this reader with a sibling leaf node in tree. + Node nodeGroup = getNode(reader.getNetworkLocation()); + if (nodeGroup != null && nodeGroup instanceof InnerNode) { + InnerNode parentNode = (InnerNode) nodeGroup; + // replace reader with the first children of its parent in tree + reader = parentNode.getLeaf(0, null); + } else { + return; + } + } + int tempIndex = 0; + int localRackNode = -1; + int localNodeGroupNode = -1; + if (reader != null) { + //scan the array to find the local node & local rack node + for (int i = 0; i < nodes.length; i++) { + if (tempIndex == 0 && reader == nodes[i]) { //local node + //swap the local node and the node at position 0 + if (i != 0) { + swap(nodes, tempIndex, i); + } + tempIndex=1; + + if (localRackNode != -1 && (localNodeGroupNode !=-1)) { + if (localRackNode == 0) { + localRackNode = i; + } + if (localNodeGroupNode == 0) { + localNodeGroupNode = i; + } + break; + } + } else if (localNodeGroupNode == -1 && isOnSameNodeGroup(reader, + nodes[i])) { + //local node group + localNodeGroupNode = i; + // node local and rack local are already found + if(tempIndex != 0 && localRackNode != -1) break; + } else if (localRackNode == -1 && isOnSameRack(reader, nodes[i])) { + localRackNode = i; + if (tempIndex != 0 && localNodeGroupNode != -1) break; + } + } + + // swap the local nodegroup node and the node at position tempIndex + if(localNodeGroupNode != -1 && localNodeGroupNode != tempIndex) { + swap(nodes, tempIndex, localNodeGroupNode); + if (localRackNode == tempIndex) { + localRackNode = localNodeGroupNode; + } + tempIndex++; + } + + // swap the local rack node and the node at position tempIndex + if(localRackNode != -1 && localRackNode != tempIndex) { + swap(nodes, tempIndex, localRackNode); + tempIndex++; + } + } + + // put a random node at position 0 if there is not a local/local-nodegroup/ + // local-rack node + if (tempIndex == 0 && localNodeGroupNode == -1 && localRackNode == -1 + && nodes.length != 0) { + swap(nodes, 0, r.nextInt(nodes.length)); + } + } + + /** InnerNodeWithNodeGroup represents a switch/router of a data center, rack + * or physical host. Different from a leaf node, it has non-null children. + */ + static class InnerNodeWithNodeGroup extends InnerNode { + public InnerNodeWithNodeGroup(String name, String location, + InnerNode parent, int level) { + super(name, location, parent, level); + } + + public InnerNodeWithNodeGroup(String name, String location) { + super(name, location); + } + + public InnerNodeWithNodeGroup(String path) { + super(path); + } + + @Override + boolean isRack() { + // it is node group + if (getChildren().isEmpty()) { + return false; + } + + Node firstChild = children.get(0); + + if (firstChild instanceof InnerNode) { + Node firstGrandChild = (((InnerNode) firstChild).children).get(0); + if (firstGrandChild instanceof InnerNode) { + // it is datacenter + return false; + } else { + return true; + } + } + return false; + } + + /** + * Judge if this node represents a node group + * + * @return true if it has no child or its children are not InnerNodes + */ + boolean isNodeGroup() { + if (children.isEmpty()) { + return true; + } + Node firstChild = children.get(0); + if (firstChild instanceof InnerNode) { + // it is rack or datacenter + return false; + } + return true; + } + + @Override + protected InnerNode createParentNode(String parentName) { + return new InnerNodeWithNodeGroup(parentName, getPath(this), this, + this.getLevel() + 1); + } + + @Override + protected boolean areChildrenLeaves() { + return isNodeGroup(); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml index 1e72e362e7..c968ff2be6 100644 --- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml +++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml @@ -599,10 +599,9 @@ - - + - net.topology.node.switch.mapping.impl + net.topology.node.switch.mapping.impl org.apache.hadoop.net.ScriptBasedMapping The default implementation of the DNSToSwitchMapping. It invokes a script specified in net.topology.script.file.name to resolve @@ -611,6 +610,13 @@ + + net.topology.impl + org.apache.hadoop.net.NetworkTopology + The default implementation of NetworkTopology which is classic three layer one. + + + net.topology.script.file.name diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java new file mode 100644 index 0000000000..7dbd33ab95 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java @@ -0,0 +1,165 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.net; + +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; + +public class TestNetworkTopologyWithNodeGroup extends TestCase { + private final static NetworkTopologyWithNodeGroup cluster = new + NetworkTopologyWithNodeGroup(); + + private final static DatanodeDescriptor dataNodes[] = new DatanodeDescriptor[] { + DFSTestUtil.getDatanodeDescriptor("1.1.1.1", "/d1/r1/s1"), + DFSTestUtil.getDatanodeDescriptor("2.2.2.2", "/d1/r1/s1"), + DFSTestUtil.getDatanodeDescriptor("3.3.3.3", "/d1/r1/s2"), + DFSTestUtil.getDatanodeDescriptor("4.4.4.4", "/d1/r2/s3"), + DFSTestUtil.getDatanodeDescriptor("5.5.5.5", "/d1/r2/s3"), + DFSTestUtil.getDatanodeDescriptor("6.6.6.6", "/d1/r2/s4"), + DFSTestUtil.getDatanodeDescriptor("7.7.7.7", "/d2/r3/s5"), + DFSTestUtil.getDatanodeDescriptor("8.8.8.8", "/d2/r3/s6") + }; + + private final static NodeBase computeNode = new NodeBase("/d1/r1/s1/h9"); + + static { + for(int i=0; i pickNodesAtRandom(int numNodes, + String excludedScope) { + Map frequency = new HashMap(); + for (DatanodeDescriptor dnd : dataNodes) { + frequency.put(dnd, 0); + } + + for (int j = 0; j < numNodes; j++) { + Node random = cluster.chooseRandom(excludedScope); + frequency.put(random, frequency.get(random) + 1); + } + return frequency; + } + + /** + * This test checks that chooseRandom works for an excluded node. + */ + public void testChooseRandomExcludedNode() { + String scope = "~" + NodeBase.getPath(dataNodes[0]); + Map frequency = pickNodesAtRandom(100, scope); + + for (Node key : dataNodes) { + // all nodes except the first should be more than zero + assertTrue(frequency.get(key) > 0 || key == dataNodes[0]); + } + } + +}