HDFS-12905. [READ] Handle decommissioning and under-maintenance Datanodes with Provided storage.

This commit is contained in:
Virajith Jalaparti 2017-12-08 10:07:40 -08:00 committed by Chris Douglas
parent 2298f2d76b
commit 0f6aa9564c
2 changed files with 107 additions and 1 deletions

View File

@ -342,15 +342,26 @@ DatanodeDescriptor choose(DatanodeDescriptor client,
return dn;
}
}
// prefer live nodes first.
DatanodeDescriptor dn = chooseRandomNode(excludedUUids, true);
if (dn == null) {
dn = chooseRandomNode(excludedUUids, false);
}
return dn;
}
private DatanodeDescriptor chooseRandomNode(Set<String> excludedUUids,
boolean preferLiveNodes) {
Random r = new Random();
for (int i = dnR.size() - 1; i >= 0; --i) {
int pos = r.nextInt(i + 1);
DatanodeDescriptor node = dnR.get(pos);
String uuid = node.getDatanodeUuid();
if (!excludedUUids.contains(uuid)) {
if (!preferLiveNodes || node.getAdminState() == AdminStates.NORMAL) {
return node;
}
}
Collections.swap(dnR, i, pos);
}
return null;

View File

@ -56,6 +56,7 @@
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.hdfs.server.blockmanagement.ProvidedStorageMap;
@ -795,4 +796,98 @@ public void testInMemoryAliasMap() throws Exception {
FileUtils.deleteDirectory(tempDirectory);
}
private DatanodeDescriptor getDatanodeDescriptor(DatanodeManager dnm,
int dnIndex) throws Exception {
return dnm.getDatanode(cluster.getDataNodes().get(dnIndex).getDatanodeId());
}
private void startDecommission(FSNamesystem namesystem, DatanodeManager dnm,
int dnIndex) throws Exception {
namesystem.writeLock();
DatanodeDescriptor dnDesc = getDatanodeDescriptor(dnm, dnIndex);
dnm.getDatanodeAdminManager().startDecommission(dnDesc);
namesystem.writeUnlock();
}
private void startMaintenance(FSNamesystem namesystem, DatanodeManager dnm,
int dnIndex) throws Exception {
namesystem.writeLock();
DatanodeDescriptor dnDesc = getDatanodeDescriptor(dnm, dnIndex);
dnm.getDatanodeAdminManager().startMaintenance(dnDesc, Long.MAX_VALUE);
namesystem.writeUnlock();
}
private void stopMaintenance(FSNamesystem namesystem, DatanodeManager dnm,
int dnIndex) throws Exception {
namesystem.writeLock();
DatanodeDescriptor dnDesc = getDatanodeDescriptor(dnm, dnIndex);
dnm.getDatanodeAdminManager().stopMaintenance(dnDesc);
namesystem.writeUnlock();
}
@Test
public void testDatanodeLifeCycle() throws Exception {
createImage(new FSTreeWalk(NAMEPATH, conf), NNDIRPATH,
FixedBlockResolver.class);
startCluster(NNDIRPATH, 3,
new StorageType[] {StorageType.PROVIDED, StorageType.DISK},
null, false);
int fileIndex = numFiles -1;
final BlockManager blockManager = cluster.getNamesystem().getBlockManager();
final DatanodeManager dnm = blockManager.getDatanodeManager();
// to start, all 3 DNs are live in ProvidedDatanodeDescriptor.
verifyFileLocation(fileIndex, 3);
// de-commision first DN; still get 3 replicas.
startDecommission(cluster.getNamesystem(), dnm, 0);
verifyFileLocation(fileIndex, 3);
// remains the same even after heartbeats.
cluster.triggerHeartbeats();
verifyFileLocation(fileIndex, 3);
// start maintenance for 2nd DN; still get 3 replicas.
startMaintenance(cluster.getNamesystem(), dnm, 1);
verifyFileLocation(fileIndex, 3);
DataNode dn1 = cluster.getDataNodes().get(0);
DataNode dn2 = cluster.getDataNodes().get(1);
// stop the 1st DN while being decomissioned.
MiniDFSCluster.DataNodeProperties dn1Properties = cluster.stopDataNode(0);
BlockManagerTestUtil.noticeDeadDatanode(cluster.getNameNode(),
dn1.getDatanodeId().getXferAddr());
// get 2 locations
verifyFileLocation(fileIndex, 2);
// stop dn2 while in maintenance.
MiniDFSCluster.DataNodeProperties dn2Properties = cluster.stopDataNode(1);
BlockManagerTestUtil.noticeDeadDatanode(cluster.getNameNode(),
dn2.getDatanodeId().getXferAddr());
// 2 valid locations will be found as blocks on nodes that die during
// maintenance are not marked for removal.
verifyFileLocation(fileIndex, 2);
// stop the maintenance; get only 1 replicas
stopMaintenance(cluster.getNamesystem(), dnm, 0);
verifyFileLocation(fileIndex, 1);
// restart the stopped DN.
cluster.restartDataNode(dn1Properties, true);
cluster.waitActive();
// reports all 3 replicas
verifyFileLocation(fileIndex, 2);
cluster.restartDataNode(dn2Properties, true);
cluster.waitActive();
// reports all 3 replicas
verifyFileLocation(fileIndex, 3);
}
}