HDFS-12905. [READ] Handle decommissioning and under-maintenance Datanodes with Provided storage.
This commit is contained in:
parent
2298f2d76b
commit
0f6aa9564c
@ -342,14 +342,25 @@ DatanodeDescriptor choose(DatanodeDescriptor client,
|
|||||||
return dn;
|
return dn;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// prefer live nodes first.
|
||||||
|
DatanodeDescriptor dn = chooseRandomNode(excludedUUids, true);
|
||||||
|
if (dn == null) {
|
||||||
|
dn = chooseRandomNode(excludedUUids, false);
|
||||||
|
}
|
||||||
|
return dn;
|
||||||
|
}
|
||||||
|
|
||||||
|
private DatanodeDescriptor chooseRandomNode(Set<String> excludedUUids,
|
||||||
|
boolean preferLiveNodes) {
|
||||||
Random r = new Random();
|
Random r = new Random();
|
||||||
for (int i = dnR.size() - 1; i >= 0; --i) {
|
for (int i = dnR.size() - 1; i >= 0; --i) {
|
||||||
int pos = r.nextInt(i + 1);
|
int pos = r.nextInt(i + 1);
|
||||||
DatanodeDescriptor node = dnR.get(pos);
|
DatanodeDescriptor node = dnR.get(pos);
|
||||||
String uuid = node.getDatanodeUuid();
|
String uuid = node.getDatanodeUuid();
|
||||||
if (!excludedUUids.contains(uuid)) {
|
if (!excludedUUids.contains(uuid)) {
|
||||||
return node;
|
if (!preferLiveNodes || node.getAdminState() == AdminStates.NORMAL) {
|
||||||
|
return node;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Collections.swap(dnR, i, pos);
|
Collections.swap(dnR, i, pos);
|
||||||
}
|
}
|
||||||
|
@ -56,6 +56,7 @@
|
|||||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.ProvidedStorageMap;
|
import org.apache.hadoop.hdfs.server.blockmanagement.ProvidedStorageMap;
|
||||||
@ -795,4 +796,98 @@ public void testInMemoryAliasMap() throws Exception {
|
|||||||
FileUtils.deleteDirectory(tempDirectory);
|
FileUtils.deleteDirectory(tempDirectory);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private DatanodeDescriptor getDatanodeDescriptor(DatanodeManager dnm,
|
||||||
|
int dnIndex) throws Exception {
|
||||||
|
return dnm.getDatanode(cluster.getDataNodes().get(dnIndex).getDatanodeId());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void startDecommission(FSNamesystem namesystem, DatanodeManager dnm,
|
||||||
|
int dnIndex) throws Exception {
|
||||||
|
namesystem.writeLock();
|
||||||
|
DatanodeDescriptor dnDesc = getDatanodeDescriptor(dnm, dnIndex);
|
||||||
|
dnm.getDatanodeAdminManager().startDecommission(dnDesc);
|
||||||
|
namesystem.writeUnlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void startMaintenance(FSNamesystem namesystem, DatanodeManager dnm,
|
||||||
|
int dnIndex) throws Exception {
|
||||||
|
namesystem.writeLock();
|
||||||
|
DatanodeDescriptor dnDesc = getDatanodeDescriptor(dnm, dnIndex);
|
||||||
|
dnm.getDatanodeAdminManager().startMaintenance(dnDesc, Long.MAX_VALUE);
|
||||||
|
namesystem.writeUnlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void stopMaintenance(FSNamesystem namesystem, DatanodeManager dnm,
|
||||||
|
int dnIndex) throws Exception {
|
||||||
|
namesystem.writeLock();
|
||||||
|
DatanodeDescriptor dnDesc = getDatanodeDescriptor(dnm, dnIndex);
|
||||||
|
dnm.getDatanodeAdminManager().stopMaintenance(dnDesc);
|
||||||
|
namesystem.writeUnlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDatanodeLifeCycle() throws Exception {
|
||||||
|
createImage(new FSTreeWalk(NAMEPATH, conf), NNDIRPATH,
|
||||||
|
FixedBlockResolver.class);
|
||||||
|
startCluster(NNDIRPATH, 3,
|
||||||
|
new StorageType[] {StorageType.PROVIDED, StorageType.DISK},
|
||||||
|
null, false);
|
||||||
|
|
||||||
|
int fileIndex = numFiles -1;
|
||||||
|
|
||||||
|
final BlockManager blockManager = cluster.getNamesystem().getBlockManager();
|
||||||
|
final DatanodeManager dnm = blockManager.getDatanodeManager();
|
||||||
|
|
||||||
|
// to start, all 3 DNs are live in ProvidedDatanodeDescriptor.
|
||||||
|
verifyFileLocation(fileIndex, 3);
|
||||||
|
|
||||||
|
// de-commision first DN; still get 3 replicas.
|
||||||
|
startDecommission(cluster.getNamesystem(), dnm, 0);
|
||||||
|
verifyFileLocation(fileIndex, 3);
|
||||||
|
|
||||||
|
// remains the same even after heartbeats.
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
verifyFileLocation(fileIndex, 3);
|
||||||
|
|
||||||
|
// start maintenance for 2nd DN; still get 3 replicas.
|
||||||
|
startMaintenance(cluster.getNamesystem(), dnm, 1);
|
||||||
|
verifyFileLocation(fileIndex, 3);
|
||||||
|
|
||||||
|
DataNode dn1 = cluster.getDataNodes().get(0);
|
||||||
|
DataNode dn2 = cluster.getDataNodes().get(1);
|
||||||
|
|
||||||
|
// stop the 1st DN while being decomissioned.
|
||||||
|
MiniDFSCluster.DataNodeProperties dn1Properties = cluster.stopDataNode(0);
|
||||||
|
BlockManagerTestUtil.noticeDeadDatanode(cluster.getNameNode(),
|
||||||
|
dn1.getDatanodeId().getXferAddr());
|
||||||
|
|
||||||
|
// get 2 locations
|
||||||
|
verifyFileLocation(fileIndex, 2);
|
||||||
|
|
||||||
|
// stop dn2 while in maintenance.
|
||||||
|
MiniDFSCluster.DataNodeProperties dn2Properties = cluster.stopDataNode(1);
|
||||||
|
BlockManagerTestUtil.noticeDeadDatanode(cluster.getNameNode(),
|
||||||
|
dn2.getDatanodeId().getXferAddr());
|
||||||
|
|
||||||
|
// 2 valid locations will be found as blocks on nodes that die during
|
||||||
|
// maintenance are not marked for removal.
|
||||||
|
verifyFileLocation(fileIndex, 2);
|
||||||
|
|
||||||
|
// stop the maintenance; get only 1 replicas
|
||||||
|
stopMaintenance(cluster.getNamesystem(), dnm, 0);
|
||||||
|
verifyFileLocation(fileIndex, 1);
|
||||||
|
|
||||||
|
// restart the stopped DN.
|
||||||
|
cluster.restartDataNode(dn1Properties, true);
|
||||||
|
cluster.waitActive();
|
||||||
|
|
||||||
|
// reports all 3 replicas
|
||||||
|
verifyFileLocation(fileIndex, 2);
|
||||||
|
|
||||||
|
cluster.restartDataNode(dn2Properties, true);
|
||||||
|
cluster.waitActive();
|
||||||
|
|
||||||
|
// reports all 3 replicas
|
||||||
|
verifyFileLocation(fileIndex, 3);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user