HDFS-11030. TestDataNodeVolumeFailure#testVolumeFailure is flaky (though passing). Contributed by Mingliang Liu
This commit is contained in:
parent
90dd3a8148
commit
0c49f73a6c
@ -29,6 +29,7 @@
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
import java.net.Socket;
|
import java.net.Socket;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -52,7 +53,6 @@
|
|||||||
import org.apache.hadoop.hdfs.client.impl.DfsClientConf;
|
import org.apache.hadoop.hdfs.client.impl.DfsClientConf;
|
||||||
import org.apache.hadoop.hdfs.net.Peer;
|
import org.apache.hadoop.hdfs.net.Peer;
|
||||||
import org.apache.hadoop.hdfs.protocol.Block;
|
import org.apache.hadoop.hdfs.protocol.Block;
|
||||||
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
|
|
||||||
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
||||||
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||||
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
||||||
@ -66,14 +66,16 @@
|
|||||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetTestUtil;
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetTestUtil;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
import org.apache.hadoop.hdfs.server.protocol.BlockReportContext;
|
|
||||||
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
|
|
||||||
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
|
|
||||||
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
|
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
|
||||||
import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
|
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
|
||||||
import org.apache.hadoop.io.IOUtils;
|
import org.apache.hadoop.io.IOUtils;
|
||||||
import org.apache.hadoop.net.NetUtils;
|
import org.apache.hadoop.net.NetUtils;
|
||||||
import org.apache.hadoop.security.token.Token;
|
import org.apache.hadoop.security.token.Token;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
|
|
||||||
|
import com.google.common.base.Supplier;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.io.filefilter.TrueFileFilter;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
@ -108,6 +110,7 @@ public void setUp() throws Exception {
|
|||||||
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, block_size);
|
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, block_size);
|
||||||
// Allow a single volume failure (there are two volumes)
|
// Allow a single volume failure (there are two volumes)
|
||||||
conf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
|
conf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 30);
|
||||||
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(dn_num).build();
|
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(dn_num).build();
|
||||||
cluster.waitActive();
|
cluster.waitActive();
|
||||||
fs = cluster.getFileSystem();
|
fs = cluster.getFileSystem();
|
||||||
@ -135,7 +138,7 @@ public void tearDown() throws Exception {
|
|||||||
* and that we can replicate to both datanodes even after a single volume
|
* and that we can replicate to both datanodes even after a single volume
|
||||||
* failure if the configuration parameter allows this.
|
* failure if the configuration parameter allows this.
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test(timeout = 120000)
|
||||||
public void testVolumeFailure() throws Exception {
|
public void testVolumeFailure() throws Exception {
|
||||||
System.out.println("Data dir: is " + dataDir.getPath());
|
System.out.println("Data dir: is " + dataDir.getPath());
|
||||||
|
|
||||||
@ -155,7 +158,7 @@ public void testVolumeFailure() throws Exception {
|
|||||||
// fail the volume
|
// fail the volume
|
||||||
// delete/make non-writable one of the directories (failed volume)
|
// delete/make non-writable one of the directories (failed volume)
|
||||||
data_fail = new File(dataDir, "data3");
|
data_fail = new File(dataDir, "data3");
|
||||||
failedDir = MiniDFSCluster.getFinalizedDir(dataDir,
|
failedDir = MiniDFSCluster.getFinalizedDir(data_fail,
|
||||||
cluster.getNamesystem().getBlockPoolId());
|
cluster.getNamesystem().getBlockPoolId());
|
||||||
if (failedDir.exists() &&
|
if (failedDir.exists() &&
|
||||||
//!FileUtil.fullyDelete(failedDir)
|
//!FileUtil.fullyDelete(failedDir)
|
||||||
@ -171,29 +174,26 @@ public void testVolumeFailure() throws Exception {
|
|||||||
// we need to make sure that the "failed" volume is being accessed -
|
// we need to make sure that the "failed" volume is being accessed -
|
||||||
// and that will cause failure, blocks removal, "emergency" block report
|
// and that will cause failure, blocks removal, "emergency" block report
|
||||||
triggerFailure(filename, filesize);
|
triggerFailure(filename, filesize);
|
||||||
|
// DN eventually have latest volume failure information for next heartbeat
|
||||||
|
final DataNode dn = cluster.getDataNodes().get(1);
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean get() {
|
||||||
|
final VolumeFailureSummary summary =
|
||||||
|
dn.getFSDataset().getVolumeFailureSummary();
|
||||||
|
return summary != null &&
|
||||||
|
summary.getFailedStorageLocations() != null &&
|
||||||
|
summary.getFailedStorageLocations().length == 1;
|
||||||
|
}
|
||||||
|
}, 10, 30 * 1000);
|
||||||
|
|
||||||
// make sure a block report is sent
|
// trigger DN to send heartbeat
|
||||||
DataNode dn = cluster.getDataNodes().get(1); //corresponds to dir data3
|
DataNodeTestUtils.triggerHeartbeat(dn);
|
||||||
String bpid = cluster.getNamesystem().getBlockPoolId();
|
final BlockManager bm = cluster.getNamesystem().getBlockManager();
|
||||||
DatanodeRegistration dnR = dn.getDNRegistrationForBP(bpid);
|
// trigger NN handel heartbeat
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(bm);
|
||||||
Map<DatanodeStorage, BlockListAsLongs> perVolumeBlockLists =
|
// NN now should have latest volume failure
|
||||||
dn.getFSDataset().getBlockReports(bpid);
|
assertEquals(1, cluster.getNamesystem().getVolumeFailuresTotal());
|
||||||
|
|
||||||
// Send block report
|
|
||||||
StorageBlockReport[] reports =
|
|
||||||
new StorageBlockReport[perVolumeBlockLists.size()];
|
|
||||||
|
|
||||||
int reportIndex = 0;
|
|
||||||
for(Map.Entry<DatanodeStorage, BlockListAsLongs> kvPair : perVolumeBlockLists.entrySet()) {
|
|
||||||
DatanodeStorage dnStorage = kvPair.getKey();
|
|
||||||
BlockListAsLongs blockList = kvPair.getValue();
|
|
||||||
reports[reportIndex++] =
|
|
||||||
new StorageBlockReport(dnStorage, blockList);
|
|
||||||
}
|
|
||||||
|
|
||||||
cluster.getNameNodeRpc().blockReport(dnR, bpid, reports,
|
|
||||||
new BlockReportContext(1, 0, System.nanoTime(), 0, true));
|
|
||||||
|
|
||||||
// verify number of blocks and files...
|
// verify number of blocks and files...
|
||||||
verify(filename, filesize);
|
verify(filename, filesize);
|
||||||
@ -492,9 +492,11 @@ private void triggerFailure(String path, long size) throws IOException {
|
|||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private boolean deteteBlocks(File dir) {
|
private boolean deteteBlocks(File dir) {
|
||||||
File [] fileList = dir.listFiles();
|
Collection<File> fileList = FileUtils.listFiles(dir,
|
||||||
|
TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE);
|
||||||
for(File f : fileList) {
|
for(File f : fileList) {
|
||||||
if(f.getName().startsWith(Block.BLOCK_FILE_PREFIX)) {
|
if(f.getName().startsWith(Block.BLOCK_FILE_PREFIX)) {
|
||||||
|
System.out.println("Deleting file " + f);
|
||||||
if(!f.delete())
|
if(!f.delete())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user