HDFS-16213. Flaky test TestFsDatasetImpl#testDnRestartWithHardLink (#3386)

Reviewed-by: Fei Hui <feihui.ustc@gmail.com>
Reviewed-by: Wei-Chiu Chuang <weichiu@apache.org>
Signed-off-by: Akira Ajisaka <aajisaka@apache.org>
This commit is contained in:
Viraj Jasani 2021-09-21 06:41:20 +05:30 committed by GitHub
parent 4df4389325
commit feee41aa00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 82 additions and 41 deletions

View File

@ -513,6 +513,7 @@ public boolean fullyDelete(@Nullable FsVolumeSpi volume, File dir) {
try { try {
faultInjectorEventHook.beforeMetadataOp(volume, DELETE); faultInjectorEventHook.beforeMetadataOp(volume, DELETE);
boolean deleted = FileUtil.fullyDelete(dir); boolean deleted = FileUtil.fullyDelete(dir);
LOG.trace("Deletion of dir {} {}", dir, deleted ? "succeeded" : "failed");
profilingEventHook.afterMetadataOp(volume, DELETE, begin); profilingEventHook.afterMetadataOp(volume, DELETE, begin);
return deleted; return deleted;
} catch(Exception e) { } catch(Exception e) {

View File

@ -102,7 +102,12 @@ class BlockPoolSlice {
private final Runnable shutdownHook; private final Runnable shutdownHook;
private volatile boolean dfsUsedSaved = false; private volatile boolean dfsUsedSaved = false;
private static final int SHUTDOWN_HOOK_PRIORITY = 30; private static final int SHUTDOWN_HOOK_PRIORITY = 30;
private final boolean deleteDuplicateReplicas;
/**
* Only tests are allowed to modify the value. For source code,
* this should be treated as final only.
*/
private boolean deleteDuplicateReplicas;
private static final String REPLICA_CACHE_FILE = "replicas"; private static final String REPLICA_CACHE_FILE = "replicas";
private final long replicaCacheExpiry; private final long replicaCacheExpiry;
private final File replicaCacheDir; private final File replicaCacheDir;
@ -440,7 +445,7 @@ void getVolumeMap(ReplicaMap volumeMap,
"Recovered " + numRecovered + " replicas from " + lazypersistDir); "Recovered " + numRecovered + " replicas from " + lazypersistDir);
} }
boolean success = readReplicasFromCache(volumeMap, lazyWriteReplicaMap); boolean success = readReplicasFromCache(volumeMap, lazyWriteReplicaMap);
if (!success) { if (!success) {
List<IOException> exceptions = Collections List<IOException> exceptions = Collections
.synchronizedList(new ArrayList<IOException>()); .synchronizedList(new ArrayList<IOException>());
@ -1081,4 +1086,11 @@ public static void reInitializeAddReplicaThreadPool() {
addReplicaThreadPool.shutdown(); addReplicaThreadPool.shutdown();
addReplicaThreadPool = null; addReplicaThreadPool = null;
} }
@VisibleForTesting
void setDeleteDuplicateReplicasForTests(
boolean deleteDuplicateReplicasForTests) {
this.deleteDuplicateReplicas = deleteDuplicateReplicasForTests;
}
} }

View File

@ -26,6 +26,7 @@
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.URI;
import java.nio.channels.ClosedChannelException; import java.nio.channels.ClosedChannelException;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
@ -1049,12 +1050,17 @@ static File[] copyBlockFiles(ReplicaInfo srcReplica, File dstMeta,
static File[] hardLinkBlockFiles(ReplicaInfo srcReplica, File dstMeta, static File[] hardLinkBlockFiles(ReplicaInfo srcReplica, File dstMeta,
File dstFile) File dstFile)
throws IOException { throws IOException {
FsVolumeSpi srcReplicaVolume = srcReplica.getVolume();
File destParentFile = dstFile.getParentFile();
// Create parent folder if not exists. // Create parent folder if not exists.
srcReplica.getFileIoProvider() boolean isDirCreated = srcReplica.getFileIoProvider()
.mkdirs(srcReplica.getVolume(), dstFile.getParentFile()); .mkdirs(srcReplicaVolume, destParentFile);
LOG.trace("Dir creation of {} on volume {} {}", destParentFile,
srcReplicaVolume, isDirCreated ? "succeeded" : "failed");
URI srcReplicaUri = srcReplica.getBlockURI();
try { try {
HardLink.createHardLink( HardLink.createHardLink(
new File(srcReplica.getBlockURI()), dstFile); new File(srcReplicaUri), dstFile);
} catch (IOException e) { } catch (IOException e) {
throw new IOException("Failed to hardLink " throw new IOException("Failed to hardLink "
+ srcReplica + " block file to " + srcReplica + " block file to "
@ -1068,9 +1074,8 @@ static File[] hardLinkBlockFiles(ReplicaInfo srcReplica, File dstMeta,
+ srcReplica + " metadata to " + srcReplica + " metadata to "
+ dstMeta, e); + dstMeta, e);
} }
if (LOG.isDebugEnabled()) { LOG.debug("Linked {} to {} . Dest meta file: {}", srcReplicaUri, dstFile,
LOG.info("Linked " + srcReplica.getBlockURI() + " to " + dstFile); dstMeta);
}
return new File[]{dstMeta, dstFile}; return new File[]{dstMeta, dstFile};
} }

View File

@ -2483,12 +2483,8 @@ public boolean restartDataNode(DataNodeProperties dnprop) throws IOException {
public void waitDatanodeFullyStarted(DataNode dn, int timeout) public void waitDatanodeFullyStarted(DataNode dn, int timeout)
throws TimeoutException, InterruptedException { throws TimeoutException, InterruptedException {
GenericTestUtils.waitFor(new Supplier<Boolean>() { GenericTestUtils.waitFor(dn::isDatanodeFullyStarted, 100, timeout,
@Override "Datanode is not started even after " + timeout + " ms of waiting");
public Boolean get() {
return dn.isDatanodeFullyStarted();
}
}, 100, timeout);
} }
private void waitDataNodeFullyStarted(final DataNode dn) private void waitDataNodeFullyStarted(final DataNode dn)

View File

@ -50,10 +50,8 @@
import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.datanode.BlockScanner; import org.apache.hadoop.hdfs.server.datanode.BlockScanner;
import org.apache.hadoop.hdfs.server.datanode.DNConf; import org.apache.hadoop.hdfs.server.datanode.DNConf;
import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNode;
@ -78,7 +76,9 @@
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TestName;
import org.mockito.Mockito; import org.mockito.Mockito;
import java.io.File; import java.io.File;
@ -124,7 +124,10 @@
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
public class TestFsDatasetImpl { public class TestFsDatasetImpl {
Logger LOG = LoggerFactory.getLogger(TestFsDatasetImpl.class);
private static final Logger LOG = LoggerFactory.getLogger(
TestFsDatasetImpl.class);
private static final String BASE_DIR = private static final String BASE_DIR =
new FileSystemTestHelper().getTestRootDir(); new FileSystemTestHelper().getTestRootDir();
private String replicaCacheRootDir = BASE_DIR + Path.SEPARATOR + "cache"; private String replicaCacheRootDir = BASE_DIR + Path.SEPARATOR + "cache";
@ -132,10 +135,6 @@ public class TestFsDatasetImpl {
private static final String CLUSTER_ID = "cluser-id"; private static final String CLUSTER_ID = "cluser-id";
private static final String[] BLOCK_POOL_IDS = {"bpid-0", "bpid-1"}; private static final String[] BLOCK_POOL_IDS = {"bpid-0", "bpid-1"};
// Use to generate storageUuid
private static final DataStorage dsForStorageUuid = new DataStorage(
new StorageInfo(HdfsServerConstants.NodeType.DATA_NODE));
private Configuration conf; private Configuration conf;
private DataNode datanode; private DataNode datanode;
private DataStorage storage; private DataStorage storage;
@ -143,6 +142,9 @@ public class TestFsDatasetImpl {
private final static String BLOCKPOOL = "BP-TEST"; private final static String BLOCKPOOL = "BP-TEST";
@Rule
public TestName name = new TestName();
private static Storage.StorageDirectory createStorageDirectory(File root, private static Storage.StorageDirectory createStorageDirectory(File root,
Configuration conf) Configuration conf)
throws SecurityException, IOException { throws SecurityException, IOException {
@ -228,6 +230,7 @@ public void setUp() throws IOException {
assertEquals(NUM_INIT_VOLUMES, getNumVolumes()); assertEquals(NUM_INIT_VOLUMES, getNumVolumes());
assertEquals(0, dataset.getNumFailedVolumes()); assertEquals(0, dataset.getNumFailedVolumes());
} }
@Test(timeout=10000) @Test(timeout=10000)
public void testReadLockEnabledByDefault() public void testReadLockEnabledByDefault()
throws Exception { throws Exception {
@ -1132,7 +1135,7 @@ public void testReportBadBlocks() throws Exception {
Assert.assertEquals(0, cluster.getNamesystem().getCorruptReplicaBlocks()); Assert.assertEquals(0, cluster.getNamesystem().getCorruptReplicaBlocks());
FileSystem fs = cluster.getFileSystem(); FileSystem fs = cluster.getFileSystem();
Path filePath = new Path("testData"); Path filePath = new Path(name.getMethodName());
DFSTestUtil.createFile(fs, filePath, 1, (short) 1, 0); DFSTestUtil.createFile(fs, filePath, 1, (short) 1, 0);
block = DFSTestUtil.getFirstBlock(fs, filePath); block = DFSTestUtil.getFirstBlock(fs, filePath);
@ -1179,7 +1182,7 @@ private void testMoveBlockFailure(Configuration config) {
FileSystem fs = cluster.getFileSystem(); FileSystem fs = cluster.getFileSystem();
DataNode dataNode = cluster.getDataNodes().get(0); DataNode dataNode = cluster.getDataNodes().get(0);
Path filePath = new Path("testData"); Path filePath = new Path(name.getMethodName());
long fileLen = 100; long fileLen = 100;
ExtendedBlock block = createTestFile(fs, fileLen, filePath); ExtendedBlock block = createTestFile(fs, fileLen, filePath);
@ -1227,7 +1230,7 @@ public void testMoveBlockSuccess() {
FileSystem fs = cluster.getFileSystem(); FileSystem fs = cluster.getFileSystem();
DataNode dataNode = cluster.getDataNodes().get(0); DataNode dataNode = cluster.getDataNodes().get(0);
Path filePath = new Path("testData"); Path filePath = new Path(name.getMethodName());
DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0); DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0);
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath); ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
@ -1266,7 +1269,7 @@ public void testDnRestartWithHardLinkInTmp() {
FileSystem fs = cluster.getFileSystem(); FileSystem fs = cluster.getFileSystem();
DataNode dataNode = cluster.getDataNodes().get(0); DataNode dataNode = cluster.getDataNodes().get(0);
Path filePath = new Path("testData"); Path filePath = new Path(name.getMethodName());
long fileLen = 100; long fileLen = 100;
ExtendedBlock block = createTestFile(fs, fileLen, filePath); ExtendedBlock block = createTestFile(fs, fileLen, filePath);
@ -1307,23 +1310,34 @@ public void testDnRestartWithHardLinkInTmp() {
* DiskScanner should clean up the hardlink correctly. * DiskScanner should clean up the hardlink correctly.
*/ */
@Test(timeout = 30000) @Test(timeout = 30000)
public void testDnRestartWithHardLink() { public void testDnRestartWithHardLink() throws Exception {
MiniDFSCluster cluster = null; MiniDFSCluster cluster = null;
boolean isReplicaDeletionEnabled =
conf.getBoolean(DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION,
DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION_DEFAULT);
try { try {
conf.setBoolean(DFSConfigKeys conf.setBoolean(DFSConfigKeys
.DFS_DATANODE_ALLOW_SAME_DISK_TIERING, true); .DFS_DATANODE_ALLOW_SAME_DISK_TIERING, true);
conf.setDouble(DFSConfigKeys conf.setDouble(DFSConfigKeys
.DFS_DATANODE_RESERVE_FOR_ARCHIVE_DEFAULT_PERCENTAGE, 0.5); .DFS_DATANODE_RESERVE_FOR_ARCHIVE_DEFAULT_PERCENTAGE, 0.5);
// Since Datanode restart in the middle of block movement may leave
// uncleaned hardlink, disabling this config (i.e. deletion of duplicate
// replica) will prevent this edge-case from happening.
// We also re-enable deletion of duplicate replica just before starting
// Dir Scanner using setDeleteDuplicateReplicasForTests (HDFS-16213).
conf.setBoolean(DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION,
false);
cluster = new MiniDFSCluster.Builder(conf) cluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(1) .numDataNodes(1)
.storageTypes( .storageTypes(
new StorageType[]{StorageType.DISK, StorageType.ARCHIVE}) new StorageType[]{StorageType.DISK, StorageType.ARCHIVE})
.storagesPerDatanode(2) .storagesPerDatanode(2)
.build(); .build();
cluster.waitActive();
FileSystem fs = cluster.getFileSystem(); FileSystem fs = cluster.getFileSystem();
DataNode dataNode = cluster.getDataNodes().get(0); DataNode dataNode = cluster.getDataNodes().get(0);
Path filePath = new Path("testData"); Path filePath = new Path(name.getMethodName());
long fileLen = 100; long fileLen = 100;
ExtendedBlock block = createTestFile(fs, fileLen, filePath); ExtendedBlock block = createTestFile(fs, fileLen, filePath);
@ -1331,11 +1345,15 @@ public void testDnRestartWithHardLink() {
FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset(); FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset();
final ReplicaInfo oldReplicaInfo = fsDataSetImpl.getReplicaInfo(block); final ReplicaInfo oldReplicaInfo = fsDataSetImpl.getReplicaInfo(block);
StorageType oldStorageType = oldReplicaInfo.getVolume().getStorageType();
fsDataSetImpl.finalizeNewReplica( fsDataSetImpl.finalizeNewReplica(
createNewReplicaObjWithLink(block, fsDataSetImpl), block); createNewReplicaObjWithLink(block, fsDataSetImpl), block);
ReplicaInfo newReplicaInfo = fsDataSetImpl.getReplicaInfo(block); ReplicaInfo newReplicaInfo = fsDataSetImpl.getReplicaInfo(block);
StorageType newStorageType = newReplicaInfo.getVolume().getStorageType();
assertEquals(StorageType.DISK, oldStorageType);
assertEquals(StorageType.ARCHIVE, newStorageType);
cluster.restartDataNode(0); cluster.restartDataNode(0);
cluster.waitDatanodeFullyStarted(cluster.getDataNodes().get(0), 60000); cluster.waitDatanodeFullyStarted(cluster.getDataNodes().get(0), 60000);
@ -1344,26 +1362,35 @@ public void testDnRestartWithHardLink() {
assertTrue(Files.exists(Paths.get(newReplicaInfo.getBlockURI()))); assertTrue(Files.exists(Paths.get(newReplicaInfo.getBlockURI())));
assertTrue(Files.exists(Paths.get(oldReplicaInfo.getBlockURI()))); assertTrue(Files.exists(Paths.get(oldReplicaInfo.getBlockURI())));
DirectoryScanner scanner = new DirectoryScanner( // Before starting Dir Scanner, we should enable deleteDuplicateReplicas.
cluster.getDataNodes().get(0).getFSDataset(), conf); FsDatasetSpi<?> fsDataset = cluster.getDataNodes().get(0).getFSDataset();
DirectoryScanner scanner = new DirectoryScanner(fsDataset, conf);
FsVolumeImpl fsVolume =
(FsVolumeImpl) fsDataset.getFsVolumeReferences().get(0);
fsVolume.getBlockPoolSlice(fsVolume.getBlockPoolList()[0])
.setDeleteDuplicateReplicasForTests(true);
scanner.start(); scanner.start();
scanner.run(); scanner.run();
GenericTestUtils.waitFor(new Supplier<Boolean>() { GenericTestUtils.waitFor(
@Override public Boolean get() { () -> !Files.exists(Paths.get(oldReplicaInfo.getBlockURI())),
return !Files.exists(Paths.get(oldReplicaInfo.getBlockURI())); 100, 10000, "Old replica is not deleted by DirScanner even after "
} + "10s of waiting has elapsed");
}, 100, 10000);
assertTrue(Files.exists(Paths.get(newReplicaInfo.getBlockURI()))); assertTrue(Files.exists(Paths.get(newReplicaInfo.getBlockURI())));
validateFileLen(fs, fileLen, filePath); validateFileLen(fs, fileLen, filePath);
} catch (Exception ex) { // Additional tests to ensure latest replica gets deleted after file
LOG.info("Exception in testDnRestartWithHardLink ", ex); // deletion.
fail("Exception while testing testDnRestartWithHardLink "); fs.delete(filePath, false);
GenericTestUtils.waitFor(
() -> !Files.exists(Paths.get(newReplicaInfo.getBlockURI())),
100, 10000);
} finally { } finally {
if (cluster.isClusterUp()) { conf.setBoolean(DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION,
cluster.shutdown(); isReplicaDeletionEnabled);
if (cluster != null && cluster.isClusterUp()) {
cluster.shutdown(true, true);
} }
} }
} }
@ -1384,7 +1411,7 @@ public void testMoveBlockSuccessWithSameMountMove() {
.build(); .build();
FileSystem fs = cluster.getFileSystem(); FileSystem fs = cluster.getFileSystem();
DataNode dataNode = cluster.getDataNodes().get(0); DataNode dataNode = cluster.getDataNodes().get(0);
Path filePath = new Path("testData"); Path filePath = new Path(name.getMethodName());
long fileLen = 100; long fileLen = 100;
ExtendedBlock block = createTestFile(fs, fileLen, filePath); ExtendedBlock block = createTestFile(fs, fileLen, filePath);
@ -1432,7 +1459,7 @@ public void testMoveBlockWithSameMountMoveWithoutSpace() {
.build(); .build();
FileSystem fs = cluster.getFileSystem(); FileSystem fs = cluster.getFileSystem();
DataNode dataNode = cluster.getDataNodes().get(0); DataNode dataNode = cluster.getDataNodes().get(0);
Path filePath = new Path("testData"); Path filePath = new Path(name.getMethodName());
long fileLen = 100; long fileLen = 100;
ExtendedBlock block = createTestFile(fs, fileLen, filePath); ExtendedBlock block = createTestFile(fs, fileLen, filePath);