From feee41aa00c3d33096013674aef4e8f26af4f364 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Tue, 21 Sep 2021 06:41:20 +0530 Subject: [PATCH] HDFS-16213. Flaky test TestFsDatasetImpl#testDnRestartWithHardLink (#3386) Reviewed-by: Fei Hui Reviewed-by: Wei-Chiu Chuang Signed-off-by: Akira Ajisaka --- .../hdfs/server/datanode/FileIoProvider.java | 1 + .../fsdataset/impl/BlockPoolSlice.java | 16 +++- .../fsdataset/impl/FsDatasetImpl.java | 17 ++-- .../apache/hadoop/hdfs/MiniDFSCluster.java | 8 +- .../fsdataset/impl/TestFsDatasetImpl.java | 81 ++++++++++++------- 5 files changed, 82 insertions(+), 41 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FileIoProvider.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FileIoProvider.java index 11577c3d7b..7d6435ea3f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FileIoProvider.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FileIoProvider.java @@ -513,6 +513,7 @@ public boolean fullyDelete(@Nullable FsVolumeSpi volume, File dir) { try { faultInjectorEventHook.beforeMetadataOp(volume, DELETE); boolean deleted = FileUtil.fullyDelete(dir); + LOG.trace("Deletion of dir {} {}", dir, deleted ? "succeeded" : "failed"); profilingEventHook.afterMetadataOp(volume, DELETE, begin); return deleted; } catch(Exception e) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java index 20df0e986b..b70199f962 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java @@ -102,7 +102,12 @@ class BlockPoolSlice { private final Runnable shutdownHook; private volatile boolean dfsUsedSaved = false; private static final int SHUTDOWN_HOOK_PRIORITY = 30; - private final boolean deleteDuplicateReplicas; + + /** + * Only tests are allowed to modify the value. For source code, + * this should be treated as final only. + */ + private boolean deleteDuplicateReplicas; private static final String REPLICA_CACHE_FILE = "replicas"; private final long replicaCacheExpiry; private final File replicaCacheDir; @@ -440,7 +445,7 @@ void getVolumeMap(ReplicaMap volumeMap, "Recovered " + numRecovered + " replicas from " + lazypersistDir); } - boolean success = readReplicasFromCache(volumeMap, lazyWriteReplicaMap); + boolean success = readReplicasFromCache(volumeMap, lazyWriteReplicaMap); if (!success) { List exceptions = Collections .synchronizedList(new ArrayList()); @@ -1081,4 +1086,11 @@ public static void reInitializeAddReplicaThreadPool() { addReplicaThreadPool.shutdown(); addReplicaThreadPool = null; } + + @VisibleForTesting + void setDeleteDuplicateReplicasForTests( + boolean deleteDuplicateReplicasForTests) { + this.deleteDuplicateReplicas = deleteDuplicateReplicasForTests; + } + } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java index 597ffbe806..09d48461e9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java @@ -26,6 +26,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.net.URI; import java.nio.channels.ClosedChannelException; import java.nio.channels.FileChannel; import java.nio.file.Files; @@ -1049,12 +1050,17 @@ static File[] copyBlockFiles(ReplicaInfo srcReplica, File dstMeta, static File[] hardLinkBlockFiles(ReplicaInfo srcReplica, File dstMeta, File dstFile) throws IOException { + FsVolumeSpi srcReplicaVolume = srcReplica.getVolume(); + File destParentFile = dstFile.getParentFile(); // Create parent folder if not exists. - srcReplica.getFileIoProvider() - .mkdirs(srcReplica.getVolume(), dstFile.getParentFile()); + boolean isDirCreated = srcReplica.getFileIoProvider() + .mkdirs(srcReplicaVolume, destParentFile); + LOG.trace("Dir creation of {} on volume {} {}", destParentFile, + srcReplicaVolume, isDirCreated ? "succeeded" : "failed"); + URI srcReplicaUri = srcReplica.getBlockURI(); try { HardLink.createHardLink( - new File(srcReplica.getBlockURI()), dstFile); + new File(srcReplicaUri), dstFile); } catch (IOException e) { throw new IOException("Failed to hardLink " + srcReplica + " block file to " @@ -1068,9 +1074,8 @@ static File[] hardLinkBlockFiles(ReplicaInfo srcReplica, File dstMeta, + srcReplica + " metadata to " + dstMeta, e); } - if (LOG.isDebugEnabled()) { - LOG.info("Linked " + srcReplica.getBlockURI() + " to " + dstFile); - } + LOG.debug("Linked {} to {} . Dest meta file: {}", srcReplicaUri, dstFile, + dstMeta); return new File[]{dstMeta, dstFile}; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java index 4de55d6406..04a7611032 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java @@ -2483,12 +2483,8 @@ public boolean restartDataNode(DataNodeProperties dnprop) throws IOException { public void waitDatanodeFullyStarted(DataNode dn, int timeout) throws TimeoutException, InterruptedException { - GenericTestUtils.waitFor(new Supplier() { - @Override - public Boolean get() { - return dn.isDatanodeFullyStarted(); - } - }, 100, timeout); + GenericTestUtils.waitFor(dn::isDatanodeFullyStarted, 100, timeout, + "Datanode is not started even after " + timeout + " ms of waiting"); } private void waitDataNodeFullyStarted(final DataNode dn) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java index 9b659d9150..1ae7582a32 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java @@ -50,10 +50,8 @@ import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; -import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; -import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.datanode.BlockScanner; import org.apache.hadoop.hdfs.server.datanode.DNConf; import org.apache.hadoop.hdfs.server.datanode.DataNode; @@ -78,7 +76,9 @@ import org.apache.hadoop.util.StringUtils; import org.junit.Assert; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TestName; import org.mockito.Mockito; import java.io.File; @@ -124,7 +124,10 @@ import org.slf4j.LoggerFactory; public class TestFsDatasetImpl { - Logger LOG = LoggerFactory.getLogger(TestFsDatasetImpl.class); + + private static final Logger LOG = LoggerFactory.getLogger( + TestFsDatasetImpl.class); + private static final String BASE_DIR = new FileSystemTestHelper().getTestRootDir(); private String replicaCacheRootDir = BASE_DIR + Path.SEPARATOR + "cache"; @@ -132,10 +135,6 @@ public class TestFsDatasetImpl { private static final String CLUSTER_ID = "cluser-id"; private static final String[] BLOCK_POOL_IDS = {"bpid-0", "bpid-1"}; - // Use to generate storageUuid - private static final DataStorage dsForStorageUuid = new DataStorage( - new StorageInfo(HdfsServerConstants.NodeType.DATA_NODE)); - private Configuration conf; private DataNode datanode; private DataStorage storage; @@ -143,6 +142,9 @@ public class TestFsDatasetImpl { private final static String BLOCKPOOL = "BP-TEST"; + @Rule + public TestName name = new TestName(); + private static Storage.StorageDirectory createStorageDirectory(File root, Configuration conf) throws SecurityException, IOException { @@ -228,6 +230,7 @@ public void setUp() throws IOException { assertEquals(NUM_INIT_VOLUMES, getNumVolumes()); assertEquals(0, dataset.getNumFailedVolumes()); } + @Test(timeout=10000) public void testReadLockEnabledByDefault() throws Exception { @@ -1132,7 +1135,7 @@ public void testReportBadBlocks() throws Exception { Assert.assertEquals(0, cluster.getNamesystem().getCorruptReplicaBlocks()); FileSystem fs = cluster.getFileSystem(); - Path filePath = new Path("testData"); + Path filePath = new Path(name.getMethodName()); DFSTestUtil.createFile(fs, filePath, 1, (short) 1, 0); block = DFSTestUtil.getFirstBlock(fs, filePath); @@ -1179,7 +1182,7 @@ private void testMoveBlockFailure(Configuration config) { FileSystem fs = cluster.getFileSystem(); DataNode dataNode = cluster.getDataNodes().get(0); - Path filePath = new Path("testData"); + Path filePath = new Path(name.getMethodName()); long fileLen = 100; ExtendedBlock block = createTestFile(fs, fileLen, filePath); @@ -1227,7 +1230,7 @@ public void testMoveBlockSuccess() { FileSystem fs = cluster.getFileSystem(); DataNode dataNode = cluster.getDataNodes().get(0); - Path filePath = new Path("testData"); + Path filePath = new Path(name.getMethodName()); DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0); ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath); @@ -1266,7 +1269,7 @@ public void testDnRestartWithHardLinkInTmp() { FileSystem fs = cluster.getFileSystem(); DataNode dataNode = cluster.getDataNodes().get(0); - Path filePath = new Path("testData"); + Path filePath = new Path(name.getMethodName()); long fileLen = 100; ExtendedBlock block = createTestFile(fs, fileLen, filePath); @@ -1307,23 +1310,34 @@ public void testDnRestartWithHardLinkInTmp() { * DiskScanner should clean up the hardlink correctly. */ @Test(timeout = 30000) - public void testDnRestartWithHardLink() { + public void testDnRestartWithHardLink() throws Exception { MiniDFSCluster cluster = null; + boolean isReplicaDeletionEnabled = + conf.getBoolean(DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION, + DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION_DEFAULT); try { conf.setBoolean(DFSConfigKeys .DFS_DATANODE_ALLOW_SAME_DISK_TIERING, true); conf.setDouble(DFSConfigKeys .DFS_DATANODE_RESERVE_FOR_ARCHIVE_DEFAULT_PERCENTAGE, 0.5); + // Since Datanode restart in the middle of block movement may leave + // uncleaned hardlink, disabling this config (i.e. deletion of duplicate + // replica) will prevent this edge-case from happening. + // We also re-enable deletion of duplicate replica just before starting + // Dir Scanner using setDeleteDuplicateReplicasForTests (HDFS-16213). + conf.setBoolean(DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION, + false); cluster = new MiniDFSCluster.Builder(conf) .numDataNodes(1) .storageTypes( new StorageType[]{StorageType.DISK, StorageType.ARCHIVE}) .storagesPerDatanode(2) .build(); + cluster.waitActive(); FileSystem fs = cluster.getFileSystem(); DataNode dataNode = cluster.getDataNodes().get(0); - Path filePath = new Path("testData"); + Path filePath = new Path(name.getMethodName()); long fileLen = 100; ExtendedBlock block = createTestFile(fs, fileLen, filePath); @@ -1331,11 +1345,15 @@ public void testDnRestartWithHardLink() { FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset(); final ReplicaInfo oldReplicaInfo = fsDataSetImpl.getReplicaInfo(block); + StorageType oldStorageType = oldReplicaInfo.getVolume().getStorageType(); fsDataSetImpl.finalizeNewReplica( createNewReplicaObjWithLink(block, fsDataSetImpl), block); ReplicaInfo newReplicaInfo = fsDataSetImpl.getReplicaInfo(block); + StorageType newStorageType = newReplicaInfo.getVolume().getStorageType(); + assertEquals(StorageType.DISK, oldStorageType); + assertEquals(StorageType.ARCHIVE, newStorageType); cluster.restartDataNode(0); cluster.waitDatanodeFullyStarted(cluster.getDataNodes().get(0), 60000); @@ -1344,26 +1362,35 @@ public void testDnRestartWithHardLink() { assertTrue(Files.exists(Paths.get(newReplicaInfo.getBlockURI()))); assertTrue(Files.exists(Paths.get(oldReplicaInfo.getBlockURI()))); - DirectoryScanner scanner = new DirectoryScanner( - cluster.getDataNodes().get(0).getFSDataset(), conf); + // Before starting Dir Scanner, we should enable deleteDuplicateReplicas. + FsDatasetSpi fsDataset = cluster.getDataNodes().get(0).getFSDataset(); + DirectoryScanner scanner = new DirectoryScanner(fsDataset, conf); + FsVolumeImpl fsVolume = + (FsVolumeImpl) fsDataset.getFsVolumeReferences().get(0); + fsVolume.getBlockPoolSlice(fsVolume.getBlockPoolList()[0]) + .setDeleteDuplicateReplicasForTests(true); scanner.start(); scanner.run(); - GenericTestUtils.waitFor(new Supplier() { - @Override public Boolean get() { - return !Files.exists(Paths.get(oldReplicaInfo.getBlockURI())); - } - }, 100, 10000); + GenericTestUtils.waitFor( + () -> !Files.exists(Paths.get(oldReplicaInfo.getBlockURI())), + 100, 10000, "Old replica is not deleted by DirScanner even after " + + "10s of waiting has elapsed"); assertTrue(Files.exists(Paths.get(newReplicaInfo.getBlockURI()))); validateFileLen(fs, fileLen, filePath); - } catch (Exception ex) { - LOG.info("Exception in testDnRestartWithHardLink ", ex); - fail("Exception while testing testDnRestartWithHardLink "); + // Additional tests to ensure latest replica gets deleted after file + // deletion. + fs.delete(filePath, false); + GenericTestUtils.waitFor( + () -> !Files.exists(Paths.get(newReplicaInfo.getBlockURI())), + 100, 10000); } finally { - if (cluster.isClusterUp()) { - cluster.shutdown(); + conf.setBoolean(DFSConfigKeys.DFS_DATANODE_DUPLICATE_REPLICA_DELETION, + isReplicaDeletionEnabled); + if (cluster != null && cluster.isClusterUp()) { + cluster.shutdown(true, true); } } } @@ -1384,7 +1411,7 @@ public void testMoveBlockSuccessWithSameMountMove() { .build(); FileSystem fs = cluster.getFileSystem(); DataNode dataNode = cluster.getDataNodes().get(0); - Path filePath = new Path("testData"); + Path filePath = new Path(name.getMethodName()); long fileLen = 100; ExtendedBlock block = createTestFile(fs, fileLen, filePath); @@ -1432,7 +1459,7 @@ public void testMoveBlockWithSameMountMoveWithoutSpace() { .build(); FileSystem fs = cluster.getFileSystem(); DataNode dataNode = cluster.getDataNodes().get(0); - Path filePath = new Path("testData"); + Path filePath = new Path(name.getMethodName()); long fileLen = 100; ExtendedBlock block = createTestFile(fs, fileLen, filePath);