HDFS-17342. Fix DataNode may invalidates normal block causing missing block (#6464). Contributed by Haiyang Hu.

Reviewed-by: ZanderXu <zanderxu@apache.org>
Reviewed-by: Chengwei Wang <1139557635@qq.com>
Signed-off-by: Shuyan Zhang <zhangshuyan@apache.org>
This commit is contained in:
huhaiyang 2024-02-06 17:52:52 +08:00 committed by GitHub
parent 9a7eeadaac
commit 5ad7737132
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 114 additions and 5 deletions

View File

@ -167,4 +167,9 @@ public void delayDeleteReplica() {}
* Just delay run diff record a while. * Just delay run diff record a while.
*/ */
public void delayDiffRecord() {} public void delayDiffRecord() {}
/**
* Just delay getMetaDataInputStream a while.
*/
public void delayGetMetaDataInputStream() {}
} }

View File

@ -63,6 +63,7 @@
import org.apache.hadoop.hdfs.server.common.AutoCloseDataSetLock; import org.apache.hadoop.hdfs.server.common.AutoCloseDataSetLock;
import org.apache.hadoop.hdfs.server.common.DataNodeLockManager; import org.apache.hadoop.hdfs.server.common.DataNodeLockManager;
import org.apache.hadoop.hdfs.server.common.DataNodeLockManager.LockLevel; import org.apache.hadoop.hdfs.server.common.DataNodeLockManager.LockLevel;
import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
import org.apache.hadoop.hdfs.server.datanode.DataSetLockManager; import org.apache.hadoop.hdfs.server.datanode.DataSetLockManager;
import org.apache.hadoop.hdfs.server.datanode.FileIoProvider; import org.apache.hadoop.hdfs.server.datanode.FileIoProvider;
import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica; import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica;
@ -247,6 +248,7 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b)
if (info == null || !info.metadataExists()) { if (info == null || !info.metadataExists()) {
return null; return null;
} }
DataNodeFaultInjector.get().delayGetMetaDataInputStream();
return info.getMetadataInputStream(0); return info.getMetadataInputStream(0);
} }
@ -2403,8 +2405,9 @@ public void invalidate(String bpid, ReplicaInfo block) {
* *
* @param bpid the block pool ID. * @param bpid the block pool ID.
* @param block The block to be invalidated. * @param block The block to be invalidated.
* @param checkFiles Whether to check data and meta files.
*/ */
public void invalidateMissingBlock(String bpid, Block block) { public void invalidateMissingBlock(String bpid, Block block, boolean checkFiles) {
// The replica seems is on its volume map but not on disk. // The replica seems is on its volume map but not on disk.
// We can't confirm here is block file lost or disk failed. // We can't confirm here is block file lost or disk failed.
@ -2416,11 +2419,21 @@ public void invalidateMissingBlock(String bpid, Block block) {
// So remove if from volume map notify namenode is ok. // So remove if from volume map notify namenode is ok.
try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.BLOCK_POOl, try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.BLOCK_POOl,
bpid)) { bpid)) {
ReplicaInfo replica = volumeMap.remove(bpid, block); // Check if this block is on the volume map.
invalidate(bpid, replica); ReplicaInfo replica = volumeMap.get(bpid, block);
// Double-check block or meta file existence when checkFiles as true.
if (replica != null && (!checkFiles ||
(!replica.blockDataExists() || !replica.metadataExists()))) {
volumeMap.remove(bpid, block);
invalidate(bpid, replica);
}
} }
} }
public void invalidateMissingBlock(String bpid, Block block) {
invalidateMissingBlock(bpid, block, true);
}
/** /**
* Remove Replica from ReplicaMap. * Remove Replica from ReplicaMap.
* *

View File

@ -1962,7 +1962,7 @@ public void delayDeleteReplica() {
* 4. block would be recovered when disk back to normal. * 4. block would be recovered when disk back to normal.
*/ */
@Test @Test
public void tesInvalidateMissingBlock() throws Exception { public void testInvalidateMissingBlock() throws Exception {
long blockSize = 1024; long blockSize = 1024;
int heartbeatInterval = 1; int heartbeatInterval = 1;
HdfsConfiguration c = new HdfsConfiguration(); HdfsConfiguration c = new HdfsConfiguration();
@ -1988,7 +1988,7 @@ public void tesInvalidateMissingBlock() throws Exception {
File metaFile = new File(metaPath); File metaFile = new File(metaPath);
// Mock local block file not found when disk with some exception. // Mock local block file not found when disk with some exception.
fsdataset.invalidateMissingBlock(bpid, replicaInfo); fsdataset.invalidateMissingBlock(bpid, replicaInfo, false);
// Assert local block file wouldn't be deleted from disk. // Assert local block file wouldn't be deleted from disk.
assertTrue(blockFile.exists()); assertTrue(blockFile.exists());
@ -2011,4 +2011,95 @@ public void tesInvalidateMissingBlock() throws Exception {
cluster.shutdown(); cluster.shutdown();
} }
} }
@Test
public void testCheckFilesWhenInvalidateMissingBlock() throws Exception {
long blockSize = 1024;
int heartbeatInterval = 1;
HdfsConfiguration c = new HdfsConfiguration();
c.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, heartbeatInterval);
c.setLong(DFS_BLOCK_SIZE_KEY, blockSize);
MiniDFSCluster cluster = new MiniDFSCluster.Builder(c).
numDataNodes(1).build();
DataNodeFaultInjector oldDnInjector = DataNodeFaultInjector.get();
try {
cluster.waitActive();
GenericTestUtils.LogCapturer logCapturer = GenericTestUtils.LogCapturer.
captureLogs(DataNode.LOG);
BlockReaderTestUtil util = new BlockReaderTestUtil(cluster, new
HdfsConfiguration(conf));
Path path = new Path("/testFile");
util.writeFile(path, 1);
String bpid = cluster.getNameNode().getNamesystem().getBlockPoolId();
DataNode dn = cluster.getDataNodes().get(0);
FsDatasetImpl dnFSDataset = (FsDatasetImpl) dn.getFSDataset();
List<ReplicaInfo> replicaInfos = dnFSDataset.getFinalizedBlocks(bpid);
assertEquals(1, replicaInfos.size());
DFSTestUtil.readFile(cluster.getFileSystem(), path);
LocatedBlock blk = util.getFileBlocks(path, 512).get(0);
ExtendedBlock block = blk.getBlock();
// Append a new block with an incremented generation stamp.
long newGS = block.getGenerationStamp() + 1;
dnFSDataset.append(block, newGS, 1024);
block.setGenerationStamp(newGS);
ReplicaInfo tmpReplicaInfo = dnFSDataset.getReplicaInfo(blk.getBlock());
DataNodeFaultInjector injector = new DataNodeFaultInjector() {
@Override
public void delayGetMetaDataInputStream() {
try {
Thread.sleep(8000);
} catch (InterruptedException e) {
// Ignore exception.
}
}
};
// Delay to getMetaDataInputStream.
DataNodeFaultInjector.set(injector);
ExecutorService executorService = Executors.newFixedThreadPool(2);
try {
Future<?> blockReaderFuture = executorService.submit(() -> {
try {
// Submit tasks for reading block.
BlockReader blockReader = BlockReaderTestUtil.getBlockReader(
cluster.getFileSystem(), blk, 0, 512);
blockReader.close();
} catch (IOException e) {
// Ignore exception.
}
});
Future<?> finalizeBlockFuture = executorService.submit(() -> {
try {
// Submit tasks for finalizing block.
Thread.sleep(1000);
dnFSDataset.finalizeBlock(block, false);
} catch (Exception e) {
// Ignore exception
}
});
// Wait for both tasks to complete.
blockReaderFuture.get();
finalizeBlockFuture.get();
} finally {
executorService.shutdown();
}
// Validate the replica is exits.
assertNotNull(dnFSDataset.getReplicaInfo(blk.getBlock()));
// Check DN log for FileNotFoundException.
String expectedMsg = String.format("opReadBlock %s received exception " +
"java.io.FileNotFoundException: %s (No such file or directory)",
blk.getBlock(), tmpReplicaInfo.getMetadataURI().getPath());
assertTrue("Expected log message not found in DN log.",
logCapturer.getOutput().contains(expectedMsg));
} finally {
cluster.shutdown();
DataNodeFaultInjector.set(oldDnInjector);
}
}
} }