HDFS-16841. Enhance the function of DebugAdmin#VerifyECCommand (#5137)

This commit is contained in:
huhaiyang 2022-11-24 09:17:27 +08:00 committed by GitHub
parent bcc3d2a20e
commit ef84d21867
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 82 additions and 19 deletions

View File

@ -432,8 +432,13 @@ private class VerifyECCommand extends DebugCommand {
VerifyECCommand() {
super("verifyEC",
"verifyEC -file <file>",
" Verify HDFS erasure coding on all block groups of the file.");
"verifyEC -file <file> [-blockId <blk_Id>] [-skipFailureBlocks]",
" -file Verify HDFS erasure coding on all block groups of the file." +
System.lineSeparator() +
" -skipFailureBlocks specify will skip any block group failures during verify," +
" and continues verify all block groups of the file," + System.lineSeparator() +
" the default is not to skip failure blocks." + System.lineSeparator() +
" -blockId specify blk_Id to verify for a specific one block group.");
}
int run(List<String> args) throws IOException {
@ -480,30 +485,48 @@ int run(List<String> args) throws IOException {
this.parityBlkNum = ecPolicy.getNumParityUnits();
this.cellSize = ecPolicy.getCellSize();
this.encoder = CodecUtil.createRawEncoder(getConf(), ecPolicy.getCodecName(),
new ErasureCoderOptions(
ecPolicy.getNumDataUnits(), ecPolicy.getNumParityUnits()));
new ErasureCoderOptions(dataBlkNum, parityBlkNum));
int blockNum = dataBlkNum + parityBlkNum;
this.readService = new ExecutorCompletionService<>(
DFSUtilClient.getThreadPoolExecutor(blockNum, blockNum, 60,
new LinkedBlockingQueue<>(), "read-", false));
this.blockReaders = new BlockReader[dataBlkNum + parityBlkNum];
this.blockReaders = new BlockReader[blockNum];
String needToVerifyBlockId = StringUtils.popOptionWithArgument("-blockId", args);
boolean skipFailureBlocks = StringUtils.popOption("-skipFailureBlocks", args);
boolean isHealthy = true;
for (LocatedBlock locatedBlock : locatedBlocks.getLocatedBlocks()) {
System.out.println("Checking EC block group: blk_" + locatedBlock.getBlock().getBlockId());
LocatedStripedBlock blockGroup = (LocatedStripedBlock) locatedBlock;
String blockName = locatedBlock.getBlock().getBlockName();
if (needToVerifyBlockId == null || needToVerifyBlockId.equals(blockName)) {
System.out.println("Checking EC block group: " + blockName);
LocatedStripedBlock blockGroup = (LocatedStripedBlock) locatedBlock;
try {
verifyBlockGroup(blockGroup);
System.out.println("Status: OK");
} catch (Exception e) {
System.err.println("Status: ERROR, message: " + e.getMessage());
return 1;
} finally {
closeBlockReaders();
try {
verifyBlockGroup(blockGroup);
System.out.println("Status: OK");
} catch (Exception e) {
System.err.println("Status: ERROR, message: " + e.getMessage());
isHealthy = false;
if (!skipFailureBlocks) {
break;
}
} finally {
closeBlockReaders();
}
if (needToVerifyBlockId != null) {
break;
}
}
}
System.out.println("\nAll EC block group status: OK");
return 0;
if (isHealthy) {
if (needToVerifyBlockId == null) {
System.out.println("\nAll EC block group status: OK");
}
return 0;
}
return 1;
}
private void verifyBlockGroup(LocatedStripedBlock blockGroup) throws Exception {

View File

@ -194,8 +194,13 @@ public void testVerifyECCommand() throws Exception {
cluster.waitActive();
DistributedFileSystem fs = cluster.getFileSystem();
assertEquals("ret: 1, verifyEC -file <file> Verify HDFS erasure coding on " +
"all block groups of the file.", runCmd(new String[]{"verifyEC"}));
assertEquals("ret: 1, verifyEC -file <file> [-blockId <blk_Id>] " +
"[-skipFailureBlocks] -file Verify HDFS erasure coding on all block groups of the file." +
" -skipFailureBlocks specify will skip any block group failures during verify," +
" and continues verify all block groups of the file," +
" the default is not to skip failure blocks." +
" -blockId specify blk_Id to verify for a specific one block group.",
runCmd(new String[]{"verifyEC"}));
assertEquals("ret: 1, File /bar does not exist.",
runCmd(new String[]{"verifyEC", "-file", "/bar"}));
@ -270,6 +275,41 @@ public void testVerifyECCommand() throws Exception {
"-out", metaFile.getAbsolutePath()});
assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_corrupt"})
.contains("Status: ERROR, message: EC compute result not match."));
// Specify -blockId.
Path newFile = new Path(ecDir, "foo_new");
DFSTestUtil.createFile(fs, newFile, (int) k, 6 * m, m, repl, seed);
blocks = DFSTestUtil.getAllBlocks(fs, newFile);
assertEquals(2, blocks.size());
blockGroup = (LocatedStripedBlock) blocks.get(0);
String blockName = blockGroup.getBlock().getBlockName();
assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_new", "-blockId", blockName})
.contains("ret: 0, Checking EC block group: " + blockName + "Status: OK"));
// Specify -verifyAllFailures.
indexedBlocks = StripedBlockUtil.parseStripedBlockGroup(blockGroup,
ecPolicy.getCellSize(), ecPolicy.getNumDataUnits(), ecPolicy.getNumParityUnits());
// Try corrupt block 0 in block group.
toCorruptLocatedBlock = indexedBlocks[0];
toCorruptBlock = toCorruptLocatedBlock.getBlock();
datanode = cluster.getDataNode(toCorruptLocatedBlock.getLocations()[0].getIpcPort());
blockFile = getBlockFile(datanode.getFSDataset(),
toCorruptBlock.getBlockPoolId(), toCorruptBlock.getLocalBlock());
metaFile = getMetaFile(datanode.getFSDataset(),
toCorruptBlock.getBlockPoolId(), toCorruptBlock.getLocalBlock());
metaFile.delete();
// Write error bytes to block file and re-generate meta checksum.
errorBytes = new byte[1048576];
new Random(0x12345678L).nextBytes(errorBytes);
FileUtils.writeByteArrayToFile(blockFile, errorBytes);
runCmd(new String[]{"computeMeta", "-block", blockFile.getAbsolutePath(),
"-out", metaFile.getAbsolutePath()});
// VerifyEC and set skipFailureBlocks.
LocatedStripedBlock blockGroup2 = (LocatedStripedBlock) blocks.get(1);
assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_new", "-skipFailureBlocks"})
.contains("ret: 1, Checking EC block group: " + blockGroup.getBlock().getBlockName() +
"Status: ERROR, message: EC compute result not match." +
"Checking EC block group: " + blockGroup2.getBlock().getBlockName() + "Status: OK"));
}
}