From ef84d2186734ff08cefcb1d669ebad06927fa3b9 Mon Sep 17 00:00:00 2001 From: huhaiyang Date: Thu, 24 Nov 2022 09:17:27 +0800 Subject: [PATCH] HDFS-16841. Enhance the function of DebugAdmin#VerifyECCommand (#5137) --- .../apache/hadoop/hdfs/tools/DebugAdmin.java | 57 +++++++++++++------ .../hadoop/hdfs/tools/TestDebugAdmin.java | 44 +++++++++++++- 2 files changed, 82 insertions(+), 19 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DebugAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DebugAdmin.java index 32e8248adc..7116c2578c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DebugAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DebugAdmin.java @@ -432,8 +432,13 @@ public class DebugAdmin extends Configured implements Tool { VerifyECCommand() { super("verifyEC", - "verifyEC -file ", - " Verify HDFS erasure coding on all block groups of the file."); + "verifyEC -file [-blockId ] [-skipFailureBlocks]", + " -file Verify HDFS erasure coding on all block groups of the file." + + System.lineSeparator() + + " -skipFailureBlocks specify will skip any block group failures during verify," + + " and continues verify all block groups of the file," + System.lineSeparator() + + " the default is not to skip failure blocks." + System.lineSeparator() + + " -blockId specify blk_Id to verify for a specific one block group."); } int run(List args) throws IOException { @@ -480,30 +485,48 @@ public class DebugAdmin extends Configured implements Tool { this.parityBlkNum = ecPolicy.getNumParityUnits(); this.cellSize = ecPolicy.getCellSize(); this.encoder = CodecUtil.createRawEncoder(getConf(), ecPolicy.getCodecName(), - new ErasureCoderOptions( - ecPolicy.getNumDataUnits(), ecPolicy.getNumParityUnits())); + new ErasureCoderOptions(dataBlkNum, parityBlkNum)); int blockNum = dataBlkNum + parityBlkNum; this.readService = new ExecutorCompletionService<>( DFSUtilClient.getThreadPoolExecutor(blockNum, blockNum, 60, new LinkedBlockingQueue<>(), "read-", false)); - this.blockReaders = new BlockReader[dataBlkNum + parityBlkNum]; + this.blockReaders = new BlockReader[blockNum]; + + String needToVerifyBlockId = StringUtils.popOptionWithArgument("-blockId", args); + boolean skipFailureBlocks = StringUtils.popOption("-skipFailureBlocks", args); + boolean isHealthy = true; for (LocatedBlock locatedBlock : locatedBlocks.getLocatedBlocks()) { - System.out.println("Checking EC block group: blk_" + locatedBlock.getBlock().getBlockId()); - LocatedStripedBlock blockGroup = (LocatedStripedBlock) locatedBlock; + String blockName = locatedBlock.getBlock().getBlockName(); + if (needToVerifyBlockId == null || needToVerifyBlockId.equals(blockName)) { + System.out.println("Checking EC block group: " + blockName); + LocatedStripedBlock blockGroup = (LocatedStripedBlock) locatedBlock; - try { - verifyBlockGroup(blockGroup); - System.out.println("Status: OK"); - } catch (Exception e) { - System.err.println("Status: ERROR, message: " + e.getMessage()); - return 1; - } finally { - closeBlockReaders(); + try { + verifyBlockGroup(blockGroup); + System.out.println("Status: OK"); + } catch (Exception e) { + System.err.println("Status: ERROR, message: " + e.getMessage()); + isHealthy = false; + if (!skipFailureBlocks) { + break; + } + } finally { + closeBlockReaders(); + } + + if (needToVerifyBlockId != null) { + break; + } } } - System.out.println("\nAll EC block group status: OK"); - return 0; + if (isHealthy) { + if (needToVerifyBlockId == null) { + System.out.println("\nAll EC block group status: OK"); + } + return 0; + } + return 1; } private void verifyBlockGroup(LocatedStripedBlock blockGroup) throws Exception { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDebugAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDebugAdmin.java index 8dd303d84d..37cd38eedb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDebugAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDebugAdmin.java @@ -194,8 +194,13 @@ public class TestDebugAdmin { cluster.waitActive(); DistributedFileSystem fs = cluster.getFileSystem(); - assertEquals("ret: 1, verifyEC -file Verify HDFS erasure coding on " + - "all block groups of the file.", runCmd(new String[]{"verifyEC"})); + assertEquals("ret: 1, verifyEC -file [-blockId ] " + + "[-skipFailureBlocks] -file Verify HDFS erasure coding on all block groups of the file." + + " -skipFailureBlocks specify will skip any block group failures during verify," + + " and continues verify all block groups of the file," + + " the default is not to skip failure blocks." + + " -blockId specify blk_Id to verify for a specific one block group.", + runCmd(new String[]{"verifyEC"})); assertEquals("ret: 1, File /bar does not exist.", runCmd(new String[]{"verifyEC", "-file", "/bar"})); @@ -270,6 +275,41 @@ public class TestDebugAdmin { "-out", metaFile.getAbsolutePath()}); assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_corrupt"}) .contains("Status: ERROR, message: EC compute result not match.")); + + // Specify -blockId. + Path newFile = new Path(ecDir, "foo_new"); + DFSTestUtil.createFile(fs, newFile, (int) k, 6 * m, m, repl, seed); + blocks = DFSTestUtil.getAllBlocks(fs, newFile); + assertEquals(2, blocks.size()); + blockGroup = (LocatedStripedBlock) blocks.get(0); + String blockName = blockGroup.getBlock().getBlockName(); + assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_new", "-blockId", blockName}) + .contains("ret: 0, Checking EC block group: " + blockName + "Status: OK")); + + // Specify -verifyAllFailures. + indexedBlocks = StripedBlockUtil.parseStripedBlockGroup(blockGroup, + ecPolicy.getCellSize(), ecPolicy.getNumDataUnits(), ecPolicy.getNumParityUnits()); + // Try corrupt block 0 in block group. + toCorruptLocatedBlock = indexedBlocks[0]; + toCorruptBlock = toCorruptLocatedBlock.getBlock(); + datanode = cluster.getDataNode(toCorruptLocatedBlock.getLocations()[0].getIpcPort()); + blockFile = getBlockFile(datanode.getFSDataset(), + toCorruptBlock.getBlockPoolId(), toCorruptBlock.getLocalBlock()); + metaFile = getMetaFile(datanode.getFSDataset(), + toCorruptBlock.getBlockPoolId(), toCorruptBlock.getLocalBlock()); + metaFile.delete(); + // Write error bytes to block file and re-generate meta checksum. + errorBytes = new byte[1048576]; + new Random(0x12345678L).nextBytes(errorBytes); + FileUtils.writeByteArrayToFile(blockFile, errorBytes); + runCmd(new String[]{"computeMeta", "-block", blockFile.getAbsolutePath(), + "-out", metaFile.getAbsolutePath()}); + // VerifyEC and set skipFailureBlocks. + LocatedStripedBlock blockGroup2 = (LocatedStripedBlock) blocks.get(1); + assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_new", "-skipFailureBlocks"}) + .contains("ret: 1, Checking EC block group: " + blockGroup.getBlock().getBlockName() + + "Status: ERROR, message: EC compute result not match." + + "Checking EC block group: " + blockGroup2.getBlock().getBlockName() + "Status: OK")); } }