From 78ae2aed8f84d2d3983f81a5219e8b1f1ec59dca Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 19 Jan 2017 16:05:56 +0800 Subject: [PATCH] HDFS-11259. Update fsck to display maintenance state info. (Manoj Govindassamy via lei) --- .../hdfs/server/namenode/NamenodeFsck.java | 68 +++- .../org/apache/hadoop/hdfs/tools/DFSck.java | 10 +- .../src/site/markdown/HDFSCommands.md | 4 +- .../hadoop/hdfs/server/namenode/TestFsck.java | 383 ++++++++++++++++-- 4 files changed, 419 insertions(+), 46 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java index d8441f8fe6..1ae75f09da 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java @@ -117,6 +117,9 @@ public class NamenodeFsck implements DataEncryptionKeyFactory { public static final String HEALTHY_STATUS = "is HEALTHY"; public static final String DECOMMISSIONING_STATUS = "is DECOMMISSIONING"; public static final String DECOMMISSIONED_STATUS = "is DECOMMISSIONED"; + public static final String ENTERING_MAINTENANCE_STATUS = + "is ENTERING MAINTENANCE"; + public static final String IN_MAINTENANCE_STATUS = "is IN MAINTENANCE"; public static final String NONEXISTENT_STATUS = "does not exist"; public static final String FAILURE_STATUS = "FAILED"; public static final String UNDEFINED = "undefined"; @@ -144,6 +147,7 @@ public class NamenodeFsck implements DataEncryptionKeyFactory { private boolean showReplicaDetails = false; private boolean showUpgradeDomains = false; + private boolean showMaintenanceState = false; private long staleInterval; private Tracer tracer; @@ -227,6 +231,8 @@ else if (key.equals("replicadetails")) { this.showReplicaDetails = true; } else if (key.equals("upgradedomains")) { this.showUpgradeDomains = true; + } else if (key.equals("maintenance")) { + this.showMaintenanceState = true; } else if (key.equals("storagepolicies")) { this.showStoragePolcies = true; } else if (key.equals("showprogress")) { @@ -280,6 +286,12 @@ public void blockIdCK(String blockId) { + numberReplicas.decommissioned()); out.println("No. of decommissioning Replica: " + numberReplicas.decommissioning()); + if (this.showMaintenanceState) { + out.println("No. of entering maintenance Replica: " + + numberReplicas.liveEnteringMaintenanceReplicas()); + out.println("No. of in maintenance Replica: " + + numberReplicas.maintenanceNotForReadReplicas()); + } out.println("No. of corrupted Replica: " + numberReplicas.corruptReplicas()); //record datanodes that have corrupted block replica @@ -300,6 +312,10 @@ public void blockIdCK(String blockId) { out.print(DECOMMISSIONED_STATUS); } else if (dn.isDecommissionInProgress()) { out.print(DECOMMISSIONING_STATUS); + } else if (this.showMaintenanceState && dn.isEnteringMaintenance()) { + out.print(ENTERING_MAINTENANCE_STATUS); + } else if (this.showMaintenanceState && dn.isInMaintenance()) { + out.print(IN_MAINTENANCE_STATUS); } else { out.print(HEALTHY_STATUS); } @@ -598,6 +614,12 @@ private String getReplicaInfo(BlockInfo storedBlock) { sb.append("DECOMMISSIONED)"); } else if (dnDesc.isDecommissionInProgress()) { sb.append("DECOMMISSIONING)"); + } else if (this.showMaintenanceState && + dnDesc.isEnteringMaintenance()) { + sb.append("ENTERING MAINTENANCE)"); + } else if (this.showMaintenanceState && + dnDesc.isInMaintenance()) { + sb.append("IN MAINTENANCE)"); } else if (corruptReplicas != null && corruptReplicas.contains(dnDesc)) { sb.append("CORRUPT)"); @@ -620,7 +642,7 @@ private String getReplicaInfo(BlockInfo storedBlock) { } private void collectBlocksSummary(String parent, HdfsFileStatus file, - Result res, LocatedBlocks blocks) throws IOException { + Result res, LocatedBlocks blocks) throws IOException { String path = file.getFullName(parent); boolean isOpen = blocks.isUnderConstruction(); if (isOpen && !showOpenFiles) { @@ -651,13 +673,21 @@ private void collectBlocksSummary(String parent, HdfsFileStatus file, NumberReplicas numberReplicas = blockManager.countNodes(storedBlock); int decommissionedReplicas = numberReplicas.decommissioned(); int decommissioningReplicas = numberReplicas.decommissioning(); + int enteringMaintenanceReplicas = + numberReplicas.liveEnteringMaintenanceReplicas(); + int inMaintenanceReplicas = + numberReplicas.maintenanceNotForReadReplicas(); res.decommissionedReplicas += decommissionedReplicas; res.decommissioningReplicas += decommissioningReplicas; + res.enteringMaintenanceReplicas += enteringMaintenanceReplicas; + res.inMaintenanceReplicas += inMaintenanceReplicas; // count total replicas int liveReplicas = numberReplicas.liveReplicas(); - int totalReplicasPerBlock = liveReplicas + decommissionedReplicas + - decommissioningReplicas; + int totalReplicasPerBlock = liveReplicas + decommissionedReplicas + + decommissioningReplicas + + enteringMaintenanceReplicas + + inMaintenanceReplicas; res.totalReplicas += totalReplicasPerBlock; boolean isMissing; @@ -711,12 +741,14 @@ private void collectBlocksSummary(String parent, HdfsFileStatus file, if (!showFiles) { out.print("\n" + path + ": "); } - out.println(" Under replicated " + block + - ". Target Replicas is " + - targetFileReplication + " but found " + - liveReplicas + " live replica(s), " + - decommissionedReplicas + " decommissioned replica(s) and " + - decommissioningReplicas + " decommissioning replica(s)."); + out.println(" Under replicated " + block + ". Target Replicas is " + + targetFileReplication + " but found " + + liveReplicas+ " live replica(s), " + + decommissionedReplicas + " decommissioned replica(s), " + + decommissioningReplicas + " decommissioning replica(s)" + + (this.showMaintenanceState ? (enteringMaintenanceReplicas + + ", entering maintenance replica(s) and " + inMaintenanceReplicas + + " in maintenance replica(s).") : ".")); } // count mis replicated blocks @@ -1095,6 +1127,8 @@ static class Result { long missingReplicas = 0L; long decommissionedReplicas = 0L; long decommissioningReplicas = 0L; + long enteringMaintenanceReplicas = 0L; + long inMaintenanceReplicas = 0L; long numUnderMinReplicatedBlocks = 0L; long numOverReplicatedBlocks = 0L; long numUnderReplicatedBlocks = 0L; @@ -1243,6 +1277,14 @@ public String toString() { res.append("\n DecommissioningReplicas:\t").append( decommissioningReplicas); } + if (enteringMaintenanceReplicas > 0) { + res.append("\n EnteringMaintenanceReplicas:\t").append( + enteringMaintenanceReplicas); + } + if (inMaintenanceReplicas > 0) { + res.append("\n InMaintenanceReplicas:\t").append( + inMaintenanceReplicas); + } return res.toString(); } } @@ -1349,6 +1391,14 @@ public String toString() { res.append("\n Decommissioning internal blocks:\t").append( decommissioningReplicas); } + if (enteringMaintenanceReplicas > 0) { + res.append("\n EnteringMaintenanceReplicas:\t").append( + enteringMaintenanceReplicas); + } + if (inMaintenanceReplicas > 0) { + res.append("\n InMaintenanceReplicas:\t").append( + inMaintenanceReplicas); + } return res.toString(); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java index 9cf234af14..96fca2448e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java @@ -80,7 +80,8 @@ public class DFSck extends Configured implements Tool { + "[-files [-blocks [-locations | -racks | -replicaDetails | " + "-upgradedomains]]]] " + "[-includeSnapshots] [-showprogress] " - + "[-storagepolicies] [-blockId ]\n" + + "[-storagepolicies] [-maintenance] " + + "[-blockId ]\n" + "\t\tstart checking from this path\n" + "\t-move\tmove corrupted files to /lost+found\n" + "\t-delete\tdelete corrupted files\n" @@ -99,6 +100,7 @@ public class DFSck extends Configured implements Tool { + "\t-files -blocks -upgradedomains\tprint out upgrade domains for " + "every block\n" + "\t-storagepolicies\tprint out storage policy summary for the blocks\n" + + "\t-maintenance\tprint out maintenance state node details\n" + "\t-showprogress\tshow progress in output. Default is OFF (no progress)\n" + "\t-blockId\tprint out which file this blockId belongs to, locations" + " (nodes, racks) of this block, and other diagnostics info" @@ -286,6 +288,8 @@ else if (args[idx].equals("-replicaDetails")) { doListCorruptFileBlocks = true; } else if (args[idx].equals("-includeSnapshots")) { url.append("&includeSnapshots=1"); + } else if (args[idx].equals("-maintenance")) { + url.append("&maintenance=1"); } else if (args[idx].equals("-blockId")) { StringBuilder sb = new StringBuilder(); idx++; @@ -372,6 +376,10 @@ else if (args[idx].equals("-replicaDetails")) { errCode = 2; } else if (lastLine.endsWith(NamenodeFsck.DECOMMISSIONING_STATUS)) { errCode = 3; + } else if (lastLine.endsWith(NamenodeFsck.IN_MAINTENANCE_STATUS)) { + errCode = 4; + } else if (lastLine.endsWith(NamenodeFsck.ENTERING_MAINTENANCE_STATUS)) { + errCode = 5; } return errCode; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSCommands.md b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSCommands.md index 152225d5d6..8825820fb4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSCommands.md +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSCommands.md @@ -121,7 +121,8 @@ Usage: [-move | -delete | -openforwrite] [-files [-blocks [-locations | -racks | -replicaDetails | -upgradedomains]]] [-includeSnapshots] [-showprogress] - [-storagepolicies] [-blockId ] + [-storagepolicies] [-maintenance] + [-blockId ] | COMMAND\_OPTION | Description | |:---- |:---- | @@ -139,6 +140,7 @@ Usage: | `-openforwrite` | Print out files opened for write. | | `-showprogress` | Print out dots for progress in output. Default is OFF (no progress). | | `-storagepolicies` | Print out storage policy summary for the blocks. | +| `-maintenance` | Print out maintenance state node details. | | `-blockId` | Print out information about the block. | Runs the HDFS filesystem checking utility. See [fsck](./HdfsUserGuide.html#fsck) for more info. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java index 81a25fdd25..61ecae3d63 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java @@ -54,6 +54,7 @@ import java.util.Random; import java.util.Set; import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -168,11 +169,11 @@ static String runFsck(Configuration conf, int expectedErrCode, PrintStream out = new PrintStream(bStream, true); GenericTestUtils.setLogLevel(FSPermissionChecker.LOG, Level.ALL); int errCode = ToolRunner.run(new DFSck(conf, out), path); + LOG.info("OUTPUT = " + bStream.toString()); if (checkErrorCode) { assertEquals(expectedErrCode, errCode); } GenericTestUtils.setLogLevel(FSPermissionChecker.LOG, Level.INFO); - LOG.info("OUTPUT = " + bStream.toString()); return bStream.toString(); } @@ -878,14 +879,13 @@ public void testUnderMinReplicatedBlock() throws Exception { assertTrue(outStr.contains("dfs.namenode.replication.min:\t2")); } - @Test(timeout = 60000) + @Test(timeout = 90000) public void testFsckReplicaDetails() throws Exception { final short replFactor = 1; short numDn = 1; final long blockSize = 512; final long fileSize = 1024; - boolean checkDecommissionInProgress = false; String[] racks = {"/rack1"}; String[] hosts = {"host1"}; @@ -906,53 +906,132 @@ public void testFsckReplicaDetails() throws Exception { DFSTestUtil.waitReplication(dfs, path, replFactor); // make sure datanode that has replica is fine before decommission - String fsckOut = runFsck(conf, 0, true, testFile, "-files", "-blocks", - "-replicaDetails"); + String fsckOut = runFsck(conf, 0, true, testFile, "-files", + "-maintenance", "-blocks", "-replicaDetails"); assertTrue(fsckOut.contains(NamenodeFsck.HEALTHY_STATUS)); assertTrue(fsckOut.contains("(LIVE)")); + assertFalse(fsckOut.contains("(ENTERING MAINTENANCE)")); + assertFalse(fsckOut.contains("(IN MAINTENANCE)")); // decommission datanode - ExtendedBlock eb = DFSTestUtil.getFirstBlock(dfs, path); FSNamesystem fsn = cluster.getNameNode().getNamesystem(); BlockManager bm = fsn.getBlockManager(); - BlockCollection bc = null; - try { - fsn.writeLock(); - BlockInfo bi = bm.getStoredBlock(eb.getLocalBlock()); - bc = fsn.getBlockCollection(bi); - } finally { - fsn.writeUnlock(); - } - DatanodeDescriptor dn = bc.getBlocks()[0] - .getDatanode(0); - bm.getDatanodeManager().getDecomManager().startDecommission(dn); - String dnName = dn.getXferAddr(); + final DatanodeManager dnm = bm.getDatanodeManager(); + DatanodeDescriptor dnDesc0 = dnm.getDatanode( + cluster.getDataNodes().get(0).getDatanodeId()); + + bm.getDatanodeManager().getDecomManager().startDecommission(dnDesc0); + final String dn0Name = dnDesc0.getXferAddr(); // check the replica status while decommissioning - fsckOut = runFsck(conf, 0, true, testFile, "-files", "-blocks", - "-replicaDetails"); + fsckOut = runFsck(conf, 0, true, testFile, "-files", + "-maintenance", "-blocks", "-replicaDetails"); assertTrue(fsckOut.contains("(DECOMMISSIONING)")); + assertFalse(fsckOut.contains("(ENTERING MAINTENANCE)")); + assertFalse(fsckOut.contains("(IN MAINTENANCE)")); - // Start 2nd Datanode and wait for decommission to start - cluster.startDataNodes(conf, 1, true, null, null, null); - DatanodeInfo datanodeInfo = null; - do { - Thread.sleep(2000); - for (DatanodeInfo info : dfs.getDataNodeStats()) { - if (dnName.equals(info.getXferAddr())) { - datanodeInfo = info; + // Start 2nd DataNode + cluster.startDataNodes(conf, 1, true, null, + new String[] {"/rack2"}, new String[] {"host2"}, null, false); + + // Wait for decommission to start + final AtomicBoolean checkDecommissionInProgress = + new AtomicBoolean(false); + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + DatanodeInfo datanodeInfo = null; + try { + for (DatanodeInfo info : dfs.getDataNodeStats()) { + if (dn0Name.equals(info.getXferAddr())) { + datanodeInfo = info; + } + } + if (!checkDecommissionInProgress.get() && datanodeInfo != null + && datanodeInfo.isDecommissionInProgress()) { + checkDecommissionInProgress.set(true); + } + if (datanodeInfo != null && datanodeInfo.isDecommissioned()) { + return true; + } + } catch (Exception e) { + LOG.warn("Unexpected exception: " + e); + return false; } + return false; } - if (!checkDecommissionInProgress && datanodeInfo != null - && datanodeInfo.isDecommissionInProgress()) { - checkDecommissionInProgress = true; - } - } while (datanodeInfo != null && !datanodeInfo.isDecommissioned()); + }, 500, 30000); // check the replica status after decommission is done - fsckOut = runFsck(conf, 0, true, testFile, "-files", "-blocks", - "-replicaDetails"); + fsckOut = runFsck(conf, 0, true, testFile, "-files", + "-maintenance", "-blocks", "-replicaDetails"); assertTrue(fsckOut.contains("(DECOMMISSIONED)")); + assertFalse(fsckOut.contains("(ENTERING MAINTENANCE)")); + assertFalse(fsckOut.contains("(IN MAINTENANCE)")); + + DatanodeDescriptor dnDesc1 = dnm.getDatanode( + cluster.getDataNodes().get(1).getDatanodeId()); + final String dn1Name = dnDesc1.getXferAddr(); + + bm.getDatanodeManager().getDecomManager().startMaintenance(dnDesc1, + Long.MAX_VALUE); + + // check the replica status while entering maintenance + fsckOut = runFsck(conf, 0, true, testFile, "-files", + "-maintenance", "-blocks", "-replicaDetails"); + assertTrue(fsckOut.contains("(DECOMMISSIONED)")); + assertTrue(fsckOut.contains("(ENTERING MAINTENANCE)")); + assertFalse(fsckOut.contains("(IN MAINTENANCE)")); + + // check entering maintenance replicas are printed only when requested + fsckOut = runFsck(conf, 0, true, testFile, "-files", + "-blocks", "-replicaDetails"); + assertTrue(fsckOut.contains("(DECOMMISSIONED)")); + assertFalse(fsckOut.contains("(ENTERING MAINTENANCE)")); + assertFalse(fsckOut.contains("(IN MAINTENANCE)")); + + + // Start 3rd DataNode + cluster.startDataNodes(conf, 1, true, null, + new String[] {"/rack3"}, new String[] {"host3"}, null, false); + + // Wait for the 2nd node to reach in maintenance state + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + DatanodeInfo dnInfo = null; + try { + for (DatanodeInfo info : dfs.getDataNodeStats()) { + if (dn1Name.equals(info.getXferAddr())) { + dnInfo = info; + } + } + if (dnInfo != null && dnInfo.isInMaintenance()) { + return true; + } + } catch (Exception e) { + LOG.warn("Unexpected exception: " + e); + return false; + } + return false; + } + }, 500, 30000); + + // check the replica status after decommission is done + fsckOut = runFsck(conf, 0, true, testFile, "-files", + "-maintenance", "-blocks", "-replicaDetails"); + assertTrue(fsckOut.contains("(DECOMMISSIONED)")); + assertFalse(fsckOut.contains("(ENTERING MAINTENANCE)")); + assertTrue(fsckOut.contains("(IN MAINTENANCE)")); + + // check in maintenance replicas are not printed when not requested + fsckOut = runFsck(conf, 0, true, testFile, "-files", + "-blocks", "-replicaDetails"); + assertTrue(fsckOut.contains("(DECOMMISSIONED)")); + assertFalse(fsckOut.contains("(ENTERING MAINTENANCE)")); + assertFalse(fsckOut.contains("(IN MAINTENANCE)")); + + } /** Test if fsck can return -1 in case of failure. @@ -1459,6 +1538,125 @@ public void testBlockIdCKDecommission() throws Exception { assertTrue(fsckOut.contains(NamenodeFsck.DECOMMISSIONED_STATUS)); } + /** + * Test for blockIdCK with datanode maintenance. + */ + @Test (timeout = 90000) + public void testBlockIdCKMaintenance() throws Exception { + final short replFactor = 2; + short numDn = 2; + final long blockSize = 512; + String[] hosts = {"host1", "host2"}; + String[] racks = {"/rack1", "/rack2"}; + + conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize); + conf.setInt(DFSConfigKeys.DFS_REPLICATION_KEY, replFactor); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY, replFactor); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY, + replFactor); + + DistributedFileSystem dfs; + cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(numDn) + .hosts(hosts) + .racks(racks) + .build(); + + assertNotNull("Failed Cluster Creation", cluster); + cluster.waitClusterUp(); + dfs = cluster.getFileSystem(); + assertNotNull("Failed to get FileSystem", dfs); + + DFSTestUtil util = new DFSTestUtil.Builder(). + setName(getClass().getSimpleName()).setNumFiles(1).build(); + //create files + final String pathString = new String("/testfile"); + final Path path = new Path(pathString); + util.createFile(dfs, path, 1024, replFactor, 1000L); + util.waitReplication(dfs, path, replFactor); + StringBuilder sb = new StringBuilder(); + for (LocatedBlock lb: util.getAllBlocks(dfs, path)){ + sb.append(lb.getBlock().getLocalBlock().getBlockName()+" "); + } + String[] bIds = sb.toString().split(" "); + + //make sure datanode that has replica is fine before maintenance + String outStr = runFsck(conf, 0, true, "/", + "-maintenance", "-blockId", bIds[0]); + System.out.println(outStr); + assertTrue(outStr.contains(NamenodeFsck.HEALTHY_STATUS)); + + FSNamesystem fsn = cluster.getNameNode().getNamesystem(); + BlockManager bm = fsn.getBlockManager(); + DatanodeManager dnm = bm.getDatanodeManager(); + DatanodeDescriptor dn = dnm.getDatanode(cluster.getDataNodes().get(0) + .getDatanodeId()); + bm.getDatanodeManager().getDecomManager().startMaintenance(dn, + Long.MAX_VALUE); + final String dnName = dn.getXferAddr(); + + //wait for the node to enter maintenance state + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + DatanodeInfo datanodeInfo = null; + try { + for (DatanodeInfo info : dfs.getDataNodeStats()) { + if (dnName.equals(info.getXferAddr())) { + datanodeInfo = info; + } + } + if (datanodeInfo != null && datanodeInfo.isEnteringMaintenance()) { + String fsckOut = runFsck(conf, 5, false, "/", + "-maintenance", "-blockId", bIds[0]); + assertTrue(fsckOut.contains( + NamenodeFsck.ENTERING_MAINTENANCE_STATUS)); + return true; + } + } catch (Exception e) { + LOG.warn("Unexpected exception: " + e); + return false; + } + return false; + } + }, 500, 30000); + + // Start 3rd DataNode + cluster.startDataNodes(conf, 1, true, null, + new String[] {"/rack3"}, new String[] {"host3"}, null, false); + + // Wait for 1st node to reach in maintenance state + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + try { + DatanodeInfo datanodeInfo = null; + for (DatanodeInfo info : dfs.getDataNodeStats()) { + if (dnName.equals(info.getXferAddr())) { + datanodeInfo = info; + } + } + if (datanodeInfo != null && datanodeInfo.isInMaintenance()) { + return true; + } + } catch (Exception e) { + LOG.warn("Unexpected exception: " + e); + return false; + } + return false; + } + }, 500, 30000); + + //check in maintenance node + String fsckOut = runFsck(conf, 4, false, "/", + "-maintenance", "-blockId", bIds[0]); + assertTrue(fsckOut.contains(NamenodeFsck.IN_MAINTENANCE_STATUS)); + + //check in maintenance node are not printed when not requested + fsckOut = runFsck(conf, 4, false, "/", "-blockId", bIds[0]); + assertFalse(fsckOut.contains(NamenodeFsck.IN_MAINTENANCE_STATUS)); + } + /** * Test for blockIdCK with block corruption. */ @@ -1655,6 +1853,121 @@ public void testFsckWithDecommissionedReplicas() throws Exception { String fsckOut = runFsck(conf, 0, true, testFile); } + /** + * Test for blocks on maintenance hosts are not shown as missing. + */ + @Test (timeout = 90000) + public void testFsckWithMaintenanceReplicas() throws Exception { + final short replFactor = 2; + short numDn = 2; + final long blockSize = 512; + String[] hosts = {"host1", "host2"}; + String[] racks = {"/rack1", "/rack2"}; + + conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize); + conf.setInt(DFSConfigKeys.DFS_REPLICATION_KEY, replFactor); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY, replFactor); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY, + replFactor); + + DistributedFileSystem dfs; + cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(numDn) + .hosts(hosts) + .racks(racks) + .build(); + + assertNotNull("Failed Cluster Creation", cluster); + cluster.waitClusterUp(); + dfs = cluster.getFileSystem(); + assertNotNull("Failed to get FileSystem", dfs); + + DFSTestUtil util = new DFSTestUtil.Builder(). + setName(getClass().getSimpleName()).setNumFiles(1).build(); + //create files + final String testFile = new String("/testfile"); + final Path path = new Path(testFile); + util.createFile(dfs, path, 1024, replFactor, 1000L); + util.waitReplication(dfs, path, replFactor); + StringBuilder sb = new StringBuilder(); + for (LocatedBlock lb: util.getAllBlocks(dfs, path)){ + sb.append(lb.getBlock().getLocalBlock().getBlockName()+" "); + } + String[] bIds = sb.toString().split(" "); + + //make sure datanode that has replica is fine before maintenance + String outStr = runFsck(conf, 0, true, testFile); + System.out.println(outStr); + assertTrue(outStr.contains(NamenodeFsck.HEALTHY_STATUS)); + + FSNamesystem fsn = cluster.getNameNode().getNamesystem(); + BlockManager bm = fsn.getBlockManager(); + DatanodeManager dnm = bm.getDatanodeManager(); + DatanodeDescriptor dn = dnm.getDatanode(cluster.getDataNodes().get(0) + .getDatanodeId()); + bm.getDatanodeManager().getDecomManager().startMaintenance(dn, + Long.MAX_VALUE); + final String dnName = dn.getXferAddr(); + + //wait for the node to enter maintenance state + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + DatanodeInfo datanodeInfo = null; + try { + for (DatanodeInfo info : dfs.getDataNodeStats()) { + if (dnName.equals(info.getXferAddr())) { + datanodeInfo = info; + } + } + if (datanodeInfo != null && datanodeInfo.isEnteringMaintenance()) { + // verify fsck returns Healthy status + String fsckOut = runFsck(conf, 0, true, testFile, "-maintenance"); + assertTrue(fsckOut.contains(NamenodeFsck.HEALTHY_STATUS)); + return true; + } + } catch (Exception e) { + LOG.warn("Unexpected exception: " + e); + return false; + } + return false; + } + }, 500, 30000); + + // Start 3rd DataNode and wait for node to reach in maintenance state + cluster.startDataNodes(conf, 1, true, null, + new String[] {"/rack3"}, new String[] {"host3"}, null, false); + + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + DatanodeInfo datanodeInfo = null; + try { + for (DatanodeInfo info : dfs.getDataNodeStats()) { + if (dnName.equals(info.getXferAddr())) { + datanodeInfo = info; + } + } + if (datanodeInfo != null && datanodeInfo.isInMaintenance()) { + return true; + } + } catch (Exception e) { + LOG.warn("Unexpected exception: " + e); + return false; + } + return false; + } + }, 500, 30000); + + // verify fsck returns Healthy status + String fsckOut = runFsck(conf, 0, true, testFile, "-maintenance"); + assertTrue(fsckOut.contains(NamenodeFsck.HEALTHY_STATUS)); + + // verify fsck returns Healthy status even without maintenance option + fsckOut = runFsck(conf, 0, true, testFile); + assertTrue(fsckOut.contains(NamenodeFsck.HEALTHY_STATUS)); + } + @Test public void testECFsck() throws Exception { FileSystem fs = null;