HDFS-6917. Add an hdfs debug command to validate blocks, call recoverlease, etc. (cmccabe)

This commit is contained in:
Colin Patrick Mccabe 2014-10-31 13:15:17 -07:00
parent 256697acd5
commit 7b026c50f1
4 changed files with 486 additions and 0 deletions

View File

@ -325,6 +325,9 @@ Release 2.7.0 - UNRELEASED
HDFS-7035. Make adding a new data directory to the DataNode an atomic
operation and improve error handling (Lei Xu via Colin P. McCabe)
HDFS-6917. Add an hdfs debug command to validate blocks, call recoverlease,
etc. (cmccabe)
OPTIMIZATIONS
BUG FIXES

View File

@ -53,6 +53,7 @@ function hadoop_usage
echo " zkfc run the ZK Failover Controller daemon"
echo ""
echo "Most commands print help when invoked w/o parameters."
# There are also debug commands, but they don't show up in this listing.
}
# let's locate libexec...
@ -121,6 +122,9 @@ case ${COMMAND} in
CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
fi
;;
debug)
CLASS='org.apache.hadoop.hdfs.tools.DebugAdmin'
;;
dfs)
CLASS=org.apache.hadoop.fs.FsShell
hadoop_debug "Appending HADOOP_CLIENT_OPTS onto HADOOP_OPTS"

View File

@ -0,0 +1,361 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.tools;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.google.common.util.concurrent.Uninterruptibles;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
/**
* This class implements debug operations on the HDFS command-line.
*
* These operations are only for debugging, and may change or disappear
* between HDFS versions.
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class DebugAdmin extends Configured implements Tool {
/**
* All the debug commands we can run.
*/
private DebugCommand DEBUG_COMMANDS[] = {
new VerifyBlockChecksumCommand(),
new RecoverLeaseCommand(),
new HelpCommand()
};
/**
* The base class for debug commands.
*/
private abstract class DebugCommand {
final String name;
final String usageText;
final String helpText;
DebugCommand(String name, String usageText, String helpText) {
this.name = name;
this.usageText = usageText;
this.helpText = helpText;
}
abstract int run(List<String> args) throws IOException;
}
private static int HEADER_LEN = 7;
/**
* The command for verifying a block metadata file and possibly block file.
*/
private class VerifyBlockChecksumCommand extends DebugCommand {
VerifyBlockChecksumCommand() {
super("verify",
"verify [-meta <metadata-file>] [-block <block-file>]",
" Verify HDFS metadata and block files. If a block file is specified, we\n" +
" will verify that the checksums in the metadata file match the block\n" +
" file.");
}
int run(List<String> args) throws IOException {
if (args.size() == 0) {
System.out.println(usageText);
System.out.println(helpText + "\n");
return 1;
}
String blockFile = StringUtils.popOptionWithArgument("-block", args);
String metaFile = StringUtils.popOptionWithArgument("-meta", args);
if (metaFile == null) {
System.err.println("You must specify a meta file with -meta");
return 1;
}
FileInputStream metaStream = null, dataStream = null;
FileChannel metaChannel = null, dataChannel = null;
DataInputStream checksumStream = null;
try {
BlockMetadataHeader header;
try {
metaStream = new FileInputStream(metaFile);
checksumStream = new DataInputStream(metaStream);
header = BlockMetadataHeader.readHeader(checksumStream);
metaChannel = metaStream.getChannel();
metaChannel.position(HEADER_LEN);
} catch (RuntimeException e) {
System.err.println("Failed to read HDFS metadata file header for " +
metaFile + ": " + StringUtils.stringifyException(e));
return 1;
} catch (IOException e) {
System.err.println("Failed to read HDFS metadata file header for " +
metaFile + ": " + StringUtils.stringifyException(e));
return 1;
}
DataChecksum checksum = header.getChecksum();
System.out.println("Checksum type: " + checksum.toString());
if (blockFile == null) {
return 0;
}
ByteBuffer metaBuf, dataBuf;
try {
dataStream = new FileInputStream(blockFile);
dataChannel = dataStream.getChannel();
final int CHECKSUMS_PER_BUF = 1024 * 32;
metaBuf = ByteBuffer.allocate(checksum.
getChecksumSize() * CHECKSUMS_PER_BUF);
dataBuf = ByteBuffer.allocate(checksum.
getBytesPerChecksum() * CHECKSUMS_PER_BUF);
} catch (IOException e) {
System.err.println("Failed to open HDFS block file for " +
blockFile + ": " + StringUtils.stringifyException(e));
return 1;
}
long offset = 0;
while (true) {
dataBuf.clear();
int dataRead = -1;
try {
dataRead = dataChannel.read(dataBuf);
if (dataRead < 0) {
break;
}
} catch (IOException e) {
System.err.println("Got I/O error reading block file " +
blockFile + "from disk at offset " + dataChannel.position() +
": " + StringUtils.stringifyException(e));
return 1;
}
try {
int csumToRead =
(((checksum.getBytesPerChecksum() - 1) + dataRead) /
checksum.getBytesPerChecksum()) *
checksum.getChecksumSize();
metaBuf.clear();
metaBuf.limit(csumToRead);
metaChannel.read(metaBuf);
dataBuf.flip();
metaBuf.flip();
} catch (IOException e) {
System.err.println("Got I/O error reading metadata file " +
metaFile + "from disk at offset " + metaChannel.position() +
": " + StringUtils.stringifyException(e));
return 1;
}
try {
checksum.verifyChunkedSums(dataBuf, metaBuf,
blockFile, offset);
} catch (IOException e) {
System.out.println("verifyChunkedSums error: " +
StringUtils.stringifyException(e));
return 1;
}
offset += dataRead;
}
System.out.println("Checksum verification succeeded on block file " +
blockFile);
return 0;
} finally {
IOUtils.cleanup(null, metaStream, dataStream, checksumStream);
}
}
}
/**
* The command for recovering a file lease.
*/
private class RecoverLeaseCommand extends DebugCommand {
RecoverLeaseCommand() {
super("recoverLease",
"recoverLease [-path <path>] [-retries <num-retries>]",
" Recover the lease on the specified path. The path must reside on an\n" +
" HDFS filesystem. The default number of retries is 1.");
}
private static final int TIMEOUT_MS = 5000;
int run(List<String> args) throws IOException {
if (args.size() == 0) {
System.out.println(usageText);
System.out.println(helpText + "\n");
return 1;
}
String pathStr = StringUtils.popOptionWithArgument("-path", args);
String retriesStr = StringUtils.popOptionWithArgument("-retries", args);
if (pathStr == null) {
System.err.println("You must supply a -path argument to " +
"recoverLease.");
return 1;
}
int maxRetries = 1;
if (retriesStr != null) {
try {
maxRetries = Integer.parseInt(retriesStr);
} catch (NumberFormatException e) {
System.err.println("Failed to parse the argument to -retries: " +
StringUtils.stringifyException(e));
return 1;
}
}
FileSystem fs;
try {
fs = FileSystem.newInstance(new URI(pathStr), getConf(), null);
} catch (URISyntaxException e) {
System.err.println("URISyntaxException for " + pathStr + ":" +
StringUtils.stringifyException(e));
return 1;
} catch (InterruptedException e) {
System.err.println("InterruptedException for " + pathStr + ":" +
StringUtils.stringifyException(e));
return 1;
}
DistributedFileSystem dfs = null;
try {
dfs = (DistributedFileSystem) fs;
} catch (ClassCastException e) {
System.err.println("Invalid filesystem for path " + pathStr + ": " +
"needed scheme hdfs, but got: " + fs.getScheme());
return 1;
}
for (int retry = 0; true; ) {
boolean recovered = false;
IOException ioe = null;
try {
recovered = dfs.recoverLease(new Path(pathStr));
} catch (IOException e) {
ioe = e;
}
if (recovered) {
System.out.println("recoverLease SUCCEEDED on " + pathStr);
return 0;
}
if (ioe != null) {
System.err.println("recoverLease got exception: ");
ioe.printStackTrace();
} else {
System.err.println("recoverLease returned false.");
}
retry++;
if (retry >= maxRetries) {
break;
}
System.err.println("Retrying in " + TIMEOUT_MS + " ms...");
Uninterruptibles.sleepUninterruptibly(TIMEOUT_MS,
TimeUnit.MILLISECONDS);
System.err.println("Retry #" + retry);
}
System.err.println("Giving up on recoverLease for " + pathStr + " after " +
maxRetries + (maxRetries == 1 ? " try." : " tries."));
return 1;
}
}
/**
* The command for getting help about other commands.
*/
private class HelpCommand extends DebugCommand {
HelpCommand() {
super("help",
"help [command-name]",
" Get help about a command.");
}
int run(List<String> args) {
DebugCommand command = popCommand(args);
if (command == null) {
printUsage();
return 0;
}
System.out.println(command.usageText);
System.out.println(command.helpText + "\n");
return 0;
}
}
public DebugAdmin(Configuration conf) {
super(conf);
}
private DebugCommand popCommand(List<String> args) {
String commandStr = (args.size() == 0) ? "" : args.get(0);
if (commandStr.startsWith("-")) {
commandStr = commandStr.substring(1);
}
for (DebugCommand command : DEBUG_COMMANDS) {
if (command.name.equals(commandStr)) {
args.remove(0);
return command;
}
}
return null;
}
public int run(String[] argv) {
LinkedList<String> args = new LinkedList<String>();
for (int j = 0; j < argv.length; ++j) {
args.add(argv[j]);
}
DebugCommand command = popCommand(args);
if (command == null) {
printUsage();
return 0;
}
try {
return command.run(args);
} catch (IOException e) {
System.err.println("IOException: " +
StringUtils.stringifyException(e));
return 1;
} catch (RuntimeException e) {
System.err.println("RuntimeException: " +
StringUtils.stringifyException(e));
return 1;
}
}
private void printUsage() {
System.out.println("Usage: hdfs debug <command> [arguments]\n");
for (DebugCommand command : DEBUG_COMMANDS) {
if (!command.name.equals("help")) {
System.out.println(command.usageText);
}
}
}
public static void main(String[] argsArray) throws IOException {
DebugAdmin debugAdmin = new DebugAdmin(new Configuration());
System.exit(debugAdmin.run(argsArray));
}
}

View File

@ -0,0 +1,118 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.PrintStream;
import static org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetTestUtil.*;
import static org.junit.Assert.assertEquals;
public class TestDebugAdmin {
private MiniDFSCluster cluster;
private DistributedFileSystem fs;
private DebugAdmin admin;
private DataNode datanode;
@Before
public void setUp() throws Exception {
Configuration conf = new Configuration();
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
fs = cluster.getFileSystem();
admin = new DebugAdmin(conf);
datanode = cluster.getDataNodes().get(0);
}
@After
public void tearDown() throws Exception {
if (cluster != null) {
cluster.shutdown();
cluster = null;
}
}
private String runCmd(String[] cmd) throws Exception {
final ByteArrayOutputStream bytes = new ByteArrayOutputStream();
final PrintStream out = new PrintStream(bytes);
final PrintStream oldErr = System.err;
final PrintStream oldOut = System.out;
System.setErr(out);
System.setOut(out);
int ret;
try {
ret = admin.run(cmd);
} finally {
System.setErr(oldErr);
System.setOut(oldOut);
IOUtils.closeStream(out);
}
return "ret: " + ret + ", " + bytes.toString();
}
@Test(timeout = 60000)
public void testRecoverLease() throws Exception {
assertEquals("ret: 1, You must supply a -path argument to recoverLease.\n",
runCmd(new String[]{"recoverLease", "-retries", "1"}));
FSDataOutputStream out = fs.create(new Path("/foo"));
out.write(123);
out.close();
assertEquals("ret: 0, recoverLease SUCCEEDED on /foo\n",
runCmd(new String[]{"recoverLease", "-path", "/foo"}));
}
@Test(timeout = 60000)
public void testVerifyBlockChecksumCommand() throws Exception {
DFSTestUtil.createFile(fs, new Path("/bar"), 1234, (short) 1, 0xdeadbeef);
FsDatasetSpi<?> fsd = datanode.getFSDataset();
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, new Path("/bar"));
File blockFile = getBlockFile(fsd,
block.getBlockPoolId(), block.getLocalBlock());
assertEquals("ret: 1, You must specify a meta file with -meta\n",
runCmd(new String[]{"verify", "-block", blockFile.getAbsolutePath()}));
File metaFile = getMetaFile(fsd,
block.getBlockPoolId(), block.getLocalBlock());
assertEquals("ret: 0, Checksum type: " +
"DataChecksum(type=CRC32C, chunkSize=512)\n",
runCmd(new String[]{"verify",
"-meta", metaFile.getAbsolutePath()}));
assertEquals("ret: 0, Checksum type: " +
"DataChecksum(type=CRC32C, chunkSize=512)\n" +
"Checksum verification succeeded on block file " +
blockFile.getAbsolutePath() + "\n",
runCmd(new String[]{"verify",
"-meta", metaFile.getAbsolutePath(),
"-block", blockFile.getAbsolutePath()})
);
}
}