HDFS-5428. Under construction files deletion after snapshot+checkpoint+nn restart leads nn safemode. Contributed by Jing Zhao.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1543329 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jing Zhao 2013-11-19 07:31:40 +00:00
parent 512475e56f
commit aa002344d0
6 changed files with 229 additions and 22 deletions

View File

@ -576,6 +576,9 @@ Release 2.3.0 - UNRELEASED
HDFS-5502. Fix HTTPS support in HsftpFileSystem. (Haohui Mai via jing9)
HDFS-5428. Under construction files deletion after snapshot+checkpoint+nn restart
leads nn safemode. (jing9)
Release 2.2.1 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -32,6 +32,7 @@
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -48,6 +49,7 @@
import org.apache.hadoop.hdfs.protocol.LayoutVersion;
import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList;
@ -678,6 +680,12 @@ INode loadINode(final byte[] localName, boolean isSnapshotINode,
if (underConstruction) {
clientName = FSImageSerialization.readString(in);
clientMachine = FSImageSerialization.readString(in);
// convert the last block to BlockUC
if (blocks != null && blocks.length > 0) {
BlockInfo lastBlk = blocks[blocks.length - 1];
blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
lastBlk, replication);
}
}
}
}
@ -690,10 +698,15 @@ INode loadINode(final byte[] localName, boolean isSnapshotINode,
}
final INodeFile file = new INodeFile(inodeId, localName, permissions,
modificationTime, atime, blocks, replication, blockSize);
return fileDiffs != null? new INodeFileWithSnapshot(file, fileDiffs)
: underConstruction? new INodeFileUnderConstruction(
file, clientName, clientMachine, null)
: file;
if (underConstruction) {
INodeFileUnderConstruction fileUC = new INodeFileUnderConstruction(
file, clientName, clientMachine, null);
return fileDiffs == null ? fileUC :
new INodeFileUnderConstructionWithSnapshot(fileUC, fileDiffs);
} else {
return fileDiffs == null ? file :
new INodeFileWithSnapshot(file, fileDiffs);
}
} else if (numBlocks == -1) {
//directory
@ -821,8 +834,20 @@ private void loadFilesUnderConstruction(DataInput in,
// verify that file exists in namespace
String path = cons.getLocalName();
final INodesInPath iip = fsDir.getLastINodeInPath(path);
INodeFile oldnode = INodeFile.valueOf(iip.getINode(0), path);
INodeFile oldnode = null;
boolean inSnapshot = false;
if (path != null && FSDirectory.isReservedName(path) &&
LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
// TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
// snapshot. If we support INode ID in the layout version, we can use
// the inode id to find the oldnode.
oldnode = namesystem.dir.getInode(cons.getId()).asFile();
inSnapshot = true;
} else {
final INodesInPath iip = fsDir.getLastINodeInPath(path);
oldnode = INodeFile.valueOf(iip.getINode(0), path);
}
cons.setLocalName(oldnode.getLocalNameBytes());
INodeReference parentRef = oldnode.getParentReference();
if (parentRef != null) {
@ -833,11 +858,23 @@ private void loadFilesUnderConstruction(DataInput in,
if (oldnode instanceof INodeFileWithSnapshot) {
cons = new INodeFileUnderConstructionWithSnapshot(cons,
((INodeFileWithSnapshot)oldnode).getDiffs());
((INodeFileWithSnapshot) oldnode).getDiffs());
}
fsDir.replaceINodeFile(path, oldnode, cons);
namesystem.leaseManager.addLease(cons.getClientName(), path);
if (!inSnapshot) {
fsDir.replaceINodeFile(path, oldnode, cons);
namesystem.leaseManager.addLease(cons.getClientName(), path);
} else {
if (parentRef != null) {
// replace oldnode with cons
parentRef.setReferredINode(cons);
} else {
// replace old node in its parent's children list and deleted list
oldnode.getParent().replaceChildFileInSnapshot(oldnode, cons);
namesystem.dir.addToInodeMap(cons);
updateBlocksMap(cons);
}
}
}
}
@ -917,6 +954,9 @@ static class Saver {
private MD5Hash savedDigest;
private final ReferenceMap referenceMap = new ReferenceMap();
private final Map<Long, INodeFileUnderConstruction> snapshotUCMap =
new HashMap<Long, INodeFileUnderConstruction>();
/** @throws IllegalStateException if the instance has not yet saved an image */
private void checkSaved() {
if (!saved) {
@ -992,14 +1032,22 @@ void save(File newFile, FSImageCompression compression) throws IOException {
// save the root
saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter);
// save the rest of the nodes
saveImage(fsDir.rootDir, out, true, counter);
saveImage(fsDir.rootDir, out, true, false, counter);
prog.endStep(Phase.SAVING_CHECKPOINT, step);
// Now that the step is finished, set counter equal to total to adjust
// for possible under-counting due to reference inodes.
prog.setCount(Phase.SAVING_CHECKPOINT, step,
fsDir.rootDir.numItemsInTree());
// save files under construction
sourceNamesystem.saveFilesUnderConstruction(out);
// TODO: for HDFS-5428, since we cannot break the compatibility of
// fsimage, we store part of the under-construction files that are only
// in snapshots in this "under-construction-file" section. As a
// temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
// paths, so that when loading fsimage we do not put them into the lease
// map. In the future, we can remove this hack when we can bump the
// layout version.
sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
context.checkCancelled();
sourceNamesystem.saveSecretManagerState(out, sdPath);
context.checkCancelled();
@ -1024,20 +1072,31 @@ void save(File newFile, FSImageCompression compression) throws IOException {
* Save children INodes.
* @param children The list of children INodes
* @param out The DataOutputStream to write
* @param inSnapshot Whether the parent directory or its ancestor is in
* the deleted list of some snapshot (caused by rename or
* deletion)
* @param counter Counter to increment for namenode startup progress
* @return Number of children that are directory
*/
private int saveChildren(ReadOnlyList<INode> children, DataOutputStream out,
Counter counter) throws IOException {
private int saveChildren(ReadOnlyList<INode> children,
DataOutputStream out, boolean inSnapshot, Counter counter)
throws IOException {
// Write normal children INode.
out.writeInt(children.size());
int dirNum = 0;
int i = 0;
for(INode child : children) {
// print all children first
// TODO: for HDFS-5428, we cannot change the format/content of fsimage
// here, thus even if the parent directory is in snapshot, we still
// do not handle INodeUC as those stored in deleted list
saveINode2Image(child, out, false, referenceMap, counter);
if (child.isDirectory()) {
dirNum++;
} else if (inSnapshot && child.isFile()
&& child.asFile().isUnderConstruction()) {
this.snapshotUCMap.put(child.getId(),
(INodeFileUnderConstruction) child.asFile());
}
if (i++ % 50 == 0) {
context.checkCancelled();
@ -1054,14 +1113,15 @@ private int saveChildren(ReadOnlyList<INode> children, DataOutputStream out,
*
* @param current The current node
* @param out The DataoutputStream to write the image
* @param snapshot The possible snapshot associated with the current node
* @param toSaveSubtree Whether or not to save the subtree to fsimage. For
* reference node, its subtree may already have been
* saved before.
* @param inSnapshot Whether the current directory is in snapshot
* @param counter Counter to increment for namenode startup progress
*/
private void saveImage(INodeDirectory current, DataOutputStream out,
boolean toSaveSubtree, Counter counter) throws IOException {
boolean toSaveSubtree, boolean inSnapshot, Counter counter)
throws IOException {
// write the inode id of the directory
out.writeLong(current.getId());
@ -1090,7 +1150,7 @@ private void saveImage(INodeDirectory current, DataOutputStream out,
}
// 3. Write children INode
dirNum += saveChildren(children, out, counter);
dirNum += saveChildren(children, out, inSnapshot, counter);
// 4. Write DirectoryDiff lists, if there is any.
SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
@ -1105,14 +1165,14 @@ private void saveImage(INodeDirectory current, DataOutputStream out,
// make sure we only save the subtree under a reference node once
boolean toSave = child.isReference() ?
referenceMap.toProcessSubtree(child.getId()) : true;
saveImage(child.asDirectory(), out, toSave, counter);
saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
}
if (snapshotDirs != null) {
for (INodeDirectory subDir : snapshotDirs) {
// make sure we only save the subtree under a reference node once
boolean toSave = subDir.getParentReference() != null ?
referenceMap.toProcessSubtree(subDir.getId()) : true;
saveImage(subDir, out, toSave, counter);
saveImage(subDir, out, toSave, true, counter);
}
}
}

View File

@ -5929,19 +5929,40 @@ void unprotectedChangeLease(String src, String dst) {
/**
* Serializes leases.
*/
void saveFilesUnderConstruction(DataOutputStream out) throws IOException {
void saveFilesUnderConstruction(DataOutputStream out,
Map<Long, INodeFileUnderConstruction> snapshotUCMap) throws IOException {
// This is run by an inferior thread of saveNamespace, which holds a read
// lock on our behalf. If we took the read lock here, we could block
// for fairness if a writer is waiting on the lock.
synchronized (leaseManager) {
Map<String, INodeFileUnderConstruction> nodes =
leaseManager.getINodesUnderConstruction();
out.writeInt(nodes.size()); // write the size
for (Map.Entry<String, INodeFileUnderConstruction> entry
: nodes.entrySet()) {
// TODO: for HDFS-5428, because of rename operations, some
// under-construction files that are
// in the current fs directory can also be captured in the
// snapshotUCMap. We should remove them from the snapshotUCMap.
snapshotUCMap.remove(entry.getValue().getId());
}
out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
for (Map.Entry<String, INodeFileUnderConstruction> entry
: nodes.entrySet()) {
FSImageSerialization.writeINodeUnderConstruction(
out, entry.getValue(), entry.getKey());
}
for (Map.Entry<Long, INodeFileUnderConstruction> entry
: snapshotUCMap.entrySet()) {
// for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
// as their paths
StringBuilder b = new StringBuilder();
b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
.append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
.append(Path.SEPARATOR).append(entry.getValue().getId());
FSImageSerialization.writeINodeUnderConstruction(
out, entry.getValue(), b.toString());
}
}
}

View File

@ -205,6 +205,23 @@ private final <N extends INodeDirectory> N replaceSelf(final N newDir,
return newDir;
}
/**
* Used when load fileUC from fsimage. The file to be replaced is actually
* only in snapshot, thus may not be contained in the children list.
* See HDFS-5428 for details.
*/
public void replaceChildFileInSnapshot(INodeFile oldChild,
final INodeFile newChild) {
if (children != null) {
final int i = searchChildren(newChild.getLocalNameBytes());
if (i >= 0 && children.get(i).getId() == oldChild.getId()) {
// no need to consider reference node here, since we already do the
// replacement in FSImageFormat.Loader#loadFilesUnderConstruction
children.set(i, newChild);
}
}
}
/** Replace the given child with a new child. */
public void replaceChild(INode oldChild, final INode newChild,
final INodeMap inodeMap) {

View File

@ -38,6 +38,7 @@
import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectoryAttributes;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectoryWithQuota;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.server.namenode.INodeMap;
import org.apache.hadoop.hdfs.server.namenode.INodeReference;
import org.apache.hadoop.hdfs.server.namenode.Quota;
@ -75,7 +76,7 @@ private final boolean replace(final ListType type,
final INode oldChild, final INode newChild) {
final List<INode> list = getList(type);
final int i = search(list, oldChild.getLocalNameBytes());
if (i < 0) {
if (i < 0 || list.get(i).getId() != oldChild.getId()) {
return false;
}
@ -592,6 +593,14 @@ public boolean removeChild(INode child, Snapshot latest,
return removed;
}
@Override
public void replaceChildFileInSnapshot(final INodeFile oldChild,
final INodeFile newChild) {
super.replaceChildFileInSnapshot(oldChild, newChild);
diffs.replaceChild(ListType.DELETED, oldChild, newChild);
diffs.replaceChild(ListType.CREATED, oldChild, newChild);
}
@Override
public void replaceChild(final INode oldChild, final INode newChild,
final INodeMap inodeMap) {

View File

@ -28,6 +28,11 @@
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.client.HdfsDataOutputStream.SyncFlag;
import org.apache.hadoop.hdfs.server.namenode.INodeId;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.security.AccessControlException;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -76,6 +81,47 @@ public void testParentDirWithUCFileDeleteWithSnapShot() throws Exception {
cluster.restartNameNode();
}
@Test
public void testWithCheckpoint() throws Exception {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
fs.delete(new Path("/test/test"), true);
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
cluster.restartNameNode(true);
// read snapshot file after restart
String test2snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test2");
DFSTestUtil.readFile(fs, new Path(test2snapshotPath));
String test3snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test3");
DFSTestUtil.readFile(fs, new Path(test3snapshotPath));
}
@Test
public void testFilesDeletionWithCheckpoint() throws Exception {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
fs.delete(new Path("/test/test/test2"), true);
fs.delete(new Path("/test/test/test3"), true);
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
cluster.restartNameNode(true);
// read snapshot file after restart
String test2snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test2");
DFSTestUtil.readFile(fs, new Path(test2snapshotPath));
String test3snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test3");
DFSTestUtil.readFile(fs, new Path(test3snapshotPath));
}
private void doWriteAndAbort(DistributedFileSystem fs, Path path)
throws IOException {
fs.mkdirs(path);
@ -110,4 +156,55 @@ private void doWriteAndAbort(DistributedFileSystem fs, Path path)
DFSTestUtil.abortStream((DFSOutputStream) out2.getWrappedStream());
fs.createSnapshot(path, "s1");
}
@Test
public void testOpenFilesWithMultipleSnapshots() throws Exception {
doTestMultipleSnapshots(true);
}
@Test
public void testOpenFilesWithMultipleSnapshotsWithoutCheckpoint()
throws Exception {
doTestMultipleSnapshots(false);
}
private void doTestMultipleSnapshots(boolean saveNamespace)
throws IOException, AccessControlException {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
fs.createSnapshot(path, "s2");
fs.delete(new Path("/test/test"), true);
fs.deleteSnapshot(path, "s2");
if (saveNamespace) {
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
}
cluster.restartNameNode(true);
}
@Test
public void testOpenFilesWithRename() throws Exception {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
// check for zero sized blocks
Path fileWithEmptyBlock = new Path("/test/test/test4");
fs.create(fileWithEmptyBlock);
NamenodeProtocols nameNodeRpc = cluster.getNameNodeRpc();
String clientName = fs.getClient().getClientName();
// create one empty block
nameNodeRpc.addBlock(fileWithEmptyBlock.toString(), clientName, null, null,
INodeId.GRANDFATHER_INODE_ID, null);
fs.createSnapshot(path, "s2");
fs.rename(new Path("/test/test"), new Path("/test/test-renamed"));
fs.delete(new Path("/test/test-renamed"), true);
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
cluster.restartNameNode(true);
}
}