HDFS-2507. Allow saveNamespace operations to be canceled. Contributed by Todd Lipcon.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1190060 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
072ef952c3
commit
646e855f6e
@ -824,6 +824,8 @@ Release 0.23.0 - Unreleased
|
|||||||
HDFS-1869. mkdirs should use the supplied permission for all of the created
|
HDFS-1869. mkdirs should use the supplied permission for all of the created
|
||||||
directories. (Daryn Sharp via szetszwo)
|
directories. (Daryn Sharp via szetszwo)
|
||||||
|
|
||||||
|
HDFS-2507. Allow saveNamespace operations to be canceled. (todd)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
HDFS-1458. Improve checkpoint performance by avoiding unnecessary image
|
HDFS-1458. Improve checkpoint performance by avoiding unnecessary image
|
||||||
|
@ -85,6 +85,8 @@ public class FSImage implements Closeable {
|
|||||||
|
|
||||||
private final NNStorageRetentionManager archivalManager;
|
private final NNStorageRetentionManager archivalManager;
|
||||||
|
|
||||||
|
private SaveNamespaceContext curSaveNamespaceContext = null;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct an FSImage
|
* Construct an FSImage
|
||||||
@ -715,14 +717,15 @@ private void loadFSImage(File curFile, MD5Hash expectedMd5,
|
|||||||
/**
|
/**
|
||||||
* Save the contents of the FS image to the file.
|
* Save the contents of the FS image to the file.
|
||||||
*/
|
*/
|
||||||
void saveFSImage(FSNamesystem source, StorageDirectory sd, long txid)
|
void saveFSImage(SaveNamespaceContext context, StorageDirectory sd)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
long txid = context.getTxId();
|
||||||
File newFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
|
File newFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
|
||||||
File dstFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid);
|
File dstFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid);
|
||||||
|
|
||||||
FSImageFormat.Saver saver = new FSImageFormat.Saver();
|
FSImageFormat.Saver saver = new FSImageFormat.Saver(context);
|
||||||
FSImageCompression compression = FSImageCompression.createCompression(conf);
|
FSImageCompression compression = FSImageCompression.createCompression(conf);
|
||||||
saver.save(newFile, txid, source, compression);
|
saver.save(newFile, compression);
|
||||||
|
|
||||||
MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest());
|
MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest());
|
||||||
storage.setMostRecentCheckpointTxId(txid);
|
storage.setMostRecentCheckpointTxId(txid);
|
||||||
@ -740,25 +743,24 @@ void saveFSImage(FSNamesystem source, StorageDirectory sd, long txid)
|
|||||||
* and writing it out.
|
* and writing it out.
|
||||||
*/
|
*/
|
||||||
private class FSImageSaver implements Runnable {
|
private class FSImageSaver implements Runnable {
|
||||||
|
private final SaveNamespaceContext context;
|
||||||
private StorageDirectory sd;
|
private StorageDirectory sd;
|
||||||
private List<StorageDirectory> errorSDs;
|
|
||||||
private final long txid;
|
|
||||||
private final FSNamesystem source;
|
|
||||||
|
|
||||||
FSImageSaver(FSNamesystem source, StorageDirectory sd,
|
public FSImageSaver(SaveNamespaceContext context, StorageDirectory sd) {
|
||||||
List<StorageDirectory> errorSDs, long txid) {
|
this.context = context;
|
||||||
this.source = source;
|
|
||||||
this.sd = sd;
|
this.sd = sd;
|
||||||
this.errorSDs = errorSDs;
|
|
||||||
this.txid = txid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void run() {
|
public void run() {
|
||||||
try {
|
try {
|
||||||
saveFSImage(source, sd, txid);
|
saveFSImage(context, sd);
|
||||||
|
} catch (SaveNamespaceCancelledException snce) {
|
||||||
|
LOG.info("Cancelled image saving for " + sd.getRoot() +
|
||||||
|
": " + snce.getMessage());
|
||||||
|
// don't report an error on the storage dir!
|
||||||
} catch (Throwable t) {
|
} catch (Throwable t) {
|
||||||
LOG.error("Unable to save image for " + sd.getRoot(), t);
|
LOG.error("Unable to save image for " + sd.getRoot(), t);
|
||||||
errorSDs.add(sd);
|
context.reportErrorOnStorageDirectory(sd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -784,7 +786,7 @@ private void waitForThreads(List<Thread> threads) {
|
|||||||
* Save the contents of the FS image to a new image file in each of the
|
* Save the contents of the FS image to a new image file in each of the
|
||||||
* current storage directories.
|
* current storage directories.
|
||||||
*/
|
*/
|
||||||
void saveNamespace(FSNamesystem source) throws IOException {
|
synchronized void saveNamespace(FSNamesystem source) throws IOException {
|
||||||
assert editLog != null : "editLog must be initialized";
|
assert editLog != null : "editLog must be initialized";
|
||||||
storage.attemptRestoreRemovedStorage();
|
storage.attemptRestoreRemovedStorage();
|
||||||
|
|
||||||
@ -800,46 +802,71 @@ void saveNamespace(FSNamesystem source) throws IOException {
|
|||||||
} finally {
|
} finally {
|
||||||
if (editLogWasOpen) {
|
if (editLogWasOpen) {
|
||||||
editLog.startLogSegment(imageTxId + 1, true);
|
editLog.startLogSegment(imageTxId + 1, true);
|
||||||
// Take this opportunity to note the current transaction
|
// Take this opportunity to note the current transaction.
|
||||||
|
// Even if the namespace save was cancelled, this marker
|
||||||
|
// is only used to determine what transaction ID is required
|
||||||
|
// for startup. So, it doesn't hurt to update it unnecessarily.
|
||||||
storage.writeTransactionIdFileToStorage(imageTxId + 1);
|
storage.writeTransactionIdFileToStorage(imageTxId + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void saveFSImageInAllDirs(FSNamesystem source, long txid)
|
void cancelSaveNamespace(String reason)
|
||||||
|
throws InterruptedException {
|
||||||
|
SaveNamespaceContext ctx = curSaveNamespaceContext;
|
||||||
|
if (ctx != null) {
|
||||||
|
ctx.cancel(reason); // waits until complete
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected synchronized void saveFSImageInAllDirs(FSNamesystem source, long txid)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
|
if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
|
||||||
throw new IOException("No image directories available!");
|
throw new IOException("No image directories available!");
|
||||||
}
|
}
|
||||||
|
|
||||||
List<StorageDirectory> errorSDs =
|
SaveNamespaceContext ctx = new SaveNamespaceContext(
|
||||||
Collections.synchronizedList(new ArrayList<StorageDirectory>());
|
source, txid);
|
||||||
|
curSaveNamespaceContext = ctx;
|
||||||
|
|
||||||
List<Thread> saveThreads = new ArrayList<Thread>();
|
try {
|
||||||
// save images into current
|
List<Thread> saveThreads = new ArrayList<Thread>();
|
||||||
for (Iterator<StorageDirectory> it
|
// save images into current
|
||||||
= storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
|
for (Iterator<StorageDirectory> it
|
||||||
StorageDirectory sd = it.next();
|
= storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
|
||||||
FSImageSaver saver = new FSImageSaver(source, sd, errorSDs, txid);
|
StorageDirectory sd = it.next();
|
||||||
Thread saveThread = new Thread(saver, saver.toString());
|
FSImageSaver saver = new FSImageSaver(ctx, sd);
|
||||||
saveThreads.add(saveThread);
|
Thread saveThread = new Thread(saver, saver.toString());
|
||||||
saveThread.start();
|
saveThreads.add(saveThread);
|
||||||
|
saveThread.start();
|
||||||
|
}
|
||||||
|
waitForThreads(saveThreads);
|
||||||
|
saveThreads.clear();
|
||||||
|
storage.reportErrorsOnDirectories(ctx.getErrorSDs());
|
||||||
|
|
||||||
|
if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
|
||||||
|
throw new IOException(
|
||||||
|
"Failed to save in any storage directories while saving namespace.");
|
||||||
|
}
|
||||||
|
if (ctx.isCancelled()) {
|
||||||
|
deleteCancelledCheckpoint(txid);
|
||||||
|
ctx.checkCancelled(); // throws
|
||||||
|
assert false : "should have thrown above!";
|
||||||
|
}
|
||||||
|
|
||||||
|
renameCheckpoint(txid);
|
||||||
|
|
||||||
|
// Since we now have a new checkpoint, we can clean up some
|
||||||
|
// old edit logs and checkpoints.
|
||||||
|
purgeOldStorage();
|
||||||
|
} finally {
|
||||||
|
// Notify any threads waiting on the checkpoint to be canceled
|
||||||
|
// that it is complete.
|
||||||
|
ctx.markComplete();
|
||||||
|
ctx = null;
|
||||||
}
|
}
|
||||||
waitForThreads(saveThreads);
|
|
||||||
saveThreads.clear();
|
|
||||||
storage.reportErrorsOnDirectories(errorSDs);
|
|
||||||
|
|
||||||
if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
|
|
||||||
throw new IOException(
|
|
||||||
"Failed to save in any storage directories while saving namespace.");
|
|
||||||
}
|
|
||||||
|
|
||||||
renameCheckpoint(txid);
|
|
||||||
|
|
||||||
// Since we now have a new checkpoint, we can clean up some
|
|
||||||
// old edit logs and checkpoints.
|
|
||||||
purgeOldStorage();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -874,6 +901,24 @@ private void renameCheckpoint(long txid) throws IOException {
|
|||||||
if(al != null) storage.reportErrorsOnDirectories(al);
|
if(al != null) storage.reportErrorsOnDirectories(al);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deletes the checkpoint file in every storage directory,
|
||||||
|
* since the checkpoint was cancelled.
|
||||||
|
*/
|
||||||
|
private void deleteCancelledCheckpoint(long txid) throws IOException {
|
||||||
|
ArrayList<StorageDirectory> al = Lists.newArrayList();
|
||||||
|
|
||||||
|
for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
|
||||||
|
File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
|
||||||
|
if (ckpt.exists() && !ckpt.delete()) {
|
||||||
|
LOG.warn("Unable to delete cancelled checkpoint in " + sd);
|
||||||
|
al.add(sd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
storage.reportErrorsOnDirectories(al);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void renameCheckpointInDir(StorageDirectory sd, long txid)
|
private void renameCheckpointInDir(StorageDirectory sd, long txid)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
|
File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
|
||||||
@ -1055,4 +1100,5 @@ public String getBlockPoolID() {
|
|||||||
public synchronized long getLastAppliedTxId() {
|
public synchronized long getLastAppliedTxId() {
|
||||||
return lastAppliedTxId;
|
return lastAppliedTxId;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -508,6 +508,7 @@ byte[][] getParent(byte[][] path) {
|
|||||||
* functions may be used to retrieve information about the file that was written.
|
* functions may be used to retrieve information about the file that was written.
|
||||||
*/
|
*/
|
||||||
static class Saver {
|
static class Saver {
|
||||||
|
private final SaveNamespaceContext context;
|
||||||
/** Set to true once an image has been written */
|
/** Set to true once an image has been written */
|
||||||
private boolean saved = false;
|
private boolean saved = false;
|
||||||
|
|
||||||
@ -530,6 +531,11 @@ private void checkNotSaved() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Saver(SaveNamespaceContext context) {
|
||||||
|
this.context = context;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the MD5 checksum of the image file that was saved.
|
* Return the MD5 checksum of the image file that was saved.
|
||||||
*/
|
*/
|
||||||
@ -539,12 +545,11 @@ MD5Hash getSavedDigest() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void save(File newFile,
|
void save(File newFile,
|
||||||
long txid,
|
|
||||||
FSNamesystem sourceNamesystem,
|
|
||||||
FSImageCompression compression)
|
FSImageCompression compression)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
checkNotSaved();
|
checkNotSaved();
|
||||||
|
|
||||||
|
final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
|
||||||
FSDirectory fsDir = sourceNamesystem.dir;
|
FSDirectory fsDir = sourceNamesystem.dir;
|
||||||
long startTime = now();
|
long startTime = now();
|
||||||
//
|
//
|
||||||
@ -565,7 +570,7 @@ void save(File newFile,
|
|||||||
.getNamespaceID());
|
.getNamespaceID());
|
||||||
out.writeLong(fsDir.rootDir.numItemsInTree());
|
out.writeLong(fsDir.rootDir.numItemsInTree());
|
||||||
out.writeLong(sourceNamesystem.getGenerationStamp());
|
out.writeLong(sourceNamesystem.getGenerationStamp());
|
||||||
out.writeLong(txid);
|
out.writeLong(context.getTxId());
|
||||||
|
|
||||||
// write compression info and set up compressed stream
|
// write compression info and set up compressed stream
|
||||||
out = compression.writeHeaderAndWrapStream(fos);
|
out = compression.writeHeaderAndWrapStream(fos);
|
||||||
@ -581,10 +586,12 @@ void save(File newFile,
|
|||||||
saveImage(strbuf, fsDir.rootDir, out);
|
saveImage(strbuf, fsDir.rootDir, out);
|
||||||
// save files under construction
|
// save files under construction
|
||||||
sourceNamesystem.saveFilesUnderConstruction(out);
|
sourceNamesystem.saveFilesUnderConstruction(out);
|
||||||
|
context.checkCancelled();
|
||||||
sourceNamesystem.saveSecretManagerState(out);
|
sourceNamesystem.saveSecretManagerState(out);
|
||||||
strbuf = null;
|
strbuf = null;
|
||||||
|
context.checkCancelled();
|
||||||
out.flush();
|
out.flush();
|
||||||
|
context.checkCancelled();
|
||||||
fout.getChannel().force(true);
|
fout.getChannel().force(true);
|
||||||
} finally {
|
} finally {
|
||||||
out.close();
|
out.close();
|
||||||
@ -603,9 +610,10 @@ void save(File newFile,
|
|||||||
* This is a recursive procedure, which first saves all children of
|
* This is a recursive procedure, which first saves all children of
|
||||||
* a current directory and then moves inside the sub-directories.
|
* a current directory and then moves inside the sub-directories.
|
||||||
*/
|
*/
|
||||||
private static void saveImage(ByteBuffer currentDirName,
|
private void saveImage(ByteBuffer currentDirName,
|
||||||
INodeDirectory current,
|
INodeDirectory current,
|
||||||
DataOutputStream out) throws IOException {
|
DataOutputStream out) throws IOException {
|
||||||
|
context.checkCancelled();
|
||||||
List<INode> children = current.getChildrenRaw();
|
List<INode> children = current.getChildrenRaw();
|
||||||
if (children == null || children.isEmpty())
|
if (children == null || children.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
@ -2728,6 +2728,27 @@ void saveNamespace() throws AccessControlException, IOException {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cancel an ongoing saveNamespace operation and wait for its
|
||||||
|
* threads to exit, if one is currently in progress.
|
||||||
|
*
|
||||||
|
* If no such operation is in progress, this call does nothing.
|
||||||
|
*
|
||||||
|
* @param reason a reason to be communicated to the caller saveNamespace
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
void cancelSaveNamespace(String reason) throws IOException {
|
||||||
|
readLock();
|
||||||
|
try {
|
||||||
|
checkSuperuserPrivilege();
|
||||||
|
getFSImage().cancelSaveNamespace(reason);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new IOException(e);
|
||||||
|
} finally {
|
||||||
|
readUnlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
|
* Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
|
||||||
* Requires superuser privilege.
|
* Requires superuser privilege.
|
||||||
|
@ -0,0 +1,28 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hdfs.server.namenode;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
class SaveNamespaceCancelledException extends IOException {
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
SaveNamespaceCancelledException(String cancelReason) {
|
||||||
|
super(cancelReason);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,98 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hdfs.server.namenode;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
|
||||||
|
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Context for an ongoing SaveNamespace operation. This class
|
||||||
|
* allows cancellation, and also is responsible for accumulating
|
||||||
|
* failed storage directories.
|
||||||
|
*/
|
||||||
|
class SaveNamespaceContext {
|
||||||
|
private final FSNamesystem sourceNamesystem;
|
||||||
|
private final long txid;
|
||||||
|
private final List<StorageDirectory> errorSDs =
|
||||||
|
Collections.synchronizedList(new ArrayList<StorageDirectory>());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the operation has been canceled, set to the reason why
|
||||||
|
* it has been canceled (eg standby moving to active)
|
||||||
|
*/
|
||||||
|
private volatile String cancelReason = null;
|
||||||
|
|
||||||
|
private CountDownLatch completionLatch = new CountDownLatch(1);
|
||||||
|
|
||||||
|
SaveNamespaceContext(
|
||||||
|
FSNamesystem sourceNamesystem,
|
||||||
|
long txid) {
|
||||||
|
this.sourceNamesystem = sourceNamesystem;
|
||||||
|
this.txid = txid;
|
||||||
|
}
|
||||||
|
|
||||||
|
FSNamesystem getSourceNamesystem() {
|
||||||
|
return sourceNamesystem;
|
||||||
|
}
|
||||||
|
|
||||||
|
long getTxId() {
|
||||||
|
return txid;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reportErrorOnStorageDirectory(StorageDirectory sd) {
|
||||||
|
errorSDs.add(sd);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<StorageDirectory> getErrorSDs() {
|
||||||
|
return errorSDs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Requests that the current saveNamespace operation be
|
||||||
|
* canceled if it is still running.
|
||||||
|
* @param reason the reason why cancellation is requested
|
||||||
|
* @throws InterruptedException
|
||||||
|
*/
|
||||||
|
void cancel(String reason) throws InterruptedException {
|
||||||
|
this.cancelReason = reason;
|
||||||
|
completionLatch.await();
|
||||||
|
}
|
||||||
|
|
||||||
|
void markComplete() {
|
||||||
|
Preconditions.checkState(completionLatch.getCount() == 1,
|
||||||
|
"Context already completed!");
|
||||||
|
completionLatch.countDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
void checkCancelled() throws SaveNamespaceCancelledException {
|
||||||
|
if (cancelReason != null) {
|
||||||
|
throw new SaveNamespaceCancelledException(
|
||||||
|
cancelReason);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isCancelled() {
|
||||||
|
return cancelReason != null;
|
||||||
|
}
|
||||||
|
}
|
@ -42,7 +42,7 @@ public abstract class MD5FileUtils {
|
|||||||
private static final Log LOG = LogFactory.getLog(
|
private static final Log LOG = LogFactory.getLog(
|
||||||
MD5FileUtils.class);
|
MD5FileUtils.class);
|
||||||
|
|
||||||
private static final String MD5_SUFFIX = ".md5";
|
public static final String MD5_SUFFIX = ".md5";
|
||||||
private static final Pattern LINE_REGEX =
|
private static final Pattern LINE_REGEX =
|
||||||
Pattern.compile("([0-9a-f]{32}) [ \\*](.+)");
|
Pattern.compile("([0-9a-f]{32}) [ \\*](.+)");
|
||||||
|
|
||||||
|
@ -29,6 +29,10 @@
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
@ -44,6 +48,9 @@
|
|||||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
|
||||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
|
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
|
||||||
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
|
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
|
||||||
|
import org.apache.hadoop.hdfs.util.MD5FileUtils;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
|
||||||
import org.apache.log4j.Level;
|
import org.apache.log4j.Level;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
@ -124,22 +131,25 @@ private void saveNamespaceWithInjectedFault(Fault fault) throws Exception {
|
|||||||
case SAVE_SECOND_FSIMAGE_RTE:
|
case SAVE_SECOND_FSIMAGE_RTE:
|
||||||
// The spy throws a RuntimeException when writing to the second directory
|
// The spy throws a RuntimeException when writing to the second directory
|
||||||
doAnswer(new FaultySaveImage(true)).
|
doAnswer(new FaultySaveImage(true)).
|
||||||
when(spyImage).saveFSImage(Mockito.eq(fsn),
|
when(spyImage).saveFSImage(
|
||||||
(StorageDirectory)anyObject(), anyLong());
|
(SaveNamespaceContext)anyObject(),
|
||||||
|
(StorageDirectory)anyObject());
|
||||||
shouldFail = false;
|
shouldFail = false;
|
||||||
break;
|
break;
|
||||||
case SAVE_SECOND_FSIMAGE_IOE:
|
case SAVE_SECOND_FSIMAGE_IOE:
|
||||||
// The spy throws an IOException when writing to the second directory
|
// The spy throws an IOException when writing to the second directory
|
||||||
doAnswer(new FaultySaveImage(false)).
|
doAnswer(new FaultySaveImage(false)).
|
||||||
when(spyImage).saveFSImage(Mockito.eq(fsn),
|
when(spyImage).saveFSImage(
|
||||||
(StorageDirectory)anyObject(), anyLong());
|
(SaveNamespaceContext)anyObject(),
|
||||||
|
(StorageDirectory)anyObject());
|
||||||
shouldFail = false;
|
shouldFail = false;
|
||||||
break;
|
break;
|
||||||
case SAVE_ALL_FSIMAGES:
|
case SAVE_ALL_FSIMAGES:
|
||||||
// The spy throws IOException in all directories
|
// The spy throws IOException in all directories
|
||||||
doThrow(new RuntimeException("Injected")).
|
doThrow(new RuntimeException("Injected")).
|
||||||
when(spyImage).saveFSImage(Mockito.eq(fsn),
|
when(spyImage).saveFSImage(
|
||||||
(StorageDirectory)anyObject(), anyLong());
|
(SaveNamespaceContext)anyObject(),
|
||||||
|
(StorageDirectory)anyObject());
|
||||||
shouldFail = true;
|
shouldFail = true;
|
||||||
break;
|
break;
|
||||||
case WRITE_STORAGE_ALL:
|
case WRITE_STORAGE_ALL:
|
||||||
@ -363,9 +373,9 @@ public void doTestFailedSaveNamespace(boolean restoreStorageAfterFailure)
|
|||||||
FSNamesystem.getNamespaceEditsDirs(conf));
|
FSNamesystem.getNamespaceEditsDirs(conf));
|
||||||
|
|
||||||
doThrow(new IOException("Injected fault: saveFSImage")).
|
doThrow(new IOException("Injected fault: saveFSImage")).
|
||||||
when(spyImage).saveFSImage(
|
when(spyImage).saveFSImage(
|
||||||
Mockito.eq(fsn), (StorageDirectory)anyObject(),
|
(SaveNamespaceContext)anyObject(),
|
||||||
Mockito.anyLong());
|
(StorageDirectory)anyObject());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
doAnEdit(fsn, 1);
|
doAnEdit(fsn, 1);
|
||||||
@ -479,6 +489,84 @@ public void testTxIdPersistence() throws Exception {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(timeout=20000)
|
||||||
|
public void testCancelSaveNamespace() throws Exception {
|
||||||
|
Configuration conf = getConf();
|
||||||
|
NameNode.initMetrics(conf, NamenodeRole.NAMENODE);
|
||||||
|
DFSTestUtil.formatNameNode(conf);
|
||||||
|
FSNamesystem fsn = FSNamesystem.loadFromDisk(conf);
|
||||||
|
|
||||||
|
// Replace the FSImage with a spy
|
||||||
|
final FSImage image = fsn.dir.fsImage;
|
||||||
|
NNStorage storage = image.getStorage();
|
||||||
|
storage.close(); // unlock any directories that FSNamesystem's initialization may have locked
|
||||||
|
storage.setStorageDirectories(
|
||||||
|
FSNamesystem.getNamespaceDirs(conf),
|
||||||
|
FSNamesystem.getNamespaceEditsDirs(conf));
|
||||||
|
|
||||||
|
FSNamesystem spyFsn = spy(fsn);
|
||||||
|
final FSNamesystem finalFsn = spyFsn;
|
||||||
|
DelayAnswer delayer = new GenericTestUtils.DelayAnswer(LOG);
|
||||||
|
doAnswer(delayer).when(spyFsn).getGenerationStamp();
|
||||||
|
|
||||||
|
ExecutorService pool = Executors.newFixedThreadPool(2);
|
||||||
|
|
||||||
|
try {
|
||||||
|
doAnEdit(fsn, 1);
|
||||||
|
|
||||||
|
// Save namespace
|
||||||
|
fsn.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
|
||||||
|
try {
|
||||||
|
Future<Void> saverFuture = pool.submit(new Callable<Void>() {
|
||||||
|
@Override
|
||||||
|
public Void call() throws Exception {
|
||||||
|
image.saveNamespace(finalFsn);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait until saveNamespace calls getGenerationStamp
|
||||||
|
delayer.waitForCall();
|
||||||
|
// then cancel the saveNamespace
|
||||||
|
Future<Void> cancelFuture = pool.submit(new Callable<Void>() {
|
||||||
|
public Void call() throws Exception {
|
||||||
|
image.cancelSaveNamespace("cancelled");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// give the cancel call time to run
|
||||||
|
Thread.sleep(500);
|
||||||
|
|
||||||
|
// allow saveNamespace to proceed - it should check the cancel flag after
|
||||||
|
// this point and throw an exception
|
||||||
|
delayer.proceed();
|
||||||
|
|
||||||
|
cancelFuture.get();
|
||||||
|
saverFuture.get();
|
||||||
|
fail("saveNamespace did not fail even though cancelled!");
|
||||||
|
} catch (Throwable t) {
|
||||||
|
GenericTestUtils.assertExceptionContains(
|
||||||
|
"SaveNamespaceCancelledException", t);
|
||||||
|
}
|
||||||
|
LOG.info("Successfully cancelled a saveNamespace");
|
||||||
|
|
||||||
|
|
||||||
|
// Check that we have only the original image and not any
|
||||||
|
// cruft left over from half-finished images
|
||||||
|
FSImageTestUtil.logStorageContents(LOG, storage);
|
||||||
|
for (StorageDirectory sd : storage.dirIterable(null)) {
|
||||||
|
File curDir = sd.getCurrentDir();
|
||||||
|
GenericTestUtils.assertGlobEquals(curDir, "fsimage_.*",
|
||||||
|
NNStorage.getImageFileName(0),
|
||||||
|
NNStorage.getImageFileName(0) + MD5FileUtils.MD5_SUFFIX);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (fsn != null) {
|
||||||
|
fsn.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void doAnEdit(FSNamesystem fsn, int id) throws IOException {
|
private void doAnEdit(FSNamesystem fsn, int id) throws IOException {
|
||||||
// Make an edit
|
// Make an edit
|
||||||
fsn.mkdirs(
|
fsn.mkdirs(
|
||||||
|
Loading…
Reference in New Issue
Block a user