HDFS-4591. HA clients can fail to fail over while Standby NN is performing long checkpoint. Contributed by Aaron T. Myers.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1456107 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2013-03-13 19:51:19 +00:00
parent 1f3a18ff44
commit 3bf09c5150
10 changed files with 151 additions and 17 deletions

View File

@ -162,6 +162,9 @@ public static class DelayAnswer implements Answer<Object> {
private final CountDownLatch waitLatch = new CountDownLatch(1); private final CountDownLatch waitLatch = new CountDownLatch(1);
private final CountDownLatch resultLatch = new CountDownLatch(1); private final CountDownLatch resultLatch = new CountDownLatch(1);
private final AtomicInteger fireCounter = new AtomicInteger(0);
private final AtomicInteger resultCounter = new AtomicInteger(0);
// Result fields set after proceed() is called. // Result fields set after proceed() is called.
private volatile Throwable thrown; private volatile Throwable thrown;
private volatile Object returnValue; private volatile Object returnValue;
@ -188,6 +191,7 @@ public void proceed() {
@Override @Override
public Object answer(InvocationOnMock invocation) throws Throwable { public Object answer(InvocationOnMock invocation) throws Throwable {
LOG.info("DelayAnswer firing fireLatch"); LOG.info("DelayAnswer firing fireLatch");
fireCounter.getAndIncrement();
fireLatch.countDown(); fireLatch.countDown();
try { try {
LOG.info("DelayAnswer waiting on waitLatch"); LOG.info("DelayAnswer waiting on waitLatch");
@ -208,6 +212,7 @@ protected Object passThrough(InvocationOnMock invocation) throws Throwable {
thrown = t; thrown = t;
throw t; throw t;
} finally { } finally {
resultCounter.incrementAndGet();
resultLatch.countDown(); resultLatch.countDown();
} }
} }
@ -235,6 +240,14 @@ public Throwable getThrown() {
public Object getReturnValue() { public Object getReturnValue() {
return returnValue; return returnValue;
} }
public int getFireCount() {
return fireCounter.get();
}
public int getResultCount() {
return resultCounter.get();
}
} }
/** /**

View File

@ -411,6 +411,9 @@ Release 2.0.5-beta - UNRELEASED
HDFS-4583. TestNodeCount fails. (Ivan Mitic via suresh) HDFS-4583. TestNodeCount fails. (Ivan Mitic via suresh)
HDFS-4591. HA clients can fail to fail over while Standby NN is performing
long checkpoint. (atm)
Release 2.0.4-alpha - UNRELEASED Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -78,6 +78,7 @@ public DelegationTokenIdentifier createIdentifier() {
@Override //SecretManager @Override //SecretManager
public void checkAvailableForRead() throws StandbyException { public void checkAvailableForRead() throws StandbyException {
namesystem.checkOperation(OperationCategory.READ);
namesystem.readLock(); namesystem.readLock();
try { try {
namesystem.checkOperation(OperationCategory.READ); namesystem.checkOperation(OperationCategory.READ);

View File

@ -62,6 +62,7 @@
import org.apache.hadoop.hdfs.server.namenode.FSClusterStats; import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
@ -874,9 +875,10 @@ public void verifyReplication(String src,
*/ */
public BlocksWithLocations getBlocks(DatanodeID datanode, long size public BlocksWithLocations getBlocks(DatanodeID datanode, long size
) throws IOException { ) throws IOException {
namesystem.checkOperation(OperationCategory.READ);
namesystem.readLock(); namesystem.readLock();
try { try {
namesystem.checkSuperuserPrivilege(); namesystem.checkOperation(OperationCategory.READ);
return getBlocksWithLocations(datanode, size); return getBlocksWithLocations(datanode, size);
} finally { } finally {
namesystem.readUnlock(); namesystem.readUnlock();

View File

@ -1129,8 +1129,10 @@ public boolean isInStandbyState() {
*/ */
void metaSave(String filename) throws IOException { void metaSave(String filename) throws IOException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.UNCHECKED);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.UNCHECKED);
File file = new File(System.getProperty("hadoop.log.dir"), filename); File file = new File(System.getProperty("hadoop.log.dir"), filename);
PrintWriter out = new PrintWriter(new BufferedWriter( PrintWriter out = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(file, true), Charsets.UTF_8))); new OutputStreamWriter(new FileOutputStream(file, true), Charsets.UTF_8)));
@ -1204,6 +1206,7 @@ private void setPermissionInt(String src, FsPermission permission)
UnresolvedLinkException, IOException { UnresolvedLinkException, IOException {
HdfsFileStatus resultingStat = null; HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -1241,6 +1244,7 @@ private void setOwnerInt(String src, String username, String group)
UnresolvedLinkException, IOException { UnresolvedLinkException, IOException {
HdfsFileStatus resultingStat = null; HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -1351,13 +1355,20 @@ private LocatedBlocks getBlockLocationsUpdateTimes(String src,
throws FileNotFoundException, UnresolvedLinkException, IOException { throws FileNotFoundException, UnresolvedLinkException, IOException {
for (int attempt = 0; attempt < 2; attempt++) { for (int attempt = 0; attempt < 2; attempt++) {
if (attempt == 0) { // first attempt is with readlock boolean isReadOp = (attempt == 0);
if (isReadOp) { // first attempt is with readlock
checkOperation(OperationCategory.READ);
readLock(); readLock();
} else { // second attempt is with write lock } else { // second attempt is with write lock
checkOperation(OperationCategory.WRITE);
writeLock(); // writelock is needed to set accesstime writeLock(); // writelock is needed to set accesstime
} }
try { try {
checkOperation(OperationCategory.READ); if (isReadOp) {
checkOperation(OperationCategory.READ);
} else {
checkOperation(OperationCategory.WRITE);
}
// if the namenode is in safemode, then do not update access time // if the namenode is in safemode, then do not update access time
if (isInSafeMode()) { if (isInSafeMode()) {
@ -1370,7 +1381,7 @@ private LocatedBlocks getBlockLocationsUpdateTimes(String src,
if (now <= inode.getAccessTime() + getAccessTimePrecision()) { if (now <= inode.getAccessTime() + getAccessTimePrecision()) {
// if we have to set access time but we only have the readlock, then // if we have to set access time but we only have the readlock, then
// restart this entire operation with the writeLock. // restart this entire operation with the writeLock.
if (attempt == 0) { if (isReadOp) {
continue; continue;
} }
} }
@ -1380,7 +1391,7 @@ private LocatedBlocks getBlockLocationsUpdateTimes(String src,
inode.computeFileSize(false), inode.isUnderConstruction(), inode.computeFileSize(false), inode.isUnderConstruction(),
offset, length, needBlockToken); offset, length, needBlockToken);
} finally { } finally {
if (attempt == 0) { if (isReadOp) {
readUnlock(); readUnlock();
} else { } else {
writeUnlock(); writeUnlock();
@ -1436,6 +1447,7 @@ private void concatInt(String target, String [] srcs)
HdfsFileStatus resultingStat = null; HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -1578,6 +1590,7 @@ private void setTimesInt(String src, long mtime, long atime)
} }
HdfsFileStatus resultingStat = null; HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -1618,6 +1631,7 @@ private void createSymlinkInt(String target, String link,
throws IOException, UnresolvedLinkException { throws IOException, UnresolvedLinkException {
HdfsFileStatus resultingStat = null; HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -1693,6 +1707,7 @@ private boolean setReplicationInt(final String src, final short replication)
blockManager.verifyReplication(src, replication, null); blockManager.verifyReplication(src, replication, null);
final boolean isFile; final boolean isFile;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -1723,6 +1738,7 @@ private boolean setReplicationInt(final String src, final short replication)
long getPreferredBlockSize(String filename) long getPreferredBlockSize(String filename)
throws IOException, UnresolvedLinkException { throws IOException, UnresolvedLinkException {
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.READ);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.READ); checkOperation(OperationCategory.READ);
@ -1785,6 +1801,7 @@ private HdfsFileStatus startFileInt(String src, PermissionStatus permissions,
boolean skipSync = false; boolean skipSync = false;
final HdfsFileStatus stat; final HdfsFileStatus stat;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -1982,6 +1999,7 @@ boolean recoverLease(String src, String holder, String clientMachine)
throws IOException { throws IOException {
boolean skipSync = false; boolean skipSync = false;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -2119,6 +2137,7 @@ private LocatedBlock appendFileInt(String src, String holder, String clientMachi
} }
LocatedBlock lb = null; LocatedBlock lb = null;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -2185,8 +2204,10 @@ LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
} }
// Part I. Analyze the state of the file with respect to the input data. // Part I. Analyze the state of the file with respect to the input data.
checkOperation(OperationCategory.READ);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.READ);
LocatedBlock[] onRetryBlock = new LocatedBlock[1]; LocatedBlock[] onRetryBlock = new LocatedBlock[1];
final INode[] inodes = analyzeFileState( final INode[] inodes = analyzeFileState(
src, fileId, clientName, previous, onRetryBlock).getINodes(); src, fileId, clientName, previous, onRetryBlock).getINodes();
@ -2213,8 +2234,10 @@ LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
// Allocate a new block, add it to the INode and the BlocksMap. // Allocate a new block, add it to the INode and the BlocksMap.
Block newBlock = null; Block newBlock = null;
long offset; long offset;
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE);
// Run the full analysis again, since things could have changed // Run the full analysis again, since things could have changed
// while chooseTarget() was executing. // while chooseTarget() was executing.
LocatedBlock[] onRetryBlock = new LocatedBlock[1]; LocatedBlock[] onRetryBlock = new LocatedBlock[1];
@ -2368,9 +2391,10 @@ LocatedBlock getAdditionalDatanode(final String src, final ExtendedBlock blk,
final DatanodeDescriptor clientnode; final DatanodeDescriptor clientnode;
final long preferredblocksize; final long preferredblocksize;
final List<DatanodeDescriptor> chosen; final List<DatanodeDescriptor> chosen;
checkOperation(OperationCategory.READ);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.READ);
//check safe mode //check safe mode
if (isInSafeMode()) { if (isInSafeMode()) {
throw new SafeModeException("Cannot add datanode; src=" + src throw new SafeModeException("Cannot add datanode; src=" + src
@ -2410,6 +2434,7 @@ LocatedBlock getAdditionalDatanode(final String src, final ExtendedBlock blk,
boolean abandonBlock(ExtendedBlock b, String src, String holder) boolean abandonBlock(ExtendedBlock b, String src, String holder)
throws LeaseExpiredException, FileNotFoundException, throws LeaseExpiredException, FileNotFoundException,
UnresolvedLinkException, IOException { UnresolvedLinkException, IOException {
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -2487,6 +2512,7 @@ boolean completeFile(String src, String holder, ExtendedBlock last)
throws SafeModeException, UnresolvedLinkException, IOException { throws SafeModeException, UnresolvedLinkException, IOException {
checkBlock(last); checkBlock(last);
boolean success = false; boolean success = false;
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -2653,6 +2679,7 @@ private boolean renameToInt(String src, String dst)
" to " + dst); " to " + dst);
} }
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -2709,6 +2736,7 @@ void renameTo(String src, String dst, Options.Rename... options)
+ src + " to " + dst); + src + " to " + dst);
} }
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -2795,6 +2823,7 @@ private boolean deleteInternal(String src, boolean recursive,
IOException { IOException {
BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo(); BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -2923,6 +2952,7 @@ HdfsFileStatus getFileInfo(String src, boolean resolveLink)
StandbyException, IOException { StandbyException, IOException {
HdfsFileStatus stat = null; HdfsFileStatus stat = null;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.READ);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.READ); checkOperation(OperationCategory.READ);
@ -2965,6 +2995,7 @@ private boolean mkdirsInt(String src, PermissionStatus permissions,
NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src); NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
} }
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -3025,6 +3056,7 @@ ContentSummary getContentSummary(String src) throws AccessControlException,
FileNotFoundException, UnresolvedLinkException, StandbyException { FileNotFoundException, UnresolvedLinkException, StandbyException {
FSPermissionChecker pc = new FSPermissionChecker(fsOwnerShortUserName, FSPermissionChecker pc = new FSPermissionChecker(fsOwnerShortUserName,
supergroup); supergroup);
checkOperation(OperationCategory.READ);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.READ); checkOperation(OperationCategory.READ);
@ -3045,6 +3077,7 @@ ContentSummary getContentSummary(String src) throws AccessControlException,
void setQuota(String path, long nsQuota, long dsQuota) void setQuota(String path, long nsQuota, long dsQuota)
throws IOException, UnresolvedLinkException { throws IOException, UnresolvedLinkException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -3068,6 +3101,7 @@ void setQuota(String path, long nsQuota, long dsQuota)
void fsync(String src, String clientName, long lastBlockLength) void fsync(String src, String clientName, long lastBlockLength)
throws IOException, UnresolvedLinkException { throws IOException, UnresolvedLinkException {
NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName); NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -3272,6 +3306,7 @@ void commitBlockSynchronization(ExtendedBlock lastblock,
String[] newtargetstorages) String[] newtargetstorages)
throws IOException, UnresolvedLinkException { throws IOException, UnresolvedLinkException {
String src = ""; String src = "";
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -3375,6 +3410,7 @@ void commitBlockSynchronization(ExtendedBlock lastblock,
* Renew the lease(s) held by the given client * Renew the lease(s) held by the given client
*/ */
void renewLease(String holder) throws IOException { void renewLease(String holder) throws IOException {
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -3416,6 +3452,7 @@ private DirectoryListing getListingInt(String src, byte[] startAfter,
throws AccessControlException, UnresolvedLinkException, IOException { throws AccessControlException, UnresolvedLinkException, IOException {
DirectoryListing dl; DirectoryListing dl;
FSPermissionChecker pc = getPermissionChecker(); FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.READ);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.READ); checkOperation(OperationCategory.READ);
@ -3702,10 +3739,12 @@ int getNumberOfDatanodes(DatanodeReportType type) {
} }
DatanodeInfo[] datanodeReport(final DatanodeReportType type DatanodeInfo[] datanodeReport(final DatanodeReportType type
) throws AccessControlException { ) throws AccessControlException, StandbyException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.UNCHECKED);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.UNCHECKED);
final DatanodeManager dm = getBlockManager().getDatanodeManager(); final DatanodeManager dm = getBlockManager().getDatanodeManager();
final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type); final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
@ -3729,8 +3768,10 @@ DatanodeInfo[] datanodeReport(final DatanodeReportType type
*/ */
void saveNamespace() throws AccessControlException, IOException { void saveNamespace() throws AccessControlException, IOException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.UNCHECKED);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.UNCHECKED);
if (!isInSafeMode()) { if (!isInSafeMode()) {
throw new IOException("Safe mode should be turned ON " + throw new IOException("Safe mode should be turned ON " +
"in order to create namespace image."); "in order to create namespace image.");
@ -3748,10 +3789,13 @@ void saveNamespace() throws AccessControlException, IOException {
* *
* @throws AccessControlException if superuser privilege is violated. * @throws AccessControlException if superuser privilege is violated.
*/ */
boolean restoreFailedStorage(String arg) throws AccessControlException { boolean restoreFailedStorage(String arg) throws AccessControlException,
StandbyException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.UNCHECKED);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.UNCHECKED);
// if it is disabled - enable it and vice versa. // if it is disabled - enable it and vice versa.
if(arg.equals("check")) if(arg.equals("check"))
@ -3772,6 +3816,7 @@ Date getStartTime() {
void finalizeUpgrade() throws IOException { void finalizeUpgrade() throws IOException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -4511,6 +4556,7 @@ String getSafeModeTip() {
CheckpointSignature rollEditLog() throws IOException { CheckpointSignature rollEditLog() throws IOException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.JOURNAL);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.JOURNAL); checkOperation(OperationCategory.JOURNAL);
@ -4528,6 +4574,7 @@ NamenodeCommand startCheckpoint(
NamenodeRegistration bnReg, // backup node NamenodeRegistration bnReg, // backup node
NamenodeRegistration nnReg) // active name-node NamenodeRegistration nnReg) // active name-node
throws IOException { throws IOException {
checkOperation(OperationCategory.CHECKPOINT);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.CHECKPOINT); checkOperation(OperationCategory.CHECKPOINT);
@ -4546,6 +4593,7 @@ NamenodeCommand startCheckpoint(
void endCheckpoint(NamenodeRegistration registration, void endCheckpoint(NamenodeRegistration registration,
CheckpointSignature sig) throws IOException { CheckpointSignature sig) throws IOException {
checkOperation(OperationCategory.CHECKPOINT);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.CHECKPOINT); checkOperation(OperationCategory.CHECKPOINT);
@ -4834,6 +4882,7 @@ private INodeFileUnderConstruction checkUCBlock(ExtendedBlock block,
* Client is reporting some bad block locations. * Client is reporting some bad block locations.
*/ */
void reportBadBlocks(LocatedBlock[] blocks) throws IOException { void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -4868,6 +4917,7 @@ void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
LocatedBlock updateBlockForPipeline(ExtendedBlock block, LocatedBlock updateBlockForPipeline(ExtendedBlock block,
String clientName) throws IOException { String clientName) throws IOException {
LocatedBlock locatedBlock; LocatedBlock locatedBlock;
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -4899,6 +4949,7 @@ LocatedBlock updateBlockForPipeline(ExtendedBlock block,
void updatePipeline(String clientName, ExtendedBlock oldBlock, void updatePipeline(String clientName, ExtendedBlock oldBlock,
ExtendedBlock newBlock, DatanodeID[] newNodes) ExtendedBlock newBlock, DatanodeID[] newNodes)
throws IOException { throws IOException {
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -5026,8 +5077,10 @@ void registerBackupNode(NamenodeRegistration bnReg,
*/ */
void releaseBackupNode(NamenodeRegistration registration) void releaseBackupNode(NamenodeRegistration registration)
throws IOException { throws IOException {
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE);
if(getFSImage().getStorage().getNamespaceID() if(getFSImage().getStorage().getNamespaceID()
!= registration.getNamespaceID()) != registration.getNamespaceID())
throw new IOException("Incompatible namespaceIDs: " throw new IOException("Incompatible namespaceIDs: "
@ -5066,6 +5119,7 @@ public String toString() {
Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path, Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
String[] cookieTab) throws IOException { String[] cookieTab) throws IOException {
checkSuperuserPrivilege(); checkSuperuserPrivilege();
checkOperation(OperationCategory.READ);
readLock(); readLock();
try { try {
checkOperation(OperationCategory.READ); checkOperation(OperationCategory.READ);
@ -5158,6 +5212,7 @@ DelegationTokenSecretManager getDelegationTokenSecretManager() {
Token<DelegationTokenIdentifier> getDelegationToken(Text renewer) Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
throws IOException { throws IOException {
Token<DelegationTokenIdentifier> token; Token<DelegationTokenIdentifier> token;
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -5204,6 +5259,7 @@ Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
long renewDelegationToken(Token<DelegationTokenIdentifier> token) long renewDelegationToken(Token<DelegationTokenIdentifier> token)
throws InvalidToken, IOException { throws InvalidToken, IOException {
long expiryTime; long expiryTime;
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);
@ -5236,6 +5292,7 @@ long renewDelegationToken(Token<DelegationTokenIdentifier> token)
*/ */
void cancelDelegationToken(Token<DelegationTokenIdentifier> token) void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
throws IOException { throws IOException {
checkOperation(OperationCategory.WRITE);
writeLock(); writeLock();
try { try {
checkOperation(OperationCategory.WRITE); checkOperation(OperationCategory.WRITE);

View File

@ -337,7 +337,6 @@ public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size)
throw new IllegalArgumentException( throw new IllegalArgumentException(
"Unexpected not positive size: "+size); "Unexpected not positive size: "+size);
} }
namesystem.checkOperation(OperationCategory.READ);
namesystem.checkSuperuserPrivilege(); namesystem.checkSuperuserPrivilege();
return namesystem.getBlockManager().getBlocks(datanode, size); return namesystem.getBlockManager().getBlocks(datanode, size);
} }
@ -707,7 +706,6 @@ public long[] getStats() throws IOException {
@Override // ClientProtocol @Override // ClientProtocol
public DatanodeInfo[] getDatanodeReport(DatanodeReportType type) public DatanodeInfo[] getDatanodeReport(DatanodeReportType type)
throws IOException { throws IOException {
namesystem.checkOperation(OperationCategory.UNCHECKED);
DatanodeInfo results[] = namesystem.datanodeReport(type); DatanodeInfo results[] = namesystem.datanodeReport(type);
if (results == null ) { if (results == null ) {
throw new IOException("Cannot find datanode report"); throw new IOException("Cannot find datanode report");
@ -732,19 +730,16 @@ public boolean setSafeMode(SafeModeAction action, boolean isChecked)
@Override // ClientProtocol @Override // ClientProtocol
public boolean restoreFailedStorage(String arg) throws IOException { public boolean restoreFailedStorage(String arg) throws IOException {
namesystem.checkOperation(OperationCategory.UNCHECKED);
return namesystem.restoreFailedStorage(arg); return namesystem.restoreFailedStorage(arg);
} }
@Override // ClientProtocol @Override // ClientProtocol
public void saveNamespace() throws IOException { public void saveNamespace() throws IOException {
namesystem.checkOperation(OperationCategory.UNCHECKED);
namesystem.saveNamespace(); namesystem.saveNamespace();
} }
@Override // ClientProtocol @Override // ClientProtocol
public long rollEdits() throws AccessControlException, IOException { public long rollEdits() throws AccessControlException, IOException {
namesystem.checkOperation(OperationCategory.JOURNAL);
CheckpointSignature sig = namesystem.rollEditLog(); CheckpointSignature sig = namesystem.rollEditLog();
return sig.getCurSegmentTxId(); return sig.getCurSegmentTxId();
} }
@ -789,7 +784,6 @@ public void finalizeUpgrade() throws IOException {
@Override // ClientProtocol @Override // ClientProtocol
public void metaSave(String filename) throws IOException { public void metaSave(String filename) throws IOException {
namesystem.checkOperation(OperationCategory.UNCHECKED);
namesystem.metaSave(filename); namesystem.metaSave(filename);
} }

View File

@ -18,7 +18,9 @@
package org.apache.hadoop.hdfs.server.namenode; package org.apache.hadoop.hdfs.server.namenode;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
import org.apache.hadoop.hdfs.util.RwLock; import org.apache.hadoop.hdfs.util.RwLock;
import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.AccessControlException;
/** Namesystem operations. */ /** Namesystem operations. */
@ -38,4 +40,6 @@ public interface Namesystem extends RwLock, SafeMode {
public boolean isGenStampInFuture(long generationStamp); public boolean isGenStampInFuture(long generationStamp);
public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal); public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal);
public void checkOperation(OperationCategory read) throws StandbyException;
} }

View File

@ -64,9 +64,17 @@ public interface HAContext {
void writeUnlock(); void writeUnlock();
/** /**
* Verify that the given operation category is allowed in the * Verify that the given operation category is allowed in the current state.
* current state. This is to allow NN implementations (eg BackupNode) * This is to allow NN implementations (eg BackupNode) to override it with
* to override it with node-specific handling. * node-specific handling.
*
* If the operation which is being checked will be taking the FSNS lock, it's
* advisable to check the operation category both immediately before and after
* taking the lock. This is because clients rely on the StandbyException
* thrown by this method in order to trigger client failover, and if a client
* first tries to contact the Standby NN, it could block for a long time if
* the Standby is holding the lock for a while, e.g. when performing a
* checkpoint. See HDFS-4591 for more details.
*/ */
void checkOperation(OperationCategory op) throws StandbyException; void checkOperation(OperationCategory op) throws StandbyException;

View File

@ -19,6 +19,7 @@
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -26,6 +27,8 @@
import java.net.URI; import java.net.URI;
import java.util.List; import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@ -43,7 +46,10 @@
import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
import org.apache.hadoop.util.ThreadUtil;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -59,6 +65,8 @@ public class TestStandbyCheckpoints {
protected MiniDFSCluster cluster; protected MiniDFSCluster cluster;
protected NameNode nn0, nn1; protected NameNode nn0, nn1;
protected FileSystem fs; protected FileSystem fs;
private static final Log LOG = LogFactory.getLog(TestStandbyCheckpoints.class);
@SuppressWarnings("rawtypes") @SuppressWarnings("rawtypes")
@Before @Before
@ -231,6 +239,49 @@ public void testCheckpointCancellation() throws Exception {
assertTrue(canceledOne); assertTrue(canceledOne);
} }
/**
* Make sure that clients will receive StandbyExceptions even when a
* checkpoint is in progress on the SBN, and therefore the StandbyCheckpointer
* thread will have FSNS lock. Regression test for HDFS-4591.
*/
@Test(timeout=120000)
public void testStandbyExceptionThrownDuringCheckpoint() throws Exception {
// Set it up so that we know when the SBN checkpoint starts and ends.
FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1);
DelayAnswer answerer = new DelayAnswer(LOG);
Mockito.doAnswer(answerer).when(spyImage1)
.saveNamespace(Mockito.any(FSNamesystem.class),
Mockito.any(Canceler.class));
// Perform some edits and wait for a checkpoint to start on the SBN.
doEdits(0, 2000);
nn0.getRpcServer().rollEditLog();
answerer.waitForCall();
answerer.proceed();
assertTrue("SBN is not performing checkpoint but it should be.",
answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
// Make sure that the lock has actually been taken by the checkpointing
// thread.
ThreadUtil.sleepAtLeastIgnoreInterrupts(1000);
try {
// Perform an RPC to the SBN and make sure it throws a StandbyException.
nn1.getRpcServer().getFileInfo("/");
fail("Should have thrown StandbyException, but instead succeeded.");
} catch (StandbyException se) {
GenericTestUtils.assertExceptionContains("is not supported", se);
}
// Make sure that the checkpoint is still going on, implying that the client
// RPC to the SBN happened during the checkpoint.
assertTrue("SBN should have still been checkpointing.",
answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
answerer.waitForResult();
assertTrue("SBN should have finished checkpointing.",
answerer.getFireCount() == 1 && answerer.getResultCount() == 1);
}
private void doEdits(int start, int stop) throws IOException { private void doEdits(int start, int stop) throws IOException {
for (int i = start; i < stop; i++) { for (int i = start; i < stop; i++) {

View File

@ -143,6 +143,7 @@ public void testDatanodeRestarts() throws Exception {
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
// We read from the standby to watch block locations // We read from the standby to watch block locations
HAUtil.setAllowStandbyReads(conf, true); HAUtil.setAllowStandbyReads(conf, true);
conf.setLong(DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, 0);
conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
.nnTopology(MiniDFSNNTopology.simpleHATopology()) .nnTopology(MiniDFSNNTopology.simpleHATopology())