HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in checkLeases (Ravi Prakash via Colin P. McCabe)

This commit is contained in:
Colin Patrick Mccabe 2014-11-24 10:46:33 -08:00
parent 555fa2d9d0
commit daacbc18d7
3 changed files with 91 additions and 36 deletions

View File

@ -6226,26 +6226,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
// Calculate number of blocks under construction // Calculate number of blocks under construction
long numUCBlocks = 0; long numUCBlocks = 0;
readLock(); readLock();
numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
try { try {
for (Lease lease : leaseManager.getSortedLeases()) {
for (String path : lease.getPaths()) {
final INodeFile cons;
try {
cons = dir.getINode(path).asFile();
Preconditions.checkState(cons.isUnderConstruction());
} catch (UnresolvedLinkException e) {
throw new AssertionError("Lease files should reside on this FS");
}
BlockInfo[] blocks = cons.getBlocks();
if(blocks == null)
continue;
for(BlockInfo b : blocks) {
if(!b.isComplete())
numUCBlocks++;
}
}
}
LOG.info("Number of blocks under construction: " + numUCBlocks);
return getBlocksTotal() - numUCBlocks; return getBlocksTotal() - numUCBlocks;
} finally { } finally {
readUnlock(); readUnlock();

View File

@ -25,8 +25,9 @@ import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.NavigableSet;
import java.util.NoSuchElementException;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
@ -36,6 +37,7 @@ import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.UnresolvedLinkException; import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Daemon;
@ -79,7 +81,7 @@ public class LeaseManager {
// //
private final SortedMap<String, Lease> leases = new TreeMap<String, Lease>(); private final SortedMap<String, Lease> leases = new TreeMap<String, Lease>();
// Set of: Lease // Set of: Lease
private final SortedSet<Lease> sortedLeases = new TreeSet<Lease>(); private final NavigableSet<Lease> sortedLeases = new TreeSet<Lease>();
// //
// Map path names to leases. It is protected by the sortedLeases lock. // Map path names to leases. It is protected by the sortedLeases lock.
@ -95,8 +97,41 @@ public class LeaseManager {
Lease getLease(String holder) { Lease getLease(String holder) {
return leases.get(holder); return leases.get(holder);
} }
SortedSet<Lease> getSortedLeases() {return sortedLeases;} @VisibleForTesting
int getNumSortedLeases() {return sortedLeases.size();}
/**
* This method iterates through all the leases and counts the number of blocks
* which are not COMPLETE. The FSNamesystem read lock MUST be held before
* calling this method.
* @return
*/
synchronized long getNumUnderConstructionBlocks() {
assert this.fsnamesystem.hasReadLock() : "The FSNamesystem read lock wasn't"
+ "acquired before counting under construction blocks";
long numUCBlocks = 0;
for (Lease lease : sortedLeases) {
for (String path : lease.getPaths()) {
final INodeFile cons;
try {
cons = this.fsnamesystem.getFSDirectory().getINode(path).asFile();
Preconditions.checkState(cons.isUnderConstruction());
} catch (UnresolvedLinkException e) {
throw new AssertionError("Lease files should reside on this FS");
}
BlockInfo[] blocks = cons.getBlocks();
if(blocks == null)
continue;
for(BlockInfo b : blocks) {
if(!b.isComplete())
numUCBlocks++;
}
}
}
LOG.info("Number of blocks under construction: " + numUCBlocks);
return numUCBlocks;
}
/** @return the lease containing src */ /** @return the lease containing src */
public Lease getLeaseByPath(String src) {return sortedLeasesByPath.get(src);} public Lease getLeaseByPath(String src) {return sortedLeasesByPath.get(src);}
@ -421,33 +456,38 @@ public class LeaseManager {
/** Check the leases beginning from the oldest. /** Check the leases beginning from the oldest.
* @return true is sync is needed. * @return true is sync is needed.
*/ */
private synchronized boolean checkLeases() { @VisibleForTesting
synchronized boolean checkLeases() {
boolean needSync = false; boolean needSync = false;
assert fsnamesystem.hasWriteLock(); assert fsnamesystem.hasWriteLock();
for(; sortedLeases.size() > 0; ) { Lease leaseToCheck = null;
final Lease oldest = sortedLeases.first(); try {
if (!oldest.expiredHardLimit()) { leaseToCheck = sortedLeases.first();
return needSync; } catch(NoSuchElementException e) {}
while(leaseToCheck != null) {
if (!leaseToCheck.expiredHardLimit()) {
break;
} }
LOG.info(oldest + " has expired hard limit"); LOG.info(leaseToCheck + " has expired hard limit");
final List<String> removing = new ArrayList<String>(); final List<String> removing = new ArrayList<String>();
// need to create a copy of the oldest lease paths, becuase // need to create a copy of the oldest lease paths, because
// internalReleaseLease() removes paths corresponding to empty files, // internalReleaseLease() removes paths corresponding to empty files,
// i.e. it needs to modify the collection being iterated over // i.e. it needs to modify the collection being iterated over
// causing ConcurrentModificationException // causing ConcurrentModificationException
String[] leasePaths = new String[oldest.getPaths().size()]; String[] leasePaths = new String[leaseToCheck.getPaths().size()];
oldest.getPaths().toArray(leasePaths); leaseToCheck.getPaths().toArray(leasePaths);
for(String p : leasePaths) { for(String p : leasePaths) {
try { try {
boolean completed = fsnamesystem.internalReleaseLease(oldest, p, boolean completed = fsnamesystem.internalReleaseLease(leaseToCheck, p,
HdfsServerConstants.NAMENODE_LEASE_HOLDER); HdfsServerConstants.NAMENODE_LEASE_HOLDER);
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
if (completed) { if (completed) {
LOG.debug("Lease recovery for " + p + " is complete. File closed."); LOG.debug("Lease recovery for " + p + " is complete. File closed.");
} else { } else {
LOG.debug("Started block recovery " + p + " lease " + oldest); LOG.debug("Started block recovery " + p + " lease " + leaseToCheck);
} }
} }
// If a lease recovery happened, we need to sync later. // If a lease recovery happened, we need to sync later.
@ -456,15 +496,23 @@ public class LeaseManager {
} }
} catch (IOException e) { } catch (IOException e) {
LOG.error("Cannot release the path " + p + " in the lease " LOG.error("Cannot release the path " + p + " in the lease "
+ oldest, e); + leaseToCheck, e);
removing.add(p); removing.add(p);
} }
} }
for(String p : removing) { for(String p : removing) {
removeLease(oldest, p); removeLease(leaseToCheck, p);
} }
leaseToCheck = sortedLeases.higher(leaseToCheck);
} }
try {
if(leaseToCheck != sortedLeases.first()) {
LOG.warn("Unable to release hard-limit expired lease: "
+ sortedLeases.first());
}
} catch(NoSuchElementException e) {}
return needSync; return needSync;
} }

View File

@ -17,6 +17,7 @@
*/ */
package org.apache.hadoop.hdfs.server.namenode; package org.apache.hadoop.hdfs.server.namenode;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull; import static org.junit.Assert.assertNull;
@ -54,4 +55,28 @@ public class TestLeaseManager {
assertNull(lm.getLeaseByPath("/a/b")); assertNull(lm.getLeaseByPath("/a/b"));
assertNull(lm.getLeaseByPath("/a/c")); assertNull(lm.getLeaseByPath("/a/c"));
} }
/** Check that even if LeaseManager.checkLease is not able to relinquish
* leases, the Namenode does't enter an infinite loop while holding the FSN
* write lock and thus become unresponsive
*/
@Test (timeout=1000)
public void testCheckLeaseNotInfiniteLoop() {
FSNamesystem fsn = Mockito.mock(FSNamesystem.class);
Mockito.when(fsn.isRunning()).thenReturn(true);
Mockito.when(fsn.hasWriteLock()).thenReturn(true);
LeaseManager lm = new LeaseManager(fsn);
//Make sure the leases we are going to add exceed the hard limit
lm.setLeasePeriod(0,0);
//Add some leases to the LeaseManager
lm.addLease("holder1", "src1");
lm.addLease("holder2", "src2");
lm.addLease("holder3", "src3");
assertEquals(lm.getNumSortedLeases(), 3);
//Initiate a call to checkLease. This should exit within the test timeout
lm.checkLeases();
}
} }