HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.
This commit is contained in:
parent
2fa7963c3d
commit
451265a83d
@ -307,10 +307,8 @@ void syncBlock(List<BlockRecord> syncList) throws IOException {
|
||||
}
|
||||
}
|
||||
|
||||
// If any of the data-nodes failed, the recovery fails, because
|
||||
// we never know the actual state of the replica on failed data-nodes.
|
||||
// The recovery should be started over.
|
||||
if (!failedList.isEmpty()) {
|
||||
// Abort if all failed.
|
||||
if (successList.isEmpty()) {
|
||||
throw new IOException("Cannot recover " + block
|
||||
+ ", the following datanodes failed: " + failedList);
|
||||
}
|
||||
|
@ -227,6 +227,50 @@ public void testBlockRecoveryWithLessMetafile() throws Exception {
|
||||
assertEquals(newFileLen, expectedNewFileLen);
|
||||
}
|
||||
|
||||
/**
|
||||
* Block/lease recovery should be retried with failed nodes from the second
|
||||
* stage removed to avoid perpetual recovery failures.
|
||||
*/
|
||||
@Test
|
||||
public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
|
||||
Configuration conf = new Configuration();
|
||||
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
|
||||
Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
|
||||
DistributedFileSystem dfs = cluster.getFileSystem();
|
||||
|
||||
// Create a file.
|
||||
FSDataOutputStream out = dfs.create(file);
|
||||
final int FILE_SIZE = 128 * 1024;
|
||||
int count = 0;
|
||||
while (count < FILE_SIZE) {
|
||||
out.writeBytes("DE K9SUL");
|
||||
count += 8;
|
||||
}
|
||||
out.hsync();
|
||||
|
||||
// Abort the original stream.
|
||||
((DFSOutputStream) out.getWrappedStream()).abort();
|
||||
|
||||
LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
|
||||
file.toString(), 0, count);
|
||||
ExtendedBlock block = locations.get(0).getBlock();
|
||||
|
||||
// Finalize one replica to simulate a partial close failure.
|
||||
cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
|
||||
// Delete the meta file to simulate a rename/move failure.
|
||||
cluster.deleteMeta(0, block);
|
||||
|
||||
// Try to recover the lease.
|
||||
DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
|
||||
.newInstance(cluster.getConfiguration(0));
|
||||
count = 0;
|
||||
while (count++ < 15 && !newDfs.recoverLease(file)) {
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
// The lease should have been recovered.
|
||||
assertTrue("File should be closed", newDfs.recoverLease(file));
|
||||
}
|
||||
|
||||
/**
|
||||
* Recover the lease on a file and append file from another client.
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user