HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.
This commit is contained in:
parent
2fa7963c3d
commit
451265a83d
@ -307,10 +307,8 @@ void syncBlock(List<BlockRecord> syncList) throws IOException {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If any of the data-nodes failed, the recovery fails, because
|
// Abort if all failed.
|
||||||
// we never know the actual state of the replica on failed data-nodes.
|
if (successList.isEmpty()) {
|
||||||
// The recovery should be started over.
|
|
||||||
if (!failedList.isEmpty()) {
|
|
||||||
throw new IOException("Cannot recover " + block
|
throw new IOException("Cannot recover " + block
|
||||||
+ ", the following datanodes failed: " + failedList);
|
+ ", the following datanodes failed: " + failedList);
|
||||||
}
|
}
|
||||||
|
@ -227,6 +227,50 @@ public void testBlockRecoveryWithLessMetafile() throws Exception {
|
|||||||
assertEquals(newFileLen, expectedNewFileLen);
|
assertEquals(newFileLen, expectedNewFileLen);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Block/lease recovery should be retried with failed nodes from the second
|
||||||
|
* stage removed to avoid perpetual recovery failures.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
|
||||||
|
Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
|
||||||
|
DistributedFileSystem dfs = cluster.getFileSystem();
|
||||||
|
|
||||||
|
// Create a file.
|
||||||
|
FSDataOutputStream out = dfs.create(file);
|
||||||
|
final int FILE_SIZE = 128 * 1024;
|
||||||
|
int count = 0;
|
||||||
|
while (count < FILE_SIZE) {
|
||||||
|
out.writeBytes("DE K9SUL");
|
||||||
|
count += 8;
|
||||||
|
}
|
||||||
|
out.hsync();
|
||||||
|
|
||||||
|
// Abort the original stream.
|
||||||
|
((DFSOutputStream) out.getWrappedStream()).abort();
|
||||||
|
|
||||||
|
LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
|
||||||
|
file.toString(), 0, count);
|
||||||
|
ExtendedBlock block = locations.get(0).getBlock();
|
||||||
|
|
||||||
|
// Finalize one replica to simulate a partial close failure.
|
||||||
|
cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
|
||||||
|
// Delete the meta file to simulate a rename/move failure.
|
||||||
|
cluster.deleteMeta(0, block);
|
||||||
|
|
||||||
|
// Try to recover the lease.
|
||||||
|
DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
|
||||||
|
.newInstance(cluster.getConfiguration(0));
|
||||||
|
count = 0;
|
||||||
|
while (count++ < 15 && !newDfs.recoverLease(file)) {
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
// The lease should have been recovered.
|
||||||
|
assertTrue("File should be closed", newDfs.recoverLease(file));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Recover the lease on a file and append file from another client.
|
* Recover the lease on a file and append file from another client.
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user