HDFS-3493. Invalidate corrupted blocks as long as minimum replication is satisfied. Contributed by Juan Yu and Vinayakumar B.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1602291 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andrew Wang 2014-06-12 21:06:31 +00:00
parent befa4bb1ed
commit be01103af7
4 changed files with 96 additions and 8 deletions

View File

@ -432,6 +432,9 @@ Release 2.5.0 - UNRELEASED
HDFS-6395. Skip checking xattr limits for non-user-visible namespaces.
(Yi Liu via wang).
HDFS-3493. Invalidate corrupted blocks as long as minimum replication is
satisfied. (Juan Yu and Vinayakumar B via wang)
OPTIMIZATIONS
HDFS-6214. Webhdfs has poor throughput for files >2GB (daryn)

View File

@ -1096,8 +1096,9 @@ public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk,
+ blk + " not found");
return;
}
markBlockAsCorrupt(new BlockToMarkCorrupt(storedBlock, reason,
Reason.CORRUPTION_REPORTED), dn, storageID);
markBlockAsCorrupt(new BlockToMarkCorrupt(storedBlock,
blk.getGenerationStamp(), reason, Reason.CORRUPTION_REPORTED),
dn, storageID);
}
private void markBlockAsCorrupt(BlockToMarkCorrupt b,
@ -1123,7 +1124,25 @@ private void markBlockAsCorrupt(BlockToMarkCorrupt b,
// Add this replica to corruptReplicas Map
corruptReplicas.addToCorruptReplicasMap(b.corrupted, node, b.reason,
b.reasonCode);
if (countNodes(b.stored).liveReplicas() >= bc.getBlockReplication()) {
NumberReplicas numberOfReplicas = countNodes(b.stored);
boolean hasEnoughLiveReplicas = numberOfReplicas.liveReplicas() >= bc
.getBlockReplication();
boolean minReplicationSatisfied =
numberOfReplicas.liveReplicas() >= minReplication;
boolean hasMoreCorruptReplicas = minReplicationSatisfied &&
(numberOfReplicas.liveReplicas() + numberOfReplicas.corruptReplicas()) >
bc.getBlockReplication();
boolean corruptedDuringWrite = minReplicationSatisfied &&
(b.stored.getGenerationStamp() > b.corrupted.getGenerationStamp());
// case 1: have enough number of live replicas
// case 2: corrupted replicas + live replicas > Replication factor
// case 3: Block is marked corrupt due to failure while writing. In this
// case genstamp will be different than that of valid block.
// In all these cases we can delete the replica.
// In case of 3, rbw block will be deleted and valid block can be replicated
if (hasEnoughLiveReplicas || hasMoreCorruptReplicas
|| corruptedDuringWrite) {
// the block is over-replicated so invalidate the replicas immediately
invalidateBlock(b, node);
} else if (namesystem.isPopulatingReplQueues()) {

View File

@ -25,13 +25,16 @@
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.TimeoutException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@ -453,4 +456,66 @@ private void changeBlockLen(MiniDFSCluster cluster, int lenDelta)
}
fs.delete(fileName, true);
}
/**
* Test that blocks should get replicated if we have corrupted blocks and
* having good replicas at least equal or greater to minreplication
*
* Simulate rbw blocks by creating dummy copies, then a DN restart to detect
* those corrupted blocks asap.
*/
@Test(timeout=30000)
public void testReplicationWhenBlockCorruption() throws Exception {
MiniDFSCluster cluster = null;
try {
Configuration conf = new HdfsConfiguration();
conf.setLong(
DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, 1);
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
FileSystem fs = cluster.getFileSystem();
FSDataOutputStream create = fs.create(new Path("/test"));
fs.setReplication(new Path("/test"), (short) 1);
create.write(new byte[1024]);
create.close();
List<File> nonParticipatedNodeDirs = new ArrayList<File>();
File participatedNodeDirs = null;
for (int i = 0; i < cluster.getDataNodes().size(); i++) {
File storageDir = cluster.getInstanceStorageDir(i, 0);
String bpid = cluster.getNamesystem().getBlockPoolId();
File data_dir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
if (data_dir.listFiles().length == 0) {
nonParticipatedNodeDirs.add(data_dir);
} else {
participatedNodeDirs = data_dir;
}
}
String blockFile = null;
File[] listFiles = participatedNodeDirs.listFiles();
for (File file : listFiles) {
if (file.getName().startsWith("blk_")
&& !file.getName().endsWith("meta")) {
blockFile = file.getName();
for (File file1 : nonParticipatedNodeDirs) {
file1.mkdirs();
new File(file1, blockFile).createNewFile();
new File(file1, blockFile + "_1000.meta").createNewFile();
}
break;
}
}
fs.setReplication(new Path("/test"), (short) 3);
cluster.restartDataNodes(); // Lets detect all DNs about dummy copied
// blocks
cluster.waitActive();
cluster.triggerBlockReports();
DFSTestUtil.waitReplication(fs, new Path("/test"), (short) 3);
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}
}

View File

@ -410,6 +410,7 @@ public void blockReport_06() throws Exception {
* The second datanode is started in the cluster.
* As soon as the replication process is completed test finds a block from
* the second DN and sets its GS to be < of original one.
* this is the markBlockAsCorrupt case 3 so we expect one pending deletion
* Block report is forced and the check for # of currupted blocks is performed.
* Another block is chosen and its length is set to a lesser than original.
* A check for another corrupted block is performed after yet another
@ -436,20 +437,20 @@ public void blockReport_07() throws Exception {
printStats();
assertThat("Wrong number of corrupt blocks",
cluster.getNamesystem().getCorruptReplicaBlocks(), is(1L));
cluster.getNamesystem().getCorruptReplicaBlocks(), is(0L));
assertThat("Wrong number of PendingDeletion blocks",
cluster.getNamesystem().getPendingDeletionBlocks(), is(0L));
cluster.getNamesystem().getPendingDeletionBlocks(), is(1L));
assertThat("Wrong number of PendingReplication blocks",
cluster.getNamesystem().getPendingReplicationBlocks(), is(0L));
reports = getBlockReports(dn, poolId, true, true);
reports = getBlockReports(dn, poolId, false, true);
sendBlockReports(dnR, poolId, reports);
printStats();
assertThat("Wrong number of corrupt blocks",
cluster.getNamesystem().getCorruptReplicaBlocks(), is(2L));
cluster.getNamesystem().getCorruptReplicaBlocks(), is(1L));
assertThat("Wrong number of PendingDeletion blocks",
cluster.getNamesystem().getPendingDeletionBlocks(), is(0L));
cluster.getNamesystem().getPendingDeletionBlocks(), is(1L));
assertThat("Wrong number of PendingReplication blocks",
cluster.getNamesystem().getPendingReplicationBlocks(), is(0L));