From cdec12d1b84d444e13bf997c817643ec24aaa832 Mon Sep 17 00:00:00 2001 From: Tsz-Wo Nicholas Sze Date: Wed, 13 May 2015 11:57:49 -0700 Subject: [PATCH] HDFS-8143. Mover should exit after some retry when failed to move blocks. Contributed by surendra singh lilhore --- .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 2 + .../hadoop/hdfs/server/mover/Mover.java | 30 +++++++++++--- .../hadoop/hdfs/server/mover/TestMover.java | 39 ++++++++++++++++++- 3 files changed, 65 insertions(+), 6 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index f8e9f3a7c7..ae056fab20 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -353,6 +353,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final long DFS_MOVER_MOVEDWINWIDTH_DEFAULT = 5400*1000L; public static final String DFS_MOVER_MOVERTHREADS_KEY = "dfs.mover.moverThreads"; public static final int DFS_MOVER_MOVERTHREADS_DEFAULT = 1000; + public static final String DFS_MOVER_RETRY_MAX_ATTEMPTS_KEY = "dfs.mover.retry.max.attempts"; + public static final int DFS_MOVER_RETRY_MAX_ATTEMPTS_DEFAULT = 10; public static final String DFS_DATANODE_ADDRESS_KEY = "dfs.datanode.address"; public static final int DFS_DATANODE_DEFAULT_PORT = 50010; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java index b32220c31e..0710f3e0cc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java @@ -58,6 +58,7 @@ import java.net.URI; import java.text.DateFormat; import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; @InterfaceAudience.Private public class Mover { @@ -107,10 +108,12 @@ private List getTargetStorages(StorageType t) { private final Dispatcher dispatcher; private final StorageMap storages; private final List targetPaths; + private final int retryMaxAttempts; + private final AtomicInteger retryCount; private final BlockStoragePolicy[] blockStoragePolicies; - Mover(NameNodeConnector nnc, Configuration conf) { + Mover(NameNodeConnector nnc, Configuration conf, AtomicInteger retryCount) { final long movedWinWidth = conf.getLong( DFSConfigKeys.DFS_MOVER_MOVEDWINWIDTH_KEY, DFSConfigKeys.DFS_MOVER_MOVEDWINWIDTH_DEFAULT); @@ -120,7 +123,10 @@ private List getTargetStorages(StorageType t) { final int maxConcurrentMovesPerNode = conf.getInt( DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_KEY, DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_DEFAULT); - + this.retryMaxAttempts = conf.getInt( + DFSConfigKeys.DFS_MOVER_RETRY_MAX_ATTEMPTS_KEY, + DFSConfigKeys.DFS_MOVER_RETRY_MAX_ATTEMPTS_DEFAULT); + this.retryCount = retryCount; this.dispatcher = new Dispatcher(nnc, Collections. emptySet(), Collections. emptySet(), movedWinWidth, moverThreads, 0, maxConcurrentMovesPerNode, conf); @@ -255,14 +261,27 @@ private boolean isSnapshotPathInCurrent(String path) throws IOException { * @return whether there is still remaining migration work for the next * round */ - private boolean processNamespace() { + private boolean processNamespace() throws IOException { getSnapshottableDirs(); boolean hasRemaining = false; for (Path target : targetPaths) { hasRemaining |= processPath(target.toUri().getPath()); } // wait for pending move to finish and retry the failed migration - hasRemaining |= Dispatcher.waitForMoveCompletion(storages.targets.values()); + boolean hasFailed = Dispatcher.waitForMoveCompletion(storages.targets + .values()); + if (hasFailed) { + if (retryCount.get() == retryMaxAttempts) { + throw new IOException("Failed to move some block's after " + + retryMaxAttempts + " retries."); + } else { + retryCount.incrementAndGet(); + } + } else { + // Reset retry count if no failure. + retryCount.set(0); + } + hasRemaining |= hasFailed; return hasRemaining; } @@ -528,6 +547,7 @@ static int run(Map> namenodes, Configuration conf) DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 2000 + conf.getLong(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_DEFAULT) * 1000; + AtomicInteger retryCount = new AtomicInteger(0); LOG.info("namenodes = " + namenodes); List connectors = Collections.emptyList(); @@ -541,7 +561,7 @@ static int run(Map> namenodes, Configuration conf) Iterator iter = connectors.iterator(); while (iter.hasNext()) { NameNodeConnector nnc = iter.next(); - final Mover m = new Mover(nnc, conf); + final Mover m = new Mover(nnc, conf, retryCount); final ExitStatus r = m.run(); if (r == ExitStatus.SUCCESS) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java index b2f9fce8c3..f4bedabf82 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java @@ -20,12 +20,14 @@ import java.io.IOException; import java.net.URI; import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.StorageType; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DFSUtil; @@ -34,6 +36,7 @@ import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DBlock; +import org.apache.hadoop.hdfs.server.balancer.ExitStatus; import org.apache.hadoop.hdfs.server.balancer.NameNodeConnector; import org.apache.hadoop.hdfs.server.mover.Mover.MLocation; import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; @@ -54,7 +57,7 @@ static Mover newMover(Configuration conf) throws IOException { final List nncs = NameNodeConnector.newNameNodeConnectors( nnMap, Mover.class.getSimpleName(), Mover.MOVER_ID_PATH, conf, NameNodeConnector.DEFAULT_MAX_IDLE_ITERATIONS); - return new Mover(nncs.get(0), conf); + return new Mover(nncs.get(0), conf, new AtomicInteger(0)); } @Test @@ -324,4 +327,38 @@ public void testTwoReplicaSameStorageTypeShouldNotSelect() throws Exception { cluster.shutdown(); } } + + @Test + public void testMoverFailedRetry() throws Exception { + // HDFS-8147 + final Configuration conf = new HdfsConfiguration(); + conf.set(DFSConfigKeys.DFS_MOVER_RETRY_MAX_ATTEMPTS_KEY, "2"); + final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(3) + .storageTypes( + new StorageType[][] {{StorageType.DISK, StorageType.ARCHIVE}, + {StorageType.DISK, StorageType.ARCHIVE}, + {StorageType.DISK, StorageType.ARCHIVE}}).build(); + try { + cluster.waitActive(); + final DistributedFileSystem dfs = cluster.getFileSystem(); + final String file = "/testMoverFailedRetry"; + // write to DISK + final FSDataOutputStream out = dfs.create(new Path(file), (short) 2); + out.writeChars("testMoverFailedRetry"); + out.close(); + + // Delete block file so, block move will fail with FileNotFoundException + LocatedBlock lb = dfs.getClient().getLocatedBlocks(file, 0).get(0); + cluster.corruptBlockOnDataNodesByDeletingBlockFile(lb.getBlock()); + // move to ARCHIVE + dfs.setStoragePolicy(new Path(file), "COLD"); + int rc = ToolRunner.run(conf, new Mover.Cli(), + new String[] {"-p", file.toString()}); + Assert.assertEquals("Movement should fail after some retry", + ExitStatus.IO_EXCEPTION.getExitCode(), rc); + } finally { + cluster.shutdown(); + } + } }