HDFS-12349. Improve log message when it could not alloc enough blocks for EC. (Lei (Eddy) Xu)

This commit is contained in:
Lei Xu 2017-09-15 12:12:42 -07:00
parent 3a8d57a0a2
commit fbe06b5880
8 changed files with 63 additions and 46 deletions

View File

@ -260,6 +260,7 @@ private void flipDataBuffers() {
private final Coordinator coordinator; private final Coordinator coordinator;
private final CellBuffers cellBuffers; private final CellBuffers cellBuffers;
private final ErasureCodingPolicy ecPolicy;
private final RawErasureEncoder encoder; private final RawErasureEncoder encoder;
private final List<StripedDataStreamer> streamers; private final List<StripedDataStreamer> streamers;
private final DFSPacket[] currentPackets; // current Packet of each streamer private final DFSPacket[] currentPackets; // current Packet of each streamer
@ -286,7 +287,7 @@ private void flipDataBuffers() {
LOG.debug("Creating DFSStripedOutputStream for " + src); LOG.debug("Creating DFSStripedOutputStream for " + src);
} }
final ErasureCodingPolicy ecPolicy = stat.getErasureCodingPolicy(); ecPolicy = stat.getErasureCodingPolicy();
final int numParityBlocks = ecPolicy.getNumParityUnits(); final int numParityBlocks = ecPolicy.getNumParityUnits();
cellSize = ecPolicy.getCellSize(); cellSize = ecPolicy.getCellSize();
numDataBlocks = ecPolicy.getNumDataUnits(); numDataBlocks = ecPolicy.getNumDataUnits();
@ -478,11 +479,6 @@ private void allocateNewBlock() throws IOException {
final LocatedBlock lb = addBlock(excludedNodes, dfsClient, src, final LocatedBlock lb = addBlock(excludedNodes, dfsClient, src,
currentBlockGroup, fileId, favoredNodes, getAddBlockFlags()); currentBlockGroup, fileId, favoredNodes, getAddBlockFlags());
assert lb.isStriped(); assert lb.isStriped();
if (lb.getLocations().length < numDataBlocks) {
throw new IOException("Failed to get " + numDataBlocks
+ " nodes from namenode: blockGroupSize= " + numAllBlocks
+ ", blocks.length= " + lb.getLocations().length);
}
// assign the new block to the current block group // assign the new block to the current block group
currentBlockGroup = lb.getBlock(); currentBlockGroup = lb.getBlock();
blockGroupIndex++; blockGroupIndex++;
@ -494,11 +490,16 @@ private void allocateNewBlock() throws IOException {
StripedDataStreamer si = getStripedDataStreamer(i); StripedDataStreamer si = getStripedDataStreamer(i);
assert si.isHealthy(); assert si.isHealthy();
if (blocks[i] == null) { if (blocks[i] == null) {
// allocBlock() should guarantee that all data blocks are successfully
// allocated.
assert i >= numDataBlocks;
// Set exception and close streamer as there is no block locations // Set exception and close streamer as there is no block locations
// found for the parity block. // found for the parity block.
LOG.warn("Failed to get block location for parity block, index=" + i); LOG.warn("Cannot allocate parity block(index={}, policy={}). " +
"Not enough datanodes? Exclude nodes={}", i, ecPolicy.getName(),
excludedNodes);
si.getLastException().set( si.getLastException().set(
new IOException("Failed to get following block, i=" + i)); new IOException("Failed to get parity block, index=" + i));
si.getErrorState().setInternalError(); si.getErrorState().setInternalError();
si.close(true); si.close(true);
} else { } else {

View File

@ -2057,6 +2057,7 @@ public DatanodeStorageInfo[] chooseTarget4NewBlock(final String src,
final List<String> favoredNodes, final List<String> favoredNodes,
final byte storagePolicyID, final byte storagePolicyID,
final BlockType blockType, final BlockType blockType,
final ErasureCodingPolicy ecPolicy,
final EnumSet<AddBlockFlag> flags) throws IOException { final EnumSet<AddBlockFlag> flags) throws IOException {
List<DatanodeDescriptor> favoredDatanodeDescriptors = List<DatanodeDescriptor> favoredDatanodeDescriptors =
getDatanodeDescriptors(favoredNodes); getDatanodeDescriptors(favoredNodes);
@ -2067,14 +2068,23 @@ public DatanodeStorageInfo[] chooseTarget4NewBlock(final String src,
final DatanodeStorageInfo[] targets = blockplacement.chooseTarget(src, final DatanodeStorageInfo[] targets = blockplacement.chooseTarget(src,
numOfReplicas, client, excludedNodes, blocksize, numOfReplicas, client, excludedNodes, blocksize,
favoredDatanodeDescriptors, storagePolicy, flags); favoredDatanodeDescriptors, storagePolicy, flags);
if (targets.length < minReplication) {
throw new IOException("File " + src + " could only be replicated to " final String errorMessage = "File %s could only be written to %d of " +
+ targets.length + " nodes instead of minReplication (=" "the %d %s. There are %d datanode(s) running and %s "
+ minReplication + "). There are " + "node(s) are excluded in this operation.";
+ getDatanodeManager().getNetworkTopology().getNumOfLeaves() if (blockType == BlockType.CONTIGUOUS && targets.length < minReplication) {
+ " datanode(s) running and " throw new IOException(String.format(errorMessage, src,
+ (excludedNodes == null? "no": excludedNodes.size()) targets.length, minReplication, "minReplication nodes",
+ " node(s) are excluded in this operation."); getDatanodeManager().getNetworkTopology().getNumOfLeaves(),
(excludedNodes == null? "no": excludedNodes.size())));
} else if (blockType == BlockType.STRIPED &&
targets.length < ecPolicy.getNumDataUnits()) {
throw new IOException(
String.format(errorMessage, src, targets.length,
ecPolicy.getNumDataUnits(),
String.format("required nodes for %s", ecPolicy.getName()),
getDatanodeManager().getNetworkTopology().getNumOfLeaves(),
(excludedNodes == null ? "no" : excludedNodes.size())));
} }
return targets; return targets;
} }

View File

@ -201,7 +201,7 @@ static ValidateAddBlockResult validateAddBlock(
} }
storagePolicyID = pendingFile.getStoragePolicyID(); storagePolicyID = pendingFile.getStoragePolicyID();
return new ValidateAddBlockResult(blockSize, numTargets, storagePolicyID, return new ValidateAddBlockResult(blockSize, numTargets, storagePolicyID,
clientMachine, blockType); clientMachine, blockType, ecPolicy);
} }
static LocatedBlock makeLocatedBlock(FSNamesystem fsn, BlockInfo blk, static LocatedBlock makeLocatedBlock(FSNamesystem fsn, BlockInfo blk,
@ -286,7 +286,7 @@ static DatanodeStorageInfo[] chooseTargetForNewBlock(
return bm.chooseTarget4NewBlock(src, r.numTargets, clientNode, return bm.chooseTarget4NewBlock(src, r.numTargets, clientNode,
excludedNodesSet, r.blockSize, excludedNodesSet, r.blockSize,
favoredNodesList, r.storagePolicyID, favoredNodesList, r.storagePolicyID,
r.blockType, flags); r.blockType, r.ecPolicy, flags);
} }
/** /**
@ -831,20 +831,28 @@ private static class FileState {
} }
static class ValidateAddBlockResult { static class ValidateAddBlockResult {
final long blockSize; private final long blockSize;
final int numTargets; private final int numTargets;
final byte storagePolicyID; private final byte storagePolicyID;
final String clientMachine; private final String clientMachine;
final BlockType blockType; private final BlockType blockType;
private final ErasureCodingPolicy ecPolicy;
ValidateAddBlockResult( ValidateAddBlockResult(
long blockSize, int numTargets, byte storagePolicyID, long blockSize, int numTargets, byte storagePolicyID,
String clientMachine, BlockType blockType) { String clientMachine, BlockType blockType,
ErasureCodingPolicy ecPolicy) {
this.blockSize = blockSize; this.blockSize = blockSize;
this.numTargets = numTargets; this.numTargets = numTargets;
this.storagePolicyID = storagePolicyID; this.storagePolicyID = storagePolicyID;
this.clientMachine = clientMachine; this.clientMachine = clientMachine;
this.blockType = blockType; this.blockType = blockType;
this.ecPolicy = ecPolicy;
if (blockType == BlockType.STRIPED) {
Preconditions.checkArgument(ecPolicy != null,
"ecPolicy is not specified for striped block");
}
} }
} }
} }

View File

@ -42,6 +42,7 @@
import org.apache.hadoop.io.erasurecode.rawcoder.NativeRSRawErasureCoderFactory; import org.apache.hadoop.io.erasurecode.rawcoder.NativeRSRawErasureCoderFactory;
import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.LambdaTestUtils;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.junit.Assert; import org.junit.Assert;
@ -284,7 +285,7 @@ public void testBlockTokenExpired() throws Exception {
@Test(timeout = 90000) @Test(timeout = 90000)
public void testAddBlockWhenNoSufficientDataBlockNumOfNodes() public void testAddBlockWhenNoSufficientDataBlockNumOfNodes()
throws IOException { throws Exception {
HdfsConfiguration conf = new HdfsConfiguration(); HdfsConfiguration conf = new HdfsConfiguration();
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize); conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize);
try { try {
@ -303,20 +304,18 @@ public void testAddBlockWhenNoSufficientDataBlockNumOfNodes()
DatanodeReportType.LIVE); DatanodeReportType.LIVE);
assertEquals("Mismatches number of live Dns ", numDatanodes, info.length); assertEquals("Mismatches number of live Dns ", numDatanodes, info.length);
final Path dirFile = new Path(dir, "ecfile"); final Path dirFile = new Path(dir, "ecfile");
FSDataOutputStream out; LambdaTestUtils.intercept(
try { IOException.class,
out = dfs.create(dirFile, true); "File " + dirFile + " could only be written to " +
numDatanodes + " of the " + dataBlocks + " required nodes for " +
getEcPolicy().getName(),
() -> {
try (FSDataOutputStream out = dfs.create(dirFile, true)) {
out.write("something".getBytes()); out.write("something".getBytes());
out.flush(); out.flush();
out.close();
Assert.fail("Failed to validate available dns against blkGroupSize");
} catch (IOException ioe) {
// expected
GenericTestUtils.assertExceptionContains("Failed to get " +
dataBlocks + " nodes from namenode: blockGroupSize= " +
(dataBlocks + parityBlocks) + ", blocks.length= " +
numDatanodes, ioe);
} }
return 0;
});
} finally { } finally {
tearDown(); tearDown();
} }

View File

@ -118,7 +118,7 @@ public void testClientAndServerDoNotHaveCommonQop() throws Exception {
HdfsConfiguration clientConf = new HdfsConfiguration(clusterConf); HdfsConfiguration clientConf = new HdfsConfiguration(clusterConf);
clientConf.set(DFS_DATA_TRANSFER_PROTECTION_KEY, "authentication"); clientConf.set(DFS_DATA_TRANSFER_PROTECTION_KEY, "authentication");
exception.expect(IOException.class); exception.expect(IOException.class);
exception.expectMessage("could only be replicated to 0 nodes"); exception.expectMessage("could only be written to 0");
doTest(clientConf); doTest(clientConf);
} }
@ -140,7 +140,7 @@ public void testServerSaslNoClientSasl() throws Exception {
"configured or not supported in client"); "configured or not supported in client");
} catch (IOException e) { } catch (IOException e) {
GenericTestUtils.assertMatches(e.getMessage(), GenericTestUtils.assertMatches(e.getMessage(),
"could only be replicated to 0 nodes"); "could only be written to 0");
} finally { } finally {
logs.stopCapturing(); logs.stopCapturing();
} }

View File

@ -1030,8 +1030,7 @@ public void testStorageWithRemainingCapacity() throws Exception {
0x1BAD5EED); 0x1BAD5EED);
} }
catch (RemoteException re) { catch (RemoteException re) {
GenericTestUtils.assertExceptionContains("nodes instead of " GenericTestUtils.assertExceptionContains("of the 1 minReplication", re);
+ "minReplication", re);
} }
} }
finally { finally {

View File

@ -182,7 +182,7 @@ public void testStorageTypeStatsWhenStorageFailed() throws Exception {
fail("Should throw exception, becuase no DISK storage available"); fail("Should throw exception, becuase no DISK storage available");
} catch (Exception e) { } catch (Exception e) {
assertTrue(e.getMessage().contains( assertTrue(e.getMessage().contains(
"could only be replicated to 0 nodes instead")); "could only be written to 0 of the 1 minReplication"));
} }
// wait for heartbeat // wait for heartbeat
Thread.sleep(6000); Thread.sleep(6000);

View File

@ -175,8 +175,8 @@ public void testDeadNodeAsBlockTarget() throws Exception {
// choose the targets, but local node should not get selected as this is not // choose the targets, but local node should not get selected as this is not
// part of the cluster anymore // part of the cluster anymore
DatanodeStorageInfo[] results = bm.chooseTarget4NewBlock("/hello", 3, DatanodeStorageInfo[] results = bm.chooseTarget4NewBlock("/hello", 3,
clientNode, new HashSet<Node>(), 256 * 1024 * 1024L, null, (byte) 7, clientNode, new HashSet<>(), 256 * 1024 * 1024L, null, (byte) 7,
BlockType.CONTIGUOUS, null); BlockType.CONTIGUOUS, null, null);
for (DatanodeStorageInfo datanodeStorageInfo : results) { for (DatanodeStorageInfo datanodeStorageInfo : results) {
assertFalse("Dead node should not be choosen", datanodeStorageInfo assertFalse("Dead node should not be choosen", datanodeStorageInfo
.getDatanodeDescriptor().equals(clientNode)); .getDatanodeDescriptor().equals(clientNode));