HDFS-9106. Transfer failure during pipeline recovery causes permanent write failures. Contributed by Kihwal Lee.

This commit is contained in:
Kihwal Lee 2015-09-28 13:29:19 -05:00
parent fb2e525c07
commit 4c9497cbf0
2 changed files with 45 additions and 14 deletions

View File

@ -1208,22 +1208,46 @@ class DataStreamer extends Daemon {
return; return;
} }
//get a new datanode int tried = 0;
final DatanodeInfo[] original = nodes; final DatanodeInfo[] original = nodes;
final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode( final StorageType[] originalTypes = storageTypes;
src, stat.getFileId(), block, nodes, storageIDs, final String[] originalIDs = storageIDs;
failed.toArray(new DatanodeInfo[failed.size()]), IOException caughtException = null;
1, dfsClient.clientName); ArrayList<DatanodeInfo> exclude = new ArrayList<DatanodeInfo>(failed);
setPipeline(lb); while (tried < 3) {
LocatedBlock lb;
//get a new datanode
lb = dfsClient.namenode.getAdditionalDatanode(
src, stat.getFileId(), block, nodes, storageIDs,
exclude.toArray(new DatanodeInfo[exclude.size()]),
1, dfsClient.clientName);
// a new node was allocated by the namenode. Update nodes.
setPipeline(lb);
//find the new datanode //find the new datanode
final int d = findNewDatanode(original); final int d = findNewDatanode(original);
//transfer replica. pick a source from the original nodes
final DatanodeInfo src = original[tried % original.length];
final DatanodeInfo[] targets = {nodes[d]};
final StorageType[] targetStorageTypes = {storageTypes[d]};
//transfer replica try {
final DatanodeInfo src = d == 0? nodes[1]: nodes[d - 1]; transfer(src, targets, targetStorageTypes, lb.getBlockToken());
final DatanodeInfo[] targets = {nodes[d]}; } catch (IOException ioe) {
final StorageType[] targetStorageTypes = {storageTypes[d]}; DFSClient.LOG.warn("Error transferring data from " + src + " to " +
transfer(src, targets, targetStorageTypes, lb.getBlockToken()); nodes[d] + ": " + ioe.getMessage());
caughtException = ioe;
// add the allocated node to the exclude list.
exclude.add(nodes[d]);
setPipeline(original, originalTypes, originalIDs);
tried++;
continue;
}
return; // finished successfully
}
// All retries failed
throw (caughtException != null) ? caughtException :
new IOException("Failed to add a node");
} }
private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets, private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets,
@ -1236,7 +1260,11 @@ class DataStreamer extends Daemon {
try { try {
sock = createSocketForPipeline(src, 2, dfsClient); sock = createSocketForPipeline(src, 2, dfsClient);
final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2); final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2);
final long readTimeout = dfsClient.getDatanodeReadTimeout(2);
// transfer timeout multiplier based on the transfer size
// One per 200 packets = 12.8MB. Minimum is 2.
int multi = 2 + (int)(bytesSent/dfsClient.getConf().getWritePacketSize())/200;
final long readTimeout = dfsClient.getDatanodeReadTimeout(multi);
OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout); OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout);
InputStream unbufIn = NetUtils.getInputStream(sock, readTimeout); InputStream unbufIn = NetUtils.getInputStream(sock, readTimeout);

View File

@ -1488,6 +1488,9 @@ Release 2.7.2 - UNRELEASED
HDFS-9043. Doc updation for commands in HDFS Federation HDFS-9043. Doc updation for commands in HDFS Federation
(J.Andreina via vinayakumab) (J.Andreina via vinayakumab)
HDFS-9106. Transfer failure during pipeline recovery causes permanent
write failures (kihwal)
Release 2.7.1 - 2015-07-06 Release 2.7.1 - 2015-07-06
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES