HDFS-9106. Transfer failure during pipeline recovery causes permanent write failures. Contributed by Kihwal Lee.
This commit is contained in:
parent
fb2e525c07
commit
4c9497cbf0
@ -1208,22 +1208,46 @@ class DataStreamer extends Daemon {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//get a new datanode
|
int tried = 0;
|
||||||
final DatanodeInfo[] original = nodes;
|
final DatanodeInfo[] original = nodes;
|
||||||
final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode(
|
final StorageType[] originalTypes = storageTypes;
|
||||||
src, stat.getFileId(), block, nodes, storageIDs,
|
final String[] originalIDs = storageIDs;
|
||||||
failed.toArray(new DatanodeInfo[failed.size()]),
|
IOException caughtException = null;
|
||||||
1, dfsClient.clientName);
|
ArrayList<DatanodeInfo> exclude = new ArrayList<DatanodeInfo>(failed);
|
||||||
setPipeline(lb);
|
while (tried < 3) {
|
||||||
|
LocatedBlock lb;
|
||||||
|
//get a new datanode
|
||||||
|
lb = dfsClient.namenode.getAdditionalDatanode(
|
||||||
|
src, stat.getFileId(), block, nodes, storageIDs,
|
||||||
|
exclude.toArray(new DatanodeInfo[exclude.size()]),
|
||||||
|
1, dfsClient.clientName);
|
||||||
|
// a new node was allocated by the namenode. Update nodes.
|
||||||
|
setPipeline(lb);
|
||||||
|
|
||||||
//find the new datanode
|
//find the new datanode
|
||||||
final int d = findNewDatanode(original);
|
final int d = findNewDatanode(original);
|
||||||
|
//transfer replica. pick a source from the original nodes
|
||||||
|
final DatanodeInfo src = original[tried % original.length];
|
||||||
|
final DatanodeInfo[] targets = {nodes[d]};
|
||||||
|
final StorageType[] targetStorageTypes = {storageTypes[d]};
|
||||||
|
|
||||||
//transfer replica
|
try {
|
||||||
final DatanodeInfo src = d == 0? nodes[1]: nodes[d - 1];
|
transfer(src, targets, targetStorageTypes, lb.getBlockToken());
|
||||||
final DatanodeInfo[] targets = {nodes[d]};
|
} catch (IOException ioe) {
|
||||||
final StorageType[] targetStorageTypes = {storageTypes[d]};
|
DFSClient.LOG.warn("Error transferring data from " + src + " to " +
|
||||||
transfer(src, targets, targetStorageTypes, lb.getBlockToken());
|
nodes[d] + ": " + ioe.getMessage());
|
||||||
|
caughtException = ioe;
|
||||||
|
// add the allocated node to the exclude list.
|
||||||
|
exclude.add(nodes[d]);
|
||||||
|
setPipeline(original, originalTypes, originalIDs);
|
||||||
|
tried++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return; // finished successfully
|
||||||
|
}
|
||||||
|
// All retries failed
|
||||||
|
throw (caughtException != null) ? caughtException :
|
||||||
|
new IOException("Failed to add a node");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets,
|
private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets,
|
||||||
@ -1236,7 +1260,11 @@ class DataStreamer extends Daemon {
|
|||||||
try {
|
try {
|
||||||
sock = createSocketForPipeline(src, 2, dfsClient);
|
sock = createSocketForPipeline(src, 2, dfsClient);
|
||||||
final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2);
|
final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2);
|
||||||
final long readTimeout = dfsClient.getDatanodeReadTimeout(2);
|
|
||||||
|
// transfer timeout multiplier based on the transfer size
|
||||||
|
// One per 200 packets = 12.8MB. Minimum is 2.
|
||||||
|
int multi = 2 + (int)(bytesSent/dfsClient.getConf().getWritePacketSize())/200;
|
||||||
|
final long readTimeout = dfsClient.getDatanodeReadTimeout(multi);
|
||||||
|
|
||||||
OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout);
|
OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout);
|
||||||
InputStream unbufIn = NetUtils.getInputStream(sock, readTimeout);
|
InputStream unbufIn = NetUtils.getInputStream(sock, readTimeout);
|
||||||
|
@ -1488,6 +1488,9 @@ Release 2.7.2 - UNRELEASED
|
|||||||
HDFS-9043. Doc updation for commands in HDFS Federation
|
HDFS-9043. Doc updation for commands in HDFS Federation
|
||||||
(J.Andreina via vinayakumab)
|
(J.Andreina via vinayakumab)
|
||||||
|
|
||||||
|
HDFS-9106. Transfer failure during pipeline recovery causes permanent
|
||||||
|
write failures (kihwal)
|
||||||
|
|
||||||
Release 2.7.1 - 2015-07-06
|
Release 2.7.1 - 2015-07-06
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
Loading…
x
Reference in New Issue
Block a user