HDFS-5583. Make DN send an OOB Ack on shutdown before restarting. Contributed by Kihwal Lee.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-5535@1571491 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Kihwal Lee 2014-02-24 23:38:04 +00:00
parent c066cef587
commit 1c6b5d2b58
9 changed files with 416 additions and 70 deletions

View File

@ -82,3 +82,4 @@ HDFS-5535 subtasks:
HDFS-6004. Change DFSAdmin for rolling upgrade commands. (szetszwo via HDFS-6004. Change DFSAdmin for rolling upgrade commands. (szetszwo via
Arpit Agarwal) Arpit Agarwal)
HDFS-5583. Make DN send an OOB Ack on shutdown before restarting. (kihwal)

View File

@ -225,6 +225,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final boolean DFS_DATANODE_SYNCONCLOSE_DEFAULT = false; public static final boolean DFS_DATANODE_SYNCONCLOSE_DEFAULT = false;
public static final String DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_KEY = "dfs.datanode.socket.reuse.keepalive"; public static final String DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_KEY = "dfs.datanode.socket.reuse.keepalive";
public static final int DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT = 4000; public static final int DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT = 4000;
public static final String DFS_DATANODE_OOB_TIMEOUT_KEY = "dfs.datanode.oob.timeout-ms";
public static final String DFS_DATANODE_OOB_TIMEOUT_DEFAULT = "1500,0,0,0"; // OOB_TYPE1, OOB_TYPE2, OOB_TYPE3, OOB_TYPE4
public static final String DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_KEY = "dfs.namenode.datanode.registration.ip-hostname-check"; public static final String DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_KEY = "dfs.namenode.datanode.registration.ip-hostname-check";
public static final boolean DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_DEFAULT = true; public static final boolean DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_DEFAULT = true;

View File

@ -26,10 +26,12 @@ import java.util.Arrays;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_OOB_TIMEOUT_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_OOB_TIMEOUT_DEFAULT;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.PipelineAckProto; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.PipelineAckProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
import com.google.protobuf.TextFormat; import com.google.protobuf.TextFormat;
/** Pipeline Acknowledgment **/ /** Pipeline Acknowledgment **/
@ -38,6 +40,21 @@ import com.google.protobuf.TextFormat;
public class PipelineAck { public class PipelineAck {
PipelineAckProto proto; PipelineAckProto proto;
public final static long UNKOWN_SEQNO = -2; public final static long UNKOWN_SEQNO = -2;
final static int OOB_START = Status.OOB_RESTART_VALUE; // the first OOB type
final static int OOB_END = Status.OOB_RESERVED3_VALUE; // the last OOB type
final static int NUM_OOB_TYPES = OOB_END - OOB_START + 1;
// place holder for timeout value of each OOB type
final static long[] OOB_TIMEOUT;
static {
OOB_TIMEOUT = new long[NUM_OOB_TYPES];
HdfsConfiguration conf = new HdfsConfiguration();
String[] ele = conf.get(DFS_DATANODE_OOB_TIMEOUT_KEY,
DFS_DATANODE_OOB_TIMEOUT_DEFAULT).split(",");
for (int i = 0; i < NUM_OOB_TYPES; i++) {
OOB_TIMEOUT[i] = (i < ele.length) ? Long.valueOf(ele[i]) : 0;
}
}
/** default constructor **/ /** default constructor **/
public PipelineAck() { public PipelineAck() {
@ -103,14 +120,57 @@ public class PipelineAck {
* @return true if all statuses are SUCCESS * @return true if all statuses are SUCCESS
*/ */
public boolean isSuccess() { public boolean isSuccess() {
for (DataTransferProtos.Status reply : proto.getStatusList()) { for (Status reply : proto.getStatusList()) {
if (reply != DataTransferProtos.Status.SUCCESS) { if (reply != Status.SUCCESS) {
return false; return false;
} }
} }
return true; return true;
} }
/**
* Returns the OOB status if this ack contains one.
* @return null if it is not an OOB ack.
*/
public Status getOOBStatus() {
// Normal data transfer acks will have a valid sequence number, so
// this will return right away in most cases.
if (getSeqno() != UNKOWN_SEQNO) {
return null;
}
for (Status reply : proto.getStatusList()) {
// The following check is valid because protobuf guarantees to
// preserve the ordering of enum elements.
if (reply.getNumber() >= OOB_START && reply.getNumber() <= OOB_END) {
return reply;
}
}
return null;
}
/**
* Get the timeout to be used for transmitting the OOB type
* @return the timeout in milliseconds
*/
public static long getOOBTimeout(Status status) throws IOException {
int index = status.getNumber() - OOB_START;
if (index >= 0 && index < NUM_OOB_TYPES) {
return OOB_TIMEOUT[index];
}
// Not an OOB.
throw new IOException("Not an OOB status: " + status);
}
/** Get the Restart OOB ack status */
public static Status getRestartOOBStatus() {
return Status.OOB_RESTART;
}
/** return true if it is the restart OOB status code */
public static boolean isRestartOOBStatus(Status st) {
return st.equals(Status.OOB_RESTART);
}
/**** Writable interface ****/ /**** Writable interface ****/
public void readFields(InputStream in) throws IOException { public void readFields(InputStream in) throws IOException {
proto = PipelineAckProto.parseFrom(vintPrefixed(in)); proto = PipelineAckProto.parseFrom(vintPrefixed(in));

View File

@ -723,14 +723,40 @@ class BlockReceiver implements Closeable {
} }
} catch (IOException ioe) { } catch (IOException ioe) {
LOG.info("Exception for " + block, ioe); if (datanode.isRestarting()) {
throw ioe; // Do not throw if shutting down for restart. Otherwise, it will cause
// premature termination of responder.
LOG.info("Shutting down for restart (" + block + ").");
} else {
LOG.info("Exception for " + block, ioe);
throw ioe;
}
} finally { } finally {
if (!responderClosed) { // Abnormal termination of the flow above // Clear the previous interrupt state of this thread.
IOUtils.closeStream(this); Thread.interrupted();
// If a shutdown for restart was initiated, upstream needs to be notified.
// There is no need to do anything special if the responder was closed
// normally.
if (!responderClosed) { // Data transfer was not complete.
if (responder != null) { if (responder != null) {
// In case this datanode is shutting down for quick restart,
// send a special ack upstream.
if (datanode.isRestarting()) {
try {
((PacketResponder) responder.getRunnable()).
sendOOBResponse(PipelineAck.getRestartOOBStatus());
} catch (InterruptedException ie) {
// It is already going down. Ignore this.
} catch (IOException ioe) {
LOG.info("Error sending OOB Ack.", ioe);
// The OOB ack could not be sent. Since the datanode is going
// down, this is ignored.
}
}
responder.interrupt(); responder.interrupt();
} }
IOUtils.closeStream(this);
cleanupBlock(); cleanupBlock();
} }
if (responder != null) { if (responder != null) {
@ -744,7 +770,10 @@ class BlockReceiver implements Closeable {
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
responder.interrupt(); responder.interrupt();
throw new IOException("Interrupted receiveBlock"); // do not throw if shutting down for restart.
if (!datanode.isRestarting()) {
throw new IOException("Interrupted receiveBlock");
}
} }
responder = null; responder = null;
} }
@ -862,6 +891,7 @@ class BlockReceiver implements Closeable {
private final PacketResponderType type; private final PacketResponderType type;
/** for log and error messages */ /** for log and error messages */
private final String myString; private final String myString;
private boolean sending = false;
@Override @Override
public String toString() { public String toString() {
@ -887,7 +917,9 @@ class BlockReceiver implements Closeable {
} }
private boolean isRunning() { private boolean isRunning() {
return running && datanode.shouldRun; // When preparing for a restart, it should continue to run until
// interrupted by the receiver thread.
return running && (datanode.shouldRun || datanode.isRestarting());
} }
/** /**
@ -903,44 +935,96 @@ class BlockReceiver implements Closeable {
if(LOG.isDebugEnabled()) { if(LOG.isDebugEnabled()) {
LOG.debug(myString + ": enqueue " + p); LOG.debug(myString + ": enqueue " + p);
} }
synchronized(this) { synchronized(ackQueue) {
if (running) { if (running) {
ackQueue.addLast(p); ackQueue.addLast(p);
notifyAll(); ackQueue.notifyAll();
}
}
}
/**
* Send an OOB response. If all acks have been sent already for the block
* and the responder is about to close, the delivery is not guaranteed.
* This is because the other end can close the connection independently.
* An OOB coming from downstream will be automatically relayed upstream
* by the responder. This method is used only by originating datanode.
*
* @param ackStatus the type of ack to be sent
*/
void sendOOBResponse(final Status ackStatus) throws IOException,
InterruptedException {
if (!running) {
LOG.info("Cannot send OOB response " + ackStatus +
". Responder not running.");
return;
}
synchronized(this) {
if (sending) {
wait(PipelineAck.getOOBTimeout(ackStatus));
// Didn't get my turn in time. Give up.
if (sending) {
throw new IOException("Could not send OOB reponse in time: "
+ ackStatus);
}
}
sending = true;
}
LOG.info("Sending an out of band ack of type " + ackStatus);
try {
sendAckUpstreamUnprotected(null, PipelineAck.UNKOWN_SEQNO, 0L, 0L,
ackStatus);
} finally {
// Let others send ack. Unless there are miltiple OOB send
// calls, there can be only one waiter, the responder thread.
// In any case, only one needs to be notified.
synchronized(this) {
sending = false;
notify();
} }
} }
} }
/** Wait for a packet with given {@code seqno} to be enqueued to ackQueue */ /** Wait for a packet with given {@code seqno} to be enqueued to ackQueue */
synchronized Packet waitForAckHead(long seqno) throws InterruptedException { Packet waitForAckHead(long seqno) throws InterruptedException {
while (isRunning() && ackQueue.size() == 0) { synchronized(ackQueue) {
if (LOG.isDebugEnabled()) { while (isRunning() && ackQueue.size() == 0) {
LOG.debug(myString + ": seqno=" + seqno + if (LOG.isDebugEnabled()) {
" waiting for local datanode to finish write."); LOG.debug(myString + ": seqno=" + seqno +
" waiting for local datanode to finish write.");
}
ackQueue.wait();
} }
wait(); return isRunning() ? ackQueue.getFirst() : null;
} }
return isRunning() ? ackQueue.getFirst() : null;
} }
/** /**
* wait for all pending packets to be acked. Then shutdown thread. * wait for all pending packets to be acked. Then shutdown thread.
*/ */
@Override @Override
public synchronized void close() { public void close() {
while (isRunning() && ackQueue.size() != 0) { synchronized(ackQueue) {
try { while (isRunning() && ackQueue.size() != 0) {
wait(); try {
} catch (InterruptedException e) { ackQueue.wait();
running = false; } catch (InterruptedException e) {
Thread.currentThread().interrupt(); running = false;
Thread.currentThread().interrupt();
}
} }
if(LOG.isDebugEnabled()) {
LOG.debug(myString + ": closing");
}
running = false;
ackQueue.notifyAll();
} }
if(LOG.isDebugEnabled()) {
LOG.debug(myString + ": closing"); synchronized(this) {
notifyAll();
} }
running = false;
notifyAll();
} }
/** /**
@ -968,6 +1052,14 @@ class BlockReceiver implements Closeable {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(myString + " got " + ack); LOG.debug(myString + " got " + ack);
} }
// Process an OOB ACK.
Status oobStatus = ack.getOOBStatus();
if (oobStatus != null) {
LOG.info("Relaying an out of band ack of type " + oobStatus);
sendAckUpstream(ack, PipelineAck.UNKOWN_SEQNO, 0L, 0L,
Status.SUCCESS);
continue;
}
seqno = ack.getSeqno(); seqno = ack.getSeqno();
} }
if (seqno != PipelineAck.UNKOWN_SEQNO if (seqno != PipelineAck.UNKOWN_SEQNO
@ -1025,6 +1117,9 @@ class BlockReceiver implements Closeable {
* status back to the client because this datanode has a problem. * status back to the client because this datanode has a problem.
* The upstream datanode will detect that this datanode is bad, and * The upstream datanode will detect that this datanode is bad, and
* rightly so. * rightly so.
*
* The receiver thread can also interrupt this thread for sending
* an out-of-band response upstream.
*/ */
LOG.info(myString + ": Thread is interrupted."); LOG.info(myString + ": Thread is interrupted.");
running = false; running = false;
@ -1094,17 +1189,64 @@ class BlockReceiver implements Closeable {
} }
/** /**
* The wrapper for the unprotected version. This is only called by
* the responder's run() method.
*
* @param ack Ack received from downstream * @param ack Ack received from downstream
* @param seqno sequence number of ack to be sent upstream * @param seqno sequence number of ack to be sent upstream
* @param totalAckTimeNanos total ack time including all the downstream * @param totalAckTimeNanos total ack time including all the downstream
* nodes * nodes
* @param offsetInBlock offset in block for the data in packet * @param offsetInBlock offset in block for the data in packet
* @param myStatus the local ack status
*/ */
private void sendAckUpstream(PipelineAck ack, long seqno, private void sendAckUpstream(PipelineAck ack, long seqno,
long totalAckTimeNanos, long offsetInBlock, long totalAckTimeNanos, long offsetInBlock,
Status myStatus) throws IOException { Status myStatus) throws IOException {
try {
// Wait for other sender to finish. Unless there is an OOB being sent,
// the responder won't have to wait.
synchronized(this) {
while(sending) {
wait();
}
sending = true;
}
try {
if (!running) return;
sendAckUpstreamUnprotected(ack, seqno, totalAckTimeNanos,
offsetInBlock, myStatus);
} finally {
synchronized(this) {
sending = false;
notify();
}
}
} catch (InterruptedException ie) {
// The responder was interrupted. Make it go down without
// interrupting the receiver(writer) thread.
running = false;
}
}
/**
* @param ack Ack received from downstream
* @param seqno sequence number of ack to be sent upstream
* @param totalAckTimeNanos total ack time including all the downstream
* nodes
* @param offsetInBlock offset in block for the data in packet
* @param myStatus the local ack status
*/
private void sendAckUpstreamUnprotected(PipelineAck ack, long seqno,
long totalAckTimeNanos, long offsetInBlock, Status myStatus)
throws IOException {
Status[] replies = null; Status[] replies = null;
if (mirrorError) { // ack read error if (ack == null) {
// A new OOB response is being sent from this node. Regardless of
// downstream nodes, reply should contain one reply.
replies = new Status[1];
replies[0] = myStatus;
} else if (mirrorError) { // ack read error
replies = MIRROR_ERROR_STATUS; replies = MIRROR_ERROR_STATUS;
} else { } else {
short ackLen = type == PacketResponderType.LAST_IN_PIPELINE ? 0 : ack short ackLen = type == PacketResponderType.LAST_IN_PIPELINE ? 0 : ack
@ -1152,9 +1294,11 @@ class BlockReceiver implements Closeable {
* *
* This should be called only when the ack queue is not empty * This should be called only when the ack queue is not empty
*/ */
private synchronized void removeAckHead() { private void removeAckHead() {
ackQueue.removeFirst(); synchronized(ackQueue) {
notifyAll(); ackQueue.removeFirst();
ackQueue.notifyAll();
}
} }
} }

View File

@ -175,6 +175,8 @@ public class DataNode extends Configured
} }
volatile boolean shouldRun = true; volatile boolean shouldRun = true;
volatile boolean shutdownForUpgrade = false;
private boolean shutdownInProgress = false;
private BlockPoolManager blockPoolManager; private BlockPoolManager blockPoolManager;
volatile FsDatasetSpi<? extends FsVolumeSpi> data = null; volatile FsDatasetSpi<? extends FsVolumeSpi> data = null;
private String clusterId = null; private String clusterId = null;
@ -1190,9 +1192,31 @@ public class DataNode extends Configured
// offerServices may be modified. // offerServices may be modified.
BPOfferService[] bposArray = this.blockPoolManager == null ? null BPOfferService[] bposArray = this.blockPoolManager == null ? null
: this.blockPoolManager.getAllNamenodeThreads(); : this.blockPoolManager.getAllNamenodeThreads();
this.shouldRun = false; // If shutdown is not for restart, set shouldRun to false early.
if (!shutdownForUpgrade) {
shouldRun = false;
}
// When shutting down for restart, DataXceiverServer is interrupted
// in order to avoid any further acceptance of requests, but the peers
// for block writes are not closed until the clients are notified.
if (dataXceiverServer != null) {
((DataXceiverServer) this.dataXceiverServer.getRunnable()).kill();
this.dataXceiverServer.interrupt();
}
// Record the time of initial notification
long timeNotified = Time.now();
if (localDataXceiverServer != null) {
((DataXceiverServer) this.localDataXceiverServer.getRunnable()).kill();
this.localDataXceiverServer.interrupt();
}
// Terminate directory scanner and block scanner
shutdownPeriodicScanners(); shutdownPeriodicScanners();
// Stop the web server
if (infoServer != null) { if (infoServer != null) {
try { try {
infoServer.stop(); infoServer.stop();
@ -1200,26 +1224,24 @@ public class DataNode extends Configured
LOG.warn("Exception shutting down DataNode", e); LOG.warn("Exception shutting down DataNode", e);
} }
} }
if (ipcServer != null) {
ipcServer.stop();
}
if (pauseMonitor != null) { if (pauseMonitor != null) {
pauseMonitor.stop(); pauseMonitor.stop();
} }
// shouldRun is set to false here to prevent certain threads from exiting
// before the restart prep is done.
this.shouldRun = false;
if (dataXceiverServer != null) {
((DataXceiverServer) this.dataXceiverServer.getRunnable()).kill();
this.dataXceiverServer.interrupt();
}
if (localDataXceiverServer != null) {
((DataXceiverServer) this.localDataXceiverServer.getRunnable()).kill();
this.localDataXceiverServer.interrupt();
}
// wait for all data receiver threads to exit // wait for all data receiver threads to exit
if (this.threadGroup != null) { if (this.threadGroup != null) {
int sleepMs = 2; int sleepMs = 2;
while (true) { while (true) {
this.threadGroup.interrupt(); // When shutting down for restart, wait 2.5 seconds before forcing
// termination of receiver threads.
if (!this.shutdownForUpgrade ||
(this.shutdownForUpgrade && (Time.now() - timeNotified > 2500))) {
this.threadGroup.interrupt();
}
LOG.info("Waiting for threadgroup to exit, active threads is " + LOG.info("Waiting for threadgroup to exit, active threads is " +
this.threadGroup.activeCount()); this.threadGroup.activeCount());
if (this.threadGroup.activeCount() == 0) { if (this.threadGroup.activeCount() == 0) {
@ -1249,7 +1271,13 @@ public class DataNode extends Configured
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
} }
} }
// IPC server needs to be shutdown late in the process, otherwise
// shutdown command response won't get sent.
if (ipcServer != null) {
ipcServer.stop();
}
if(blockPoolManager != null) { if(blockPoolManager != null) {
try { try {
this.blockPoolManager.shutDownAll(bposArray); this.blockPoolManager.shutDownAll(bposArray);
@ -1275,6 +1303,11 @@ public class DataNode extends Configured
MBeans.unregister(dataNodeInfoBeanName); MBeans.unregister(dataNodeInfoBeanName);
dataNodeInfoBeanName = null; dataNodeInfoBeanName = null;
} }
LOG.info("Shutdown complete.");
synchronized(this) {
// Notify the main thread.
notifyAll();
}
} }
@ -1775,7 +1808,11 @@ public class DataNode extends Configured
&& blockPoolManager.getAllNamenodeThreads().length == 0) { && blockPoolManager.getAllNamenodeThreads().length == 0) {
shouldRun = false; shouldRun = false;
} }
Thread.sleep(2000); // Terminate if shutdown is complete or 2 seconds after all BPs
// are shutdown.
synchronized(this) {
wait(2000);
}
} catch (InterruptedException ex) { } catch (InterruptedException ex) {
LOG.warn("Received exception in Datanode#join: " + ex); LOG.warn("Received exception in Datanode#join: " + ex);
} }
@ -2411,17 +2448,27 @@ public class DataNode extends Configured
} }
@Override // ClientDatanodeProtocol @Override // ClientDatanodeProtocol
public void shutdownDatanode(boolean forUpgrade) throws IOException { public synchronized void shutdownDatanode(boolean forUpgrade) throws IOException {
LOG.info("shutdownDatanode command received (upgrade=" + forUpgrade + LOG.info("shutdownDatanode command received (upgrade=" + forUpgrade +
"). Shutting down Datanode..."); "). Shutting down Datanode...");
// Delay start the shutdown process so that the rpc response can be // Shutdown can be called only once.
if (shutdownInProgress) {
throw new IOException("Shutdown already in progress.");
}
shutdownInProgress = true;
shutdownForUpgrade = forUpgrade;
// Asynchronously start the shutdown process so that the rpc response can be
// sent back. // sent back.
Thread shutdownThread = new Thread() { Thread shutdownThread = new Thread() {
@Override public void run() { @Override public void run() {
try { if (!shutdownForUpgrade) {
Thread.sleep(1000); // Delay the shutdown a bit if not doing for restart.
} catch (InterruptedException ie) { } try {
Thread.sleep(1000);
} catch (InterruptedException ie) { }
}
shutdown(); shutdown();
} }
}; };
@ -2462,6 +2509,10 @@ public class DataNode extends Configured
return bp != null ? bp.isAlive() : false; return bp != null ? bp.isAlive() : false;
} }
boolean isRestarting() {
return shutdownForUpgrade;
}
/** /**
* A datanode is considered to be fully started if all the BP threads are * A datanode is considered to be fully started if all the BP threads are
* alive and all the block pools are initialized. * alive and all the block pools are initialized.

View File

@ -158,8 +158,8 @@ class DataXceiver extends Receiver implements Runnable {
int opsProcessed = 0; int opsProcessed = 0;
Op op = null; Op op = null;
dataXceiverServer.addPeer(peer);
try { try {
dataXceiverServer.addPeer(peer, Thread.currentThread());
peer.setWriteTimeout(datanode.getDnConf().socketWriteTimeout); peer.setWriteTimeout(datanode.getDnConf().socketWriteTimeout);
InputStream input = socketIn; InputStream input = socketIn;
if ((!peer.hasSecureChannel()) && dnConf.encryptDataTransfer) { if ((!peer.hasSecureChannel()) && dnConf.encryptDataTransfer) {

View File

@ -20,8 +20,7 @@ package org.apache.hadoop.hdfs.server.datanode;
import java.io.IOException; import java.io.IOException;
import java.net.SocketTimeoutException; import java.net.SocketTimeoutException;
import java.nio.channels.AsynchronousCloseException; import java.nio.channels.AsynchronousCloseException;
import java.util.HashSet; import java.util.HashMap;
import java.util.Set;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -45,7 +44,8 @@ class DataXceiverServer implements Runnable {
private final PeerServer peerServer; private final PeerServer peerServer;
private final DataNode datanode; private final DataNode datanode;
private final Set<Peer> peers = new HashSet<Peer>(); private final HashMap<Peer, Thread> peers = new HashMap<Peer, Thread>();
private boolean closed = false;
/** /**
* Maximal number of concurrent xceivers per node. * Maximal number of concurrent xceivers per node.
@ -127,7 +127,7 @@ class DataXceiverServer implements Runnable {
@Override @Override
public void run() { public void run() {
Peer peer = null; Peer peer = null;
while (datanode.shouldRun) { while (datanode.shouldRun && !datanode.shutdownForUpgrade) {
try { try {
peer = peerServer.accept(); peer = peerServer.accept();
@ -147,7 +147,7 @@ class DataXceiverServer implements Runnable {
} catch (AsynchronousCloseException ace) { } catch (AsynchronousCloseException ace) {
// another thread closed our listener socket - that's expected during shutdown, // another thread closed our listener socket - that's expected during shutdown,
// but not in other circumstances // but not in other circumstances
if (datanode.shouldRun) { if (datanode.shouldRun && !datanode.shutdownForUpgrade) {
LOG.warn(datanode.getDisplayName() + ":DataXceiverServer: ", ace); LOG.warn(datanode.getDisplayName() + ":DataXceiverServer: ", ace);
} }
} catch (IOException ie) { } catch (IOException ie) {
@ -170,35 +170,82 @@ class DataXceiverServer implements Runnable {
datanode.shouldRun = false; datanode.shouldRun = false;
} }
} }
synchronized (this) {
for (Peer p : peers) { // Close the server to stop reception of more requests.
IOUtils.cleanup(LOG, p);
}
}
try { try {
peerServer.close(); peerServer.close();
closed = true;
} catch (IOException ie) { } catch (IOException ie) {
LOG.warn(datanode.getDisplayName() LOG.warn(datanode.getDisplayName()
+ " :DataXceiverServer: close exception", ie); + " :DataXceiverServer: close exception", ie);
} }
// if in restart prep stage, notify peers before closing them.
if (datanode.shutdownForUpgrade) {
restartNotifyPeers();
// Each thread needs some time to process it. If a thread needs
// to send an OOB message to the client, but blocked on network for
// long time, we need to force its termination.
LOG.info("Shutting down DataXceiverServer before restart");
// Allow roughly up to 2 seconds.
for (int i = 0; getNumPeers() > 0 && i < 10; i++) {
try {
Thread.sleep(200);
} catch (InterruptedException e) {
// ignore
}
}
}
// Close all peers.
closeAllPeers();
} }
void kill() { void kill() {
assert datanode.shouldRun == false : assert (datanode.shouldRun == false || datanode.shutdownForUpgrade) :
"shoudRun should be set to false before killing"; "shoudRun should be set to false or restarting should be true"
+ " before killing";
try { try {
this.peerServer.close(); this.peerServer.close();
this.closed = true;
} catch (IOException ie) { } catch (IOException ie) {
LOG.warn(datanode.getDisplayName() + ":DataXceiverServer.kill(): ", ie); LOG.warn(datanode.getDisplayName() + ":DataXceiverServer.kill(): ", ie);
} }
} }
synchronized void addPeer(Peer peer) { synchronized void addPeer(Peer peer, Thread t) throws IOException {
peers.add(peer); if (closed) {
throw new IOException("Server closed.");
}
peers.put(peer, t);
} }
synchronized void closePeer(Peer peer) { synchronized void closePeer(Peer peer) {
peers.remove(peer); peers.remove(peer);
IOUtils.cleanup(null, peer); IOUtils.cleanup(null, peer);
} }
// Notify all peers of the shutdown and restart.
// datanode.shouldRun should still be true and datanode.restarting should
// be set true before calling this method.
synchronized void restartNotifyPeers() {
assert (datanode.shouldRun == true && datanode.shutdownForUpgrade);
for (Peer p : peers.keySet()) {
// interrupt each and every DataXceiver thread.
peers.get(p).interrupt();
}
}
// Close all peers and clear the map.
synchronized void closeAllPeers() {
LOG.info("Closing all peers.");
for (Peer p : peers.keySet()) {
IOUtils.cleanup(LOG, p);
}
peers.clear();
}
// Return the number of peers.
synchronized int getNumPeers() {
return peers.size();
}
} }

View File

@ -157,6 +157,10 @@ enum Status {
ERROR_ACCESS_TOKEN = 5; ERROR_ACCESS_TOKEN = 5;
CHECKSUM_OK = 6; CHECKSUM_OK = 6;
ERROR_UNSUPPORTED = 7; ERROR_UNSUPPORTED = 7;
OOB_RESTART = 8; // Quick restart
OOB_RESERVED1 = 9; // Reserved
OOB_RESERVED2 = 10; // Reserved
OOB_RESERVED3 = 11; // Reserved
} }
message PipelineAckProto { message PipelineAckProto {

View File

@ -24,8 +24,10 @@ import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException; import org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.hdfs.tools.DFSAdmin;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.junit.Assert; import org.junit.Assert;
@ -159,4 +161,39 @@ public class TestClientProtocolForPipelineRecovery {
} }
} }
} }
/** Test recovery on restart OOB message */
@Test
public void testPipelineRecoveryOnOOB() throws Exception {
Configuration conf = new HdfsConfiguration();
MiniDFSCluster cluster = null;
try {
int numDataNodes = 3;
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDataNodes).build();
cluster.waitActive();
FileSystem fileSys = cluster.getFileSystem();
Path file = new Path("dataprotocol2.dat");
DFSTestUtil.createFile(fileSys, file, 10240L, (short)2, 0L);
DFSOutputStream out = (DFSOutputStream)(fileSys.append(file).
getWrappedStream());
out.write(1);
out.hflush();
DFSAdmin dfsadmin = new DFSAdmin(conf);
DataNode dn = cluster.getDataNodes().get(0);
final String dnAddr = dn.getDatanodeId().getIpcAddr(false);
// issue shutdown to the datanode.
final String[] args1 = {"-shutdownDatanode", dnAddr, "upgrade" };
Assert.assertEquals(0, dfsadmin.run(args1));
out.close();
Thread.sleep(3000);
final String[] args2 = {"-getDatanodeInfo", dnAddr };
Assert.assertEquals(-1, dfsadmin.run(args2));
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}
} }