diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt index e2975e85cf..414b28e908 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt @@ -29,3 +29,5 @@ HDFS-2582. Scope dfs.ha.namenodes config by nameservice (todd) HDFS-2591. MiniDFSCluster support to mix and match federation with HA (todd) HDFS-1975. Support for sharing the namenode state from active to standby. (jitendra, atm, todd) + +HDFS-1971. Send block report from datanode to both active and standby namenodes. (sanjay, todd via suresh) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java index 4d098ebec2..85807f6d5a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java @@ -17,62 +17,43 @@ */ package org.apache.hadoop.hdfs.server.datanode; -import static org.apache.hadoop.hdfs.server.common.Util.now; - import java.io.IOException; import java.net.InetSocketAddress; -import java.net.SocketTimeoutException; -import java.net.URI; -import java.util.Collection; -import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; import org.apache.commons.logging.Log; import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.protocol.Block; -import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; -import org.apache.hadoop.hdfs.protocol.HdfsConstants; -import org.apache.hadoop.hdfs.protocol.LocatedBlock; -import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException; -import org.apache.hadoop.hdfs.server.common.IncorrectVersionException; -import org.apache.hadoop.hdfs.server.common.Storage; -import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand; import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; -import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException; import org.apache.hadoop.hdfs.server.protocol.FinalizeCommand; import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand; import org.apache.hadoop.ipc.RPC; -import org.apache.hadoop.ipc.RemoteException; -import org.apache.hadoop.util.StringUtils; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; /** - * A thread per namenode to perform: - *
versionRequest
to determine the NN's
- * namespace and version info. It automatically retries until
- * the NN responds or the DN is shutting down.
- *
- * @return the NamespaceInfo
- * @throws IncorrectVersionException if the remote NN does not match
- * this DN's version
- */
- NamespaceInfo retrieveNamespaceInfo() throws IncorrectVersionException {
- NamespaceInfo nsInfo = null;
- while (shouldRun()) {
- try {
- nsInfo = bpNamenode.versionRequest();
- LOG.debug(this + " received versionRequest response: " + nsInfo);
- break;
- } catch(SocketTimeoutException e) { // namenode is busy
- LOG.warn("Problem connecting to server: " + nnAddr);
- } catch(IOException e ) { // namenode is not available
- LOG.warn("Problem connecting to server: " + nnAddr);
- }
-
- // try again in a second
- sleepAndLogInterrupts(5000, "requesting version info from NN");
- }
-
- if (nsInfo != null) {
- checkNNVersion(nsInfo);
- }
- return nsInfo;
- }
-
- private void checkNNVersion(NamespaceInfo nsInfo)
- throws IncorrectVersionException {
- // build and layout versions should match
- String nsBuildVer = nsInfo.getBuildVersion();
- String stBuildVer = Storage.getBuildVersion();
- if (!nsBuildVer.equals(stBuildVer)) {
- LOG.warn("Data-node and name-node Build versions must be the same. " +
- "Namenode build version: " + nsBuildVer + "Datanode " +
- "build version: " + stBuildVer);
- throw new IncorrectVersionException(nsBuildVer, "namenode", stBuildVer);
- }
-
- if (HdfsConstants.LAYOUT_VERSION != nsInfo.getLayoutVersion()) {
- LOG.warn("Data-node and name-node layout versions must be the same." +
- " Expected: "+ HdfsConstants.LAYOUT_VERSION +
- " actual "+ bpNSInfo.getLayoutVersion());
- throw new IncorrectVersionException(
- bpNSInfo.getLayoutVersion(), "namenode");
- }
- }
-
- private void connectToNNAndHandshake() throws IOException {
- // get NN proxy
- bpNamenode = (DatanodeProtocol)RPC.waitForProxy(DatanodeProtocol.class,
- DatanodeProtocol.versionID, nnAddr, dn.getConf());
-
- // First phase of the handshake with NN - get the namespace
- // info.
- bpNSInfo = retrieveNamespaceInfo();
-
- // Now that we know the namespace ID, etc, we can pass this to the DN.
- // The DN can now initialize its local storage if we are the
- // first BP to handshake, etc.
- dn.initBlockPool(this);
-
- // Second phase of the handshake with the NN.
- register();
- }
-
- /**
- * This methods arranges for the data node to send the block report at
- * the next heartbeat.
- */
- void scheduleBlockReport(long delay) {
- if (delay > 0) { // send BR after random delay
- lastBlockReport = System.currentTimeMillis()
- - ( dnConf.blockReportInterval - DFSUtil.getRandom().nextInt((int)(delay)));
- } else { // send at next heartbeat
- lastBlockReport = lastHeartbeat - dnConf.blockReportInterval;
- }
- resetBlockReportTime = true; // reset future BRs for randomness
- }
-
void reportBadBlocks(ExtendedBlock block) {
- DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) };
- LocatedBlock[] blocks = { new LocatedBlock(block, dnArr) };
-
- try {
- bpNamenode.reportBadBlocks(blocks);
- } catch (IOException e){
- /* One common reason is that NameNode could be in safe mode.
- * Should we keep on retrying in that case?
- */
- LOG.warn("Failed to report bad block " + block + " to namenode : "
- + " Exception", e);
+ checkBlock(block);
+ for (BPServiceActor actor : bpServices) {
+ actor.reportBadBlocks(block);
}
-
}
- /**
- * Report received blocks and delete hints to the Namenode
- *
- * @throws IOException
- */
- private void reportReceivedDeletedBlocks() throws IOException {
-
- // check if there are newly received blocks
- ReceivedDeletedBlockInfo[] receivedAndDeletedBlockArray = null;
- int currentReceivedRequestsCounter;
- synchronized (receivedAndDeletedBlockList) {
- currentReceivedRequestsCounter = pendingReceivedRequests;
- int numBlocks = receivedAndDeletedBlockList.size();
- if (numBlocks > 0) {
- //
- // Send newly-received and deleted blockids to namenode
- //
- receivedAndDeletedBlockArray = receivedAndDeletedBlockList
- .toArray(new ReceivedDeletedBlockInfo[numBlocks]);
- }
- }
- if (receivedAndDeletedBlockArray != null) {
- bpNamenode.blockReceivedAndDeleted(bpRegistration, getBlockPoolId(),
- receivedAndDeletedBlockArray);
- synchronized (receivedAndDeletedBlockList) {
- for (int i = 0; i < receivedAndDeletedBlockArray.length; i++) {
- receivedAndDeletedBlockList.remove(receivedAndDeletedBlockArray[i]);
- }
- pendingReceivedRequests -= currentReceivedRequestsCounter;
- }
- }
- }
-
/*
* Informing the name node could take a long long time! Should we wait
* till namenode is informed before responding with success to the
* client? For now we don't.
*/
void notifyNamenodeReceivedBlock(ExtendedBlock block, String delHint) {
- if (block == null || delHint == null) {
- throw new IllegalArgumentException(block == null ? "Block is null"
- : "delHint is null");
+ checkBlock(block);
+ checkDelHint(delHint);
+ ReceivedDeletedBlockInfo bInfo =
+ new ReceivedDeletedBlockInfo(block.getLocalBlock(), delHint);
+ for (BPServiceActor actor : bpServices) {
+ actor.notifyNamenodeReceivedBlock(bInfo);
}
+ }
- if (!block.getBlockPoolId().equals(getBlockPoolId())) {
- LOG.warn("BlockPool mismatch " + block.getBlockPoolId() + " vs. "
- + getBlockPoolId());
- return;
- }
-
- synchronized (receivedAndDeletedBlockList) {
- receivedAndDeletedBlockList.add(new ReceivedDeletedBlockInfo(block
- .getLocalBlock(), delHint));
- pendingReceivedRequests++;
- receivedAndDeletedBlockList.notifyAll();
- }
+ private void checkBlock(ExtendedBlock block) {
+ Preconditions.checkArgument(block != null,
+ "block is null");
+ Preconditions.checkArgument(block.getBlockPoolId().equals(getBlockPoolId()),
+ "block belongs to BP %s instead of BP %s",
+ block.getBlockPoolId(), getBlockPoolId());
+ }
+
+ private void checkDelHint(String delHint) {
+ Preconditions.checkArgument(delHint != null,
+ "delHint is null");
}
void notifyNamenodeDeletedBlock(ExtendedBlock block) {
- if (block == null) {
- throw new IllegalArgumentException("Block is null");
- }
-
- if (!block.getBlockPoolId().equals(getBlockPoolId())) {
- LOG.warn("BlockPool mismatch " + block.getBlockPoolId() + " vs. "
- + getBlockPoolId());
- return;
- }
-
- synchronized (receivedAndDeletedBlockList) {
- receivedAndDeletedBlockList.add(new ReceivedDeletedBlockInfo(block
- .getLocalBlock(), ReceivedDeletedBlockInfo.TODELETE_HINT));
+ checkBlock(block);
+ ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo(block
+ .getLocalBlock(), ReceivedDeletedBlockInfo.TODELETE_HINT);
+
+ for (BPServiceActor actor : bpServices) {
+ actor.notifyNamenodeDeletedBlock(bInfo);
}
}
-
- /**
- * Report the list blocks to the Namenode
- * @throws IOException
- */
- DatanodeCommand blockReport() throws IOException {
- // send block report if timer has expired.
- DatanodeCommand cmd = null;
- long startTime = now();
- if (startTime - lastBlockReport > dnConf.blockReportInterval) {
-
- // Create block report
- long brCreateStartTime = now();
- BlockListAsLongs bReport = dn.data.getBlockReport(getBlockPoolId());
-
- // Send block report
- long brSendStartTime = now();
- cmd = bpNamenode.blockReport(bpRegistration, getBlockPoolId(), bReport
- .getBlockListAsLongs());
-
- // Log the block report processing stats from Datanode perspective
- long brSendCost = now() - brSendStartTime;
- long brCreateCost = brSendStartTime - brCreateStartTime;
- dn.metrics.addBlockReport(brSendCost);
- LOG.info("BlockReport of " + bReport.getNumberOfBlocks()
- + " blocks took " + brCreateCost + " msec to generate and "
- + brSendCost + " msecs for RPC and NN processing");
-
- // If we have sent the first block report, then wait a random
- // time before we start the periodic block reports.
- if (resetBlockReportTime) {
- lastBlockReport = startTime - DFSUtil.getRandom().nextInt((int)(dnConf.blockReportInterval));
- resetBlockReportTime = false;
- } else {
- /* say the last block report was at 8:20:14. The current report
- * should have started around 9:20:14 (default 1 hour interval).
- * If current time is :
- * 1) normal like 9:20:18, next report should be at 10:20:14
- * 2) unexpected like 11:35:43, next report should be at 12:20:14
- */
- lastBlockReport += (now() - lastBlockReport) /
- dnConf.blockReportInterval * dnConf.blockReportInterval;
- }
- LOG.info("sent block report, processed command:" + cmd);
- }
- return cmd;
- }
-
-
- DatanodeCommand [] sendHeartBeat() throws IOException {
- return bpNamenode.sendHeartbeat(bpRegistration,
- dn.data.getCapacity(),
- dn.data.getDfsUsed(),
- dn.data.getRemaining(),
- dn.data.getBlockPoolUsed(getBlockPoolId()),
- dn.xmitsInProgress.get(),
- dn.getXceiverCount(), dn.data.getNumFailedVolumes());
- }
-
//This must be called only by blockPoolManager
void start() {
- if ((bpThread != null) && (bpThread.isAlive())) {
- //Thread is started already
- return;
+ for (BPServiceActor actor : bpServices) {
+ actor.start();
}
- bpThread = new Thread(this, formatThreadName());
- bpThread.setDaemon(true); // needed for JUnit testing
- bpThread.start();
- }
-
- private String formatThreadName() {
- Collection- * The bpDatanode needs to register with the namenode on startup in order - * 1) to report which storage it is serving now and - * 2) to receive a registrationID - * - * issued by the namenode to recognize registered datanodes. - * - * @see FSNamesystem#registerDatanode(DatanodeRegistration) - * @throws IOException - */ - void register() throws IOException { - Preconditions.checkState(bpNSInfo != null, - "register() should be called after handshake()"); + synchronized void startDistributedUpgradeIfNeeded() throws IOException { + UpgradeManagerDatanode um = getUpgradeManager(); - // The handshake() phase loaded the block pool storage - // off disk - so update the bpRegistration object from that info - bpRegistration = dn.createBPRegistration(bpNSInfo); - - LOG.info(this + " beginning handshake with NN"); - - while (shouldRun()) { - try { - // Use returned registration from namenode with updated machine name. - bpRegistration = bpNamenode.registerDatanode(bpRegistration); - break; - } catch(SocketTimeoutException e) { // namenode is busy - LOG.info("Problem connecting to server: " + nnAddr); - sleepAndLogInterrupts(1000, "connecting to server"); - } - } - - LOG.info("Block pool " + this + " successfully registered with NN"); - dn.bpRegistrationSucceeded(bpRegistration, getBlockPoolId()); - - // random short delay - helps scatter the BR from all DNs - scheduleBlockReport(dnConf.initialBlockReportDelay); + if(!um.getUpgradeState()) + return; + um.setUpgradeState(false, um.getUpgradeVersion()); + um.startUpgrade(); + return; } - - - private void sleepAndLogInterrupts(int millis, - String stateString) { - try { - Thread.sleep(millis); - } catch (InterruptedException ie) { - LOG.info("BPOfferService " + this + - " interrupted while " + stateString); - } + + DataNode getDataNode() { + return dn; } /** - * No matter what kind of exception we get, keep retrying to offerService(). - * That's the loop that connects to the NameNode and provides basic DataNode - * functionality. - * - * Only stop when "shouldRun" or "shouldServiceRun" is turned off, which can - * happen either at shutdown or due to refreshNamenodes. + * Called by the BPServiceActors when they handshake to a NN. + * If this is the first NN connection, this sets the namespace info + * for this BPOfferService. If it's a connection to a new NN, it + * verifies that this namespace matches (eg to prevent a misconfiguration + * where a StandbyNode from a different cluster is specified) */ - @Override - public void run() { - LOG.info(this + " starting to offer service"); - - try { - // init stuff - try { - // setup storage - connectToNNAndHandshake(); - } catch (IOException ioe) { - // Initial handshake, storage recovery or registration failed - // End BPOfferService thread - LOG.fatal("Initialization failed for block pool " + this, ioe); - return; - } - - initialized = true; // bp is initialized; + void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException { + if (this.bpNSInfo == null) { + this.bpNSInfo = nsInfo; - while (shouldRun()) { - try { - startDistributedUpgradeIfNeeded(); - offerService(); - } catch (Exception ex) { - LOG.error("Exception in BPOfferService for " + this, ex); - sleepAndLogInterrupts(5000, "offering service"); - } - } - } catch (Throwable ex) { - LOG.warn("Unexpected exception in block pool " + this, ex); - } finally { - LOG.warn("Ending block pool service for: " + this); - cleanUp(); + // Now that we know the namespace ID, etc, we can pass this to the DN. + // The DN can now initialize its local storage if we are the + // first BP to handshake, etc. + dn.initBlockPool(this); + return; + } else { + checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(), + "Blockpool ID"); + checkNSEquality(bpNSInfo.getNamespaceID(), nsInfo.getNamespaceID(), + "Namespace ID"); + checkNSEquality(bpNSInfo.getClusterID(), nsInfo.getClusterID(), + "Cluster ID"); } } - private boolean shouldRun() { - return shouldServiceRun && dn.shouldRun(); - } - /** - * Process an array of datanode commands - * - * @param cmds an array of datanode commands - * @return true if further processing may be required or false otherwise. + * After one of the BPServiceActors registers successfully with the + * NN, it calls this function to verify that the NN it connected to + * is consistent with other NNs serving the block-pool. */ - private boolean processCommand(DatanodeCommand[] cmds) { - if (cmds != null) { - for (DatanodeCommand cmd : cmds) { - try { - if (processCommand(cmd) == false) { - return false; - } - } catch (IOException ioe) { - LOG.warn("Error processing datanode Command", ioe); - } + void registrationSucceeded(BPServiceActor bpServiceActor, + DatanodeRegistration reg) throws IOException { + if (bpRegistration != null) { + checkNSEquality(bpRegistration.storageInfo.getNamespaceID(), + reg.storageInfo.getNamespaceID(), "namespace ID"); + checkNSEquality(bpRegistration.storageInfo.getClusterID(), + reg.storageInfo.getClusterID(), "cluster ID"); + } else { + bpRegistration = reg; + } + } + + /** + * Verify equality of two namespace-related fields, throwing + * an exception if they are unequal. + */ + private static void checkNSEquality( + Object ourID, Object theirID, + String idHelpText) throws IOException { + if (!ourID.equals(theirID)) { + throw new IOException(idHelpText + " mismatch: " + + "previously connected to " + idHelpText + " " + ourID + + " but now connected to " + idHelpText + " " + theirID); + } + } + + DatanodeRegistration createRegistration() { + Preconditions.checkState(bpNSInfo != null, + "getRegistration() can only be called after initial handshake"); + return dn.createBPRegistration(bpNSInfo); + } + + /** + * Called when an actor shuts down. If this is the last actor + * to shut down, shuts down the whole blockpool in the DN. + */ + void shutdownActor(BPServiceActor actor) { + if (bpServiceToActive == actor) { + bpServiceToActive = null; + } + + bpServices.remove(actor); + + // TODO: synchronization should be a little better here + if (bpServices.isEmpty()) { + dn.shutdownBlockPool(this); + + if(upgradeManager != null) + upgradeManager.shutdownUpgrade(); + } + } + + @Deprecated + InetSocketAddress getNNSocketAddress() { + // TODO(HA) this doesn't make sense anymore + return bpServiceToActive.getNNSocketAddress(); + } + + /** + * Called by the DN to report an error to the NNs. + */ + void trySendErrorReport(int errCode, String errMsg) { + for (BPServiceActor actor : bpServices) { + actor.trySendErrorReport(errCode, errMsg); + } + } + + /** + * Ask each of the actors to schedule a block report after + * the specified delay. + */ + void scheduleBlockReport(long delay) { + for (BPServiceActor actor : bpServices) { + actor.scheduleBlockReport(delay); + } + } + + /** + * Ask each of the actors to report a bad block hosted on another DN. + */ + void reportRemoteBadBlock(DatanodeInfo dnInfo, ExtendedBlock block) { + for (BPServiceActor actor : bpServices) { + try { + actor.reportRemoteBadBlock(dnInfo, block); + } catch (IOException e) { + LOG.warn("Couldn't report bad block " + block + " to " + actor, + e); } } - return true; + } + + /** + * TODO: this is still used in a few places where we need to sort out + * what to do in HA! + * @return a proxy to the active NN + */ + @Deprecated + DatanodeProtocol getActiveNN() { + return bpServiceToActive.bpNamenode; + } + + /** + * @return true if the given NN address is one of the NNs for this + * block pool + */ + boolean containsNN(InetSocketAddress addr) { + for (BPServiceActor actor : bpServices) { + if (actor.getNNSocketAddress().equals(addr)) { + return true; + } + } + return false; + } + + @VisibleForTesting + int countNameNodes() { + return bpServices.size(); + } + + /** + * Run an immediate block report on this thread. Used by tests. + */ + @VisibleForTesting + void triggerBlockReportForTests() throws IOException { + for (BPServiceActor actor : bpServices) { + actor.triggerBlockReportForTests(); + } + } + + boolean processCommandFromActor(DatanodeCommand cmd, + BPServiceActor actor) throws IOException { + assert bpServices.contains(actor); + if (actor == bpServiceToActive) { + return processCommandFromActive(cmd, actor); + } else { + return processCommandFromStandby(cmd, actor); + } } /** @@ -669,7 +412,8 @@ private boolean processCommand(DatanodeCommand[] cmds) { * @return true if further processing may be required or false otherwise. * @throws IOException */ - private boolean processCommand(DatanodeCommand cmd) throws IOException { + private boolean processCommandFromActive(DatanodeCommand cmd, + BPServiceActor actor) throws IOException { if (cmd == null) return true; final BlockCommand bcmd = @@ -700,19 +444,12 @@ private boolean processCommand(DatanodeCommand cmd) throws IOException { dn.metrics.incrBlocksRemoved(toDelete.length); break; case DatanodeProtocol.DNA_SHUTDOWN: - // shut down the data node - shouldServiceRun = false; - return false; + // TODO: DNA_SHUTDOWN appears to be unused - the NN never sends this command + throw new UnsupportedOperationException("Received unimplemented DNA_SHUTDOWN"); case DatanodeProtocol.DNA_REGISTER: // namenode requested a registration - at start or if NN lost contact LOG.info("DatanodeCommand action: DNA_REGISTER"); - if (shouldRun()) { - // re-retrieve namespace info to make sure that, if the NN - // was restarted, we still match its version (HDFS-2120) - retrieveNamespaceInfo(); - // and re-register - register(); - } + actor.reRegister(); break; case DatanodeProtocol.DNA_FINALIZE: String bp = ((FinalizeCommand) cmd).getBlockPoolId(); @@ -732,7 +469,8 @@ assert getBlockPoolId().equals(bp) : case DatanodeProtocol.DNA_ACCESSKEYUPDATE: LOG.info("DatanodeCommand action: DNA_ACCESSKEYUPDATE"); if (dn.isBlockTokenEnabled) { - dn.blockPoolTokenSecretManager.setKeys(getBlockPoolId(), + dn.blockPoolTokenSecretManager.setKeys( + getBlockPoolId(), ((KeyUpdateCommand) cmd).getExportedKeys()); } break; @@ -751,32 +489,39 @@ assert getBlockPoolId().equals(bp) : } return true; } - - private void processDistributedUpgradeCommand(UpgradeCommand comm) - throws IOException { - UpgradeManagerDatanode upgradeManager = getUpgradeManager(); - upgradeManager.processUpgradeCommand(comm); + + private boolean processCommandFromStandby(DatanodeCommand cmd, + BPServiceActor actor) throws IOException { + if (cmd == null) + return true; + switch(cmd.getAction()) { + case DatanodeProtocol.DNA_REGISTER: + // namenode requested a registration - at start or if NN lost contact + LOG.info("DatanodeCommand action: DNA_REGISTER"); + actor.reRegister(); + return true; + case DatanodeProtocol.DNA_TRANSFER: + case DatanodeProtocol.DNA_INVALIDATE: + case DatanodeProtocol.DNA_SHUTDOWN: + case DatanodeProtocol.DNA_RECOVERBLOCK: + case DatanodeProtocol.DNA_ACCESSKEYUPDATE: + case DatanodeProtocol.DNA_BALANCERBANDWIDTHUPDATE: + LOG.warn("Got a command from standby NN - ignoring command:" + cmd.getAction()); + return true; + default: + LOG.warn("Unknown DatanodeCommand action: " + cmd.getAction()); + } + return true; } - synchronized UpgradeManagerDatanode getUpgradeManager() { - if(upgradeManager == null) - upgradeManager = - new UpgradeManagerDatanode(dn, getBlockPoolId()); - - return upgradeManager; - } - /** - * Start distributed upgrade if it should be initiated by the data-node. + * Connect to the NN at the given address. This is separated out for ease + * of testing. */ - private void startDistributedUpgradeIfNeeded() throws IOException { - UpgradeManagerDatanode um = getUpgradeManager(); - - if(!um.getUpgradeState()) - return; - um.setUpgradeState(false, um.getUpgradeVersion()); - um.startUpgrade(); - return; + DatanodeProtocol connectToNN(InetSocketAddress nnAddr) + throws IOException { + return (DatanodeProtocol)RPC.waitForProxy(DatanodeProtocol.class, + DatanodeProtocol.versionID, nnAddr, dn.getConf()); } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java new file mode 100644 index 0000000000..2c4a15bf81 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java @@ -0,0 +1,633 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import static org.apache.hadoop.hdfs.server.common.Util.now; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.SocketTimeoutException; +import java.net.URI; +import java.util.Collection; +import java.util.LinkedList; + +import org.apache.commons.logging.Log; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException; +import org.apache.hadoop.hdfs.server.common.IncorrectVersionException; +import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; +import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException; +import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; +import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.ipc.RemoteException; +import org.apache.hadoop.util.StringUtils; + +import com.google.common.annotations.VisibleForTesting; + +/** + * A thread per active or standby namenode to perform: + *
versionRequest
to determine the NN's
+ * namespace and version info. It automatically retries until
+ * the NN responds or the DN is shutting down.
+ *
+ * @return the NamespaceInfo
+ */
+ @VisibleForTesting
+ NamespaceInfo retrieveNamespaceInfo() throws IOException {
+ NamespaceInfo nsInfo = null;
+ while (shouldRun()) {
+ try {
+ nsInfo = bpNamenode.versionRequest();
+ LOG.debug(this + " received versionRequest response: " + nsInfo);
+ break;
+ } catch(SocketTimeoutException e) { // namenode is busy
+ LOG.warn("Problem connecting to server: " + nnAddr);
+ } catch(IOException e ) { // namenode is not available
+ LOG.warn("Problem connecting to server: " + nnAddr);
+ }
+
+ // try again in a second
+ sleepAndLogInterrupts(5000, "requesting version info from NN");
+ }
+
+ if (nsInfo != null) {
+ checkNNVersion(nsInfo);
+ } else {
+ throw new IOException("DN shut down before block pool connected");
+ }
+ return nsInfo;
+ }
+
+ private void checkNNVersion(NamespaceInfo nsInfo)
+ throws IncorrectVersionException {
+ // build and layout versions should match
+ String nsBuildVer = nsInfo.getBuildVersion();
+ String stBuildVer = Storage.getBuildVersion();
+ if (!nsBuildVer.equals(stBuildVer)) {
+ LOG.warn("Data-node and name-node Build versions must be the same. " +
+ "Namenode build version: " + nsBuildVer + "Datanode " +
+ "build version: " + stBuildVer);
+ throw new IncorrectVersionException(nsBuildVer, "namenode", stBuildVer);
+ }
+
+ if (HdfsConstants.LAYOUT_VERSION != nsInfo.getLayoutVersion()) {
+ LOG.warn("Data-node and name-node layout versions must be the same." +
+ " Expected: "+ HdfsConstants.LAYOUT_VERSION +
+ " actual "+ nsInfo.getLayoutVersion());
+ throw new IncorrectVersionException(
+ nsInfo.getLayoutVersion(), "namenode");
+ }
+ }
+
+ private void connectToNNAndHandshake() throws IOException {
+ // get NN proxy
+ bpNamenode = bpos.connectToNN(nnAddr);
+
+ // First phase of the handshake with NN - get the namespace
+ // info.
+ NamespaceInfo nsInfo = retrieveNamespaceInfo();
+
+ // Verify that this matches the other NN in this HA pair.
+ // This also initializes our block pool in the DN if we are
+ // the first NN connection for this BP.
+ bpos.verifyAndSetNamespaceInfo(nsInfo);
+
+ // Second phase of the handshake with the NN.
+ register();
+ }
+
+ /**
+ * This methods arranges for the data node to send the block report at
+ * the next heartbeat.
+ */
+ void scheduleBlockReport(long delay) {
+ if (delay > 0) { // send BR after random delay
+ lastBlockReport = System.currentTimeMillis()
+ - ( dnConf.blockReportInterval - DFSUtil.getRandom().nextInt((int)(delay)));
+ } else { // send at next heartbeat
+ lastBlockReport = lastHeartbeat - dnConf.blockReportInterval;
+ }
+ resetBlockReportTime = true; // reset future BRs for randomness
+ }
+
+ void reportBadBlocks(ExtendedBlock block) {
+ DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) };
+ LocatedBlock[] blocks = { new LocatedBlock(block, dnArr) };
+
+ try {
+ bpNamenode.reportBadBlocks(blocks);
+ } catch (IOException e){
+ /* One common reason is that NameNode could be in safe mode.
+ * Should we keep on retrying in that case?
+ */
+ LOG.warn("Failed to report bad block " + block + " to namenode : "
+ + " Exception", e);
+ }
+ }
+
+ /**
+ * Report received blocks and delete hints to the Namenode
+ *
+ * @throws IOException
+ */
+ private void reportReceivedDeletedBlocks() throws IOException {
+
+ // check if there are newly received blocks
+ ReceivedDeletedBlockInfo[] receivedAndDeletedBlockArray = null;
+ int currentReceivedRequestsCounter;
+ synchronized (receivedAndDeletedBlockList) {
+ currentReceivedRequestsCounter = pendingReceivedRequests;
+ int numBlocks = receivedAndDeletedBlockList.size();
+ if (numBlocks > 0) {
+ //
+ // Send newly-received and deleted blockids to namenode
+ //
+ receivedAndDeletedBlockArray = receivedAndDeletedBlockList
+ .toArray(new ReceivedDeletedBlockInfo[numBlocks]);
+ }
+ }
+ if (receivedAndDeletedBlockArray != null) {
+ bpNamenode.blockReceivedAndDeleted(bpRegistration, bpos.getBlockPoolId(),
+ receivedAndDeletedBlockArray);
+ synchronized (receivedAndDeletedBlockList) {
+ for (int i = 0; i < receivedAndDeletedBlockArray.length; i++) {
+ receivedAndDeletedBlockList.remove(receivedAndDeletedBlockArray[i]);
+ }
+ pendingReceivedRequests -= currentReceivedRequestsCounter;
+ }
+ }
+ }
+
+ /*
+ * Informing the name node could take a long long time! Should we wait
+ * till namenode is informed before responding with success to the
+ * client? For now we don't.
+ */
+ void notifyNamenodeReceivedBlock(ReceivedDeletedBlockInfo bInfo) {
+ synchronized (receivedAndDeletedBlockList) {
+ receivedAndDeletedBlockList.add(bInfo);
+ pendingReceivedRequests++;
+ receivedAndDeletedBlockList.notifyAll();
+ }
+ }
+
+ void notifyNamenodeDeletedBlock(ReceivedDeletedBlockInfo bInfo) {
+ synchronized (receivedAndDeletedBlockList) {
+ receivedAndDeletedBlockList.add(bInfo);
+ }
+ }
+
+ /**
+ * Run an immediate block report on this thread. Used by tests.
+ */
+ @VisibleForTesting
+ void triggerBlockReportForTests() throws IOException {
+ lastBlockReport = 0;
+ blockReport();
+ }
+
+ /**
+ * Report the list blocks to the Namenode
+ * @throws IOException
+ */
+ DatanodeCommand blockReport() throws IOException {
+ // send block report if timer has expired.
+ DatanodeCommand cmd = null;
+ long startTime = now();
+ if (startTime - lastBlockReport > dnConf.blockReportInterval) {
+
+ // Create block report
+ long brCreateStartTime = now();
+ BlockListAsLongs bReport = dn.getFSDataset().getBlockReport(
+ bpos.getBlockPoolId());
+
+ // Send block report
+ long brSendStartTime = now();
+ cmd = bpNamenode.blockReport(bpRegistration, bpos.getBlockPoolId(), bReport
+ .getBlockListAsLongs());
+
+ // Log the block report processing stats from Datanode perspective
+ long brSendCost = now() - brSendStartTime;
+ long brCreateCost = brSendStartTime - brCreateStartTime;
+ dn.getMetrics().addBlockReport(brSendCost);
+ LOG.info("BlockReport of " + bReport.getNumberOfBlocks()
+ + " blocks took " + brCreateCost + " msec to generate and "
+ + brSendCost + " msecs for RPC and NN processing");
+
+ // If we have sent the first block report, then wait a random
+ // time before we start the periodic block reports.
+ if (resetBlockReportTime) {
+ lastBlockReport = startTime - DFSUtil.getRandom().nextInt((int)(dnConf.blockReportInterval));
+ resetBlockReportTime = false;
+ } else {
+ /* say the last block report was at 8:20:14. The current report
+ * should have started around 9:20:14 (default 1 hour interval).
+ * If current time is :
+ * 1) normal like 9:20:18, next report should be at 10:20:14
+ * 2) unexpected like 11:35:43, next report should be at 12:20:14
+ */
+ lastBlockReport += (now() - lastBlockReport) /
+ dnConf.blockReportInterval * dnConf.blockReportInterval;
+ }
+ LOG.info("sent block report, processed command:" + cmd);
+ }
+ return cmd;
+ }
+
+
+ DatanodeCommand [] sendHeartBeat() throws IOException {
+ LOG.info("heartbeat: " + this);
+ // TODO: saw an NPE here - maybe if the two BPOS register at
+ // same time, this one won't block on the other one?
+ return bpNamenode.sendHeartbeat(bpRegistration,
+ dn.getFSDataset().getCapacity(),
+ dn.getFSDataset().getDfsUsed(),
+ dn.getFSDataset().getRemaining(),
+ dn.getFSDataset().getBlockPoolUsed(bpos.getBlockPoolId()),
+ dn.getXmitsInProgress(),
+ dn.getXceiverCount(), dn.getFSDataset().getNumFailedVolumes());
+ }
+
+ //This must be called only by BPOfferService
+ void start() {
+ if ((bpThread != null) && (bpThread.isAlive())) {
+ //Thread is started already
+ return;
+ }
+ bpThread = new Thread(this, formatThreadName());
+ bpThread.setDaemon(true); // needed for JUnit testing
+ bpThread.start();
+ }
+
+ private String formatThreadName() {
+ Collection
+ * The bpDatanode needs to register with the namenode on startup in order
+ * 1) to report which storage it is serving now and
+ * 2) to receive a registrationID
+ *
+ * issued by the namenode to recognize registered datanodes.
+ *
+ * @see FSNamesystem#registerDatanode(DatanodeRegistration)
+ * @throws IOException
+ */
+ void register() throws IOException {
+ // The handshake() phase loaded the block pool storage
+ // off disk - so update the bpRegistration object from that info
+ bpRegistration = bpos.createRegistration();
+
+ LOG.info(this + " beginning handshake with NN");
+
+ while (shouldRun()) {
+ try {
+ // Use returned registration from namenode with updated machine name.
+ bpRegistration = bpNamenode.registerDatanode(bpRegistration);
+ break;
+ } catch(SocketTimeoutException e) { // namenode is busy
+ LOG.info("Problem connecting to server: " + nnAddr);
+ sleepAndLogInterrupts(1000, "connecting to server");
+ }
+ }
+
+ LOG.info("Block pool " + this + " successfully registered with NN");
+ bpos.registrationSucceeded(this, bpRegistration);
+
+ // random short delay - helps scatter the BR from all DNs
+ scheduleBlockReport(dnConf.initialBlockReportDelay);
+ }
+
+
+ private void sleepAndLogInterrupts(int millis,
+ String stateString) {
+ try {
+ Thread.sleep(millis);
+ } catch (InterruptedException ie) {
+ LOG.info("BPOfferService " + this +
+ " interrupted while " + stateString);
+ }
+ }
+
+ /**
+ * No matter what kind of exception we get, keep retrying to offerService().
+ * That's the loop that connects to the NameNode and provides basic DataNode
+ * functionality.
+ *
+ * Only stop when "shouldRun" or "shouldServiceRun" is turned off, which can
+ * happen either at shutdown or due to refreshNamenodes.
+ */
+ @Override
+ public void run() {
+ LOG.info(this + " starting to offer service");
+
+ try {
+ // init stuff
+ try {
+ // setup storage
+ connectToNNAndHandshake();
+ } catch (IOException ioe) {
+ // Initial handshake, storage recovery or registration failed
+ // End BPOfferService thread
+ LOG.fatal("Initialization failed for block pool " + this, ioe);
+ return;
+ }
+
+ initialized = true; // bp is initialized;
+
+ while (shouldRun()) {
+ try {
+ bpos.startDistributedUpgradeIfNeeded();
+ offerService();
+ } catch (Exception ex) {
+ LOG.error("Exception in BPOfferService for " + this, ex);
+ sleepAndLogInterrupts(5000, "offering service");
+ }
+ }
+ } catch (Throwable ex) {
+ LOG.warn("Unexpected exception in block pool " + this, ex);
+ } finally {
+ LOG.warn("Ending block pool service for: " + this);
+ cleanUp();
+ }
+ }
+
+ private boolean shouldRun() {
+ return shouldServiceRun && dn.shouldRun();
+ }
+
+ /**
+ * Process an array of datanode commands
+ *
+ * @param cmds an array of datanode commands
+ * @return true if further processing may be required or false otherwise.
+ */
+ boolean processCommand(DatanodeCommand[] cmds) {
+ if (cmds != null) {
+ for (DatanodeCommand cmd : cmds) {
+ try {
+ if (bpos.processCommandFromActor(cmd, this) == false) {
+ return false;
+ }
+ } catch (IOException ioe) {
+ LOG.warn("Error processing datanode Command", ioe);
+ }
+ }
+ }
+ return true;
+ }
+
+ void trySendErrorReport(int errCode, String errMsg) {
+ try {
+ bpNamenode.errorReport(bpRegistration, errCode, errMsg);
+ } catch(IOException e) {
+ LOG.warn("Error reporting an error to NameNode " + nnAddr,
+ e);
+ }
+ }
+
+ /**
+ * Report a bad block from another DN in this cluster.
+ */
+ void reportRemoteBadBlock(DatanodeInfo dnInfo, ExtendedBlock block)
+ throws IOException {
+ LocatedBlock lb = new LocatedBlock(block,
+ new DatanodeInfo[] {dnInfo});
+ bpNamenode.reportBadBlocks(new LocatedBlock[] {lb});
+ }
+
+ void reRegister() throws IOException {
+ if (shouldRun()) {
+ // re-retrieve namespace info to make sure that, if the NN
+ // was restarted, we still match its version (HDFS-2120)
+ retrieveNamespaceInfo();
+ // and re-register
+ register();
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
index 61bc29acf4..c8aac296a7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
@@ -320,7 +320,6 @@ private void handleMirrorOutError(IOException ioe) throws IOException {
private void verifyChunks( byte[] dataBuf, int dataOff, int len,
byte[] checksumBuf, int checksumOff )
throws IOException {
- DatanodeProtocol nn = datanode.getBPNamenode(block.getBlockPoolId());
while (len > 0) {
int chunkLen = Math.min(len, bytesPerChecksum);
@@ -331,9 +330,7 @@ private void verifyChunks( byte[] dataBuf, int dataOff, int len,
try {
LOG.info("report corrupt block " + block + " from datanode " +
srcDataNode + " to namenode");
- LocatedBlock lb = new LocatedBlock(block,
- new DatanodeInfo[] {srcDataNode});
- nn.reportBadBlocks(new LocatedBlock[] {lb});
+ datanode.reportRemoteBadBlock(srcDataNode, block);
} catch (IOException e) {
LOG.warn("Failed to report bad block " + block +
" from datanode " + srcDataNode + " to namenode");
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
index aba55f8c6a..dc3a18163b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
@@ -71,6 +71,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
+import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
@@ -243,7 +244,7 @@ public static InetSocketAddress createSocketAddr(String target) {
@InterfaceAudience.Private
class BlockPoolManager {
private final Map