diff --git a/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt new file mode 100644 index 0000000000..748ff939ce --- /dev/null +++ b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt @@ -0,0 +1,53 @@ +Changes for HDFS-1623 branch. + +This change list will be merged into the trunk CHANGES.txt when the HDFS-1623 +branch is merged. +------------------------------ + +HADOOP-7455. HA: Introduce HA Service Protocol Interface. (suresh) + +HADOOP-7774. HA: Administrative CLI to control HA daemons. (todd) + +HADOOP-7896. HA: if both NNs are in Standby mode, client needs to try failing +back and forth several times with sleeps. (atm) + +HADOOP-7922. Improve some logging for client IPC failovers and +StandbyExceptions (todd) + +HADOOP-7921. StandbyException should extend IOException (todd) + +HADOOP-7928. HA: Client failover policy is incorrectly trying to fail over all +IOExceptions (atm) + +HADOOP-7925. Add interface and update CLI to query current state to +HAServiceProtocol (eli via todd) + +HADOOP-7932. Make client connection retries on socket time outs configurable. +(Uma Maheswara Rao G via todd) + +HADOOP-7924. FailoverController for client-based configuration (eli) + +HADOOP-7961. Move HA fencing to common. (eli) + +HADOOP-7970. HAServiceProtocol methods must throw IOException. +(Hari Mankude via suresh). + +HADOOP-7992. Add ZKClient library to facilitate leader election. +(Bikas Saha via suresh). + +HADOOP-7983. HA: failover should be able to pass args to fencers. (eli) + +HADOOP-7938. HA: the FailoverController should optionally fence the +active during failover. (eli) + +HADOOP-7991. HA: the FailoverController should check the standby is +ready before failing over. (eli) + +HADOOP-8038. Add 'ipc.client.connect.max.retries.on.timeouts' entry in +core-default.xml file. (Uma Maheswara Rao G via atm) + +HADOOP-8041. Log a warning when a failover is first attempted (todd) + +HADOOP-8068. void methods can swallow exceptions when going through failover path (todd) + +HADOOP-8116. RetriableCommand is using RetryPolicy incorrectly after HADOOP-7896. (atm) diff --git a/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml b/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml index 3624c99871..855b028453 100644 --- a/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml +++ b/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml @@ -278,8 +278,12 @@ - + + + + + diff --git a/hadoop-common-project/hadoop-common/pom.xml b/hadoop-common-project/hadoop-common/pom.xml index 12d98c6e90..fd18b607a2 100644 --- a/hadoop-common-project/hadoop-common/pom.xml +++ b/hadoop-common-project/hadoop-common/pom.xml @@ -263,6 +263,38 @@ json-simple compile + + com.jcraft + jsch + + + + org.apache.zookeeper + zookeeper + 3.4.2 + + + + junit + junit + + + com.sun.jdmk + jmxtools + + + com.sun.jmx + jmxri + + + + + org.apache.zookeeper + zookeeper + 3.4.2 + test-jar + test + diff --git a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml index b8f5f511d3..771ac052b3 100644 --- a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml +++ b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml @@ -138,6 +138,12 @@ dfsadmin and mradmin commands to refresh the security policy in-effect. + + security.ha.service.protocol.acl + ACL for HAService protocol used by HAAdmin to manage the + active and stand-by states of namenode. + + diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java index f0ca72b00e..c2a6479dd2 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java @@ -114,11 +114,12 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic { public static final String HADOOP_SECURITY_SERVICE_AUTHORIZATION_REFRESH_USER_MAPPINGS = "security.refresh.user.mappings.protocol.acl"; + public static final String + SECURITY_HA_SERVICE_PROTOCOL_ACL = "security.ha.service.protocol.acl"; public static final String HADOOP_SECURITY_TOKEN_SERVICE_USE_IP = "hadoop.security.token.service.use_ip"; public static final boolean HADOOP_SECURITY_TOKEN_SERVICE_USE_IP_DEFAULT = true; - } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java index 401d07ab11..7953411b57 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java @@ -172,6 +172,11 @@ public class CommonConfigurationKeysPublic { /** Default value for IPC_CLIENT_CONNECT_MAX_RETRIES_KEY */ public static final int IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 10; /** See core-default.xml */ + public static final String IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY = + "ipc.client.connect.max.retries.on.timeouts"; + /** Default value for IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY */ + public static final int IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT = 45; + /** See core-default.xml */ public static final String IPC_CLIENT_TCPNODELAY_KEY = "ipc.client.tcpnodelay"; /** Defalt value for IPC_CLIENT_TCPNODELAY_KEY */ diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java new file mode 100644 index 0000000000..7da2d3e1bf --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java @@ -0,0 +1,593 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ha; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.HadoopIllegalArgumentException; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.zookeeper.data.ACL; +import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.Watcher; +import org.apache.zookeeper.WatchedEvent; +import org.apache.zookeeper.ZooKeeper; +import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.AsyncCallback.*; +import org.apache.zookeeper.data.Stat; +import org.apache.zookeeper.KeeperException.Code; + +import com.google.common.annotations.VisibleForTesting; + +/** + * + * This class implements a simple library to perform leader election on top of + * Apache Zookeeper. Using Zookeeper as a coordination service, leader election + * can be performed by atomically creating an ephemeral lock file (znode) on + * Zookeeper. The service instance that successfully creates the znode becomes + * active and the rest become standbys.
+ * This election mechanism is only efficient for small number of election + * candidates (order of 10's) because contention on single znode by a large + * number of candidates can result in Zookeeper overload.
+ * The elector does not guarantee fencing (protection of shared resources) among + * service instances. After it has notified an instance about becoming a leader, + * then that instance must ensure that it meets the service consistency + * requirements. If it cannot do so, then it is recommended to quit the + * election. The application implements the {@link ActiveStandbyElectorCallback} + * to interact with the elector + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class ActiveStandbyElector implements Watcher, StringCallback, + StatCallback { + + /** + * Callback interface to interact with the ActiveStandbyElector object.
+ * The application will be notified with a callback only on state changes + * (i.e. there will never be successive calls to becomeActive without an + * intermediate call to enterNeutralMode).
+ * The callbacks will be running on Zookeeper client library threads. The + * application should return from these callbacks quickly so as not to impede + * Zookeeper client library performance and notifications. The app will + * typically remember the state change and return from the callback. It will + * then proceed with implementing actions around that state change. It is + * possible to be called back again while these actions are in flight and the + * app should handle this scenario. + */ + public interface ActiveStandbyElectorCallback { + /** + * This method is called when the app becomes the active leader + */ + void becomeActive(); + + /** + * This method is called when the app becomes a standby + */ + void becomeStandby(); + + /** + * If the elector gets disconnected from Zookeeper and does not know about + * the lock state, then it will notify the service via the enterNeutralMode + * interface. The service may choose to ignore this or stop doing state + * changing operations. Upon reconnection, the elector verifies the leader + * status and calls back on the becomeActive and becomeStandby app + * interfaces.
+ * Zookeeper disconnects can happen due to network issues or loss of + * Zookeeper quorum. Thus enterNeutralMode can be used to guard against + * split-brain issues. In such situations it might be prudent to call + * becomeStandby too. However, such state change operations might be + * expensive and enterNeutralMode can help guard against doing that for + * transient issues. + */ + void enterNeutralMode(); + + /** + * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper + * errors or Zookeeper persistent unavailability) then notifyFatalError is + * called to notify the app about it. + */ + void notifyFatalError(String errorMessage); + } + + /** + * Name of the lock znode used by the library. Protected for access in test + * classes + */ + @VisibleForTesting + protected static final String LOCKFILENAME = "ActiveStandbyElectorLock"; + + public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class); + + private static final int NUM_RETRIES = 3; + + private enum ConnectionState { + DISCONNECTED, CONNECTED, TERMINATED + }; + + private enum State { + INIT, ACTIVE, STANDBY, NEUTRAL + }; + + private State state = State.INIT; + private int createRetryCount = 0; + private int statRetryCount = 0; + private ZooKeeper zkClient; + private ConnectionState zkConnectionState = ConnectionState.TERMINATED; + + private final ActiveStandbyElectorCallback appClient; + private final String zkHostPort; + private final int zkSessionTimeout; + private final List zkAcl; + private byte[] appData; + private final String zkLockFilePath; + private final String znodeWorkingDir; + + /** + * Create a new ActiveStandbyElector object
+ * The elector is created by providing to it the Zookeeper configuration, the + * parent znode under which to create the znode and a reference to the + * callback interface.
+ * The parent znode name must be the same for all service instances and + * different across services.
+ * After the leader has been lost, a new leader will be elected after the + * session timeout expires. Hence, the app must set this parameter based on + * its needs for failure response time. The session timeout must be greater + * than the Zookeeper disconnect timeout and is recommended to be 3X that + * value to enable Zookeeper to retry transient disconnections. Setting a very + * short session timeout may result in frequent transitions between active and + * standby states during issues like network outages/GS pauses. + * + * @param zookeeperHostPorts + * ZooKeeper hostPort for all ZooKeeper servers + * @param zookeeperSessionTimeout + * ZooKeeper session timeout + * @param parentZnodeName + * znode under which to create the lock + * @param acl + * ZooKeeper ACL's + * @param app + * reference to callback interface object + * @throws IOException + * @throws HadoopIllegalArgumentException + */ + public ActiveStandbyElector(String zookeeperHostPorts, + int zookeeperSessionTimeout, String parentZnodeName, List acl, + ActiveStandbyElectorCallback app) throws IOException, + HadoopIllegalArgumentException { + if (app == null || acl == null || parentZnodeName == null + || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { + throw new HadoopIllegalArgumentException("Invalid argument"); + } + zkHostPort = zookeeperHostPorts; + zkSessionTimeout = zookeeperSessionTimeout; + zkAcl = acl; + appClient = app; + znodeWorkingDir = parentZnodeName; + zkLockFilePath = znodeWorkingDir + "/" + LOCKFILENAME; + + // createConnection for future API calls + createConnection(); + } + + /** + * To participate in election, the app will call joinElection. The result will + * be notified by a callback on either the becomeActive or becomeStandby app + * interfaces.
+ * After this the elector will automatically monitor the leader status and + * perform re-election if necessary
+ * The app could potentially start off in standby mode and ignore the + * becomeStandby call. + * + * @param data + * to be set by the app. non-null data must be set. + * @throws HadoopIllegalArgumentException + * if valid data is not supplied + */ + public synchronized void joinElection(byte[] data) + throws HadoopIllegalArgumentException { + LOG.debug("Attempting active election"); + + if (data == null) { + throw new HadoopIllegalArgumentException("data cannot be null"); + } + + appData = new byte[data.length]; + System.arraycopy(data, 0, appData, 0, data.length); + + joinElectionInternal(); + } + + /** + * Any service instance can drop out of the election by calling quitElection. + *
+ * This will lose any leader status, if held, and stop monitoring of the lock + * node.
+ * If the instance wants to participate in election again, then it needs to + * call joinElection().
+ * This allows service instances to take themselves out of rotation for known + * impending unavailable states (e.g. long GC pause or software upgrade). + */ + public synchronized void quitElection() { + LOG.debug("Yielding from election"); + reset(); + } + + /** + * Exception thrown when there is no active leader + */ + public static class ActiveNotFoundException extends Exception { + private static final long serialVersionUID = 3505396722342846462L; + } + + /** + * get data set by the active leader + * + * @return data set by the active instance + * @throws ActiveNotFoundException + * when there is no active leader + * @throws KeeperException + * other zookeeper operation errors + * @throws InterruptedException + * @throws IOException + * when ZooKeeper connection could not be established + */ + public synchronized byte[] getActiveData() throws ActiveNotFoundException, + KeeperException, InterruptedException, IOException { + try { + if (zkClient == null) { + createConnection(); + } + Stat stat = new Stat(); + return zkClient.getData(zkLockFilePath, false, stat); + } catch(KeeperException e) { + Code code = e.code(); + if (operationNodeDoesNotExist(code)) { + // handle the commonly expected cases that make sense for us + throw new ActiveNotFoundException(); + } else { + throw e; + } + } + } + + /** + * interface implementation of Zookeeper callback for create + */ + @Override + public synchronized void processResult(int rc, String path, Object ctx, + String name) { + LOG.debug("CreateNode result: " + rc + " for path: " + path + + " connectionState: " + zkConnectionState); + if (zkClient == null) { + // zkClient is nulled before closing the connection + // this is the callback with session expired after we closed the session + return; + } + + Code code = Code.get(rc); + if (operationSuccess(code)) { + // we successfully created the znode. we are the leader. start monitoring + becomeActive(); + monitorActiveStatus(); + return; + } + + if (operationNodeExists(code)) { + if (createRetryCount == 0) { + // znode exists and we did not retry the operation. so a different + // instance has created it. become standby and monitor lock. + becomeStandby(); + } + // if we had retried then the znode could have been created by our first + // attempt to the server (that we lost) and this node exists response is + // for the second attempt. verify this case via ephemeral node owner. this + // will happen on the callback for monitoring the lock. + monitorActiveStatus(); + return; + } + + String errorMessage = "Received create error from Zookeeper. code:" + + code.toString(); + LOG.debug(errorMessage); + + if (operationRetry(code)) { + if (createRetryCount < NUM_RETRIES) { + LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); + ++createRetryCount; + createNode(); + return; + } + errorMessage = errorMessage + + ". Not retrying further znode create connection errors."; + } + + fatalError(errorMessage); + } + + /** + * interface implementation of Zookeeper callback for monitor (exists) + */ + @Override + public synchronized void processResult(int rc, String path, Object ctx, + Stat stat) { + LOG.debug("StatNode result: " + rc + " for path: " + path + + " connectionState: " + zkConnectionState); + if (zkClient == null) { + // zkClient is nulled before closing the connection + // this is the callback with session expired after we closed the session + return; + } + + Code code = Code.get(rc); + if (operationSuccess(code)) { + // the following owner check completes verification in case the lock znode + // creation was retried + if (stat.getEphemeralOwner() == zkClient.getSessionId()) { + // we own the lock znode. so we are the leader + becomeActive(); + } else { + // we dont own the lock znode. so we are a standby. + becomeStandby(); + } + // the watch set by us will notify about changes + return; + } + + if (operationNodeDoesNotExist(code)) { + // the lock znode disappeared before we started monitoring it + enterNeutralMode(); + joinElectionInternal(); + return; + } + + String errorMessage = "Received stat error from Zookeeper. code:" + + code.toString(); + LOG.debug(errorMessage); + + if (operationRetry(code)) { + if (statRetryCount < NUM_RETRIES) { + ++statRetryCount; + monitorNode(); + return; + } + errorMessage = errorMessage + + ". Not retrying further znode monitoring connection errors."; + } + + fatalError(errorMessage); + } + + /** + * interface implementation of Zookeeper watch events (connection and node) + */ + @Override + public synchronized void process(WatchedEvent event) { + Event.EventType eventType = event.getType(); + LOG.debug("Watcher event type: " + eventType + " with state:" + + event.getState() + " for path:" + event.getPath() + + " connectionState: " + zkConnectionState); + if (zkClient == null) { + // zkClient is nulled before closing the connection + // this is the callback with session expired after we closed the session + return; + } + + if (eventType == Event.EventType.None) { + // the connection state has changed + switch (event.getState()) { + case SyncConnected: + // if the listener was asked to move to safe state then it needs to + // be undone + ConnectionState prevConnectionState = zkConnectionState; + zkConnectionState = ConnectionState.CONNECTED; + if (prevConnectionState == ConnectionState.DISCONNECTED) { + monitorActiveStatus(); + } + break; + case Disconnected: + // ask the app to move to safe state because zookeeper connection + // is not active and we dont know our state + zkConnectionState = ConnectionState.DISCONNECTED; + enterNeutralMode(); + break; + case Expired: + // the connection got terminated because of session timeout + // call listener to reconnect + enterNeutralMode(); + reJoinElection(); + break; + default: + fatalError("Unexpected Zookeeper watch event state: " + + event.getState()); + break; + } + + return; + } + + // a watch on lock path in zookeeper has fired. so something has changed on + // the lock. ideally we should check that the path is the same as the lock + // path but trusting zookeeper for now + String path = event.getPath(); + if (path != null) { + switch (eventType) { + case NodeDeleted: + if (state == State.ACTIVE) { + enterNeutralMode(); + } + joinElectionInternal(); + break; + case NodeDataChanged: + monitorActiveStatus(); + break; + default: + LOG.debug("Unexpected node event: " + eventType + " for path: " + path); + monitorActiveStatus(); + } + + return; + } + + // some unexpected error has occurred + fatalError("Unexpected watch error from Zookeeper"); + } + + /** + * Get a new zookeeper client instance. protected so that test class can + * inherit and pass in a mock object for zookeeper + * + * @return new zookeeper client instance + * @throws IOException + */ + protected synchronized ZooKeeper getNewZooKeeper() throws IOException { + return new ZooKeeper(zkHostPort, zkSessionTimeout, this); + } + + private void fatalError(String errorMessage) { + reset(); + appClient.notifyFatalError(errorMessage); + } + + private void monitorActiveStatus() { + LOG.debug("Monitoring active leader"); + statRetryCount = 0; + monitorNode(); + } + + private void joinElectionInternal() { + if (zkClient == null) { + if (!reEstablishSession()) { + fatalError("Failed to reEstablish connection with ZooKeeper"); + return; + } + } + + createRetryCount = 0; + createNode(); + } + + private void reJoinElection() { + LOG.debug("Trying to re-establish ZK session"); + terminateConnection(); + joinElectionInternal(); + } + + private boolean reEstablishSession() { + int connectionRetryCount = 0; + boolean success = false; + while(!success && connectionRetryCount < NUM_RETRIES) { + LOG.debug("Establishing zookeeper connection"); + try { + createConnection(); + success = true; + } catch(IOException e) { + LOG.warn(e); + try { + Thread.sleep(5000); + } catch(InterruptedException e1) { + LOG.warn(e1); + } + } + ++connectionRetryCount; + } + return success; + } + + private void createConnection() throws IOException { + zkClient = getNewZooKeeper(); + } + + private void terminateConnection() { + if (zkClient == null) { + return; + } + LOG.debug("Terminating ZK connection"); + ZooKeeper tempZk = zkClient; + zkClient = null; + try { + tempZk.close(); + } catch(InterruptedException e) { + LOG.warn(e); + } + zkConnectionState = ConnectionState.TERMINATED; + } + + private void reset() { + state = State.INIT; + terminateConnection(); + } + + private void becomeActive() { + if (state != State.ACTIVE) { + LOG.debug("Becoming active"); + state = State.ACTIVE; + appClient.becomeActive(); + } + } + + private void becomeStandby() { + if (state != State.STANDBY) { + LOG.debug("Becoming standby"); + state = State.STANDBY; + appClient.becomeStandby(); + } + } + + private void enterNeutralMode() { + if (state != State.NEUTRAL) { + LOG.debug("Entering neutral mode"); + state = State.NEUTRAL; + appClient.enterNeutralMode(); + } + } + + private void createNode() { + zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, this, + null); + } + + private void monitorNode() { + zkClient.exists(zkLockFilePath, true, this, null); + } + + private boolean operationSuccess(Code code) { + return (code == Code.OK); + } + + private boolean operationNodeExists(Code code) { + return (code == Code.NODEEXISTS); + } + + private boolean operationNodeDoesNotExist(Code code) { + return (code == Code.NONODE); + } + + private boolean operationRetry(Code code) { + switch (code) { + case CONNECTIONLOSS: + case OPERATIONTIMEOUT: + return true; + } + return false; + } + +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/BadFencingConfigurationException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/BadFencingConfigurationException.java new file mode 100644 index 0000000000..3d3b1ba53c --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/BadFencingConfigurationException.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; + +/** + * Indicates that the operator has specified an invalid configuration + * for fencing methods. + */ +class BadFencingConfigurationException extends IOException { + private static final long serialVersionUID = 1L; + + public BadFencingConfigurationException(String msg) { + super(msg); + } + + public BadFencingConfigurationException(String msg, Throwable cause) { + super(msg, cause); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java new file mode 100644 index 0000000000..0960fb7cbd --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java @@ -0,0 +1,184 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; +import java.net.InetSocketAddress; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; + +import com.google.common.base.Preconditions; + +/** + * The FailOverController is responsible for electing an active service + * on startup or when the current active is changing (eg due to failure), + * monitoring the health of a service, and performing a fail-over when a + * new active service is either manually selected by a user or elected. + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class FailoverController { + + private static final Log LOG = LogFactory.getLog(FailoverController.class); + + /** + * Perform pre-failover checks on the given service we plan to + * failover to, eg to prevent failing over to a service (eg due + * to it being inaccessible, already active, not healthy, etc). + * + * An option to ignore toSvc if it claims it is not ready to + * become active is provided in case performing a failover will + * allow it to become active, eg because it triggers a log roll + * so the standby can learn about new blocks and leave safemode. + * + * @param toSvc service to make active + * @param toSvcName name of service to make active + * @param forceActive ignore toSvc if it reports that it is not ready + * @throws FailoverFailedException if we should avoid failover + */ + private static void preFailoverChecks(HAServiceProtocol toSvc, + InetSocketAddress toSvcAddr, + boolean forceActive) + throws FailoverFailedException { + HAServiceState toSvcState; + + try { + toSvcState = toSvc.getServiceState(); + } catch (IOException e) { + String msg = "Unable to get service state for " + toSvcAddr; + LOG.error(msg, e); + throw new FailoverFailedException(msg, e); + } + + if (!toSvcState.equals(HAServiceState.STANDBY)) { + throw new FailoverFailedException( + "Can't failover to an active service"); + } + + try { + HAServiceProtocolHelper.monitorHealth(toSvc); + } catch (HealthCheckFailedException hce) { + throw new FailoverFailedException( + "Can't failover to an unhealthy service", hce); + } catch (IOException e) { + throw new FailoverFailedException( + "Got an IO exception", e); + } + + try { + if (!toSvc.readyToBecomeActive()) { + if (!forceActive) { + throw new FailoverFailedException( + toSvcAddr + " is not ready to become active"); + } + } + } catch (IOException e) { + throw new FailoverFailedException( + "Got an IO exception", e); + } + } + + /** + * Failover from service 1 to service 2. If the failover fails + * then try to failback. + * + * @param fromSvc currently active service + * @param fromSvcAddr addr of the currently active service + * @param toSvc service to make active + * @param toSvcAddr addr of the service to make active + * @param fencer for fencing fromSvc + * @param forceFence to fence fromSvc even if not strictly necessary + * @param forceActive try to make toSvc active even if it is not ready + * @throws FailoverFailedException if the failover fails + */ + public static void failover(HAServiceProtocol fromSvc, + InetSocketAddress fromSvcAddr, + HAServiceProtocol toSvc, + InetSocketAddress toSvcAddr, + NodeFencer fencer, + boolean forceFence, + boolean forceActive) + throws FailoverFailedException { + Preconditions.checkArgument(fencer != null, "failover requires a fencer"); + preFailoverChecks(toSvc, toSvcAddr, forceActive); + + // Try to make fromSvc standby + boolean tryFence = true; + try { + HAServiceProtocolHelper.transitionToStandby(fromSvc); + // We should try to fence if we failed or it was forced + tryFence = forceFence ? true : false; + } catch (ServiceFailedException sfe) { + LOG.warn("Unable to make " + fromSvcAddr + " standby (" + + sfe.getMessage() + ")"); + } catch (IOException ioe) { + LOG.warn("Unable to make " + fromSvcAddr + + " standby (unable to connect)", ioe); + } + + // Fence fromSvc if it's required or forced by the user + if (tryFence) { + if (!fencer.fence(fromSvcAddr)) { + throw new FailoverFailedException("Unable to fence " + + fromSvcAddr + ". Fencing failed."); + } + } + + // Try to make toSvc active + boolean failed = false; + Throwable cause = null; + try { + HAServiceProtocolHelper.transitionToActive(toSvc); + } catch (ServiceFailedException sfe) { + LOG.error("Unable to make " + toSvcAddr + " active (" + + sfe.getMessage() + "). Failing back."); + failed = true; + cause = sfe; + } catch (IOException ioe) { + LOG.error("Unable to make " + toSvcAddr + + " active (unable to connect). Failing back.", ioe); + failed = true; + cause = ioe; + } + + // We failed to make toSvc active + if (failed) { + String msg = "Unable to failover to " + toSvcAddr; + // Only try to failback if we didn't fence fromSvc + if (!tryFence) { + try { + // Unconditionally fence toSvc in case it is still trying to + // become active, eg we timed out waiting for its response. + // Unconditionally force fromSvc to become active since it + // was previously active when we initiated failover. + failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true, true); + } catch (FailoverFailedException ffe) { + msg += ". Failback to " + fromSvcAddr + + " failed (" + ffe.getMessage() + ")"; + LOG.fatal(msg); + } + } + throw new FailoverFailedException(msg, cause); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverFailedException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverFailedException.java new file mode 100644 index 0000000000..09982b4f7e --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverFailedException.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Exception thrown to indicate service failover has failed. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class FailoverFailedException extends Exception { + private static final long serialVersionUID = 1L; + + public FailoverFailedException(final String message) { + super(message); + } + + public FailoverFailedException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FenceMethod.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FenceMethod.java new file mode 100644 index 0000000000..d8bda1402f --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FenceMethod.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.net.InetSocketAddress; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configurable; + +/** + * A fencing method is a method by which one node can forcibly prevent + * another node from making continued progress. This might be implemented + * by killing a process on the other node, by denying the other node's + * access to shared storage, or by accessing a PDU to cut the other node's + * power. + *

+ * Since these methods are often vendor- or device-specific, operators + * may implement this interface in order to achieve fencing. + *

+ * Fencing is configured by the operator as an ordered list of methods to + * attempt. Each method will be tried in turn, and the next in the list + * will only be attempted if the previous one fails. See {@link NodeFencer} + * for more information. + *

+ * If an implementation also implements {@link Configurable} then its + * setConf method will be called upon instantiation. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public interface FenceMethod { + /** + * Verify that the given fencing method's arguments are valid. + * @param args the arguments provided in the configuration. This may + * be null if the operator did not configure any arguments. + * @throws BadFencingConfigurationException if the arguments are invalid + */ + public void checkArgs(String args) throws BadFencingConfigurationException; + + /** + * Attempt to fence the target node. + * @param serviceAddr the address (host:ipcport) of the service to fence + * @param args the configured arguments, which were checked at startup by + * {@link #checkArgs(String)} + * @return true if fencing was successful, false if unsuccessful or + * indeterminate + * @throws BadFencingConfigurationException if the configuration was + * determined to be invalid only at runtime + */ + public boolean tryFence(InetSocketAddress serviceAddr, String args) + throws BadFencingConfigurationException; +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java new file mode 100644 index 0000000000..3350692d68 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java @@ -0,0 +1,321 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; +import java.io.PrintStream; +import java.net.InetSocketAddress; +import java.util.Map; + +import org.apache.commons.cli.Options; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.ParseException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.ha.protocolPB.HAServiceProtocolClientSideTranslatorPB; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import com.google.common.collect.ImmutableMap; + +/** + * A command-line tool for making calls in the HAServiceProtocol. + * For example,. this can be used to force a service to standby or active + * mode, or to trigger a health-check. + */ +@InterfaceAudience.Private + +public abstract class HAAdmin extends Configured implements Tool { + + private static final String FORCEFENCE = "forcefence"; + private static final String FORCEACTIVE = "forceactive"; + + private static Map USAGE = + ImmutableMap.builder() + .put("-transitionToActive", + new UsageInfo("", "Transitions the service into Active state")) + .put("-transitionToStandby", + new UsageInfo("", "Transitions the service into Standby state")) + .put("-failover", + new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] ", + "Failover from the first service to the second.\n" + + "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" + + "Try to failover to the target service even if it is not ready if the " + + FORCEACTIVE + " option is used.")) + .put("-getServiceState", + new UsageInfo("", "Returns the state of the service")) + .put("-checkHealth", + new UsageInfo("", + "Requests that the service perform a health check.\n" + + "The HAAdmin tool will exit with a non-zero exit code\n" + + "if the check fails.")) + .put("-help", + new UsageInfo("", "Displays help on the specified command")) + .build(); + + /** Output stream for errors, for use in tests */ + protected PrintStream errOut = System.err; + PrintStream out = System.out; + + protected String getUsageString() { + return "Usage: HAAdmin"; + } + + protected void printUsage(PrintStream errOut) { + errOut.println(getUsageString()); + for (Map.Entry e : USAGE.entrySet()) { + String cmd = e.getKey(); + UsageInfo usage = e.getValue(); + + errOut.println(" [" + cmd + " " + usage.args + "]"); + } + errOut.println(); + ToolRunner.printGenericCommandUsage(errOut); + } + + private static void printUsage(PrintStream errOut, String cmd) { + UsageInfo usage = USAGE.get(cmd); + if (usage == null) { + throw new RuntimeException("No usage for cmd " + cmd); + } + errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]"); + } + + private int transitionToActive(final String[] argv) + throws IOException, ServiceFailedException { + if (argv.length != 2) { + errOut.println("transitionToActive: incorrect number of arguments"); + printUsage(errOut, "-transitionToActive"); + return -1; + } + + HAServiceProtocol proto = getProtocol(argv[1]); + HAServiceProtocolHelper.transitionToActive(proto); + return 0; + } + + private int transitionToStandby(final String[] argv) + throws IOException, ServiceFailedException { + if (argv.length != 2) { + errOut.println("transitionToStandby: incorrect number of arguments"); + printUsage(errOut, "-transitionToStandby"); + return -1; + } + + HAServiceProtocol proto = getProtocol(argv[1]); + HAServiceProtocolHelper.transitionToStandby(proto); + return 0; + } + + private int failover(final String[] argv) + throws IOException, ServiceFailedException { + Configuration conf = getConf(); + boolean forceFence = false; + boolean forceActive = false; + + Options failoverOpts = new Options(); + // "-failover" isn't really an option but we need to add + // it to appease CommandLineParser + failoverOpts.addOption("failover", false, "failover"); + failoverOpts.addOption(FORCEFENCE, false, "force fencing"); + failoverOpts.addOption(FORCEACTIVE, false, "force failover"); + + CommandLineParser parser = new GnuParser(); + CommandLine cmd; + + try { + cmd = parser.parse(failoverOpts, argv); + forceFence = cmd.hasOption(FORCEFENCE); + forceActive = cmd.hasOption(FORCEACTIVE); + } catch (ParseException pe) { + errOut.println("failover: incorrect arguments"); + printUsage(errOut, "-failover"); + return -1; + } + + int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length; + final String[] args = cmd.getArgs(); + + if (numOpts > 2 || args.length != 2) { + errOut.println("failover: incorrect arguments"); + printUsage(errOut, "-failover"); + return -1; + } + + NodeFencer fencer; + try { + fencer = NodeFencer.create(conf); + } catch (BadFencingConfigurationException bfce) { + errOut.println("failover: incorrect fencing configuration: " + + bfce.getLocalizedMessage()); + return -1; + } + if (fencer == null) { + errOut.println("failover: no fencer configured"); + return -1; + } + + InetSocketAddress addr1 = + NetUtils.createSocketAddr(getServiceAddr(args[0])); + InetSocketAddress addr2 = + NetUtils.createSocketAddr(getServiceAddr(args[1])); + HAServiceProtocol proto1 = getProtocol(args[0]); + HAServiceProtocol proto2 = getProtocol(args[1]); + + try { + FailoverController.failover(proto1, addr1, proto2, addr2, + fencer, forceFence, forceActive); + out.println("Failover from "+args[0]+" to "+args[1]+" successful"); + } catch (FailoverFailedException ffe) { + errOut.println("Failover failed: " + ffe.getLocalizedMessage()); + return -1; + } + return 0; + } + + private int checkHealth(final String[] argv) + throws IOException, ServiceFailedException { + if (argv.length != 2) { + errOut.println("checkHealth: incorrect number of arguments"); + printUsage(errOut, "-checkHealth"); + return -1; + } + + HAServiceProtocol proto = getProtocol(argv[1]); + try { + HAServiceProtocolHelper.monitorHealth(proto); + } catch (HealthCheckFailedException e) { + errOut.println("Health check failed: " + e.getLocalizedMessage()); + return -1; + } + return 0; + } + + private int getServiceState(final String[] argv) + throws IOException, ServiceFailedException { + if (argv.length != 2) { + errOut.println("getServiceState: incorrect number of arguments"); + printUsage(errOut, "-getServiceState"); + return -1; + } + + HAServiceProtocol proto = getProtocol(argv[1]); + out.println(proto.getServiceState()); + return 0; + } + + /** + * Return the serviceId as is, we are assuming it was + * given as a service address of form . + */ + protected String getServiceAddr(String serviceId) { + return serviceId; + } + + /** + * Return a proxy to the specified target service. + */ + protected HAServiceProtocol getProtocol(String serviceId) + throws IOException { + String serviceAddr = getServiceAddr(serviceId); + InetSocketAddress addr = NetUtils.createSocketAddr(serviceAddr); + return new HAServiceProtocolClientSideTranslatorPB(addr, getConf()); + } + + @Override + public int run(String[] argv) throws Exception { + try { + return runCmd(argv); + } catch (IllegalArgumentException iae) { + errOut.println("Illegal argument: " + iae.getLocalizedMessage()); + return -1; + } catch (IOException ioe) { + errOut.println("Operation failed: " + ioe.getLocalizedMessage()); + return -1; + } + } + + protected int runCmd(String[] argv) throws Exception { + if (argv.length < 1) { + printUsage(errOut); + return -1; + } + + String cmd = argv[0]; + + if (!cmd.startsWith("-")) { + errOut.println("Bad command '" + cmd + "': expected command starting with '-'"); + printUsage(errOut); + return -1; + } + + if ("-transitionToActive".equals(cmd)) { + return transitionToActive(argv); + } else if ("-transitionToStandby".equals(cmd)) { + return transitionToStandby(argv); + } else if ("-failover".equals(cmd)) { + return failover(argv); + } else if ("-getServiceState".equals(cmd)) { + return getServiceState(argv); + } else if ("-checkHealth".equals(cmd)) { + return checkHealth(argv); + } else if ("-help".equals(cmd)) { + return help(argv); + } else { + errOut.println(cmd.substring(1) + ": Unknown command"); + printUsage(errOut); + return -1; + } + } + + private int help(String[] argv) { + if (argv.length != 2) { + printUsage(errOut, "-help"); + return -1; + } + String cmd = argv[1]; + if (!cmd.startsWith("-")) { + cmd = "-" + cmd; + } + UsageInfo usageInfo = USAGE.get(cmd); + if (usageInfo == null) { + errOut.println(cmd + ": Unknown command"); + printUsage(errOut); + return -1; + } + + errOut.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help); + return 0; + } + + private static class UsageInfo { + private final String args; + private final String help; + + public UsageInfo(String args, String help) { + this.args = args; + this.help = help; + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java new file mode 100644 index 0000000000..18b10f99c6 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.security.KerberosInfo; + +import java.io.IOException; + +/** + * Protocol interface that provides High Availability related primitives to + * monitor and fail-over the service. + * + * This interface could be used by HA frameworks to manage the service. + */ +@KerberosInfo( + serverPrincipal=CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_USER_NAME_KEY) +@InterfaceAudience.Public +@InterfaceStability.Evolving +public interface HAServiceProtocol { + /** + * Initial version of the protocol + */ + public static final long versionID = 1L; + + /** + * An HA service may be in active or standby state. During + * startup, it is in an unknown INITIALIZING state. + */ + public enum HAServiceState { + INITIALIZING("initializing"), + ACTIVE("active"), + STANDBY("standby"); + + private String name; + + HAServiceState(String name) { + this.name = name; + } + + public String toString() { + return name; + } + } + + /** + * Monitor the health of service. This periodically called by the HA + * frameworks to monitor the health of the service. + * + * Service is expected to perform checks to ensure it is functional. + * If the service is not healthy due to failure or partial failure, + * it is expected to throw {@link HealthCheckFailedException}. + * The definition of service not healthy is left to the service. + * + * Note that when health check of an Active service fails, + * failover to standby may be done. + * + * @throws HealthCheckFailedException + * if the health check of a service fails. + * @throws AccessControlException + * if access is denied. + * @throws IOException + * if other errors happen + */ + public void monitorHealth() throws HealthCheckFailedException, + AccessControlException, + IOException; + + /** + * Request service to transition to active state. No operation, if the + * service is already in active state. + * + * @throws ServiceFailedException + * if transition from standby to active fails. + * @throws AccessControlException + * if access is denied. + * @throws IOException + * if other errors happen + */ + public void transitionToActive() throws ServiceFailedException, + AccessControlException, + IOException; + + /** + * Request service to transition to standby state. No operation, if the + * service is already in standby state. + * + * @throws ServiceFailedException + * if transition from active to standby fails. + * @throws AccessControlException + * if access is denied. + * @throws IOException + * if other errors happen + */ + public void transitionToStandby() throws ServiceFailedException, + AccessControlException, + IOException; + + /** + * Return the current state of the service. + * + * @throws AccessControlException + * if access is denied. + * @throws IOException + * if other errors happen + */ + public HAServiceState getServiceState() throws AccessControlException, + IOException; + + /** + * Return true if the service is capable and ready to transition + * from the standby state to the active state. + * + * @return true if the service is ready to become active, false otherwise. + * @throws AccessControlException + * if access is denied. + * @throws IOException + * if other errors happen + */ + public boolean readyToBecomeActive() throws ServiceFailedException, + AccessControlException, + IOException; +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocolHelper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocolHelper.java new file mode 100644 index 0000000000..b8ee717951 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocolHelper.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.ipc.RemoteException; + +/** + * Helper for making {@link HAServiceProtocol} RPC calls. This helper + * unwraps the {@link RemoteException} to specific exceptions. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class HAServiceProtocolHelper { + public static void monitorHealth(HAServiceProtocol svc) + throws IOException { + try { + svc.monitorHealth(); + } catch (RemoteException e) { + throw e.unwrapRemoteException(HealthCheckFailedException.class); + } + } + + public static void transitionToActive(HAServiceProtocol svc) + throws IOException { + try { + svc.transitionToActive(); + } catch (RemoteException e) { + throw e.unwrapRemoteException(ServiceFailedException.class); + } + } + + public static void transitionToStandby(HAServiceProtocol svc) + throws IOException { + try { + svc.transitionToStandby(); + } catch (RemoteException e) { + throw e.unwrapRemoteException(ServiceFailedException.class); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthCheckFailedException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthCheckFailedException.java new file mode 100644 index 0000000000..e636adff3e --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthCheckFailedException.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Exception thrown to indicate that health check of a service failed. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class HealthCheckFailedException extends IOException { + private static final long serialVersionUID = 1L; + + public HealthCheckFailedException(final String message) { + super(message); + } + + public HealthCheckFailedException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java new file mode 100644 index 0000000000..34a2c8b823 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java @@ -0,0 +1,195 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.net.InetSocketAddress; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.ReflectionUtils; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; + +/** + * This class parses the configured list of fencing methods, and + * is responsible for trying each one in turn while logging informative + * output.

+ * + * The fencing methods are configured as a carriage-return separated list. + * Each line in the list is of the form:

+ * com.example.foo.MyMethod(arg string) + * or + * com.example.foo.MyMethod + * The class provided must implement the {@link FenceMethod} interface. + * The fencing methods that ship with Hadoop may also be referred to + * by shortened names:

+ *

    + *
  • shell(/path/to/some/script.sh args...)
  • + *
  • sshfence(...) (see {@link SshFenceByTcpPort}) + *
+ */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class NodeFencer { + public static final String CONF_METHODS_KEY = + "dfs.ha.fencing.methods"; + + private static final String CLASS_RE = "([a-zA-Z0-9\\.\\$]+)"; + private static final Pattern CLASS_WITH_ARGUMENT = + Pattern.compile(CLASS_RE + "\\((.+?)\\)"); + private static final Pattern CLASS_WITHOUT_ARGUMENT = + Pattern.compile(CLASS_RE); + private static final Pattern HASH_COMMENT_RE = + Pattern.compile("#.*$"); + + private static final Log LOG = LogFactory.getLog(NodeFencer.class); + + /** + * Standard fencing methods included with Hadoop. + */ + private static final Map> STANDARD_METHODS = + ImmutableMap.>of( + "shell", ShellCommandFencer.class, + "sshfence", SshFenceByTcpPort.class); + + private final List methods; + + public NodeFencer(Configuration conf) + throws BadFencingConfigurationException { + this.methods = parseMethods(conf); + } + + public static NodeFencer create(Configuration conf) + throws BadFencingConfigurationException { + String confStr = conf.get(CONF_METHODS_KEY); + if (confStr == null) { + return null; + } + return new NodeFencer(conf); + } + + public boolean fence(InetSocketAddress serviceAddr) { + LOG.info("====== Beginning Service Fencing Process... ======"); + int i = 0; + for (FenceMethodWithArg method : methods) { + LOG.info("Trying method " + (++i) + "/" + methods.size() +": " + method); + + try { + if (method.method.tryFence(serviceAddr, method.arg)) { + LOG.info("====== Fencing successful by method " + method + " ======"); + return true; + } + } catch (BadFencingConfigurationException e) { + LOG.error("Fencing method " + method + " misconfigured", e); + continue; + } catch (Throwable t) { + LOG.error("Fencing method " + method + " failed with an unexpected error.", t); + continue; + } + LOG.warn("Fencing method " + method + " was unsuccessful."); + } + + LOG.error("Unable to fence service by any configured method."); + return false; + } + + private static List parseMethods(Configuration conf) + throws BadFencingConfigurationException { + String confStr = conf.get(CONF_METHODS_KEY); + String[] lines = confStr.split("\\s*\n\\s*"); + + List methods = Lists.newArrayList(); + for (String line : lines) { + line = HASH_COMMENT_RE.matcher(line).replaceAll(""); + line = line.trim(); + if (!line.isEmpty()) { + methods.add(parseMethod(conf, line)); + } + } + + return methods; + } + + private static FenceMethodWithArg parseMethod(Configuration conf, String line) + throws BadFencingConfigurationException { + Matcher m; + if ((m = CLASS_WITH_ARGUMENT.matcher(line)).matches()) { + String className = m.group(1); + String arg = m.group(2); + return createFenceMethod(conf, className, arg); + } else if ((m = CLASS_WITHOUT_ARGUMENT.matcher(line)).matches()) { + String className = m.group(1); + return createFenceMethod(conf, className, null); + } else { + throw new BadFencingConfigurationException( + "Unable to parse line: '" + line + "'"); + } + } + + private static FenceMethodWithArg createFenceMethod( + Configuration conf, String clazzName, String arg) + throws BadFencingConfigurationException { + + Class clazz; + try { + // See if it's a short name for one of the built-in methods + clazz = STANDARD_METHODS.get(clazzName); + if (clazz == null) { + // Try to instantiate the user's custom method + clazz = Class.forName(clazzName); + } + } catch (Exception e) { + throw new BadFencingConfigurationException( + "Could not find configured fencing method " + clazzName, + e); + } + + // Check that it implements the right interface + if (!FenceMethod.class.isAssignableFrom(clazz)) { + throw new BadFencingConfigurationException("Class " + clazzName + + " does not implement FenceMethod"); + } + + FenceMethod method = (FenceMethod)ReflectionUtils.newInstance( + clazz, conf); + method.checkArgs(arg); + return new FenceMethodWithArg(method, arg); + } + + private static class FenceMethodWithArg { + private final FenceMethod method; + private final String arg; + + private FenceMethodWithArg(FenceMethod method, String arg) { + this.method = method; + this.arg = arg; + } + + public String toString() { + return method.getClass().getCanonicalName() + "(" + arg + ")"; + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ServiceFailedException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ServiceFailedException.java new file mode 100644 index 0000000000..6f3e444b39 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ServiceFailedException.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + + +/** + * Exception thrown to indicate that an operation performed + * to modify the state of a service or application failed. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class ServiceFailedException extends IOException { + private static final long serialVersionUID = 1L; + + public ServiceFailedException(final String message) { + super(message); + } + + public ServiceFailedException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java new file mode 100644 index 0000000000..ca81f23a18 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java @@ -0,0 +1,187 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; +import java.lang.reflect.Field; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.StringUtils; + +import com.google.common.annotations.VisibleForTesting; + +/** + * Fencing method that runs a shell command. It should be specified + * in the fencing configuration like:
+ * + * shell(/path/to/my/script.sh arg1 arg2 ...) + *
+ * The string between '(' and ')' is passed directly to a bash shell and + * may not include any closing parentheses.

+ * + * The shell command will be run with an environment set up to contain + * all of the current Hadoop configuration variables, with the '_' character + * replacing any '.' characters in the configuration keys.

+ * + * If the shell command returns an exit code of 0, the fencing is + * determined to be successful. If it returns any other exit code, the + * fencing was not successful and the next fencing method in the list + * will be attempted.

+ * + * Note: this fencing method does not implement any timeout. + * If timeouts are necessary, they should be implemented in the shell + * script itself (eg by forking a subshell to kill its parent in + * some number of seconds). + */ +public class ShellCommandFencer + extends Configured implements FenceMethod { + + /** Length at which to abbreviate command in long messages */ + private static final int ABBREV_LENGTH = 20; + + @VisibleForTesting + static Log LOG = LogFactory.getLog( + ShellCommandFencer.class); + + @Override + public void checkArgs(String args) throws BadFencingConfigurationException { + if (args == null || args.isEmpty()) { + throw new BadFencingConfigurationException( + "No argument passed to 'shell' fencing method"); + } + // Nothing else we can really check without actually running the command + } + + @Override + public boolean tryFence(InetSocketAddress serviceAddr, String cmd) { + List cmdList = Arrays.asList(cmd.split("\\s+")); + + // Create arg list with service as the first argument + List argList = new ArrayList(); + argList.add(cmdList.get(0)); + argList.add(serviceAddr.getHostName() + ":" + serviceAddr.getPort()); + argList.addAll(cmdList.subList(1, cmdList.size())); + String cmdWithSvc = StringUtils.join(" ", argList); + + ProcessBuilder builder = new ProcessBuilder( + "bash", "-e", "-c", cmdWithSvc); + setConfAsEnvVars(builder.environment()); + + Process p; + try { + p = builder.start(); + p.getOutputStream().close(); + } catch (IOException e) { + LOG.warn("Unable to execute " + cmd, e); + return false; + } + + String pid = tryGetPid(p); + LOG.info("Launched fencing command '" + cmd + "' with " + + ((pid != null) ? ("pid " + pid) : "unknown pid")); + + String logPrefix = abbreviate(cmd, ABBREV_LENGTH); + if (pid != null) { + logPrefix = "[PID " + pid + "] " + logPrefix; + } + + // Pump logs to stderr + StreamPumper errPumper = new StreamPumper( + LOG, logPrefix, p.getErrorStream(), + StreamPumper.StreamType.STDERR); + errPumper.start(); + + StreamPumper outPumper = new StreamPumper( + LOG, logPrefix, p.getInputStream(), + StreamPumper.StreamType.STDOUT); + outPumper.start(); + + int rc; + try { + rc = p.waitFor(); + errPumper.join(); + outPumper.join(); + } catch (InterruptedException ie) { + LOG.warn("Interrupted while waiting for fencing command: " + cmd); + return false; + } + + return rc == 0; + } + + /** + * Abbreviate a string by putting '...' in the middle of it, + * in an attempt to keep logs from getting too messy. + * @param cmd the string to abbreviate + * @param len maximum length to abbreviate to + * @return abbreviated string + */ + static String abbreviate(String cmd, int len) { + if (cmd.length() > len && len >= 5) { + int firstHalf = (len - 3) / 2; + int rem = len - firstHalf - 3; + + return cmd.substring(0, firstHalf) + + "..." + cmd.substring(cmd.length() - rem); + } else { + return cmd; + } + } + + /** + * Attempt to use evil reflection tricks to determine the + * pid of a launched process. This is helpful to ops + * if debugging a fencing process that might have gone + * wrong. If running on a system or JVM where this doesn't + * work, it will simply return null. + */ + private static String tryGetPid(Process p) { + try { + Class clazz = p.getClass(); + if (clazz.getName().equals("java.lang.UNIXProcess")) { + Field f = clazz.getDeclaredField("pid"); + f.setAccessible(true); + return String.valueOf(f.getInt(p)); + } else { + LOG.trace("Unable to determine pid for " + p + + " since it is not a UNIXProcess"); + return null; + } + } catch (Throwable t) { + LOG.trace("Unable to determine pid for " + p, t); + return null; + } + } + + /** + * Set the environment of the subprocess to be the Configuration, + * with '.'s replaced by '_'s. + */ + private void setConfAsEnvVars(Map env) { + for (Map.Entry pair : getConf()) { + env.put(pair.getKey().replace('.', '_'), pair.getValue()); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java new file mode 100644 index 0000000000..cec731cf20 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java @@ -0,0 +1,315 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.Collection; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configured; + +import com.google.common.annotations.VisibleForTesting; +import com.jcraft.jsch.ChannelExec; +import com.jcraft.jsch.JSch; +import com.jcraft.jsch.JSchException; +import com.jcraft.jsch.Session; + +/** + * This fencing implementation sshes to the target node and uses + * fuser to kill the process listening on the service's + * TCP port. This is more accurate than using "jps" since it doesn't + * require parsing, and will work even if there are multiple service + * processes running on the same machine.

+ * It returns a successful status code if: + *

    + *
  • fuser indicates it successfully killed a process, or + *
  • nc -z indicates that nothing is listening on the target port + *
+ *

+ * This fencing mechanism is configured as following in the fencing method + * list: + * sshfence([[username][:ssh-port]]) + * where the optional argument specifies the username and port to use + * with ssh. + *

+ * In order to achieve passwordless SSH, the operator must also configure + * dfs.ha.fencing.ssh.private-key-files to point to an + * SSH key that has passphrase-less access to the given username and host. + */ +public class SshFenceByTcpPort extends Configured + implements FenceMethod { + + static final Log LOG = LogFactory.getLog( + SshFenceByTcpPort.class); + + static final String CONF_CONNECT_TIMEOUT_KEY = + "dfs.ha.fencing.ssh.connect-timeout"; + private static final int CONF_CONNECT_TIMEOUT_DEFAULT = + 30*1000; + static final String CONF_IDENTITIES_KEY = + "dfs.ha.fencing.ssh.private-key-files"; + + /** + * Verify that the argument, if given, in the conf is parseable. + */ + @Override + public void checkArgs(String argStr) throws BadFencingConfigurationException { + if (argStr != null) { + // Use a dummy service when checking the arguments defined + // in the configuration are parseable. + new Args(new InetSocketAddress("localhost", 8020), argStr); + } + } + + @Override + public boolean tryFence(InetSocketAddress serviceAddr, String argsStr) + throws BadFencingConfigurationException { + + Args args = new Args(serviceAddr, argsStr); + + Session session; + try { + session = createSession(args); + } catch (JSchException e) { + LOG.warn("Unable to create SSH session", e); + return false; + } + + LOG.info("Connecting to " + args.host + "..."); + + try { + session.connect(getSshConnectTimeout()); + } catch (JSchException e) { + LOG.warn("Unable to connect to " + args.host + + " as user " + args.user, e); + return false; + } + LOG.info("Connected to " + args.host); + + try { + return doFence(session, args.targetPort); + } catch (JSchException e) { + LOG.warn("Unable to achieve fencing on remote host", e); + return false; + } finally { + session.disconnect(); + } + } + + + private Session createSession(Args args) throws JSchException { + JSch jsch = new JSch(); + for (String keyFile : getKeyFiles()) { + jsch.addIdentity(keyFile); + } + JSch.setLogger(new LogAdapter()); + + Session session = jsch.getSession(args.user, args.host, args.sshPort); + session.setConfig("StrictHostKeyChecking", "no"); + return session; + } + + private boolean doFence(Session session, int port) throws JSchException { + try { + LOG.info("Looking for process running on port " + port); + int rc = execCommand(session, + "PATH=$PATH:/sbin:/usr/sbin fuser -v -k -n tcp " + port); + if (rc == 0) { + LOG.info("Successfully killed process that was " + + "listening on port " + port); + // exit code 0 indicates the process was successfully killed. + return true; + } else if (rc == 1) { + // exit code 1 indicates either that the process was not running + // or that fuser didn't have root privileges in order to find it + // (eg running as a different user) + LOG.info( + "Indeterminate response from trying to kill service. " + + "Verifying whether it is running using nc..."); + rc = execCommand(session, "nc -z localhost 8020"); + if (rc == 0) { + // the service is still listening - we are unable to fence + LOG.warn("Unable to fence - it is running but we cannot kill it"); + return false; + } else { + LOG.info("Verified that the service is down."); + return true; + } + } else { + // other + } + LOG.info("rc: " + rc); + return rc == 0; + } catch (InterruptedException e) { + LOG.warn("Interrupted while trying to fence via ssh", e); + return false; + } catch (IOException e) { + LOG.warn("Unknown failure while trying to fence via ssh", e); + return false; + } + } + + /** + * Execute a command through the ssh session, pumping its + * stderr and stdout to our own logs. + */ + private int execCommand(Session session, String cmd) + throws JSchException, InterruptedException, IOException { + LOG.debug("Running cmd: " + cmd); + ChannelExec exec = null; + try { + exec = (ChannelExec)session.openChannel("exec"); + exec.setCommand(cmd); + exec.setInputStream(null); + exec.connect(); + + // Pump stdout of the command to our WARN logs + StreamPumper outPumper = new StreamPumper(LOG, cmd + " via ssh", + exec.getInputStream(), StreamPumper.StreamType.STDOUT); + outPumper.start(); + + // Pump stderr of the command to our WARN logs + StreamPumper errPumper = new StreamPumper(LOG, cmd + " via ssh", + exec.getErrStream(), StreamPumper.StreamType.STDERR); + errPumper.start(); + + outPumper.join(); + errPumper.join(); + return exec.getExitStatus(); + } finally { + cleanup(exec); + } + } + + private static void cleanup(ChannelExec exec) { + if (exec != null) { + try { + exec.disconnect(); + } catch (Throwable t) { + LOG.warn("Couldn't disconnect ssh channel", t); + } + } + } + + private int getSshConnectTimeout() { + return getConf().getInt( + CONF_CONNECT_TIMEOUT_KEY, CONF_CONNECT_TIMEOUT_DEFAULT); + } + + private Collection getKeyFiles() { + return getConf().getTrimmedStringCollection(CONF_IDENTITIES_KEY); + } + + /** + * Container for the parsed arg line for this fencing method. + */ + @VisibleForTesting + static class Args { + private static final Pattern USER_PORT_RE = Pattern.compile( + "([^:]+?)?(?:\\:(\\d+))?"); + + private static final int DEFAULT_SSH_PORT = 22; + + String host; + int targetPort; + String user; + int sshPort; + + public Args(InetSocketAddress serviceAddr, String arg) + throws BadFencingConfigurationException { + host = serviceAddr.getHostName(); + targetPort = serviceAddr.getPort(); + user = System.getProperty("user.name"); + sshPort = DEFAULT_SSH_PORT; + + // Parse optional user and ssh port + if (arg != null && !"".equals(arg)) { + Matcher m = USER_PORT_RE.matcher(arg); + if (!m.matches()) { + throw new BadFencingConfigurationException( + "Unable to parse user and SSH port: "+ arg); + } + if (m.group(1) != null) { + user = m.group(1); + } + if (m.group(2) != null) { + sshPort = parseConfiggedPort(m.group(2)); + } + } + } + + private Integer parseConfiggedPort(String portStr) + throws BadFencingConfigurationException { + try { + return Integer.valueOf(portStr); + } catch (NumberFormatException nfe) { + throw new BadFencingConfigurationException( + "Port number '" + portStr + "' invalid"); + } + } + } + + /** + * Adapter from JSch's logger interface to our log4j + */ + private static class LogAdapter implements com.jcraft.jsch.Logger { + static final Log LOG = LogFactory.getLog( + SshFenceByTcpPort.class.getName() + ".jsch"); + + public boolean isEnabled(int level) { + switch (level) { + case com.jcraft.jsch.Logger.DEBUG: + return LOG.isDebugEnabled(); + case com.jcraft.jsch.Logger.INFO: + return LOG.isInfoEnabled(); + case com.jcraft.jsch.Logger.WARN: + return LOG.isWarnEnabled(); + case com.jcraft.jsch.Logger.ERROR: + return LOG.isErrorEnabled(); + case com.jcraft.jsch.Logger.FATAL: + return LOG.isFatalEnabled(); + default: + return false; + } + } + + public void log(int level, String message) { + switch (level) { + case com.jcraft.jsch.Logger.DEBUG: + LOG.debug(message); + break; + case com.jcraft.jsch.Logger.INFO: + LOG.info(message); + break; + case com.jcraft.jsch.Logger.WARN: + LOG.warn(message); + break; + case com.jcraft.jsch.Logger.ERROR: + LOG.error(message); + break; + case com.jcraft.jsch.Logger.FATAL: + LOG.fatal(message); + break; + } + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/StreamPumper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/StreamPumper.java new file mode 100644 index 0000000000..8bc16af2af --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/StreamPumper.java @@ -0,0 +1,90 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.apache.commons.logging.Log; + +/** + * Class responsible for pumping the streams of the subprocess + * out to log4j. stderr is pumped to WARN level and stdout is + * pumped to INFO level + */ +class StreamPumper { + enum StreamType { + STDOUT, STDERR; + } + + private final Log log; + + final Thread thread; + final String logPrefix; + final StreamPumper.StreamType type; + private final InputStream stream; + private boolean started = false; + + StreamPumper(final Log log, final String logPrefix, + final InputStream stream, final StreamType type) { + this.log = log; + this.logPrefix = logPrefix; + this.stream = stream; + this.type = type; + + thread = new Thread(new Runnable() { + @Override + public void run() { + try { + pump(); + } catch (Throwable t) { + ShellCommandFencer.LOG.warn(logPrefix + + ": Unable to pump output from " + type, + t); + } + } + }, logPrefix + ": StreamPumper for " + type); + thread.setDaemon(true); + } + + void join() throws InterruptedException { + assert started; + thread.join(); + } + + void start() { + assert !started; + thread.start(); + started = true; + } + + protected void pump() throws IOException { + InputStreamReader inputStreamReader = new InputStreamReader(stream); + BufferedReader br = new BufferedReader(inputStreamReader); + String line = null; + while ((line = br.readLine()) != null) { + if (type == StreamType.STDOUT) { + log.info(logPrefix + ": " + line); + } else { + log.warn(logPrefix + ": " + line); + } + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolClientSideTranslatorPB.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolClientSideTranslatorPB.java new file mode 100644 index 0000000000..3bf4f6f013 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolClientSideTranslatorPB.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha.protocolPB; + +import java.io.Closeable; +import java.io.IOException; +import java.net.InetSocketAddress; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.ha.HAServiceProtocol; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.GetServiceStateRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceStateProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.MonitorHealthRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.ReadyToBecomeActiveRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToActiveRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToStandbyRequestProto; +import org.apache.hadoop.ipc.ProtobufHelper; +import org.apache.hadoop.ipc.ProtobufRpcEngine; +import org.apache.hadoop.ipc.ProtocolSignature; +import org.apache.hadoop.ipc.RPC; + +import com.google.protobuf.RpcController; +import com.google.protobuf.ServiceException; + +/** + * This class is the client side translator to translate the requests made on + * {@link HAServiceProtocol} interfaces to the RPC server implementing + * {@link HAServiceProtocolPB}. + */ +@InterfaceAudience.Private +@InterfaceStability.Stable +public class HAServiceProtocolClientSideTranslatorPB implements + HAServiceProtocol, Closeable { + /** RpcController is not used and hence is set to null */ + private final static RpcController NULL_CONTROLLER = null; + private final static MonitorHealthRequestProto MONITOR_HEALTH_REQ = + MonitorHealthRequestProto.newBuilder().build(); + private final static TransitionToActiveRequestProto TRANSITION_TO_ACTIVE_REQ = + TransitionToActiveRequestProto.newBuilder().build(); + private final static TransitionToStandbyRequestProto TRANSITION_TO_STANDBY_REQ = + TransitionToStandbyRequestProto.newBuilder().build(); + private final static GetServiceStateRequestProto GET_SERVICE_STATE_REQ = + GetServiceStateRequestProto.newBuilder().build(); + private final static ReadyToBecomeActiveRequestProto ACTIVE_READY_REQ = + ReadyToBecomeActiveRequestProto.newBuilder().build(); + + private final HAServiceProtocolPB rpcProxy; + + public HAServiceProtocolClientSideTranslatorPB(InetSocketAddress addr, + Configuration conf) throws IOException { + RPC.setProtocolEngine(conf, HAServiceProtocolPB.class, + ProtobufRpcEngine.class); + rpcProxy = RPC.getProxy(HAServiceProtocolPB.class, + RPC.getProtocolVersion(HAServiceProtocolPB.class), addr, conf); + } + + @Override + public void monitorHealth() throws IOException { + try { + rpcProxy.monitorHealth(NULL_CONTROLLER, MONITOR_HEALTH_REQ); + } catch (ServiceException e) { + throw ProtobufHelper.getRemoteException(e); + } + } + + @Override + public void transitionToActive() throws IOException { + try { + rpcProxy.transitionToActive(NULL_CONTROLLER, TRANSITION_TO_ACTIVE_REQ); + } catch (ServiceException e) { + throw ProtobufHelper.getRemoteException(e); + } + } + + @Override + public void transitionToStandby() throws IOException { + try { + rpcProxy.transitionToStandby(NULL_CONTROLLER, TRANSITION_TO_STANDBY_REQ); + } catch (ServiceException e) { + throw ProtobufHelper.getRemoteException(e); + } + } + + @Override + public HAServiceState getServiceState() throws IOException { + HAServiceStateProto state; + try { + state = rpcProxy.getServiceState(NULL_CONTROLLER, + GET_SERVICE_STATE_REQ).getState(); + } catch (ServiceException e) { + throw ProtobufHelper.getRemoteException(e); + } + switch(state) { + case ACTIVE: + return HAServiceState.ACTIVE; + case STANDBY: + return HAServiceState.STANDBY; + case INITIALIZING: + default: + return HAServiceState.INITIALIZING; + } + } + + @Override + public void close() { + RPC.stopProxy(rpcProxy); + } + + @Override + public boolean readyToBecomeActive() throws IOException { + try { + return rpcProxy.readyToBecomeActive(NULL_CONTROLLER, ACTIVE_READY_REQ) + .getReadyToBecomeActive(); + } catch (ServiceException e) { + throw ProtobufHelper.getRemoteException(e); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolPB.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolPB.java new file mode 100644 index 0000000000..57eefce54a --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolPB.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha.protocolPB; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceProtocolService; +import org.apache.hadoop.ipc.ProtocolInfo; +import org.apache.hadoop.ipc.VersionedProtocol; +import org.apache.hadoop.security.KerberosInfo; + +@KerberosInfo( + serverPrincipal=CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_USER_NAME_KEY) +@ProtocolInfo(protocolName = "org.apache.hadoop.ha.HAServiceProtocol", + protocolVersion = 1) +@InterfaceAudience.Public +@InterfaceStability.Evolving +public interface HAServiceProtocolPB extends + HAServiceProtocolService.BlockingInterface, VersionedProtocol { + /** + * If any methods need annotation, it can be added here + */ +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolServerSideTranslatorPB.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolServerSideTranslatorPB.java new file mode 100644 index 0000000000..3655a4e712 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolServerSideTranslatorPB.java @@ -0,0 +1,158 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha.protocolPB; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.ha.HAServiceProtocol; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.GetServiceStateRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.GetServiceStateResponseProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceStateProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.MonitorHealthRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.MonitorHealthResponseProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.ReadyToBecomeActiveRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.ReadyToBecomeActiveResponseProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToActiveRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToActiveResponseProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToStandbyRequestProto; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToStandbyResponseProto; +import org.apache.hadoop.ipc.ProtocolSignature; +import org.apache.hadoop.ipc.RPC; + +import com.google.protobuf.RpcController; +import com.google.protobuf.ServiceException; + +/** + * This class is used on the server side. Calls come across the wire for the + * for protocol {@link HAServiceProtocolPB}. + * This class translates the PB data types + * to the native data types used inside the NN as specified in the generic + * ClientProtocol. + */ +@InterfaceAudience.Private +@InterfaceStability.Stable +public class HAServiceProtocolServerSideTranslatorPB implements + HAServiceProtocolPB { + private final HAServiceProtocol server; + private static final MonitorHealthResponseProto MONITOR_HEALTH_RESP = + MonitorHealthResponseProto.newBuilder().build(); + private static final TransitionToActiveResponseProto TRANSITION_TO_ACTIVE_RESP = + TransitionToActiveResponseProto.newBuilder().build(); + private static final TransitionToStandbyResponseProto TRANSITION_TO_STANDBY_RESP = + TransitionToStandbyResponseProto.newBuilder().build(); + + public HAServiceProtocolServerSideTranslatorPB(HAServiceProtocol server) { + this.server = server; + } + + @Override + public MonitorHealthResponseProto monitorHealth(RpcController controller, + MonitorHealthRequestProto request) throws ServiceException { + try { + server.monitorHealth(); + return MONITOR_HEALTH_RESP; + } catch(IOException e) { + throw new ServiceException(e); + } + } + + @Override + public TransitionToActiveResponseProto transitionToActive( + RpcController controller, TransitionToActiveRequestProto request) + throws ServiceException { + try { + server.transitionToActive(); + return TRANSITION_TO_ACTIVE_RESP; + } catch(IOException e) { + throw new ServiceException(e); + } + } + + @Override + public TransitionToStandbyResponseProto transitionToStandby( + RpcController controller, TransitionToStandbyRequestProto request) + throws ServiceException { + try { + server.transitionToStandby(); + return TRANSITION_TO_STANDBY_RESP; + } catch(IOException e) { + throw new ServiceException(e); + } + } + + @Override + public GetServiceStateResponseProto getServiceState(RpcController controller, + GetServiceStateRequestProto request) throws ServiceException { + HAServiceState s; + try { + s = server.getServiceState(); + } catch(IOException e) { + throw new ServiceException(e); + } + + HAServiceStateProto ret; + switch (s) { + case ACTIVE: + ret = HAServiceStateProto.ACTIVE; + break; + case STANDBY: + ret = HAServiceStateProto.STANDBY; + break; + case INITIALIZING: + default: + ret = HAServiceStateProto.INITIALIZING; + break; + } + return GetServiceStateResponseProto.newBuilder().setState(ret).build(); + } + + @Override + public long getProtocolVersion(String protocol, long clientVersion) + throws IOException { + return RPC.getProtocolVersion(HAServiceProtocolPB.class); + } + + @Override + public ProtocolSignature getProtocolSignature(String protocol, + long clientVersion, int clientMethodsHash) throws IOException { + if (!protocol.equals(RPC.getProtocolName(HAServiceProtocolPB.class))) { + throw new IOException("Serverside implements " + + RPC.getProtocolName(HAServiceProtocolPB.class) + + ". The following requested protocol is unknown: " + protocol); + } + + return ProtocolSignature.getProtocolSignature(clientMethodsHash, + RPC.getProtocolVersion(HAServiceProtocolPB.class), + HAServiceProtocolPB.class); + } + + @Override + public ReadyToBecomeActiveResponseProto readyToBecomeActive( + RpcController controller, ReadyToBecomeActiveRequestProto request) + throws ServiceException { + try { + return ReadyToBecomeActiveResponseProto.newBuilder() + .setReadyToBecomeActive(server.readyToBecomeActive()).build(); + } catch (IOException e) { + throw new ServiceException(e); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java index 812a46e02b..ae37d0bed4 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java @@ -27,28 +27,28 @@ import org.apache.hadoop.ipc.RPC; * event of failover, and always returns the same proxy object. */ @InterfaceStability.Evolving -public class DefaultFailoverProxyProvider implements FailoverProxyProvider { +public class DefaultFailoverProxyProvider implements FailoverProxyProvider { - private Object proxy; - private Class iface; + private T proxy; + private Class iface; - public DefaultFailoverProxyProvider(Class iface, Object proxy) { + public DefaultFailoverProxyProvider(Class iface, T proxy) { this.proxy = proxy; this.iface = iface; } @Override - public Class getInterface() { + public Class getInterface() { return iface; } @Override - public Object getProxy() { + public T getProxy() { return proxy; } @Override - public void performFailover(Object currentProxy) { + public void performFailover(T currentProxy) { // Nothing to do. } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java index 707a40d888..ba7d29f0d5 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java @@ -29,7 +29,7 @@ import org.apache.hadoop.classification.InterfaceStability; * {@link RetryPolicy}. */ @InterfaceStability.Evolving -public interface FailoverProxyProvider extends Closeable { +public interface FailoverProxyProvider extends Closeable { /** * Get the proxy object which should be used until the next failover event @@ -37,7 +37,7 @@ public interface FailoverProxyProvider extends Closeable { * * @return the proxy object to invoke methods upon */ - public Object getProxy(); + public T getProxy(); /** * Called whenever the associated {@link RetryPolicy} determines that an error @@ -46,7 +46,7 @@ public interface FailoverProxyProvider extends Closeable { * @param currentProxy the proxy object which was being used before this * failover event */ - public void performFailover(Object currentProxy); + public void performFailover(T currentProxy); /** * Return a reference to the interface this provider's proxy objects actually @@ -58,5 +58,5 @@ public interface FailoverProxyProvider extends Closeable { * @return the interface implemented by the proxy objects returned by * {@link FailoverProxyProvider#getProxy()} */ - public Class getInterface(); + public Class getInterface(); } \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java index 0dad53b59b..323542cbd3 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java @@ -20,14 +20,15 @@ package org.apache.hadoop.io.retry; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; -import java.lang.reflect.Proxy; import java.util.Collections; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.retry.RetryPolicy.RetryAction; +import org.apache.hadoop.util.ThreadUtil; import org.apache.hadoop.ipc.Client.ConnectionId; +import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RpcInvocationHandler; class RetryInvocationHandler implements RpcInvocationHandler { @@ -38,6 +39,7 @@ class RetryInvocationHandler implements RpcInvocationHandler { * The number of times the associated proxyProvider has ever been failed over. */ private long proxyProviderFailoverCount = 0; + private volatile boolean hasMadeASuccessfulCall = false; private RetryPolicy defaultPolicy; private Map methodNameToPolicyMap; @@ -78,47 +80,82 @@ class RetryInvocationHandler implements RpcInvocationHandler { invocationAttemptFailoverCount = proxyProviderFailoverCount; } try { - return invokeMethod(method, args); + Object ret = invokeMethod(method, args); + hasMadeASuccessfulCall = true; + return ret; } catch (Exception e) { boolean isMethodIdempotent = proxyProvider.getInterface() .getMethod(method.getName(), method.getParameterTypes()) .isAnnotationPresent(Idempotent.class); RetryAction action = policy.shouldRetry(e, retries++, invocationFailoverCount, isMethodIdempotent); - if (action == RetryAction.FAIL) { - LOG.warn("Exception while invoking " + method.getName() - + " of " + currentProxy.getClass() + ". Not retrying.", e); - if (!method.getReturnType().equals(Void.TYPE)) { - throw e; // non-void methods can't fail without an exception + if (action.action == RetryAction.RetryDecision.FAIL) { + if (action.reason != null) { + LOG.warn("Exception while invoking " + + currentProxy.getClass() + "." + method.getName() + + ". Not retrying because " + action.reason, e); } - return null; - } else if (action == RetryAction.FAILOVER_AND_RETRY) { - LOG.warn("Exception while invoking " + method.getName() - + " of " + currentProxy.getClass() - + " after " + invocationFailoverCount + " fail over attempts." - + " Trying to fail over.", e); - // Make sure that concurrent failed method invocations only cause a - // single actual fail over. - synchronized (proxyProvider) { - if (invocationAttemptFailoverCount == proxyProviderFailoverCount) { - proxyProvider.performFailover(currentProxy); - proxyProviderFailoverCount++; - currentProxy = proxyProvider.getProxy(); + throw e; + } else { // retry or failover + // avoid logging the failover if this is the first call on this + // proxy object, and we successfully achieve the failover without + // any flip-flopping + boolean worthLogging = + !(invocationFailoverCount == 0 && !hasMadeASuccessfulCall); + worthLogging |= LOG.isDebugEnabled(); + if (action.action == RetryAction.RetryDecision.FAILOVER_AND_RETRY && + worthLogging) { + String msg = "Exception while invoking " + method.getName() + + " of class " + currentProxy.getClass().getSimpleName(); + if (invocationFailoverCount > 0) { + msg += " after " + invocationFailoverCount + " fail over attempts"; + } + msg += ". Trying to fail over " + formatSleepMessage(action.delayMillis); + if (LOG.isDebugEnabled()) { + LOG.debug(msg, e); } else { - LOG.warn("A failover has occurred since the start of this method" - + " invocation attempt."); + LOG.warn(msg); + } + } else { + if(LOG.isDebugEnabled()) { + LOG.debug("Exception while invoking " + method.getName() + + " of class " + currentProxy.getClass().getSimpleName() + + ". Retrying " + formatSleepMessage(action.delayMillis), e); } } - invocationFailoverCount++; - } - if(LOG.isDebugEnabled()) { - LOG.debug("Exception while invoking " + method.getName() - + " of " + currentProxy.getClass() + ". Retrying.", e); + + if (action.delayMillis > 0) { + ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis); + } + + if (action.action == RetryAction.RetryDecision.FAILOVER_AND_RETRY) { + // Make sure that concurrent failed method invocations only cause a + // single actual fail over. + synchronized (proxyProvider) { + if (invocationAttemptFailoverCount == proxyProviderFailoverCount) { + proxyProvider.performFailover(currentProxy); + proxyProviderFailoverCount++; + currentProxy = proxyProvider.getProxy(); + } else { + LOG.warn("A failover has occurred since the start of this method" + + " invocation attempt."); + } + } + invocationFailoverCount++; + } } } } } - + + private static String formatSleepMessage(long millis) { + if (millis > 0) { + return "after sleeping for " + millis + "ms."; + } else { + return "immediately."; + } + } + private Object invokeMethod(Method method, Object[] args) throws Throwable { try { if (!method.isAccessible()) { @@ -137,9 +174,7 @@ class RetryInvocationHandler implements RpcInvocationHandler { @Override //RpcInvocationHandler public ConnectionId getConnectionId() { - RpcInvocationHandler inv = (RpcInvocationHandler) Proxy - .getInvocationHandler(currentProxy); - return inv.getConnectionId(); + return RPC.getConnectionIdForProxy(currentProxy); } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java index 3634e18673..2be8b75999 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java @@ -33,6 +33,8 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.StandbyException; +import com.google.common.annotations.VisibleForTesting; + /** *

* A collection of useful implementations of {@link RetryPolicy}. @@ -42,6 +44,8 @@ public class RetryPolicies { public static final Log LOG = LogFactory.getLog(RetryPolicies.class); + private static final Random RAND = new Random(); + /** *

* Try once, and fail by re-throwing the exception. @@ -50,14 +54,6 @@ public class RetryPolicies { */ public static final RetryPolicy TRY_ONCE_THEN_FAIL = new TryOnceThenFail(); - /** - *

- * Try once, and fail silently for void methods, or by - * re-throwing the exception for non-void methods. - *

- */ - public static final RetryPolicy TRY_ONCE_DONT_FAIL = new TryOnceDontFail(); - /** *

* Keep trying forever. @@ -137,16 +133,17 @@ public class RetryPolicies { public static final RetryPolicy failoverOnNetworkException( RetryPolicy fallbackPolicy, int maxFailovers) { - return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers); + return failoverOnNetworkException(fallbackPolicy, maxFailovers, 0, 0); + } + + public static final RetryPolicy failoverOnNetworkException( + RetryPolicy fallbackPolicy, int maxFailovers, long delayMillis, + long maxDelayBase) { + return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers, + delayMillis, maxDelayBase); } static class TryOnceThenFail implements RetryPolicy { - public RetryAction shouldRetry(Exception e, int retries, int failovers, - boolean isMethodIdempotent) throws Exception { - throw e; - } - } - static class TryOnceDontFail implements RetryPolicy { public RetryAction shouldRetry(Exception e, int retries, int failovers, boolean isMethodIdempotent) throws Exception { return RetryAction.FAIL; @@ -174,14 +171,10 @@ public class RetryPolicies { public RetryAction shouldRetry(Exception e, int retries, int failovers, boolean isMethodIdempotent) throws Exception { if (retries >= maxRetries) { - throw e; + return RetryAction.FAIL; } - try { - timeUnit.sleep(calculateSleepTime(retries)); - } catch (InterruptedException ie) { - // retry - } - return RetryAction.RETRY; + return new RetryAction(RetryAction.RetryDecision.RETRY, + timeUnit.toMillis(calculateSleepTime(retries))); } protected abstract long calculateSleepTime(int retries); @@ -268,7 +261,7 @@ public class RetryPolicies { } static class ExponentialBackoffRetry extends RetryLimited { - private Random r = new Random(); + public ExponentialBackoffRetry( int maxRetries, long sleepTime, TimeUnit timeUnit) { super(maxRetries, sleepTime, timeUnit); @@ -276,16 +269,19 @@ public class RetryPolicies { @Override protected long calculateSleepTime(int retries) { - return sleepTime*r.nextInt(1<<(retries+1)); + return calculateExponentialTime(sleepTime, retries + 1); } } - /* + /** * Fail over and retry in the case of: * Remote StandbyException (server is up, but is not the active server) * Immediate socket exceptions (e.g. no route to host, econnrefused) * Socket exceptions after initial connection when operation is idempotent * + * The first failover is immediate, while all subsequent failovers wait an + * exponentially-increasing random amount of time. + * * Fail immediately in the case of: * Socket exceptions after initial connection when operation is not idempotent * @@ -295,33 +291,49 @@ public class RetryPolicies { private RetryPolicy fallbackPolicy; private int maxFailovers; + private long delayMillis; + private long maxDelayBase; public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy, int maxFailovers) { + this(fallbackPolicy, maxFailovers, 0, 0); + } + + public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy, + int maxFailovers, long delayMillis, long maxDelayBase) { this.fallbackPolicy = fallbackPolicy; this.maxFailovers = maxFailovers; + this.delayMillis = delayMillis; + this.maxDelayBase = maxDelayBase; } @Override public RetryAction shouldRetry(Exception e, int retries, int failovers, boolean isMethodIdempotent) throws Exception { if (failovers >= maxFailovers) { - LOG.info("Failovers (" + failovers + ") exceeded maximum allowed (" + return new RetryAction(RetryAction.RetryDecision.FAIL, 0, + "failovers (" + failovers + ") exceeded maximum allowed (" + maxFailovers + ")"); - return RetryAction.FAIL; } if (e instanceof ConnectException || e instanceof NoRouteToHostException || e instanceof UnknownHostException || - e instanceof StandbyException) { - return RetryAction.FAILOVER_AND_RETRY; + e instanceof StandbyException || + isWrappedStandbyException(e)) { + return new RetryAction( + RetryAction.RetryDecision.FAILOVER_AND_RETRY, + // retry immediately if this is our first failover, sleep otherwise + failovers == 0 ? 0 : + calculateExponentialTime(delayMillis, failovers, maxDelayBase)); } else if (e instanceof SocketException || - e instanceof IOException) { + (e instanceof IOException && !(e instanceof RemoteException))) { if (isMethodIdempotent) { return RetryAction.FAILOVER_AND_RETRY; } else { - return RetryAction.FAIL; + return new RetryAction(RetryAction.RetryDecision.FAIL, 0, + "the invoked method is not idempotent, and unable to determine " + + "whether it was invoked"); } } else { return fallbackPolicy.shouldRetry(e, retries, failovers, @@ -330,4 +342,34 @@ public class RetryPolicies { } } + + /** + * Return a value which is time increasing exponentially as a + * function of retries, +/- 0%-50% of that value, chosen + * randomly. + * + * @param time the base amount of time to work with + * @param retries the number of retries that have so occurred so far + * @param cap value at which to cap the base sleep time + * @return an amount of time to sleep + */ + @VisibleForTesting + public static long calculateExponentialTime(long time, int retries, + long cap) { + long baseTime = Math.min(time * ((long)1 << retries), cap); + return (long) (baseTime * (RAND.nextFloat() + 0.5)); + } + + private static long calculateExponentialTime(long time, int retries) { + return calculateExponentialTime(time, retries, Long.MAX_VALUE); + } + + private static boolean isWrappedStandbyException(Exception e) { + if (!(e instanceof RemoteException)) { + return false; + } + Exception unwrapped = ((RemoteException)e).unwrapRemoteException( + StandbyException.class); + return unwrapped instanceof StandbyException; + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java index 4c4534ffb7..ed673e950f 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java @@ -19,7 +19,6 @@ package org.apache.hadoop.io.retry; import org.apache.hadoop.classification.InterfaceStability; - /** *

* Specifies a policy for retrying method failures. @@ -33,10 +32,39 @@ public interface RetryPolicy { * Returned by {@link RetryPolicy#shouldRetry(Exception, int, int, boolean)}. */ @InterfaceStability.Evolving - public enum RetryAction { - FAIL, - RETRY, - FAILOVER_AND_RETRY + public static class RetryAction { + + // A few common retry policies, with no delays. + public static final RetryAction FAIL = + new RetryAction(RetryDecision.FAIL); + public static final RetryAction RETRY = + new RetryAction(RetryDecision.RETRY); + public static final RetryAction FAILOVER_AND_RETRY = + new RetryAction(RetryDecision.FAILOVER_AND_RETRY); + + public final RetryDecision action; + public final long delayMillis; + public final String reason; + + public RetryAction(RetryDecision action) { + this(action, 0, null); + } + + public RetryAction(RetryDecision action, long delayTime) { + this(action, delayTime, null); + } + + public RetryAction(RetryDecision action, long delayTime, String reason) { + this.action = action; + this.delayMillis = delayTime; + this.reason = reason; + } + + public enum RetryDecision { + FAIL, + RETRY, + FAILOVER_AND_RETRY + } } /** diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java index f09600d4b3..e5a2d7f15a 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java @@ -227,6 +227,8 @@ public class Client { private int maxIdleTime; //connections will be culled if it was idle for //maxIdleTime msecs private int maxRetries; //the max. no. of retries for socket connections + // the max. no. of retries for socket connections on time out exceptions + private int maxRetriesOnSocketTimeouts; private boolean tcpNoDelay; // if T then disable Nagle's Algorithm private boolean doPing; //do we need to send ping message private int pingInterval; // how often sends ping to the server in msecs @@ -250,6 +252,7 @@ public class Client { this.rpcTimeout = remoteId.getRpcTimeout(); this.maxIdleTime = remoteId.getMaxIdleTime(); this.maxRetries = remoteId.getMaxRetries(); + this.maxRetriesOnSocketTimeouts = remoteId.getMaxRetriesOnSocketTimeouts(); this.tcpNoDelay = remoteId.getTcpNoDelay(); this.doPing = remoteId.getDoPing(); this.pingInterval = remoteId.getPingInterval(); @@ -478,11 +481,8 @@ public class Client { if (updateAddress()) { timeoutFailures = ioFailures = 0; } - /* - * The max number of retries is 45, which amounts to 20s*45 = 15 - * minutes retries. - */ - handleConnectionFailure(timeoutFailures++, 45, toe); + handleConnectionFailure(timeoutFailures++, + maxRetriesOnSocketTimeouts, toe); } catch (IOException ie) { if (updateAddress()) { timeoutFailures = ioFailures = 0; @@ -1286,6 +1286,8 @@ public class Client { private final int maxIdleTime; //connections will be culled if it was idle for //maxIdleTime msecs private final int maxRetries; //the max. no. of retries for socket connections + // the max. no. of retries for socket connections on time out exceptions + private final int maxRetriesOnSocketTimeouts; private final boolean tcpNoDelay; // if T then disable Nagle's Algorithm private final boolean doPing; //do we need to send ping message private final int pingInterval; // how often sends ping to the server in msecs @@ -1293,8 +1295,8 @@ public class Client { ConnectionId(InetSocketAddress address, Class protocol, UserGroupInformation ticket, int rpcTimeout, String serverPrincipal, int maxIdleTime, - int maxRetries, boolean tcpNoDelay, - boolean doPing, int pingInterval) { + int maxRetries, int maxRetriesOnSocketTimeouts, + boolean tcpNoDelay, boolean doPing, int pingInterval) { this.protocol = protocol; this.address = address; this.ticket = ticket; @@ -1302,6 +1304,7 @@ public class Client { this.serverPrincipal = serverPrincipal; this.maxIdleTime = maxIdleTime; this.maxRetries = maxRetries; + this.maxRetriesOnSocketTimeouts = maxRetriesOnSocketTimeouts; this.tcpNoDelay = tcpNoDelay; this.doPing = doPing; this.pingInterval = pingInterval; @@ -1335,6 +1338,11 @@ public class Client { return maxRetries; } + /** max connection retries on socket time outs */ + public int getMaxRetriesOnSocketTimeouts() { + return maxRetriesOnSocketTimeouts; + } + boolean getTcpNoDelay() { return tcpNoDelay; } @@ -1369,6 +1377,9 @@ public class Client { CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_DEFAULT), conf.getInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT), + conf.getInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT), conf.getBoolean(CommonConfigurationKeysPublic.IPC_CLIENT_TCPNODELAY_KEY, CommonConfigurationKeysPublic.IPC_CLIENT_TCPNODELAY_DEFAULT), doPing, diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ProtocolTranslator.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ProtocolTranslator.java new file mode 100644 index 0000000000..5bf9dbaed1 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ProtocolTranslator.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ipc; + +import org.apache.hadoop.classification.InterfaceAudience; + +/** + * An interface implemented by client-side protocol translators to get the + * underlying proxy object the translator is operating on. + */ +@InterfaceAudience.Private +public interface ProtocolTranslator { + + /** + * Return the proxy object underlying this protocol translator. + * @return the proxy object underlying this protocol translator. + */ + public Object getUnderlyingProxyObject(); + +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java index 4f85e905cd..eee364ccde 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java @@ -40,6 +40,7 @@ import javax.net.SocketFactory; import org.apache.commons.logging.*; import org.apache.hadoop.io.*; +import org.apache.hadoop.ipc.Client.ConnectionId; import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; import org.apache.hadoop.ipc.protobuf.ProtocolInfoProtos.ProtocolInfoService; import org.apache.hadoop.net.NetUtils; @@ -530,9 +531,24 @@ public class RPC { * Returns the server address for a given proxy. */ public static InetSocketAddress getServerAddress(Object proxy) { + return getConnectionIdForProxy(proxy).getAddress(); + } + + /** + * Return the connection ID of the given object. If the provided object is in + * fact a protocol translator, we'll get the connection ID of the underlying + * proxy object. + * + * @param proxy the proxy object to get the connection ID of. + * @return the connection ID for the provided proxy object. + */ + public static ConnectionId getConnectionIdForProxy(Object proxy) { + if (proxy instanceof ProtocolTranslator) { + proxy = ((ProtocolTranslator)proxy).getUnderlyingProxyObject(); + } RpcInvocationHandler inv = (RpcInvocationHandler) Proxy .getInvocationHandler(proxy); - return inv.getConnectionId().getAddress(); + return inv.getConnectionId(); } /** @@ -564,6 +580,12 @@ public class RPC { * @param proxy the RPC proxy object to be stopped */ public static void stopProxy(Object proxy) { + if (proxy instanceof ProtocolTranslator) { + RPC.stopProxy(((ProtocolTranslator)proxy) + .getUnderlyingProxyObject()); + return; + } + InvocationHandler invocationHandler = null; try { invocationHandler = Proxy.getInvocationHandler(proxy); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java index 2b35598738..5f642c4f69 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java @@ -1671,6 +1671,10 @@ public abstract class Server { // on the server side, as opposed to just a normal exceptional // result. LOG.warn(logMsg, e); + } else if (e instanceof StandbyException) { + // Don't log the whole stack trace of these exceptions. + // Way too noisy! + LOG.info(logMsg); } else { LOG.info(logMsg, e); } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java index 49f4fadfd5..7a168619af 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.ipc; +import java.io.IOException; + import org.apache.hadoop.classification.InterfaceStability; /** @@ -24,7 +26,7 @@ import org.apache.hadoop.classification.InterfaceStability; * set of servers in which only a subset may be active. */ @InterfaceStability.Evolving -public class StandbyException extends Exception { +public class StandbyException extends IOException { static final long serialVersionUID = 0x12308AD010L; public StandbyException(String msg) { super(msg); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java index 2685887464..43132d263a 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java @@ -23,6 +23,7 @@ import java.net.URI; import java.net.URL; import java.net.UnknownHostException; import java.security.AccessController; +import java.security.PrivilegedAction; import java.util.Arrays; import java.util.List; import java.util.ServiceLoader; @@ -448,6 +449,27 @@ public class SecurityUtil { return buildTokenService(NetUtils.createSocketAddr(uri.getAuthority())); } + /** + * Perform the given action as the daemon's login user. If the login + * user cannot be determined, this will log a FATAL error and exit + * the whole JVM. + */ + public static T doAsLoginUserOrFatal(PrivilegedAction action) { + if (UserGroupInformation.isSecurityEnabled()) { + UserGroupInformation ugi = null; + try { + ugi = UserGroupInformation.getLoginUser(); + } catch (IOException e) { + LOG.fatal("Exception while getting login user", e); + e.printStackTrace(); + Runtime.getRuntime().exit(-1); + } + return ugi.doAs(action); + } else { + return action.run(); + } + } + /** * Resolves a host subject to the security requirements determined by * hadoop.security.token.service.use_ip. @@ -597,5 +619,5 @@ public class SecurityUtil { void setSearchDomains(String ... domains) { searchDomains = Arrays.asList(domains); } - } + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java index 3c2e666a39..11df9811b2 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java @@ -40,6 +40,8 @@ import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.SecretManager; import org.apache.hadoop.util.Daemon; +import com.google.common.base.Preconditions; + @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) @InterfaceStability.Evolving public abstract @@ -84,6 +86,12 @@ extends AbstractDelegationTokenIdentifier> private Thread tokenRemoverThread; protected volatile boolean running; + /** + * If the delegation token update thread holds this lock, it will + * not get interrupted. + */ + protected Object noInterruptsLock = new Object(); + public AbstractDelegationTokenSecretManager(long delegationKeyUpdateInterval, long delegationTokenMaxLifetime, long delegationTokenRenewInterval, long delegationTokenRemoverScanInterval) { @@ -95,6 +103,7 @@ extends AbstractDelegationTokenIdentifier> /** should be called before this object is used */ public void startThreads() throws IOException { + Preconditions.checkState(!running); updateCurrentKey(); synchronized (this) { running = true; @@ -354,12 +363,21 @@ extends AbstractDelegationTokenIdentifier> } } - public synchronized void stopThreads() { + public void stopThreads() { if (LOG.isDebugEnabled()) LOG.debug("Stopping expired delegation token remover thread"); running = false; + if (tokenRemoverThread != null) { - tokenRemoverThread.interrupt(); + synchronized (noInterruptsLock) { + tokenRemoverThread.interrupt(); + } + try { + tokenRemoverThread.join(); + } catch (InterruptedException e) { + throw new RuntimeException( + "Unable to join on token removal thread", e); + } } } @@ -395,7 +413,7 @@ extends AbstractDelegationTokenIdentifier> lastTokenCacheCleanup = now; } try { - Thread.sleep(5000); // 5 seconds + Thread.sleep(Math.min(5000, keyUpdateInterval)); // 5 seconds } catch (InterruptedException ie) { LOG .error("InterruptedExcpetion recieved for ExpiredTokenRemover thread " diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java new file mode 100644 index 0000000000..6e4dfafdf7 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.util; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.classification.InterfaceStability; + +@InterfaceStability.Evolving +public class ThreadUtil { + + private static final Log LOG = LogFactory.getLog(ThreadUtil.class); + + /** + * Cause the current thread to sleep as close as possible to the provided + * number of milliseconds. This method will log and ignore any + * {@link InterruptedException} encountered. + * + * @param millis the number of milliseconds for the current thread to sleep + */ + public static void sleepAtLeastIgnoreInterrupts(long millis) { + long start = System.currentTimeMillis(); + while (System.currentTimeMillis() - start < millis) { + long timeToSleep = millis - + (System.currentTimeMillis() - start); + try { + Thread.sleep(timeToSleep); + } catch (InterruptedException ie) { + LOG.warn("interrupted while sleeping", ie); + } + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml b/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml index b3e12d14e2..2fd9f8d2a9 100644 --- a/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml +++ b/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml @@ -216,6 +216,13 @@ group list is separated by a blank. For e.g. "alice,bob users,wheel". A special value of "*" means all users are allowed. + + + security.ha.service.protocol.acl + * + ACL for HAService protocol used by HAAdmin to manage the + active and stand-by states of namenode. + security.mrhs.client.protocol.acl diff --git a/hadoop-common-project/hadoop-common/src/main/proto/HAServiceProtocol.proto b/hadoop-common-project/hadoop-common/src/main/proto/HAServiceProtocol.proto new file mode 100644 index 0000000000..a3fd86c040 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/proto/HAServiceProtocol.proto @@ -0,0 +1,128 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +option java_package = "org.apache.hadoop.ha.proto"; +option java_outer_classname = "HAServiceProtocolProtos"; +option java_generic_services = true; +option java_generate_equals_and_hash = true; + +enum HAServiceStateProto { + INITIALIZING = 0; + ACTIVE = 1; + STANDBY = 2; +} + +/** + * void request + */ +message MonitorHealthRequestProto { +} + +/** + * void response + */ +message MonitorHealthResponseProto { +} + +/** + * void request + */ +message TransitionToActiveRequestProto { +} + +/** + * void response + */ +message TransitionToActiveResponseProto { +} + +/** + * void request + */ +message TransitionToStandbyRequestProto { +} + +/** + * void response + */ +message TransitionToStandbyResponseProto { +} + +/** + * void request + */ +message GetServiceStateRequestProto { +} + +/** + * Returns the state of the service + */ +message GetServiceStateResponseProto { + required HAServiceStateProto state = 1; +} + +/** + * void request + */ +message ReadyToBecomeActiveRequestProto { +} + +/** + * Returns true if service is ready to become active + */ +message ReadyToBecomeActiveResponseProto { + required bool readyToBecomeActive = 1; +} + +/** + * Protocol interface provides High availability related + * primitives to monitor and failover a service. + * + * For details see o.a.h.ha.HAServiceProtocol. + */ +service HAServiceProtocolService { + /** + * Monitor the health of a service. + */ + rpc monitorHealth(MonitorHealthRequestProto) + returns(MonitorHealthResponseProto); + + /** + * Request service to tranisition to active state. + */ + rpc transitionToActive(TransitionToActiveRequestProto) + returns(TransitionToActiveResponseProto); + + /** + * Request service to transition to standby state. + */ + rpc transitionToStandby(TransitionToStandbyRequestProto) + returns(TransitionToStandbyResponseProto); + + /** + * Get the current state of the service. + */ + rpc getServiceState(GetServiceStateRequestProto) + returns(GetServiceStateResponseProto); + + /** + * Check if the service is ready to become active + */ + rpc readyToBecomeActive(ReadyToBecomeActiveRequestProto) + returns(ReadyToBecomeActiveResponseProto); +} diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml index 8fc45c5efc..a9684000b6 100644 --- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml +++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml @@ -487,6 +487,14 @@ + + ipc.client.connect.max.retries.on.timeouts + 45 + Indicates the number of retries a client will make on socket timeout + to establish a server connection. + + + ipc.server.listen.queue.size 128 @@ -849,4 +857,30 @@ + + dfs.ha.fencing.methods + + + List of fencing methods to use for service fencing. May contain + builtin methods (eg shell and sshfence) or user-defined method. + + + + + dfs.ha.fencing.ssh.connect-timeout + 30000 + + SSH connection timeout, in milliseconds, to use with the builtin + sshfence fencer. + + + + + dfs.ha.fencing.ssh.private-key-files + + + The SSH private key files to use with the builtin sshfence fencer. + + + diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java new file mode 100644 index 0000000000..fec350d3bc --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java @@ -0,0 +1,527 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ha; + +import java.io.IOException; +import java.util.List; + +import org.apache.zookeeper.AsyncCallback; +import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.KeeperException.Code; +import org.apache.zookeeper.WatchedEvent; +import org.apache.zookeeper.ZooKeeper; +import org.apache.zookeeper.Watcher.Event; +import org.apache.zookeeper.data.ACL; +import org.apache.zookeeper.data.Stat; +import org.apache.zookeeper.ZooDefs.Ids; +import org.junit.Before; +import org.junit.Test; +import org.junit.Assert; +import org.mockito.Mockito; + +import org.apache.hadoop.HadoopIllegalArgumentException; +import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback; +import org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException; + +public class TestActiveStandbyElector { + + static ZooKeeper mockZK; + static int count; + static ActiveStandbyElectorCallback mockApp; + static final byte[] data = new byte[8]; + + ActiveStandbyElectorTester elector; + + class ActiveStandbyElectorTester extends ActiveStandbyElector { + ActiveStandbyElectorTester(String hostPort, int timeout, String parent, + List acl, ActiveStandbyElectorCallback app) throws IOException { + super(hostPort, timeout, parent, acl, app); + } + + @Override + public ZooKeeper getNewZooKeeper() { + ++TestActiveStandbyElector.count; + return TestActiveStandbyElector.mockZK; + } + + } + + private static final String zkParentName = "/zookeeper"; + private static final String zkLockPathName = "/zookeeper/" + + ActiveStandbyElector.LOCKFILENAME; + + @Before + public void init() throws IOException { + count = 0; + mockZK = Mockito.mock(ZooKeeper.class); + mockApp = Mockito.mock(ActiveStandbyElectorCallback.class); + elector = new ActiveStandbyElectorTester("hostPort", 1000, zkParentName, + Ids.OPEN_ACL_UNSAFE, mockApp); + } + + /** + * verify that joinElection checks for null data + */ + @Test(expected = HadoopIllegalArgumentException.class) + public void testJoinElectionException() { + elector.joinElection(null); + } + + /** + * verify that joinElection tries to create ephemeral lock znode + */ + @Test + public void testJoinElection() { + elector.joinElection(data); + Mockito.verify(mockZK, Mockito.times(1)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + } + + /** + * verify that successful znode create result becomes active and monitoring is + * started + */ + @Test + public void testCreateNodeResultBecomeActive() { + elector.joinElection(data); + elector.processResult(Code.OK.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).becomeActive(); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + // monitor callback verifies the leader is ephemeral owner of lock but does + // not call becomeActive since its already active + Stat stat = new Stat(); + stat.setEphemeralOwner(1L); + Mockito.when(mockZK.getSessionId()).thenReturn(1L); + elector.processResult(Code.OK.intValue(), zkLockPathName, null, stat); + // should not call neutral mode/standby/active + Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode(); + Mockito.verify(mockApp, Mockito.times(0)).becomeStandby(); + Mockito.verify(mockApp, Mockito.times(1)).becomeActive(); + // another joinElection not called. + Mockito.verify(mockZK, Mockito.times(1)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + // no new monitor called + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + } + + /** + * verify that znode create for existing node and no retry becomes standby and + * monitoring is started + */ + @Test + public void testCreateNodeResultBecomeStandby() { + elector.joinElection(data); + + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).becomeStandby(); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + } + + /** + * verify that znode create error result in fatal error + */ + @Test + public void testCreateNodeResultError() { + elector.joinElection(data); + + elector.processResult(Code.APIERROR.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError( + "Received create error from Zookeeper. code:APIERROR"); + } + + /** + * verify that retry of network errors verifies master by session id and + * becomes active if they match. monitoring is started. + */ + @Test + public void testCreateNodeResultRetryBecomeActive() { + elector.joinElection(data); + + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + // 4 errors results in fatalError + Mockito + .verify(mockApp, Mockito.times(1)) + .notifyFatalError( + "Received create error from Zookeeper. code:CONNECTIONLOSS. "+ + "Not retrying further znode create connection errors."); + + elector.joinElection(data); + // recreate connection via getNewZooKeeper + Assert.assertEquals(2, TestActiveStandbyElector.count); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + Stat stat = new Stat(); + stat.setEphemeralOwner(1L); + Mockito.when(mockZK.getSessionId()).thenReturn(1L); + elector.processResult(Code.OK.intValue(), zkLockPathName, null, stat); + Mockito.verify(mockApp, Mockito.times(1)).becomeActive(); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + Mockito.verify(mockZK, Mockito.times(6)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + } + + /** + * verify that retry of network errors verifies active by session id and + * becomes standby if they dont match. monitoring is started. + */ + @Test + public void testCreateNodeResultRetryBecomeStandby() { + elector.joinElection(data); + + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + Stat stat = new Stat(); + stat.setEphemeralOwner(0); + Mockito.when(mockZK.getSessionId()).thenReturn(1L); + elector.processResult(Code.OK.intValue(), zkLockPathName, null, stat); + Mockito.verify(mockApp, Mockito.times(1)).becomeStandby(); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + } + + /** + * verify that if create znode results in nodeexists and that znode is deleted + * before exists() watch is set then the return of the exists() method results + * in attempt to re-create the znode and become active + */ + @Test + public void testCreateNodeResultRetryNoNode() { + elector.joinElection(data); + + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + zkLockPathName); + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + elector.processResult(Code.NONODE.intValue(), zkLockPathName, null, + (Stat) null); + Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode(); + Mockito.verify(mockZK, Mockito.times(4)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + } + + /** + * verify that more than 3 network error retries result fatalError + */ + @Test + public void testStatNodeRetry() { + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + (Stat) null); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + (Stat) null); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + (Stat) null); + elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null, + (Stat) null); + Mockito + .verify(mockApp, Mockito.times(1)) + .notifyFatalError( + "Received stat error from Zookeeper. code:CONNECTIONLOSS. "+ + "Not retrying further znode monitoring connection errors."); + } + + /** + * verify error in exists() callback results in fatal error + */ + @Test + public void testStatNodeError() { + elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), zkLockPathName, + null, (Stat) null); + Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode(); + Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError( + "Received stat error from Zookeeper. code:RUNTIMEINCONSISTENCY"); + } + + /** + * verify behavior of watcher.process callback with non-node event + */ + @Test + public void testProcessCallbackEventNone() { + elector.joinElection(data); + + WatchedEvent mockEvent = Mockito.mock(WatchedEvent.class); + Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.None); + + // first SyncConnected should not do anything + Mockito.when(mockEvent.getState()).thenReturn( + Event.KeeperState.SyncConnected); + elector.process(mockEvent); + Mockito.verify(mockZK, Mockito.times(0)).exists(Mockito.anyString(), + Mockito.anyBoolean(), Mockito. anyObject(), + Mockito. anyObject()); + + // disconnection should enter safe mode + Mockito.when(mockEvent.getState()).thenReturn( + Event.KeeperState.Disconnected); + elector.process(mockEvent); + Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode(); + + // re-connection should monitor master status + Mockito.when(mockEvent.getState()).thenReturn( + Event.KeeperState.SyncConnected); + elector.process(mockEvent); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + // session expired should enter safe mode and initiate re-election + // re-election checked via checking re-creation of new zookeeper and + // call to create lock znode + Mockito.when(mockEvent.getState()).thenReturn(Event.KeeperState.Expired); + elector.process(mockEvent); + // already in safe mode above. should not enter safe mode again + Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode(); + // called getNewZooKeeper to create new session. first call was in + // constructor + Assert.assertEquals(2, TestActiveStandbyElector.count); + // once in initial joinElection and one now + Mockito.verify(mockZK, Mockito.times(2)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + + // create znode success. become master and monitor + elector.processResult(Code.OK.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).becomeActive(); + Mockito.verify(mockZK, Mockito.times(2)).exists(zkLockPathName, true, + elector, null); + + // error event results in fatal error + Mockito.when(mockEvent.getState()).thenReturn(Event.KeeperState.AuthFailed); + elector.process(mockEvent); + Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError( + "Unexpected Zookeeper watch event state: AuthFailed"); + // only 1 state change callback is called at a time + Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode(); + } + + /** + * verify behavior of watcher.process with node event + */ + @Test + public void testProcessCallbackEventNode() { + elector.joinElection(data); + + // make the object go into the monitoring state + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).becomeStandby(); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + WatchedEvent mockEvent = Mockito.mock(WatchedEvent.class); + Mockito.when(mockEvent.getPath()).thenReturn(zkLockPathName); + + // monitoring should be setup again after event is received + Mockito.when(mockEvent.getType()).thenReturn( + Event.EventType.NodeDataChanged); + elector.process(mockEvent); + Mockito.verify(mockZK, Mockito.times(2)).exists(zkLockPathName, true, + elector, null); + + // monitoring should be setup again after event is received + Mockito.when(mockEvent.getType()).thenReturn( + Event.EventType.NodeChildrenChanged); + elector.process(mockEvent); + Mockito.verify(mockZK, Mockito.times(3)).exists(zkLockPathName, true, + elector, null); + + // lock node deletion when in standby mode should create znode again + // successful znode creation enters active state and sets monitor + Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.NodeDeleted); + elector.process(mockEvent); + // enterNeutralMode not called when app is standby and leader is lost + Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode(); + // once in initial joinElection() and one now + Mockito.verify(mockZK, Mockito.times(2)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + elector.processResult(Code.OK.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).becomeActive(); + Mockito.verify(mockZK, Mockito.times(4)).exists(zkLockPathName, true, + elector, null); + + // lock node deletion in active mode should enter neutral mode and create + // znode again successful znode creation enters active state and sets + // monitor + Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.NodeDeleted); + elector.process(mockEvent); + Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode(); + // another joinElection called + Mockito.verify(mockZK, Mockito.times(3)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + elector.processResult(Code.OK.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(2)).becomeActive(); + Mockito.verify(mockZK, Mockito.times(5)).exists(zkLockPathName, true, + elector, null); + + // bad path name results in fatal error + Mockito.when(mockEvent.getPath()).thenReturn(null); + elector.process(mockEvent); + Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError( + "Unexpected watch error from Zookeeper"); + // fatal error means no new connection other than one from constructor + Assert.assertEquals(1, TestActiveStandbyElector.count); + // no new watches after fatal error + Mockito.verify(mockZK, Mockito.times(5)).exists(zkLockPathName, true, + elector, null); + + } + + /** + * verify becomeStandby is not called if already in standby + */ + @Test + public void testSuccessiveStandbyCalls() { + elector.joinElection(data); + + // make the object go into the monitoring standby state + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).becomeStandby(); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + WatchedEvent mockEvent = Mockito.mock(WatchedEvent.class); + Mockito.when(mockEvent.getPath()).thenReturn(zkLockPathName); + + // notify node deletion + // monitoring should be setup again after event is received + Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.NodeDeleted); + elector.process(mockEvent); + // is standby. no need to notify anything now + Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode(); + // another joinElection called. + Mockito.verify(mockZK, Mockito.times(2)).create(zkLockPathName, data, + Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null); + // lost election + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + // still standby. so no need to notify again + Mockito.verify(mockApp, Mockito.times(1)).becomeStandby(); + // monitor is set again + Mockito.verify(mockZK, Mockito.times(2)).exists(zkLockPathName, true, + elector, null); + } + + /** + * verify quit election terminates connection and there are no new watches. + * next call to joinElection creates new connection and performs election + */ + @Test + public void testQuitElection() throws InterruptedException { + elector.quitElection(); + Mockito.verify(mockZK, Mockito.times(1)).close(); + // no watches added + Mockito.verify(mockZK, Mockito.times(0)).exists(zkLockPathName, true, + elector, null); + + byte[] data = new byte[8]; + elector.joinElection(data); + // getNewZooKeeper called 2 times. once in constructor and once now + Assert.assertEquals(2, TestActiveStandbyElector.count); + elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null, + zkLockPathName); + Mockito.verify(mockApp, Mockito.times(1)).becomeStandby(); + Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true, + elector, null); + + } + + /** + * verify that receiveActiveData gives data when active exists, tells that + * active does not exist and reports error in getting active information + * + * @throws IOException + * @throws InterruptedException + * @throws KeeperException + * @throws ActiveNotFoundException + */ + @Test + public void testGetActiveData() throws ActiveNotFoundException, + KeeperException, InterruptedException, IOException { + // get valid active data + byte[] data = new byte[8]; + Mockito.when( + mockZK.getData(Mockito.eq(zkLockPathName), Mockito.eq(false), + Mockito. anyObject())).thenReturn(data); + Assert.assertEquals(data, elector.getActiveData()); + Mockito.verify(mockZK, Mockito.times(1)).getData( + Mockito.eq(zkLockPathName), Mockito.eq(false), + Mockito. anyObject()); + + // active does not exist + Mockito.when( + mockZK.getData(Mockito.eq(zkLockPathName), Mockito.eq(false), + Mockito. anyObject())).thenThrow( + new KeeperException.NoNodeException()); + try { + elector.getActiveData(); + Assert.fail("ActiveNotFoundException expected"); + } catch(ActiveNotFoundException e) { + Mockito.verify(mockZK, Mockito.times(2)).getData( + Mockito.eq(zkLockPathName), Mockito.eq(false), + Mockito. anyObject()); + } + + // error getting active data rethrows keeperexception + try { + Mockito.when( + mockZK.getData(Mockito.eq(zkLockPathName), Mockito.eq(false), + Mockito. anyObject())).thenThrow( + new KeeperException.AuthFailedException()); + elector.getActiveData(); + Assert.fail("KeeperException.AuthFailedException expected"); + } catch(KeeperException.AuthFailedException ke) { + Mockito.verify(mockZK, Mockito.times(3)).getData( + Mockito.eq(zkLockPathName), Mockito.eq(false), + Mockito. anyObject()); + } + } + +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java new file mode 100644 index 0000000000..672e8d30d1 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java @@ -0,0 +1,231 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ha; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback; +import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.ZooKeeper; +import org.apache.zookeeper.ZooDefs.Ids; +import org.apache.zookeeper.data.ACL; +import org.apache.zookeeper.test.ClientBase; + +/** + * Test for {@link ActiveStandbyElector} using real zookeeper. + */ +public class TestActiveStandbyElectorRealZK extends ClientBase { + static final int NUM_ELECTORS = 2; + static ZooKeeper[] zkClient = new ZooKeeper[NUM_ELECTORS]; + static int currentClientIndex = 0; + + @Override + public void setUp() throws Exception { + // build.test.dir is used by zookeeper + new File(System.getProperty("build.test.dir", "build")).mkdirs(); + super.setUp(); + } + + class ActiveStandbyElectorTesterRealZK extends ActiveStandbyElector { + ActiveStandbyElectorTesterRealZK(String hostPort, int timeout, + String parent, List acl, ActiveStandbyElectorCallback app) + throws IOException { + super(hostPort, timeout, parent, acl, app); + } + + @Override + public ZooKeeper getNewZooKeeper() { + return TestActiveStandbyElectorRealZK.zkClient[ + TestActiveStandbyElectorRealZK.currentClientIndex]; + } + } + + /** + * The class object runs on a thread and waits for a signal to start from the + * test object. On getting the signal it joins the election and thus by doing + * this on multiple threads we can test simultaneous attempts at leader lock + * creation. after joining the election, the object waits on a signal to exit. + * this signal comes when the object's elector has become a leader or there is + * an unexpected fatal error. this lets another thread object to become a + * leader. + */ + class ThreadRunner implements Runnable, ActiveStandbyElectorCallback { + int index; + TestActiveStandbyElectorRealZK test; + boolean wait = true; + + ThreadRunner(int i, TestActiveStandbyElectorRealZK s) { + index = i; + test = s; + } + + @Override + public void run() { + LOG.info("starting " + index); + while(true) { + synchronized (test) { + // wait for test start signal to come + if (!test.start) { + try { + test.wait(); + } catch(InterruptedException e) { + Assert.fail(e.getMessage()); + } + } else { + break; + } + } + } + // join election + byte[] data = new byte[8]; + ActiveStandbyElector elector = test.elector[index]; + LOG.info("joining " + index); + elector.joinElection(data); + try { + while(true) { + synchronized (this) { + // wait for elector to become active/fatal error + if (wait) { + // wait to become active + // wait capped at 30s to prevent hung test + wait(30000); + } else { + break; + } + } + } + Thread.sleep(1000); + // quit election to allow other elector to become active + elector.quitElection(); + } catch(InterruptedException e) { + Assert.fail(e.getMessage()); + } + LOG.info("ending " + index); + } + + @Override + public synchronized void becomeActive() { + test.reportActive(index); + LOG.info("active " + index); + wait = false; + notifyAll(); + } + + @Override + public synchronized void becomeStandby() { + test.reportStandby(index); + LOG.info("standby " + index); + } + + @Override + public synchronized void enterNeutralMode() { + LOG.info("neutral " + index); + } + + @Override + public synchronized void notifyFatalError(String errorMessage) { + LOG.info("fatal " + index + " .Error message:" + errorMessage); + wait = false; + notifyAll(); + } + } + + boolean start = false; + int activeIndex = -1; + int standbyIndex = -1; + String parentDir = "/" + java.util.UUID.randomUUID().toString(); + + ActiveStandbyElector[] elector = new ActiveStandbyElector[NUM_ELECTORS]; + ThreadRunner[] threadRunner = new ThreadRunner[NUM_ELECTORS]; + Thread[] thread = new Thread[NUM_ELECTORS]; + + synchronized void reportActive(int index) { + if (activeIndex == -1) { + activeIndex = index; + } else { + // standby should become active + Assert.assertEquals(standbyIndex, index); + // old active should not become active + Assert.assertFalse(activeIndex == index); + } + activeIndex = index; + } + + synchronized void reportStandby(int index) { + // only 1 standby should be reported and it should not be the same as active + Assert.assertEquals(-1, standbyIndex); + standbyIndex = index; + Assert.assertFalse(activeIndex == standbyIndex); + } + + /** + * the test creates 2 electors which try to become active using a real + * zookeeper server. It verifies that 1 becomes active and 1 becomes standby. + * Upon becoming active the leader quits election and the test verifies that + * the standby now becomes active. these electors run on different threads and + * callback to the test class to report active and standby where the outcome + * is verified + * + * @throws IOException + * @throws InterruptedException + * @throws KeeperException + */ + @Test + public void testActiveStandbyTransition() throws IOException, + InterruptedException, KeeperException { + LOG.info("starting test with parentDir:" + parentDir); + start = false; + byte[] data = new byte[8]; + // create random working directory + createClient().create(parentDir, data, Ids.OPEN_ACL_UNSAFE, + CreateMode.PERSISTENT); + + for(currentClientIndex = 0; + currentClientIndex < NUM_ELECTORS; + ++currentClientIndex) { + LOG.info("creating " + currentClientIndex); + zkClient[currentClientIndex] = createClient(); + threadRunner[currentClientIndex] = new ThreadRunner(currentClientIndex, + this); + elector[currentClientIndex] = new ActiveStandbyElectorTesterRealZK( + "hostPort", 1000, parentDir, Ids.OPEN_ACL_UNSAFE, + threadRunner[currentClientIndex]); + zkClient[currentClientIndex].register(elector[currentClientIndex]); + thread[currentClientIndex] = new Thread(threadRunner[currentClientIndex]); + thread[currentClientIndex].start(); + } + + synchronized (this) { + // signal threads to start + LOG.info("signaling threads"); + start = true; + notifyAll(); + } + + for(int i = 0; i < thread.length; i++) { + thread[i].join(); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java new file mode 100644 index 0000000000..9e2cc75e9d --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java @@ -0,0 +1,441 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import java.io.IOException; +import java.net.InetSocketAddress; + +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.verify; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import org.apache.hadoop.ha.protocolPB.HAServiceProtocolClientSideTranslatorPB; +import org.apache.hadoop.ha.TestNodeFencer.AlwaysSucceedFencer; +import org.apache.hadoop.ha.TestNodeFencer.AlwaysFailFencer; +import static org.apache.hadoop.ha.TestNodeFencer.setupFencer; +import org.apache.hadoop.ipc.ProtocolSignature; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.AccessControlException; + +import org.junit.Test; +import static org.junit.Assert.*; + +public class TestFailoverController { + + private InetSocketAddress svc1Addr = new InetSocketAddress("svc1", 1234); + private InetSocketAddress svc2Addr = new InetSocketAddress("svc2", 5678); + + private class DummyService implements HAServiceProtocol { + HAServiceState state; + + DummyService(HAServiceState state) { + this.state = state; + } + + @Override + public void monitorHealth() throws HealthCheckFailedException, IOException { + // Do nothing + } + + @Override + public void transitionToActive() throws ServiceFailedException, IOException { + state = HAServiceState.ACTIVE; + } + + @Override + public void transitionToStandby() throws ServiceFailedException, IOException { + state = HAServiceState.STANDBY; + } + + @Override + public HAServiceState getServiceState() throws IOException { + return state; + } + + @Override + public boolean readyToBecomeActive() throws ServiceFailedException, IOException { + return true; + } + } + + @Test + public void testFailoverAndFailback() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY); + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + AlwaysSucceedFencer.fenceCalled = 0; + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled); + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); + + AlwaysSucceedFencer.fenceCalled = 0; + FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false, false); + assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled); + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + } + + @Test + public void testFailoverFromStandbyToStandby() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.STANDBY); + DummyService svc2 = new DummyService(HAServiceState.STANDBY); + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); + } + + @Test + public void testFailoverFromActiveToActive() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.ACTIVE); + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Can't failover to an already active service"); + } catch (FailoverFailedException ffe) { + // Expected + } + + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); + } + + @Test + public void testFailoverWithoutPermission() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE) { + @Override + public HAServiceState getServiceState() throws IOException { + throw new AccessControlException("Access denied"); + } + }; + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public HAServiceState getServiceState() throws IOException { + throw new AccessControlException("Access denied"); + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Can't failover when access is denied"); + } catch (FailoverFailedException ffe) { + assertTrue(ffe.getCause().getMessage().contains("Access denied")); + } + } + + + @Test + public void testFailoverToUnreadyService() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public boolean readyToBecomeActive() throws ServiceFailedException, IOException { + return false; + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Can't failover to a service that's not ready"); + } catch (FailoverFailedException ffe) { + // Expected + } + + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + + // Forcing it means we ignore readyToBecomeActive + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, true); + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); + } + + @Test + public void testFailoverToUnhealthyServiceFailsAndFailsback() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public void monitorHealth() throws HealthCheckFailedException { + throw new HealthCheckFailedException("Failed!"); + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Failover to unhealthy service"); + } catch (FailoverFailedException ffe) { + // Expected + } + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + } + + @Test + public void testFailoverFromFaultyServiceSucceeds() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE) { + @Override + public void transitionToStandby() throws ServiceFailedException { + throw new ServiceFailedException("Failed!"); + } + }; + DummyService svc2 = new DummyService(HAServiceState.STANDBY); + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + AlwaysSucceedFencer.fenceCalled = 0; + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + } catch (FailoverFailedException ffe) { + fail("Faulty active prevented failover"); + } + + // svc1 still thinks it's active, that's OK, it was fenced + assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals("svc1:1234", AlwaysSucceedFencer.fencedSvc); + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); + } + + @Test + public void testFailoverFromFaultyServiceFencingFailure() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE) { + @Override + public void transitionToStandby() throws ServiceFailedException { + throw new ServiceFailedException("Failed!"); + } + }; + DummyService svc2 = new DummyService(HAServiceState.STANDBY); + NodeFencer fencer = setupFencer(AlwaysFailFencer.class.getName()); + + AlwaysFailFencer.fenceCalled = 0; + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Failed over even though fencing failed"); + } catch (FailoverFailedException ffe) { + // Expected + } + + assertEquals(1, AlwaysFailFencer.fenceCalled); + assertEquals("svc1:1234", AlwaysFailFencer.fencedSvc); + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + } + + @Test + public void testFencingFailureDuringFailover() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY); + NodeFencer fencer = setupFencer(AlwaysFailFencer.class.getName()); + + AlwaysFailFencer.fenceCalled = 0; + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false); + fail("Failed over even though fencing requested and failed"); + } catch (FailoverFailedException ffe) { + // Expected + } + + // If fencing was requested and it failed we don't try to make + // svc2 active anyway, and we don't failback to svc1. + assertEquals(1, AlwaysFailFencer.fenceCalled); + assertEquals("svc1:1234", AlwaysFailFencer.fencedSvc); + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + } + + private HAServiceProtocol getProtocol(String target) + throws IOException { + InetSocketAddress addr = NetUtils.createSocketAddr(target); + Configuration conf = new Configuration(); + // Lower the timeout so we quickly fail to connect + conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 1); + return new HAServiceProtocolClientSideTranslatorPB(addr, conf); + } + + @Test + public void testFailoverFromNonExistantServiceWithFencer() throws Exception { + HAServiceProtocol svc1 = getProtocol("localhost:1234"); + DummyService svc2 = new DummyService(HAServiceState.STANDBY); + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + } catch (FailoverFailedException ffe) { + fail("Non-existant active prevented failover"); + } + + // Don't check svc1 because we can't reach it, but that's OK, it's been fenced. + assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); + } + + @Test + public void testFailoverToNonExistantServiceFails() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + HAServiceProtocol svc2 = getProtocol("localhost:1234"); + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Failed over to a non-existant standby"); + } catch (FailoverFailedException ffe) { + // Expected + } + + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + } + + @Test + public void testFailoverToFaultyServiceFailsbackOK() throws Exception { + DummyService svc1 = spy(new DummyService(HAServiceState.ACTIVE)); + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public void transitionToActive() throws ServiceFailedException { + throw new ServiceFailedException("Failed!"); + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Failover to already active service"); + } catch (FailoverFailedException ffe) { + // Expected + } + + // svc1 went standby then back to active + verify(svc1).transitionToStandby(); + verify(svc1).transitionToActive(); + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + } + + @Test + public void testWeDontFailbackIfActiveWasFenced() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public void transitionToActive() throws ServiceFailedException { + throw new ServiceFailedException("Failed!"); + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false); + fail("Failed over to service that won't transition to active"); + } catch (FailoverFailedException ffe) { + // Expected + } + + // We failed to failover and did not failback because we fenced + // svc1 (we forced it), therefore svc1 and svc2 should be standby. + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + } + + @Test + public void testWeFenceOnFailbackIfTransitionToActiveFails() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public void transitionToActive() throws ServiceFailedException, IOException { + throw new IOException("Failed!"); + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + AlwaysSucceedFencer.fenceCalled = 0; + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Failed over to service that won't transition to active"); + } catch (FailoverFailedException ffe) { + // Expected + } + + // We failed to failover. We did not fence svc1 because it cooperated + // and we didn't force it, so we failed back to svc1 and fenced svc2. + // Note svc2 still thinks it's active, that's OK, we fenced it. + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals("svc2:5678", AlwaysSucceedFencer.fencedSvc); + } + + @Test + public void testFailureToFenceOnFailbackFailsTheFailback() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public void transitionToActive() throws ServiceFailedException, IOException { + throw new IOException("Failed!"); + } + }; + NodeFencer fencer = setupFencer(AlwaysFailFencer.class.getName()); + AlwaysFailFencer.fenceCalled = 0; + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Failed over to service that won't transition to active"); + } catch (FailoverFailedException ffe) { + // Expected + } + + // We did not fence svc1 because it cooperated and we didn't force it, + // we failed to failover so we fenced svc2, we failed to fence svc2 + // so we did not failback to svc1, ie it's still standby. + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(1, AlwaysFailFencer.fenceCalled); + assertEquals("svc2:5678", AlwaysFailFencer.fencedSvc); + } + + @Test + public void testFailbackToFaultyServiceFails() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE) { + @Override + public void transitionToActive() throws ServiceFailedException { + throw new ServiceFailedException("Failed!"); + } + }; + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public void transitionToActive() throws ServiceFailedException { + throw new ServiceFailedException("Failed!"); + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Failover to already active service"); + } catch (FailoverFailedException ffe) { + // Expected + } + + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java new file mode 100644 index 0000000000..f22056a1f6 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import static org.junit.Assert.*; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; + +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.Log; +import org.apache.hadoop.conf.Configuration; + +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; +import static org.mockito.Mockito.when; + +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; + +public class TestHAAdmin { + private static final Log LOG = LogFactory.getLog(TestHAAdmin.class); + + private HAAdmin tool; + private ByteArrayOutputStream errOutBytes = new ByteArrayOutputStream(); + private String errOutput; + private HAServiceProtocol mockProtocol; + + @Before + public void setup() throws IOException { + mockProtocol = Mockito.mock(HAServiceProtocol.class); + when(mockProtocol.readyToBecomeActive()).thenReturn(true); + tool = new HAAdmin() { + @Override + protected HAServiceProtocol getProtocol(String target) throws IOException { + return mockProtocol; + } + }; + tool.setConf(new Configuration()); + tool.errOut = new PrintStream(errOutBytes); + } + + private void assertOutputContains(String string) { + if (!errOutput.contains(string)) { + fail("Expected output to contain '" + string + "' but was:\n" + + errOutput); + } + } + + @Test + public void testAdminUsage() throws Exception { + assertEquals(-1, runTool()); + assertOutputContains("Usage:"); + assertOutputContains("-transitionToActive"); + + assertEquals(-1, runTool("badCommand")); + assertOutputContains("Bad command 'badCommand'"); + + assertEquals(-1, runTool("-badCommand")); + assertOutputContains("badCommand: Unknown"); + + // valid command but not enough arguments + assertEquals(-1, runTool("-transitionToActive")); + assertOutputContains("transitionToActive: incorrect number of arguments"); + assertEquals(-1, runTool("-transitionToActive", "x", "y")); + assertOutputContains("transitionToActive: incorrect number of arguments"); + assertEquals(-1, runTool("-failover")); + assertOutputContains("failover: incorrect arguments"); + assertOutputContains("failover: incorrect arguments"); + assertEquals(-1, runTool("-failover", "foo:1234")); + assertOutputContains("failover: incorrect arguments"); + } + + @Test + public void testHelp() throws Exception { + assertEquals(-1, runTool("-help")); + assertEquals(0, runTool("-help", "transitionToActive")); + assertOutputContains("Transitions the service into Active"); + } + + private Object runTool(String ... args) throws Exception { + errOutBytes.reset(); + LOG.info("Running: HAAdmin " + Joiner.on(" ").join(args)); + int ret = tool.run(args); + errOutput = new String(errOutBytes.toByteArray(), Charsets.UTF_8); + LOG.info("Output:\n" + errOutput); + return ret; + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestNodeFencer.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestNodeFencer.java new file mode 100644 index 0000000000..5508547c0a --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestNodeFencer.java @@ -0,0 +1,173 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import static org.junit.Assert.*; + +import java.net.InetSocketAddress; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.junit.Before; +import org.junit.Test; + +import com.google.common.collect.Lists; + +public class TestNodeFencer { + + @Before + public void clearMockState() { + AlwaysSucceedFencer.fenceCalled = 0; + AlwaysSucceedFencer.callArgs.clear(); + AlwaysFailFencer.fenceCalled = 0; + AlwaysFailFencer.callArgs.clear(); + } + + @Test + public void testSingleFencer() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer( + AlwaysSucceedFencer.class.getName() + "(foo)"); + assertTrue(fencer.fence(new InetSocketAddress("host", 1234))); + assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals("host:1234", AlwaysSucceedFencer.fencedSvc); + assertEquals("foo", AlwaysSucceedFencer.callArgs.get(0)); + } + + @Test + public void testMultipleFencers() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer( + AlwaysSucceedFencer.class.getName() + "(foo)\n" + + AlwaysSucceedFencer.class.getName() + "(bar)\n"); + assertTrue(fencer.fence(new InetSocketAddress("host", 1234))); + // Only one call, since the first fencer succeeds + assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals("foo", AlwaysSucceedFencer.callArgs.get(0)); + } + + @Test + public void testWhitespaceAndCommentsInConfig() + throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer( + "\n" + + " # the next one will always fail\n" + + " " + AlwaysFailFencer.class.getName() + "(foo) # <- fails\n" + + AlwaysSucceedFencer.class.getName() + "(bar) \n"); + assertTrue(fencer.fence(new InetSocketAddress("host", 1234))); + // One call to each, since top fencer fails + assertEquals(1, AlwaysFailFencer.fenceCalled); + assertEquals("host:1234", AlwaysFailFencer.fencedSvc); + assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals("host:1234", AlwaysSucceedFencer.fencedSvc); + assertEquals("foo", AlwaysFailFencer.callArgs.get(0)); + assertEquals("bar", AlwaysSucceedFencer.callArgs.get(0)); + } + + @Test + public void testArglessFencer() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer( + AlwaysSucceedFencer.class.getName()); + assertTrue(fencer.fence(new InetSocketAddress("host", 1234))); + // One call to each, since top fencer fails + assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals("host:1234", AlwaysSucceedFencer.fencedSvc); + assertEquals(null, AlwaysSucceedFencer.callArgs.get(0)); + } + + @Test + public void testShortNameShell() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer("shell(true)"); + assertTrue(fencer.fence(new InetSocketAddress("host", 1234))); + } + + @Test + public void testShortNameSsh() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer("sshfence"); + assertFalse(fencer.fence(new InetSocketAddress("host", 1234))); + } + + @Test + public void testShortNameSshWithUser() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer("sshfence(user)"); + assertFalse(fencer.fence(new InetSocketAddress("host", 1234))); + } + + @Test + public void testShortNameSshWithPort() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer("sshfence(:123)"); + assertFalse(fencer.fence(new InetSocketAddress("host", 1234))); + } + + @Test + public void testShortNameSshWithUserPort() throws BadFencingConfigurationException { + NodeFencer fencer = setupFencer("sshfence(user:123)"); + assertFalse(fencer.fence(new InetSocketAddress("host", 1234))); + } + + public static NodeFencer setupFencer(String confStr) + throws BadFencingConfigurationException { + System.err.println("Testing configuration:\n" + confStr); + Configuration conf = new Configuration(); + conf.set(NodeFencer.CONF_METHODS_KEY, confStr); + return new NodeFencer(conf); + } + + /** + * Mock fencing method that always returns true + */ + public static class AlwaysSucceedFencer extends Configured + implements FenceMethod { + static int fenceCalled = 0; + static String fencedSvc; + static List callArgs = Lists.newArrayList(); + + @Override + public boolean tryFence(InetSocketAddress serviceAddr, String args) { + fencedSvc = serviceAddr.getHostName() + ":" + serviceAddr.getPort(); + callArgs.add(args); + fenceCalled++; + return true; + } + + @Override + public void checkArgs(String args) { + } + } + + /** + * Identical mock to above, except always returns false + */ + public static class AlwaysFailFencer extends Configured + implements FenceMethod { + static int fenceCalled = 0; + static String fencedSvc; + static List callArgs = Lists.newArrayList(); + + @Override + public boolean tryFence(InetSocketAddress serviceAddr, String args) { + fencedSvc = serviceAddr.getHostName() + ":" + serviceAddr.getPort(); + callArgs.add(args); + fenceCalled++; + return false; + } + + @Override + public void checkArgs(String args) { + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java new file mode 100644 index 0000000000..49bae039ec --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java @@ -0,0 +1,154 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import static org.junit.Assert.*; + +import java.net.InetSocketAddress; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.mockito.Mockito; + +import static org.mockito.Mockito.spy; + +public class TestShellCommandFencer { + private ShellCommandFencer fencer = createFencer(); + + @BeforeClass + public static void setupLogSpy() { + ShellCommandFencer.LOG = spy(ShellCommandFencer.LOG); + } + + @Before + public void resetLogSpy() { + Mockito.reset(ShellCommandFencer.LOG); + } + + private static ShellCommandFencer createFencer() { + Configuration conf = new Configuration(); + conf.set("in.fencing.tests", "yessir"); + ShellCommandFencer fencer = new ShellCommandFencer(); + fencer.setConf(conf); + return fencer; + } + + /** + * Test that the exit code of the script determines + * whether the fencer succeeded or failed + */ + @Test + public void testBasicSuccessFailure() { + InetSocketAddress addr = new InetSocketAddress("host", 1234); + assertTrue(fencer.tryFence(addr, "echo")); + assertFalse(fencer.tryFence(addr, "exit 1")); + // bad path should also fail + assertFalse(fencer.tryFence(addr, "xxxxxxxxxxxx")); + } + + @Test + public void testCheckNoArgs() { + try { + Configuration conf = new Configuration(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell"); + new NodeFencer(conf); + fail("Didn't throw when passing no args to shell"); + } catch (BadFencingConfigurationException confe) { + assertTrue( + "Unexpected exception:" + StringUtils.stringifyException(confe), + confe.getMessage().contains("No argument passed")); + } + } + + @Test + public void testCheckParensNoArgs() { + try { + Configuration conf = new Configuration(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell()"); + new NodeFencer(conf); + fail("Didn't throw when passing no args to shell"); + } catch (BadFencingConfigurationException confe) { + assertTrue( + "Unexpected exception:" + StringUtils.stringifyException(confe), + confe.getMessage().contains("Unable to parse line: 'shell()'")); + } + } + + /** + * Test that lines on stdout get passed as INFO + * level messages + */ + @Test + public void testStdoutLogging() { + InetSocketAddress addr = new InetSocketAddress("host", 1234); + assertTrue(fencer.tryFence(addr, "echo hello")); + Mockito.verify(ShellCommandFencer.LOG).info( + Mockito.endsWith("echo hello: host:1234 hello")); + } + + /** + * Test that lines on stderr get passed as + * WARN level log messages + */ + @Test + public void testStderrLogging() { + InetSocketAddress addr = new InetSocketAddress("host", 1234); + assertTrue(fencer.tryFence(addr, "echo hello >&2")); + Mockito.verify(ShellCommandFencer.LOG).warn( + Mockito.endsWith("echo hello >&2: host:1234 hello")); + } + + /** + * Verify that the Configuration gets passed as + * environment variables to the fencer. + */ + @Test + public void testConfAsEnvironment() { + InetSocketAddress addr = new InetSocketAddress("host", 1234); + fencer.tryFence(addr, "echo $in_fencing_tests"); + Mockito.verify(ShellCommandFencer.LOG).info( + Mockito.endsWith("echo $in...ing_tests: host:1234 yessir")); + } + + /** + * Test that we properly close off our input to the subprocess + * such that it knows there's no tty connected. This is important + * so that, if we use 'ssh', it won't try to prompt for a password + * and block forever, for example. + */ + @Test(timeout=10000) + public void testSubprocessInputIsClosed() { + InetSocketAddress addr = new InetSocketAddress("host", 1234); + assertFalse(fencer.tryFence(addr, "read")); + } + + @Test + public void testCommandAbbreviation() { + assertEquals("a...f", ShellCommandFencer.abbreviate("abcdef", 5)); + assertEquals("abcdef", ShellCommandFencer.abbreviate("abcdef", 6)); + assertEquals("abcdef", ShellCommandFencer.abbreviate("abcdef", 7)); + + assertEquals("a...g", ShellCommandFencer.abbreviate("abcdefg", 5)); + assertEquals("a...h", ShellCommandFencer.abbreviate("abcdefgh", 5)); + assertEquals("a...gh", ShellCommandFencer.abbreviate("abcdefgh", 6)); + assertEquals("ab...gh", ShellCommandFencer.abbreviate("abcdefgh", 7)); + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestSshFenceByTcpPort.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestSshFenceByTcpPort.java new file mode 100644 index 0000000000..f89df6a21f --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestSshFenceByTcpPort.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ha; + +import static org.junit.Assert.*; + +import java.net.InetSocketAddress; + +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.ha.SshFenceByTcpPort.Args; +import org.apache.log4j.Level; +import org.junit.Assume; +import org.junit.Test; + +public class TestSshFenceByTcpPort { + + static { + ((Log4JLogger)SshFenceByTcpPort.LOG).getLogger().setLevel(Level.ALL); + } + + private String TEST_FENCING_HOST = System.getProperty( + "test.TestSshFenceByTcpPort.host", "localhost"); + private String TEST_FENCING_PORT = System.getProperty( + "test.TestSshFenceByTcpPort.port", "8020"); + private final String TEST_KEYFILE = System.getProperty( + "test.TestSshFenceByTcpPort.key"); + + @Test(timeout=20000) + public void testFence() throws BadFencingConfigurationException { + Assume.assumeTrue(isConfigured()); + Configuration conf = new Configuration(); + conf.set(SshFenceByTcpPort.CONF_IDENTITIES_KEY, TEST_KEYFILE); + SshFenceByTcpPort fence = new SshFenceByTcpPort(); + fence.setConf(conf); + assertTrue(fence.tryFence( + new InetSocketAddress(TEST_FENCING_HOST, + Integer.valueOf(TEST_FENCING_PORT)), + null)); + } + + /** + * Test connecting to a host which definitely won't respond. + * Make sure that it times out and returns false, but doesn't throw + * any exception + */ + @Test(timeout=20000) + public void testConnectTimeout() throws BadFencingConfigurationException { + Configuration conf = new Configuration(); + conf.setInt(SshFenceByTcpPort.CONF_CONNECT_TIMEOUT_KEY, 3000); + SshFenceByTcpPort fence = new SshFenceByTcpPort(); + fence.setConf(conf); + // Connect to Google's DNS server - not running ssh! + assertFalse(fence.tryFence(new InetSocketAddress("8.8.8.8", 1234), "")); + } + + @Test + public void testArgsParsing() throws BadFencingConfigurationException { + InetSocketAddress addr = new InetSocketAddress("bar.com", 1234); + + Args args = new SshFenceByTcpPort.Args(addr, null); + assertEquals("bar.com", args.host); + assertEquals(1234, args.targetPort); + assertEquals(System.getProperty("user.name"), args.user); + assertEquals(22, args.sshPort); + + args = new SshFenceByTcpPort.Args(addr, ""); + assertEquals("bar.com", args.host); + assertEquals(1234, args.targetPort); + assertEquals(System.getProperty("user.name"), args.user); + assertEquals(22, args.sshPort); + + args = new SshFenceByTcpPort.Args(addr, "12345"); + assertEquals("bar.com", args.host); + assertEquals(1234, args.targetPort); + assertEquals("12345", args.user); + assertEquals(22, args.sshPort); + + args = new SshFenceByTcpPort.Args(addr, ":12345"); + assertEquals("bar.com", args.host); + assertEquals(1234, args.targetPort); + assertEquals(System.getProperty("user.name"), args.user); + assertEquals(12345, args.sshPort); + + args = new SshFenceByTcpPort.Args(addr, "foo:8020"); + assertEquals("bar.com", args.host); + assertEquals(1234, args.targetPort); + assertEquals("foo", args.user); + assertEquals(8020, args.sshPort); + } + + @Test + public void testBadArgsParsing() throws BadFencingConfigurationException { + assertBadArgs(":"); // No port specified + assertBadArgs("bar.com:"); // " + assertBadArgs(":xx"); // Port does not parse + assertBadArgs("bar.com:xx"); // " + } + + private void assertBadArgs(String argStr) { + InetSocketAddress addr = new InetSocketAddress("bar.com", 1234); + try { + new Args(addr, argStr); + fail("Did not fail on bad args: " + argStr); + } catch (BadFencingConfigurationException e) { + // Expected + } + } + + private boolean isConfigured() { + return (TEST_FENCING_HOST != null && !TEST_FENCING_HOST.isEmpty()) && + (TEST_FENCING_PORT != null && !TEST_FENCING_PORT.isEmpty()) && + (TEST_KEYFILE != null && !TEST_KEYFILE.isEmpty()); + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java index eec4797ab3..4949ef3140 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java @@ -25,21 +25,23 @@ import java.util.concurrent.CountDownLatch; import org.apache.hadoop.io.retry.UnreliableImplementation.TypeOfExceptionToFailWith; import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException; import org.apache.hadoop.ipc.StandbyException; +import org.apache.hadoop.util.ThreadUtil; import org.junit.Test; +@SuppressWarnings("unchecked") public class TestFailoverProxy { - public static class FlipFlopProxyProvider implements FailoverProxyProvider { + public static class FlipFlopProxyProvider implements FailoverProxyProvider { - private Class iface; - private Object currentlyActive; - private Object impl1; - private Object impl2; + private Class iface; + private T currentlyActive; + private T impl1; + private T impl2; private int failoversOccurred = 0; - public FlipFlopProxyProvider(Class iface, Object activeImpl, - Object standbyImpl) { + public FlipFlopProxyProvider(Class iface, T activeImpl, + T standbyImpl) { this.iface = iface; this.impl1 = activeImpl; this.impl2 = standbyImpl; @@ -47,7 +49,7 @@ public class TestFailoverProxy { } @Override - public Object getProxy() { + public T getProxy() { return currentlyActive; } @@ -58,7 +60,7 @@ public class TestFailoverProxy { } @Override - public Class getInterface() { + public Class getInterface() { return iface; } @@ -126,7 +128,7 @@ public class TestFailoverProxy { new FlipFlopProxyProvider(UnreliableInterface.class, new UnreliableImplementation("impl1"), new UnreliableImplementation("impl2")), - RetryPolicies.TRY_ONCE_DONT_FAIL); + RetryPolicies.TRY_ONCE_THEN_FAIL); unreliable.succeedsOnceThenFailsReturningString(); try { @@ -180,7 +182,7 @@ public class TestFailoverProxy { assertEquals("impl1", unreliable.succeedsOnceThenFailsReturningString()); try { - assertEquals("impl2", unreliable.succeedsOnceThenFailsReturningString()); + unreliable.succeedsOnceThenFailsReturningString(); fail("should not have succeeded twice"); } catch (IOException e) { // Make sure we *don't* fail over since the first implementation threw an @@ -194,6 +196,27 @@ public class TestFailoverProxy { assertEquals("impl2", unreliable.succeedsOnceThenFailsReturningStringIdempotent()); } + /** + * Test that if a non-idempotent void function is called, and there is an exception, + * the exception is properly propagated + */ + @Test + public void testExceptionPropagatedForNonIdempotentVoid() throws Exception { + UnreliableInterface unreliable = (UnreliableInterface)RetryProxy + .create(UnreliableInterface.class, + new FlipFlopProxyProvider(UnreliableInterface.class, + new UnreliableImplementation("impl1", TypeOfExceptionToFailWith.IO_EXCEPTION), + new UnreliableImplementation("impl2", TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION)), + RetryPolicies.failoverOnNetworkException(1)); + + try { + unreliable.nonIdempotentVoidFailsIfIdentifierDoesntMatch("impl2"); + fail("did not throw an exception"); + } catch (Exception e) { + } + + } + private static class SynchronizedUnreliableImplementation extends UnreliableImplementation { private CountDownLatch methodLatch; @@ -267,4 +290,62 @@ public class TestFailoverProxy { assertEquals("impl2", t2.result); assertEquals(1, proxyProvider.getFailoversOccurred()); } + + /** + * Ensure that when all configured services are throwing StandbyException + * that we fail over back and forth between them until one is no longer + * throwing StandbyException. + */ + @Test + public void testFailoverBetweenMultipleStandbys() + throws UnreliableException, StandbyException, IOException { + + final long millisToSleep = 10000; + + final UnreliableImplementation impl1 = new UnreliableImplementation("impl1", + TypeOfExceptionToFailWith.STANDBY_EXCEPTION); + FlipFlopProxyProvider proxyProvider = new FlipFlopProxyProvider( + UnreliableInterface.class, + impl1, + new UnreliableImplementation("impl2", + TypeOfExceptionToFailWith.STANDBY_EXCEPTION)); + + final UnreliableInterface unreliable = (UnreliableInterface)RetryProxy + .create(UnreliableInterface.class, proxyProvider, + RetryPolicies.failoverOnNetworkException( + RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000)); + + new Thread() { + @Override + public void run() { + ThreadUtil.sleepAtLeastIgnoreInterrupts(millisToSleep); + impl1.setIdentifier("renamed-impl1"); + } + }.start(); + + String result = unreliable.failsIfIdentifierDoesntMatch("renamed-impl1"); + assertEquals("renamed-impl1", result); + } + + /** + * Ensure that normal IO exceptions don't result in a failover. + */ + @Test + public void testExpectedIOException() { + UnreliableInterface unreliable = (UnreliableInterface)RetryProxy + .create(UnreliableInterface.class, + new FlipFlopProxyProvider(UnreliableInterface.class, + new UnreliableImplementation("impl1", TypeOfExceptionToFailWith.REMOTE_EXCEPTION), + new UnreliableImplementation("impl2", TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION)), + RetryPolicies.failoverOnNetworkException( + RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000)); + + try { + unreliable.failsIfIdentifierDoesntMatch("no-such-identifier"); + fail("Should have thrown *some* exception"); + } catch (Exception e) { + assertTrue("Expected IOE but got " + e.getClass(), + e instanceof IOException); + } + } } \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java index c48e87b7dd..696f40d837 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java @@ -19,7 +19,6 @@ package org.apache.hadoop.io.retry; import static org.apache.hadoop.io.retry.RetryPolicies.RETRY_FOREVER; -import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_DONT_FAIL; import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_THEN_FAIL; import static org.apache.hadoop.io.retry.RetryPolicies.retryByException; import static org.apache.hadoop.io.retry.RetryPolicies.retryByRemoteException; @@ -59,19 +58,6 @@ public class TestRetryProxy extends TestCase { } } - public void testTryOnceDontFail() throws UnreliableException { - UnreliableInterface unreliable = (UnreliableInterface) - RetryProxy.create(UnreliableInterface.class, unreliableImpl, TRY_ONCE_DONT_FAIL); - unreliable.alwaysSucceeds(); - unreliable.failsOnceThenSucceeds(); - try { - unreliable.failsOnceThenSucceedsWithReturnValue(); - fail("Should fail"); - } catch (UnreliableException e) { - // expected - } - } - public void testRetryForever() throws UnreliableException { UnreliableInterface unreliable = (UnreliableInterface) RetryProxy.create(UnreliableInterface.class, unreliableImpl, RETRY_FOREVER); diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java index 7fa88b3b08..54fe677844 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java @@ -19,6 +19,7 @@ package org.apache.hadoop.io.retry; import java.io.IOException; +import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.StandbyException; @@ -37,7 +38,8 @@ public class UnreliableImplementation implements UnreliableInterface { public static enum TypeOfExceptionToFailWith { UNRELIABLE_EXCEPTION, STANDBY_EXCEPTION, - IO_EXCEPTION + IO_EXCEPTION, + REMOTE_EXCEPTION } public UnreliableImplementation() { @@ -48,6 +50,10 @@ public class UnreliableImplementation implements UnreliableInterface { this(identifier, TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION); } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } + public UnreliableImplementation(String identifier, TypeOfExceptionToFailWith exceptionToFailWith) { this.identifier = identifier; @@ -91,14 +97,7 @@ public class UnreliableImplementation implements UnreliableInterface { if (succeedsOnceThenFailsCount++ < 1) { return identifier; } else { - switch (exceptionToFailWith) { - case STANDBY_EXCEPTION: - throw new StandbyException(identifier); - case UNRELIABLE_EXCEPTION: - throw new UnreliableException(identifier); - case IO_EXCEPTION: - throw new IOException(identifier); - } + throwAppropriateException(exceptionToFailWith, identifier); return null; } } @@ -109,16 +108,8 @@ public class UnreliableImplementation implements UnreliableInterface { if (succeedsTenTimesThenFailsCount++ < 10) { return identifier; } else { - switch (exceptionToFailWith) { - case STANDBY_EXCEPTION: - throw new StandbyException(identifier); - case UNRELIABLE_EXCEPTION: - throw new UnreliableException(identifier); - case IO_EXCEPTION: - throw new IOException(identifier); - default: - throw new RuntimeException(identifier); - } + throwAppropriateException(exceptionToFailWith, identifier); + return null; } } @@ -128,16 +119,8 @@ public class UnreliableImplementation implements UnreliableInterface { if (succeedsOnceThenFailsIdempotentCount++ < 1) { return identifier; } else { - switch (exceptionToFailWith) { - case STANDBY_EXCEPTION: - throw new StandbyException(identifier); - case UNRELIABLE_EXCEPTION: - throw new UnreliableException(identifier); - case IO_EXCEPTION: - throw new IOException(identifier); - default: - throw new RuntimeException(identifier); - } + throwAppropriateException(exceptionToFailWith, identifier); + return null; } } @@ -147,17 +130,38 @@ public class UnreliableImplementation implements UnreliableInterface { if (this.identifier.equals(identifier)) { return identifier; } else { - switch (exceptionToFailWith) { - case STANDBY_EXCEPTION: - throw new StandbyException(identifier); - case UNRELIABLE_EXCEPTION: - throw new UnreliableException(identifier); - case IO_EXCEPTION: - throw new IOException(identifier); - default: - throw new RuntimeException(identifier); - } + String message = "expected '" + this.identifier + "' but received '" + + identifier + "'"; + throwAppropriateException(exceptionToFailWith, message); + return null; + } + } + + @Override + public void nonIdempotentVoidFailsIfIdentifierDoesntMatch(String identifier) + throws UnreliableException, StandbyException, IOException { + if (this.identifier.equals(identifier)) { + return; + } else { + String message = "expected '" + this.identifier + "' but received '" + + identifier + "'"; + throwAppropriateException(exceptionToFailWith, message); } } + private static void throwAppropriateException(TypeOfExceptionToFailWith eType, + String message) throws UnreliableException, StandbyException, IOException { + switch (eType) { + case STANDBY_EXCEPTION: + throw new StandbyException(message); + case UNRELIABLE_EXCEPTION: + throw new UnreliableException(message); + case IO_EXCEPTION: + throw new IOException(message); + case REMOTE_EXCEPTION: + throw new RemoteException(IOException.class.getName(), message); + default: + throw new RuntimeException(message); + } + } } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java index e794c1686c..66a8b85360 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java @@ -67,4 +67,7 @@ public interface UnreliableInterface { @Idempotent public String failsIfIdentifierDoesntMatch(String identifier) throws UnreliableException, StandbyException, IOException; + + void nonIdempotentVoidFailsIfIdentifierDoesntMatch(String identifier) + throws UnreliableException, StandbyException, IOException; } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java index 1f3e67a4f9..efb2dc1126 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java @@ -20,7 +20,9 @@ package org.apache.hadoop.ipc; import org.apache.commons.logging.*; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; @@ -590,6 +592,38 @@ public class TestIPC { Server.RECEIVED_HTTP_REQ_RESPONSE.getBytes()); } + @Test + public void testConnectionRetriesOnSocketTimeoutExceptions() throws Exception { + Configuration conf = new Configuration(); + // set max retries to 0 + conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + 0); + assertRetriesOnSocketTimeouts(conf, 1); + + // set max retries to 3 + conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + 3); + assertRetriesOnSocketTimeouts(conf, 4); + } + + private void assertRetriesOnSocketTimeouts(Configuration conf, + int maxTimeoutRetries) throws IOException, InterruptedException { + SocketFactory mockFactory = Mockito.mock(SocketFactory.class); + doThrow(new SocketTimeoutException()).when(mockFactory).createSocket(); + Client client = new Client(IntWritable.class, conf, mockFactory); + InetSocketAddress address = new InetSocketAddress("127.0.0.1", 9090); + try { + client.call(new IntWritable(RANDOM.nextInt()), address, null, null, 0, + conf); + fail("Not throwing the SocketTimeoutException"); + } catch (SocketTimeoutException e) { + Mockito.verify(mockFactory, Mockito.times(maxTimeoutRetries)) + .createSocket(); + } + } + private void doIpcVersionTest( byte[] requestData, byte[] expectedResponse) throws Exception { diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt new file mode 100644 index 0000000000..3e59df7433 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt @@ -0,0 +1,252 @@ +Changes for HDFS-1623 branch. + +This change list will be merged into the trunk CHANGES.txt when the HDFS-1623 +branch is merged. +------------------------------ + +HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd) + +HDFS-1974. Introduce active and standy states to the namenode. (suresh) + +HDFS-2407. getServerDefaults and getStats don't check operation category (atm) + +HDFS-1973. HA: HDFS clients must handle namenode failover and switch over to the new active namenode. (atm) + +HDFS-2301. Start/stop appropriate namenode services when transition to active and standby states. (suresh) + +HDFS-2231. Configuration changes for HA namenode. (suresh) + +HDFS-2418. Change ConfiguredFailoverProxyProvider to take advantage of HDFS-2231. (atm) + +HDFS-2393. Mark appropriate methods of ClientProtocol with the idempotent annotation. (atm) + +HDFS-2523. Small NN fixes to include HAServiceProtocol and prevent NPE on shutdown. (todd) + +HDFS-2577. NN fails to start since it tries to start secret manager in safemode. (todd) + +HDFS-2582. Scope dfs.ha.namenodes config by nameservice (todd) + +HDFS-2591. MiniDFSCluster support to mix and match federation with HA (todd) + +HDFS-1975. Support for sharing the namenode state from active to standby. (jitendra, atm, todd) + +HDFS-1971. Send block report from datanode to both active and standby namenodes. (sanjay, todd via suresh) + +HDFS-2616. Change DatanodeProtocol#sendHeartbeat() to return HeartbeatResponse. (suresh) + +HDFS-2622. Fix TestDFSUpgrade in HA branch. (todd) + +HDFS-2612. Handle refreshNameNodes in federated HA clusters (todd) + +HDFS-2623. Add test case for hot standby capability (todd) + +HDFS-2626. BPOfferService.verifyAndSetNamespaceInfo needs to be synchronized (todd) + +HDFS-2624. ConfiguredFailoverProxyProvider doesn't correctly stop ProtocolTranslators (todd) + +HDFS-2625. TestDfsOverAvroRpc failing after introduction of HeartbeatResponse type (todd) + +HDFS-2627. Determine DN's view of which NN is active based on heartbeat responses (todd) + +HDFS-2634. Standby needs to ingest latest edit logs before transitioning to active (todd) + +HDFS-2671. NN should throw StandbyException in response to RPCs in STANDBY state (todd) + +HDFS-2680. DFSClient should construct failover proxy with exponential backoff (todd) + +HDFS-2683. Authority-based lookup of proxy provider fails if path becomes canonicalized (todd) + +HDFS-2689. HA: BookKeeperEditLogInputStream doesn't implement isInProgress() (atm) + +HDFS-2602. NN should log newly-allocated blocks without losing BlockInfo (atm) + +HDFS-2667. Fix transition from active to standby (todd) + +HDFS-2684. Fix up some failing unit tests on HA branch (todd) + +HDFS-2679. Add interface to query current state to HAServiceProtocol (eli via todd) + +HDFS-2677. Web UI should indicate the NN state. (eli via todd) + +HDFS-2678. When a FailoverProxyProvider is used, DFSClient should not retry connection ten times before failing over (atm via todd) + +HDFS-2682. When a FailoverProxyProvider is used, Client should not retry for 45 times if it is timing out to connect to server. (Uma Maheswara Rao G via todd) + +HDFS-2693. Fix synchronization issues around state transition (todd) + +HDFS-1972. Fencing mechanism for block invalidations and replications (todd) + +HDFS-2714. Fix test cases which use standalone FSNamesystems (todd) + +HDFS-2692. Fix bugs related to failover from/into safe mode. (todd) + +HDFS-2716. Configuration needs to allow different dfs.http.addresses for each HA NN (todd) + +HDFS-2720. Fix MiniDFSCluster HA support to work properly on Windows. (Uma Maheswara Rao G via todd) + +HDFS-2291. Allow the StandbyNode to make checkpoints in an HA setup. (todd) + +HDFS-2709. Appropriately handle error conditions in EditLogTailer (atm via todd) + +HDFS-2730. Refactor shared HA-related test code into HATestUtil class (todd) + +HDFS-2762. Fix TestCheckpoint timing out on HA branch. (Uma Maheswara Rao G via todd) + +HDFS-2724. NN web UI can throw NPE after startup, before standby state is entered. (todd) + +HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. (Hari Mankude and todd via todd) + +HDFS-2773. Reading edit logs from an earlier version should not leave blocks in under-construction state. (todd) + +HDFS-2775. Fix TestStandbyCheckpoints.testBothNodesInStandbyState failing intermittently. (todd) + +HDFS-2766. Test for case where standby partially reads log and then performs checkpoint. (atm) + +HDFS-2738. FSEditLog.selectinputStreams is reading through in-progress streams even when non-in-progress are requested. (atm) + +HDFS-2789. TestHAAdmin.testFailover is failing (eli) + +HDFS-2747. Entering safe mode after starting SBN can NPE. (Uma Maheswara Rao G via todd) + +HDFS-2772. On transition to active, standby should not swallow ELIE. (atm) + +HDFS-2767. ConfiguredFailoverProxyProvider should support NameNodeProtocol. (Uma Maheswara Rao G via todd) + +HDFS-2795. Standby NN takes a long time to recover from a dead DN starting up. (todd) + +HDFS-2592. Balancer support for HA namenodes. (Uma Maheswara Rao G via todd) + +HDFS-2367. Enable the configuration of multiple HA cluster addresses. (atm) + +HDFS-2812. When becoming active, the NN should treat all leases as freshly renewed. (todd) + +HDFS-2737. Automatically trigger log rolls periodically on the active NN. (todd and atm) + +HDFS-2820. Add a simple sanity check for HA config (todd) + +HDFS-2688. Add tests for quota tracking in an HA cluster. (todd) + +HDFS-2804. Should not mark blocks under-replicated when exiting safemode (todd) + +HDFS-2807. Service level authorizartion for HAServiceProtocol. (jitendra) + +HDFS-2809. Add test to verify that delegation tokens are honored after failover. (jitendra and atm) + +HDFS-2838. NPE in FSNamesystem when in safe mode. (Gregory Chanan via eli) + +HDFS-2805. Add a test for a federated cluster with HA NNs. (Brandon Li via jitendra) + +HDFS-2841. HAAdmin does not work if security is enabled. (atm) + +HDFS-2691. Fixes for pipeline recovery in an HA cluster: report RBW replicas immediately upon pipeline creation. (todd) + +HDFS-2824. Fix failover when prior NN died just after creating an edit log segment. (atm via todd) + +HDFS-2853. HA: NN fails to start if the shared edits dir is marked required (atm via eli) + +HDFS-2845. SBN should not allow browsing of the file system via web UI. (Bikas Saha via atm) + +HDFS-2742. HA: observed dataloss in replication stress test. (todd via eli) + +HDFS-2870. Fix log level for block debug info in processMisReplicatedBlocks (todd) + +HDFS-2859. LOCAL_ADDRESS_MATCHER.match has NPE when called from DFSUtil.getSuffixIDs when the host is incorrect (Bikas Saha via todd) + +HDFS-2861. checkpointing should verify that the dfs.http.address has been configured to a non-loopback for peer NN (todd) + +HDFS-2860. TestDFSRollback#testRollback is failing. (atm) + +HDFS-2769. HA: When HA is enabled with a shared edits dir, that dir should be +marked required. (atm via eli) + +HDFS-2863. Failures observed if dfs.edits.dir and shared.edits.dir have same directories. (Bikas Saha via atm) + +HDFS-2874. Edit log should log to shared dirs before local dirs. (todd) + +HDFS-2890. DFSUtil#getSuffixIDs should skip unset configurations. (atm) + +HDFS-2792. Make fsck work. (atm) + +HDFS-2808. HA: haadmin should use namenode ids. (eli) + +HDFS-2819. Document new HA-related configs in hdfs-default.xml. (eli) + +HDFS-2752. HA: exit if multiple shared dirs are configured. (eli) + +HDFS-2894. HA: automatically determine the nameservice Id if only one nameservice is configured. (eli) + +HDFS-2733. Document HA configuration and CLI. (atm) + +HDFS-2794. Active NN may purge edit log files before standby NN has a chance to read them (todd) + +HDFS-2901. Improvements for SBN web UI - not show under-replicated/missing blocks. (Brandon Li via jitendra) + +HDFS-2905. HA: Standby NN NPE when shared edits dir is deleted. (Bikas Saha via jitendra) + +HDFS-2579. Starting delegation token manager during safemode fails. (todd) + +HDFS-2510. Add HA-related metrics. (atm) + +HDFS-2924. Standby checkpointing fails to authenticate in secure cluster. (todd) + +HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra) + +HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. (Bikas Saha via atm) + +HDFS-2917. HA: haadmin should not work if run by regular user (eli) + +HDFS-2939. TestHAStateTransitions fails on Windows. (Uma Maheswara Rao G via atm) + +HDFS-2947. On startup NN throws an NPE in the metrics system. (atm) + +HDFS-2942. TestActiveStandbyElectorRealZK fails if build dir does not exist. (atm) + +HDFS-2948. NN throws NPE during shutdown if it fails to startup (todd) + +HDFS-2909. HA: Inaccessible shared edits dir not getting removed from FSImage storage dirs upon error. (Bikas Saha via jitendra) + +HDFS-2934. Allow configs to be scoped to all NNs in the nameservice. (todd) + +HDFS-2935. Shared edits dir property should be suffixed with nameservice and namenodeID (todd) + +HDFS-2928. ConfiguredFailoverProxyProvider should not create a NameNode proxy with an underlying retry proxy. (Uma Maheswara Rao G via atm) + +HDFS-2955. IllegalStateException during standby startup in getCurSegmentTxId. (Hari Mankude via atm) + +HDFS-2937. TestDFSHAAdmin needs tests with MiniDFSCluster. (Brandon Li via suresh) + +HDFS-2586. Add protobuf service and implementation for HAServiceProtocol. (suresh via atm) + +HDFS-2952. NN should not start with upgrade option or with a pending an unfinalized upgrade. (atm) + +HDFS-2974. MiniDFSCluster does not delete standby NN name dirs during format. (atm) + +HDFS-2929. Stress test and fixes for block synchronization (todd) + +HDFS-2972. Small optimization building incremental block report (todd) + +HDFS-2973. Re-enable NO_ACK optimization for block deletion. (todd) + +HDFS-2922. HA: close out operation categories (eli) + +HDFS-2993. HA: BackupNode#checkOperation should permit CHECKPOINT operations (eli) + +HDFS-2904. Client support for getting delegation tokens. (todd) + +HDFS-3013. HA: NameNode format doesn't pick up dfs.namenode.name.dir.NameServiceId configuration (Mingjie Lai via todd) + +HDFS-3019. Fix silent failure of TestEditLogJournalFailures (todd) + +HDFS-2958. Sweep for remaining proxy construction which doesn't go through failover path. (atm) + +HDFS-2920. fix remaining TODO items. (atm and todd) + +HDFS-3027. Implement a simple NN health check. (atm) + +HDFS-3023. Optimize entries in edits log for persistBlocks call. (todd) + +HDFS-2979. Balancer should use logical uri for creating failover proxy with HA enabled. (atm) + +HDFS-3035. Fix failure of TestFileAppendRestart due to OP_UPDATE_BLOCKS (todd) + +HDFS-3039. Address findbugs and javadoc warnings on branch. (todd via atm) diff --git a/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml b/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml index 709e52fb46..301d302825 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml @@ -256,4 +256,12 @@ + + + + + + diff --git a/hadoop-hdfs-project/hadoop-hdfs/pom.xml b/hadoop-hdfs-project/hadoop-hdfs/pom.xml index 0b4da80e8a..3f85de096b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/pom.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/pom.xml @@ -387,6 +387,7 @@ CHANGES.txt + CHANGES.HDFS-1623.txt .idea/** src/main/conf/* src/main/docs/** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java index 707182ec5c..636471a450 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java @@ -129,6 +129,12 @@ class BookKeeperEditLogInputStream extends EditLogInputStream { return null; } + // TODO(HA): Test this. + @Override + public boolean isInProgress() { + return true; + } + /** * Input stream implementation which can be used by * FSEditLogOp.Reader diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java index 7fa90269ec..047efd51f4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java @@ -312,8 +312,10 @@ public class BookKeeperJournalManager implements JournalManager { } } + // TODO(HA): Handle inProgressOk @Override - public EditLogInputStream getInputStream(long fromTxnId) throws IOException { + public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk) + throws IOException { for (EditLogLedgerMetadata l : getLedgerList()) { if (l.getFirstTxId() == fromTxnId) { try { @@ -329,8 +331,10 @@ public class BookKeeperJournalManager implements JournalManager { throw new IOException("No ledger for fromTxnId " + fromTxnId + " found."); } + // TODO(HA): Handle inProgressOk @Override - public long getNumberOfTransactions(long fromTxnId) throws IOException { + public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk) + throws IOException { long count = 0; long expectedStart = 0; for (EditLogLedgerMetadata l : getLedgerList()) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java index b949bc200e..5937fa8295 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java @@ -195,7 +195,7 @@ public class TestBookKeeperJournalManager { out.close(); bkjm.finalizeLogSegment(1, 100); - long numTrans = bkjm.getNumberOfTransactions(1); + long numTrans = bkjm.getNumberOfTransactions(1, true); assertEquals(100, numTrans); } @@ -218,17 +218,17 @@ public class TestBookKeeperJournalManager { } zkc.delete(bkjm.finalizedLedgerZNode(DEFAULT_SEGMENT_SIZE+1, DEFAULT_SEGMENT_SIZE*2), -1); - long numTrans = bkjm.getNumberOfTransactions(1); + long numTrans = bkjm.getNumberOfTransactions(1, true); assertEquals(DEFAULT_SEGMENT_SIZE, numTrans); try { - numTrans = bkjm.getNumberOfTransactions(DEFAULT_SEGMENT_SIZE+1); + numTrans = bkjm.getNumberOfTransactions(DEFAULT_SEGMENT_SIZE+1, true); fail("Should have thrown corruption exception by this point"); } catch (JournalManager.CorruptionException ce) { // if we get here, everything is going good } - numTrans = bkjm.getNumberOfTransactions((DEFAULT_SEGMENT_SIZE*2)+1); + numTrans = bkjm.getNumberOfTransactions((DEFAULT_SEGMENT_SIZE*2)+1, true); assertEquals(DEFAULT_SEGMENT_SIZE, numTrans); } @@ -262,7 +262,7 @@ public class TestBookKeeperJournalManager { out.abort(); out.close(); - long numTrans = bkjm.getNumberOfTransactions(1); + long numTrans = bkjm.getNumberOfTransactions(1, true); assertEquals((txid-1), numTrans); } @@ -357,7 +357,7 @@ public class TestBookKeeperJournalManager { bkjm.finalizeLogSegment(1, numTransactions); - EditLogInputStream in = bkjm.getInputStream(1); + EditLogInputStream in = bkjm.getInputStream(1, true); try { assertEquals(numTransactions, FSEditLogTestUtil.countTransactionsInStream(in)); @@ -392,4 +392,4 @@ public class TestBookKeeperJournalManager { assertNotNull(zkc.exists(bkjm.finalizedLedgerZNode(1, 100), false)); assertNull(zkc.exists(bkjm.inprogressZNode(), false)); } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs b/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs index 4c56bb3f14..a01c939e42 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs @@ -32,6 +32,7 @@ function print_usage(){ echo " namenode run the DFS namenode" echo " datanode run a DFS datanode" echo " dfsadmin run a DFS admin client" + echo " haadmin run a DFS HA admin client" echo " fsck run a DFS filesystem checking utility" echo " balancer run a cluster balancing utility" echo " jmxget get JMX exported values from NameNode or DataNode." @@ -86,6 +87,10 @@ elif [ "$COMMAND" = "dfs" ] ; then elif [ "$COMMAND" = "dfsadmin" ] ; then CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" +elif [ "$COMMAND" = "haadmin" ] ; then + CLASS=org.apache.hadoop.hdfs.tools.DFSHAAdmin + CLASSPATH=${CLASSPATH}:${TOOL_PATH} + HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" elif [ "$COMMAND" = "fsck" ] ; then CLASS=org.apache.hadoop.hdfs.tools.DFSck HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java index 5a45f51ee5..82d0c3663c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java @@ -80,8 +80,7 @@ public class Hdfs extends AbstractFileSystem { throw new IOException("Incomplete HDFS URI, no host: " + theUri); } - InetSocketAddress namenode = NameNode.getAddress(theUri.getAuthority()); - this.dfs = new DFSClient(namenode, conf, getStatistics()); + this.dfs = new DFSClient(theUri, conf, getStatistics()); } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java index 359fd47a6c..88b36b73b9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java @@ -1,4 +1,3 @@ - /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -30,6 +29,8 @@ import java.net.InetSocketAddress; import java.net.NetworkInterface; import java.net.Socket; import java.net.SocketException; +import java.net.URI; +import java.net.URISyntaxException; import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; @@ -60,6 +61,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.UnresolvedLinkException; import org.apache.hadoop.fs.permission.FsPermission; import static org.apache.hadoop.hdfs.DFSConfigKeys.*; + import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.CorruptFileBlocks; import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException; @@ -83,6 +85,7 @@ import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseP import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpBlockChecksumResponseProto; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; +import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport; @@ -105,7 +108,8 @@ import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.TokenRenewer; import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.util.Progressable; -import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; + +import com.google.common.base.Preconditions; /******************************************************** * DFSClient can connect to a Hadoop Filesystem and @@ -124,7 +128,9 @@ public class DFSClient implements java.io.Closeable { public static final long SERVER_DEFAULTS_VALIDITY_PERIOD = 60 * 60 * 1000L; // 1 hour static final int TCP_WINDOW_SIZE = 128 * 1024; // 128 KB final ClientProtocol namenode; - private final InetSocketAddress nnAddress; + /* The service used for delegation tokens */ + private Text dtService; + final UserGroupInformation ugi; volatile boolean clientRunning = true; private volatile FsServerDefaults serverDefaults; @@ -143,6 +149,9 @@ public class DFSClient implements java.io.Closeable { * DFSClient configuration */ static class Conf { + final int maxFailoverAttempts; + final int failoverSleepBaseMillis; + final int failoverSleepMaxMillis; final int maxBlockAcquireFailures; final int confTime; final int ioBufferSize; @@ -164,6 +173,16 @@ public class DFSClient implements java.io.Closeable { final boolean useLegacyBlockReader; Conf(Configuration conf) { + maxFailoverAttempts = conf.getInt( + DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY, + DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_DEFAULT); + failoverSleepBaseMillis = conf.getInt( + DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_KEY, + DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_DEFAULT); + failoverSleepMaxMillis = conf.getInt( + DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_KEY, + DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_DEFAULT); + maxBlockAcquireFailures = conf.getInt( DFS_CLIENT_MAX_BLOCK_ACQUIRE_FAILURES_KEY, DFS_CLIENT_MAX_BLOCK_ACQUIRE_FAILURES_DEFAULT); @@ -236,6 +255,7 @@ public class DFSClient implements java.io.Closeable { */ private final Map filesBeingWritten = new HashMap(); + private boolean shortCircuitLocalReads; /** @@ -247,59 +267,69 @@ public class DFSClient implements java.io.Closeable { public DFSClient(Configuration conf) throws IOException { this(NameNode.getAddress(conf), conf); } + + public DFSClient(InetSocketAddress address, Configuration conf) throws IOException { + this(NameNode.getUri(address), conf); + } /** - * Same as this(nameNodeAddr, conf, null); + * Same as this(nameNodeUri, conf, null); * @see #DFSClient(InetSocketAddress, Configuration, org.apache.hadoop.fs.FileSystem.Statistics) */ - public DFSClient(InetSocketAddress nameNodeAddr, Configuration conf + public DFSClient(URI nameNodeUri, Configuration conf ) throws IOException { - this(nameNodeAddr, conf, null); + this(nameNodeUri, conf, null); } /** - * Same as this(nameNodeAddr, null, conf, stats); + * Same as this(nameNodeUri, null, conf, stats); * @see #DFSClient(InetSocketAddress, ClientProtocol, Configuration, org.apache.hadoop.fs.FileSystem.Statistics) */ - public DFSClient(InetSocketAddress nameNodeAddr, Configuration conf, + public DFSClient(URI nameNodeUri, Configuration conf, FileSystem.Statistics stats) throws IOException { - this(nameNodeAddr, null, conf, stats); + this(nameNodeUri, null, conf, stats); } - + /** - * Create a new DFSClient connected to the given nameNodeAddr or rpcNamenode. - * Exactly one of nameNodeAddr or rpcNamenode must be null. + * Create a new DFSClient connected to the given nameNodeUri or rpcNamenode. + * Exactly one of nameNodeUri or rpcNamenode must be null. */ - DFSClient(InetSocketAddress nameNodeAddr, ClientProtocol rpcNamenode, + DFSClient(URI nameNodeUri, ClientProtocol rpcNamenode, Configuration conf, FileSystem.Statistics stats) throws IOException { // Copy only the required DFSClient configuration this.dfsClientConf = new Conf(conf); this.conf = conf; this.stats = stats; - this.nnAddress = nameNodeAddr; this.socketFactory = NetUtils.getSocketFactory(conf, ClientProtocol.class); this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf); // The hdfsTimeout is currently the same as the ipc timeout this.hdfsTimeout = Client.getTimeout(conf); this.ugi = UserGroupInformation.getCurrentUser(); - final String authority = nameNodeAddr == null? "null": - nameNodeAddr.getHostName() + ":" + nameNodeAddr.getPort(); + + final String authority = nameNodeUri == null? "null": nameNodeUri.getAuthority(); this.leaserenewer = LeaseRenewer.getInstance(authority, ugi, this); this.clientName = leaserenewer.getClientName(dfsClientConf.taskId); + this.socketCache = new SocketCache(dfsClientConf.socketCacheCapacity); - if (nameNodeAddr != null && rpcNamenode == null) { - this.namenode = DFSUtil.createNamenode(nameNodeAddr, conf, ugi); - } else if (nameNodeAddr == null && rpcNamenode != null) { - //This case is used for testing. + + + if (rpcNamenode != null) { + // This case is used for testing. + Preconditions.checkArgument(nameNodeUri == null); this.namenode = rpcNamenode; + dtService = null; } else { - throw new IllegalArgumentException( - "Expecting exactly one of nameNodeAddr and rpcNamenode being null: " - + "nameNodeAddr=" + nameNodeAddr + ", rpcNamenode=" + rpcNamenode); + Preconditions.checkArgument(nameNodeUri != null, + "null URI"); + NameNodeProxies.ProxyAndInfo proxyInfo = + NameNodeProxies.createProxy(conf, nameNodeUri, ClientProtocol.class); + this.dtService = proxyInfo.getDelegationTokenService(); + this.namenode = proxyInfo.getProxy(); } + // read directly from the block file if configured. this.shortCircuitLocalReads = conf.getBoolean( DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, @@ -388,20 +418,8 @@ public class DFSClient implements java.io.Closeable { /** * Close connections the Namenode. - * The namenode variable is either a rpcProxy passed by a test or - * created using the protocolTranslator which is closeable. - * If closeable then call close, else close using RPC.stopProxy(). */ void closeConnectionToNamenode() { - if (namenode instanceof Closeable) { - try { - ((Closeable) namenode).close(); - return; - } catch (IOException e) { - // fall through - lets try the stopProxy - LOG.warn("Exception closing namenode, stopping the proxy"); - } - } RPC.stopProxy(namenode); } @@ -491,11 +509,13 @@ public class DFSClient implements java.io.Closeable { */ public Token getDelegationToken(Text renewer) throws IOException { - Token result = + assert dtService != null; + Token token = namenode.getDelegationToken(renewer); - SecurityUtil.setTokenService(result, nnAddress); - LOG.info("Created " + DelegationTokenIdentifier.stringifyToken(result)); - return result; + token.setService(this.dtService); + + LOG.info("Created " + DelegationTokenIdentifier.stringifyToken(token)); + return token; } /** @@ -625,13 +645,8 @@ public class DFSClient implements java.io.Closeable { @Override public long renew(Token token, Configuration conf) throws IOException { Token delToken = - (Token) token; - LOG.info("Renewing " + - DelegationTokenIdentifier.stringifyToken(delToken)); - ClientProtocol nn = - DFSUtil.createNamenode - (SecurityUtil.getTokenServiceAddr(delToken), - conf, UserGroupInformation.getCurrentUser()); + (Token) token; + ClientProtocol nn = getNNProxy(delToken, conf); try { return nn.renewDelegationToken(delToken); } catch (RemoteException re) { @@ -647,9 +662,7 @@ public class DFSClient implements java.io.Closeable { (Token) token; LOG.info("Cancelling " + DelegationTokenIdentifier.stringifyToken(delToken)); - ClientProtocol nn = DFSUtil.createNamenode( - SecurityUtil.getTokenServiceAddr(delToken), conf, - UserGroupInformation.getCurrentUser()); + ClientProtocol nn = getNNProxy(delToken, conf); try { nn.cancelDelegationToken(delToken); } catch (RemoteException re) { @@ -657,6 +670,31 @@ public class DFSClient implements java.io.Closeable { AccessControlException.class); } } + + private static ClientProtocol getNNProxy( + Token token, Configuration conf) + throws IOException { + URI uri = HAUtil.getServiceUriFromToken(token); + if (HAUtil.isTokenForLogicalUri(token) && + !HAUtil.isLogicalUri(conf, uri)) { + // If the token is for a logical nameservice, but the configuration + // we have disagrees about that, we can't actually renew it. + // This can be the case in MR, for example, if the RM doesn't + // have all of the HA clusters configured in its configuration. + throw new IOException("Unable to map logical nameservice URI '" + + uri + "' to a NameNode. Local configuration does not have " + + "a failover proxy provider configured."); + } + + NameNodeProxies.ProxyAndInfo info = + NameNodeProxies.createProxy(conf, uri, ClientProtocol.class); + assert info.getDelegationTokenService().equals(token.getService()) : + "Returned service '" + info.getDelegationTokenService().toString() + + "' doesn't match expected service '" + + token.getService().toString() + "'"; + + return info.getProxy(); + } @Override public boolean isManaged(Token token) throws IOException { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index fef6d8b9ca..4187f1c5c7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -48,6 +48,19 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final String DFS_CLIENT_WRITE_REPLACE_DATANODE_ON_FAILURE_POLICY_DEFAULT = "DEFAULT"; public static final String DFS_CLIENT_SOCKET_CACHE_CAPACITY_KEY = "dfs.client.socketcache.capacity"; public static final int DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT = 16; + + // HA related configuration + public static final String DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX = "dfs.client.failover.proxy.provider"; + public static final String DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY = "dfs.client.failover.max.attempts"; + public static final int DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_DEFAULT = 15; + public static final String DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_KEY = "dfs.client.failover.sleep.base.millis"; + public static final int DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_DEFAULT = 500; + public static final String DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_KEY = "dfs.client.failover.sleep.max.millis"; + public static final int DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_DEFAULT = 15000; + public static final String DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_KEY = "dfs.client.failover.connection.retries"; + public static final int DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_DEFAULT = 0; + public static final String DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_KEY = "dfs.client.failover.connection.retries.on.timeouts"; + public static final int DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT = 0; public static final String DFS_NAMENODE_BACKUP_ADDRESS_KEY = "dfs.namenode.backup.address"; public static final String DFS_NAMENODE_BACKUP_ADDRESS_DEFAULT = "localhost:50100"; @@ -120,6 +133,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final boolean DFS_WEBHDFS_ENABLED_DEFAULT = false; public static final String DFS_PERMISSIONS_ENABLED_KEY = "dfs.permissions.enabled"; public static final boolean DFS_PERMISSIONS_ENABLED_DEFAULT = true; + public static final String DFS_PERSIST_BLOCKS_KEY = "dfs.persist.blocks"; + public static final boolean DFS_PERSIST_BLOCKS_DEFAULT = false; public static final String DFS_PERMISSIONS_SUPERUSERGROUP_KEY = "dfs.permissions.superusergroup"; public static final String DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT = "supergroup"; public static final String DFS_ADMIN = "dfs.cluster.administrators"; @@ -131,6 +146,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final boolean DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_DEFAULT = true; public static final String DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY = "dfs.namenode.num.checkpoints.retained"; public static final int DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT = 2; + public static final String DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY = "dfs.namenode.num.extra.edits.retained"; + public static final int DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_DEFAULT = 1000000; //1M + public static final String DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY = "dfs.namenode.edits.dir.minimum"; public static final int DFS_NAMENODE_EDITS_DIR_MINIMUM_DEFAULT = 1; @@ -150,6 +168,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final long DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT = 24*60*60*1000; // 1 day public static final String DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY = "dfs.namenode.delegation.token.max-lifetime"; public static final long DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT = 7*24*60*60*1000; // 7 days + public static final String DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY = "dfs.namenode.delegation.token.always-use"; // for tests + public static final boolean DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT = false; //Filesystem limit keys public static final String DFS_NAMENODE_MAX_COMPONENT_LENGTH_KEY = "dfs.namenode.fs-limits.max-component-length"; @@ -165,6 +185,7 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final String DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT = "0.0.0.0:" + DFS_NAMENODE_HTTPS_PORT_DEFAULT; public static final String DFS_NAMENODE_NAME_DIR_KEY = "dfs.namenode.name.dir"; public static final String DFS_NAMENODE_EDITS_DIR_KEY = "dfs.namenode.edits.dir"; + public static final String DFS_NAMENODE_SHARED_EDITS_DIR_KEY = "dfs.namenode.shared.edits.dir"; public static final String DFS_NAMENODE_EDITS_PLUGIN_PREFIX = "dfs.namenode.edits.journal-plugin"; public static final String DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY = "dfs.namenode.edits.dir.required"; public static final String DFS_CLIENT_READ_PREFETCH_SIZE_KEY = "dfs.client.read.prefetch.size"; @@ -298,8 +319,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final String DFS_NAMENODE_NAME_CACHE_THRESHOLD_KEY = "dfs.namenode.name.cache.threshold"; public static final int DFS_NAMENODE_NAME_CACHE_THRESHOLD_DEFAULT = 10; - public static final String DFS_FEDERATION_NAMESERVICES = "dfs.federation.nameservices"; - public static final String DFS_FEDERATION_NAMESERVICE_ID = "dfs.federation.nameservice.id"; + public static final String DFS_FEDERATION_NAMESERVICES = "dfs.federation.nameservices"; + public static final String DFS_FEDERATION_NAMESERVICE_ID = "dfs.federation.nameservice.id"; public static final String DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY = "dfs.namenode.resource.check.interval"; public static final int DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT = 5000; public static final String DFS_NAMENODE_DU_RESERVED_KEY = "dfs.namenode.resource.du.reserved"; @@ -309,5 +330,16 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final int DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_DEFAULT = 1; public static final String DFS_WEB_AUTHENTICATION_KERBEROS_PRINCIPAL_KEY = "dfs.web.authentication.kerberos.principal"; public static final String DFS_WEB_AUTHENTICATION_KERBEROS_KEYTAB_KEY = "dfs.web.authentication.kerberos.keytab"; + public static final String DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY = "dfs.block.local-path-access.user"; + + // HA related configuration + public static final String DFS_HA_NAMENODES_KEY_PREFIX = "dfs.ha.namenodes"; + public static final String DFS_HA_NAMENODE_ID_KEY = "dfs.ha.namenode.id"; + public static final String DFS_HA_STANDBY_CHECKPOINTS_KEY = "dfs.ha.standby.checkpoints"; + public static final boolean DFS_HA_STANDBY_CHECKPOINTS_DEFAULT = true; + public static final String DFS_HA_LOGROLL_PERIOD_KEY = "dfs.ha.log-roll.period"; + public static final int DFS_HA_LOGROLL_PERIOD_DEFAULT = 2 * 60; // 2m + public static final String DFS_HA_TAILEDITS_PERIOD_KEY = "dfs.ha.tail-edits.period"; + public static final int DFS_HA_TAILEDITS_PERIOD_DEFAULT = 60; // 1m } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java index 7064616780..cbc0f0ea23 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java @@ -18,24 +18,21 @@ package org.apache.hadoop.hdfs; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_FEDERATION_NAMESERVICES; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY; - +import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.InetSocketAddress; import java.net.URI; import java.net.URISyntaxException; import java.security.SecureRandom; -import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Random; +import java.util.Set; import java.util.StringTokenizer; import javax.net.SocketFactory; @@ -46,9 +43,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; -import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocolPB.ClientDatanodeProtocolTranslatorPB; @@ -59,11 +56,19 @@ import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NodeBase; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import com.google.protobuf.BlockingService; @InterfaceAudience.Private public class DFSUtil { + public static final Log LOG = LogFactory.getLog(DFSUtil.class.getName()); + + private DFSUtil() { /* Hidden constructor */ } private static final ThreadLocal RANDOM = new ThreadLocal() { @Override protected Random initialValue() { @@ -101,13 +106,20 @@ public class DFSUtil { a.isDecommissioned() ? 1 : -1; } }; + /** + * Address matcher for matching an address to local address + */ + static final AddressMatcher LOCAL_ADDRESS_MATCHER = new AddressMatcher() { + public boolean match(InetSocketAddress s) { + return NetUtils.isLocalAddress(s.getAddress()); + }; + }; /** * Whether the pathname is valid. Currently prohibits relative paths, * and names which contain a ":" or "/" */ public static boolean isValidName(String src) { - // Path must be absolute. if (!src.startsWith(Path.SEPARATOR)) { return false; @@ -304,12 +316,38 @@ public class DFSUtil { /** * Returns collection of nameservice Ids from the configuration. * @param conf configuration - * @return collection of nameservice Ids + * @return collection of nameservice Ids, or null if not specified */ public static Collection getNameServiceIds(Configuration conf) { - return conf.getStringCollection(DFS_FEDERATION_NAMESERVICES); + return conf.getTrimmedStringCollection(DFS_FEDERATION_NAMESERVICES); } + /** + * @return coll if it is non-null and non-empty. Otherwise, + * returns a list with a single null value. + */ + private static Collection emptyAsSingletonNull(Collection coll) { + if (coll == null || coll.isEmpty()) { + return Collections.singletonList(null); + } else { + return coll; + } + } + + /** + * Namenode HighAvailability related configuration. + * Returns collection of namenode Ids from the configuration. One logical id + * for each namenode in the in the HA setup. + * + * @param conf configuration + * @param nsId the nameservice ID to look at, or null for non-federated + * @return collection of namenode Ids + */ + public static Collection getNameNodeIds(Configuration conf, String nsId) { + String key = addSuffix(DFS_HA_NAMENODES_KEY_PREFIX, nsId); + return conf.getTrimmedStringCollection(key); + } + /** * Given a list of keys in the order of preference, returns a value * for the key in the given order from the configuration. @@ -323,9 +361,7 @@ public class DFSUtil { Configuration conf, String... keys) { String value = null; for (String key : keys) { - if (keySuffix != null) { - key += "." + keySuffix; - } + key = addSuffix(key, keySuffix); value = conf.get(key); if (value != null) { break; @@ -337,36 +373,84 @@ public class DFSUtil { return value; } + /** Add non empty and non null suffix to a key */ + private static String addSuffix(String key, String suffix) { + if (suffix == null || suffix.isEmpty()) { + return key; + } + assert !suffix.startsWith(".") : + "suffix '" + suffix + "' should not already have '.' prepended."; + return key + "." + suffix; + } + + /** Concatenate list of suffix strings '.' separated */ + private static String concatSuffixes(String... suffixes) { + if (suffixes == null) { + return null; + } + return Joiner.on(".").skipNulls().join(suffixes); + } + /** - * Returns list of InetSocketAddress for a given set of keys. - * @param conf configuration - * @param defaultAddress default address to return in case key is not found - * @param keys Set of keys to look for in the order of preference - * @return list of InetSocketAddress corresponding to the key + * Return configuration key of format key.suffix1.suffix2...suffixN */ - private static List getAddresses(Configuration conf, + public static String addKeySuffixes(String key, String... suffixes) { + String keySuffix = concatSuffixes(suffixes); + return addSuffix(key, keySuffix); + } + + /** + * Returns the configured address for all NameNodes in the cluster. + * @param conf configuration + * @param defaultAddress default address to return in case key is not found. + * @param keys Set of keys to look for in the order of preference + * @return a map(nameserviceId to map(namenodeId to InetSocketAddress)) + */ + private static Map> + getAddresses(Configuration conf, String defaultAddress, String... keys) { Collection nameserviceIds = getNameServiceIds(conf); - List isas = new ArrayList(); - - // Configuration with a single namenode - if (nameserviceIds == null || nameserviceIds.isEmpty()) { - String address = getConfValue(defaultAddress, null, conf, keys); - if (address == null) { - return null; - } - isas.add(NetUtils.createSocketAddr(address)); - } else { - // Get the namenodes for all the configured nameServiceIds - for (String nameserviceId : nameserviceIds) { - String address = getConfValue(null, nameserviceId, conf, keys); - if (address == null) { - return null; - } - isas.add(NetUtils.createSocketAddr(address)); + + // Look for configurations of the form [.][.] + // across all of the configured nameservices and namenodes. + Map> ret = Maps.newHashMap(); + for (String nsId : emptyAsSingletonNull(nameserviceIds)) { + Map isas = + getAddressesForNameserviceId(conf, nsId, defaultAddress, keys); + if (!isas.isEmpty()) { + ret.put(nsId, isas); } } - return isas; + return ret; + } + + private static Map getAddressesForNameserviceId( + Configuration conf, String nsId, String defaultValue, + String[] keys) { + Collection nnIds = getNameNodeIds(conf, nsId); + Map ret = Maps.newHashMap(); + for (String nnId : emptyAsSingletonNull(nnIds)) { + String suffix = concatSuffixes(nsId, nnId); + String address = getConfValue(defaultValue, suffix, conf, keys); + if (address != null) { + InetSocketAddress isa = NetUtils.createSocketAddr(address); + ret.put(nnId, isa); + } + } + return ret; + } + + /** + * Returns list of InetSocketAddress corresponding to HA NN RPC addresses from + * the configuration. + * + * @param conf configuration + * @return list of InetSocketAddresses + * @throws IOException if no addresses are configured + */ + public static Map> getHaNnRpcAddresses( + Configuration conf) { + return getAddresses(conf, null, DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY); } /** @@ -377,11 +461,11 @@ public class DFSUtil { * @return list of InetSocketAddresses * @throws IOException on error */ - public static List getBackupNodeAddresses( + public static Map> getBackupNodeAddresses( Configuration conf) throws IOException { - List addressList = getAddresses(conf, + Map> addressList = getAddresses(conf, null, DFS_NAMENODE_BACKUP_ADDRESS_KEY); - if (addressList == null) { + if (addressList.isEmpty()) { throw new IOException("Incorrect configuration: backup node address " + DFS_NAMENODE_BACKUP_ADDRESS_KEY + " is not configured."); } @@ -396,11 +480,11 @@ public class DFSUtil { * @return list of InetSocketAddresses * @throws IOException on error */ - public static List getSecondaryNameNodeAddresses( + public static Map> getSecondaryNameNodeAddresses( Configuration conf) throws IOException { - List addressList = getAddresses(conf, null, + Map> addressList = getAddresses(conf, null, DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY); - if (addressList == null) { + if (addressList.isEmpty()) { throw new IOException("Incorrect configuration: secondary namenode address " + DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY + " is not configured."); } @@ -420,7 +504,7 @@ public class DFSUtil { * @return list of InetSocketAddress * @throws IOException on error */ - public static List getNNServiceRpcAddresses( + public static Map> getNNServiceRpcAddresses( Configuration conf) throws IOException { // Use default address as fall back String defaultAddress; @@ -430,9 +514,10 @@ public class DFSUtil { defaultAddress = null; } - List addressList = getAddresses(conf, defaultAddress, + Map> addressList = + getAddresses(conf, defaultAddress, DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, DFS_NAMENODE_RPC_ADDRESS_KEY); - if (addressList == null) { + if (addressList.isEmpty()) { throw new IOException("Incorrect configuration: namenode address " + DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY + " or " + DFS_NAMENODE_RPC_ADDRESS_KEY @@ -442,10 +527,154 @@ public class DFSUtil { } /** - * Given the InetSocketAddress for any configured communication with a - * namenode, this method returns the corresponding nameservice ID, - * by doing a reverse lookup on the list of nameservices until it - * finds a match. + * Flatten the given map, as returned by other functions in this class, + * into a flat list of {@link ConfiguredNNAddress} instances. + */ + public static List flattenAddressMap( + Map> map) { + List ret = Lists.newArrayList(); + + for (Map.Entry> entry : + map.entrySet()) { + String nsId = entry.getKey(); + Map nnMap = entry.getValue(); + for (Map.Entry e2 : nnMap.entrySet()) { + String nnId = e2.getKey(); + InetSocketAddress addr = e2.getValue(); + + ret.add(new ConfiguredNNAddress(nsId, nnId, addr)); + } + } + return ret; + } + + /** + * Format the given map, as returned by other functions in this class, + * into a string suitable for debugging display. The format of this string + * should not be considered an interface, and is liable to change. + */ + public static String addressMapToString( + Map> map) { + StringBuilder b = new StringBuilder(); + for (Map.Entry> entry : + map.entrySet()) { + String nsId = entry.getKey(); + Map nnMap = entry.getValue(); + b.append("Nameservice <").append(nsId).append(">:").append("\n"); + for (Map.Entry e2 : nnMap.entrySet()) { + b.append(" NN ID ").append(e2.getKey()) + .append(" => ").append(e2.getValue()).append("\n"); + } + } + return b.toString(); + } + + public static String nnAddressesAsString(Configuration conf) { + Map> addresses = + getHaNnRpcAddresses(conf); + return addressMapToString(addresses); + } + + /** + * Represent one of the NameNodes configured in the cluster. + */ + public static class ConfiguredNNAddress { + private final String nameserviceId; + private final String namenodeId; + private final InetSocketAddress addr; + + private ConfiguredNNAddress(String nameserviceId, String namenodeId, + InetSocketAddress addr) { + this.nameserviceId = nameserviceId; + this.namenodeId = namenodeId; + this.addr = addr; + } + + public String getNameserviceId() { + return nameserviceId; + } + + public String getNamenodeId() { + return namenodeId; + } + + public InetSocketAddress getAddress() { + return addr; + } + + @Override + public String toString() { + return "ConfiguredNNAddress[nsId=" + nameserviceId + ";" + + "nnId=" + namenodeId + ";addr=" + addr + "]"; + } + } + + /** + * Get a URI for each configured nameservice. If a nameservice is + * HA-enabled, then the logical URI of the nameservice is returned. If the + * nameservice is not HA-enabled, then a URI corresponding to an RPC address + * of the single NN for that nameservice is returned, preferring the service + * RPC address over the client RPC address. + * + * @param conf configuration + * @return a collection of all configured NN URIs, preferring service + * addresses + */ + public static Collection getNsServiceRpcUris(Configuration conf) { + return getNameServiceUris(conf, + DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, + DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY); + } + + /** + * Get a URI for each configured nameservice. If a nameservice is + * HA-enabled, then the logical URI of the nameservice is returned. If the + * nameservice is not HA-enabled, then a URI corresponding to the address of + * the single NN for that nameservice is returned. + * + * @param conf configuration + * @param keys configuration keys to try in order to get the URI for non-HA + * nameservices + * @return a collection of all configured NN URIs + */ + public static Collection getNameServiceUris(Configuration conf, + String... keys) { + Set ret = new HashSet(); + for (String nsId : getNameServiceIds(conf)) { + if (HAUtil.isHAEnabled(conf, nsId)) { + // Add the logical URI of the nameservice. + try { + ret.add(new URI(HdfsConstants.HDFS_URI_SCHEME + "://" + nsId)); + } catch (URISyntaxException ue) { + throw new IllegalArgumentException(ue); + } + } else { + // Add the URI corresponding to the address of the NN. + for (String key : keys) { + String addr = conf.get(concatSuffixes(key, nsId)); + if (addr != null) { + ret.add(createUri(HdfsConstants.HDFS_URI_SCHEME, + NetUtils.createSocketAddr(addr))); + break; + } + } + } + } + // Add the generic configuration keys. + for (String key : keys) { + String addr = conf.get(key); + if (addr != null) { + ret.add(createUri("hdfs", NetUtils.createSocketAddr(addr))); + break; + } + } + return ret; + } + + /** + * Given the InetSocketAddress this method returns the nameservice Id + * corresponding to the key with matching address, by doing a reverse + * lookup on the list of nameservices until it finds a match. * * Since the process of resolving URIs to Addresses is slightly expensive, * this utility method should not be used in performance-critical routines. @@ -463,91 +692,109 @@ public class DFSUtil { * not the NameServiceId-suffixed keys. * @return nameserviceId, or null if no match found */ - public static String getNameServiceIdFromAddress(Configuration conf, - InetSocketAddress address, String... keys) { - Collection nameserviceIds = getNameServiceIds(conf); - + public static String getNameServiceIdFromAddress(final Configuration conf, + final InetSocketAddress address, String... keys) { // Configuration with a single namenode and no nameserviceId - if (nameserviceIds == null || nameserviceIds.isEmpty()) { - return null; - } - // Get the candidateAddresses for all the configured nameServiceIds - for (String nameserviceId : nameserviceIds) { - for (String key : keys) { - String candidateAddress = conf.get( - getNameServiceIdKey(key, nameserviceId)); - if (candidateAddress != null - && address.equals(NetUtils.createSocketAddr(candidateAddress))) - return nameserviceId; - } - } - // didn't find a match - return null; + String[] ids = getSuffixIDs(conf, address, keys); + return (ids != null) ? ids[0] : null; } - + /** - * return server http or https address from the configuration + * return server http or https address from the configuration for a + * given namenode rpc address. * @param conf - * @param namenode - namenode address + * @param namenodeAddr - namenode RPC address * @param httpsAddress -If true, and if security is enabled, returns server * https address. If false, returns server http address. * @return server http or https address */ public static String getInfoServer( - InetSocketAddress namenode, Configuration conf, boolean httpsAddress) { - String httpAddress = null; - - String httpAddressKey = (UserGroupInformation.isSecurityEnabled() - && httpsAddress) ? DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_KEY - : DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY; - String httpAddressDefault = (UserGroupInformation.isSecurityEnabled() - && httpsAddress) ? DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT - : DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_DEFAULT; - if(namenode != null) { + InetSocketAddress namenodeAddr, Configuration conf, boolean httpsAddress) { + boolean securityOn = UserGroupInformation.isSecurityEnabled(); + String httpAddressKey = (securityOn && httpsAddress) ? + DFS_NAMENODE_HTTPS_ADDRESS_KEY : DFS_NAMENODE_HTTP_ADDRESS_KEY; + String httpAddressDefault = (securityOn && httpsAddress) ? + DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT : DFS_NAMENODE_HTTP_ADDRESS_DEFAULT; + + String suffixes[]; + if (namenodeAddr != null) { // if non-default namenode, try reverse look up // the nameServiceID if it is available - String nameServiceId = DFSUtil.getNameServiceIdFromAddress( - conf, namenode, + suffixes = getSuffixIDs(conf, namenodeAddr, DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY); - - if (nameServiceId != null) { - httpAddress = conf.get(DFSUtil.getNameServiceIdKey( - httpAddressKey, nameServiceId)); - } - } - // else - Use non-federation style configuration - if (httpAddress == null) { - httpAddress = conf.get(httpAddressKey, httpAddressDefault); + } else { + suffixes = new String[2]; } - return httpAddress; + return getSuffixedConf(conf, httpAddressKey, httpAddressDefault, suffixes); } + /** - * @return key specific to a nameserviceId from a generic key + * Substitute a default host in the case that an address has been configured + * with a wildcard. This is used, for example, when determining the HTTP + * address of the NN -- if it's configured to bind to 0.0.0.0, we want to + * substitute the hostname from the filesystem URI rather than trying to + * connect to 0.0.0.0. + * @param configuredAddress the address found in the configuration + * @param defaultHost the host to substitute with, if configuredAddress + * is a local/wildcard address. + * @return the substituted address + * @throws IOException if it is a wildcard address and security is enabled */ - public static String getNameServiceIdKey(String key, String nameserviceId) { - return key + "." + nameserviceId; + public static String substituteForWildcardAddress(String configuredAddress, + String defaultHost) throws IOException { + InetSocketAddress sockAddr = NetUtils.createSocketAddr(configuredAddress); + if (sockAddr.getAddress().isAnyLocalAddress()) { + if(UserGroupInformation.isSecurityEnabled()) { + throw new IOException("Cannot use a wildcard address with security. " + + "Must explicitly set bind address for Kerberos"); + } + return defaultHost + ":" + sockAddr.getPort(); + } else { + return configuredAddress; + } + } + + private static String getSuffixedConf(Configuration conf, + String key, String defaultVal, String[] suffixes) { + String ret = conf.get(DFSUtil.addKeySuffixes(key, suffixes)); + if (ret != null) { + return ret; + } + return conf.get(key, defaultVal); } /** * Sets the node specific setting into generic configuration key. Looks up - * value of "key.nameserviceId" and if found sets that value into generic key - * in the conf. Note that this only modifies the runtime conf. + * value of "key.nameserviceId.namenodeId" and if found sets that value into + * generic key in the conf. If this is not found, falls back to + * "key.nameserviceId" and then the unmodified key. + * + * Note that this only modifies the runtime conf. * * @param conf * Configuration object to lookup specific key and to set the value * to the key passed. Note the conf object is modified. * @param nameserviceId - * nameservice Id to construct the node specific key. + * nameservice Id to construct the node specific key. Pass null if + * federation is not configuration. + * @param nnId + * namenode Id to construct the node specific key. Pass null if + * HA is not configured. * @param keys * The key for which node specific value is looked up */ public static void setGenericConf(Configuration conf, - String nameserviceId, String... keys) { + String nameserviceId, String nnId, String... keys) { for (String key : keys) { - String value = conf.get(getNameServiceIdKey(key, nameserviceId)); + String value = conf.get(addKeySuffixes(key, nameserviceId, nnId)); + if (value != null) { + conf.set(key, value); + continue; + } + value = conf.get(addKeySuffixes(key, nameserviceId)); if (value != null) { conf.set(key, value); } @@ -572,34 +819,7 @@ public class DFSUtil { public static int roundBytesToGB(long bytes) { return Math.round((float)bytes/ 1024 / 1024 / 1024); } - - - /** Create a {@link NameNode} proxy */ - public static ClientProtocol createNamenode(Configuration conf) - throws IOException { - return createNamenode(NameNode.getAddress(conf), conf); - } - - /** Create a {@link NameNode} proxy */ - public static ClientProtocol createNamenode( InetSocketAddress nameNodeAddr, - Configuration conf) throws IOException { - return createNamenode(nameNodeAddr, conf, - UserGroupInformation.getCurrentUser()); - } - - /** Create a {@link NameNode} proxy */ - public static ClientProtocol createNamenode( InetSocketAddress nameNodeAddr, - Configuration conf, UserGroupInformation ugi) throws IOException { - /** - * Currently we have simply burnt-in support for a SINGLE - * protocol - protocolPB. This will be replaced - * by a way to pick the right protocol based on the - * version of the target server. - */ - return new org.apache.hadoop.hdfs.protocolPB. - ClientNamenodeProtocolTranslatorPB(nameNodeAddr, conf, ugi); - } - + /** Create a {@link ClientDatanodeProtocol} proxy */ public static ClientDatanodeProtocol createClientDatanodeProtocolProxy( DatanodeID datanodeid, Configuration conf, int socketTimeout, @@ -622,9 +842,9 @@ public class DFSUtil { SocketFactory factory) throws IOException { return new ClientDatanodeProtocolTranslatorPB(addr, ticket, conf, factory); } - + /** - * Get name service Id for the {@link NameNode} based on namenode RPC address + * Get nameservice Id for the {@link NameNode} based on namenode RPC address * matching the local node address. */ public static String getNamenodeNameServiceId(Configuration conf) { @@ -632,7 +852,7 @@ public class DFSUtil { } /** - * Get name service Id for the BackupNode based on backup node RPC address + * Get nameservice Id for the BackupNode based on backup node RPC address * matching the local node address. */ public static String getBackupNameServiceId(Configuration conf) { @@ -640,7 +860,7 @@ public class DFSUtil { } /** - * Get name service Id for the secondary node based on secondary http address + * Get nameservice Id for the secondary node based on secondary http address * matching the local node address. */ public static String getSecondaryNameServiceId(Configuration conf) { @@ -652,13 +872,14 @@ public class DFSUtil { * the address of the local node. * * If {@link DFSConfigKeys#DFS_FEDERATION_NAMESERVICE_ID} is not specifically - * configured, this method determines the nameservice Id by matching the local - * nodes address with the configured addresses. When a match is found, it - * returns the nameservice Id from the corresponding configuration key. + * configured, and more than one nameservice Id is configured, this method + * determines the nameservice Id by matching the local node's address with the + * configured addresses. When a match is found, it returns the nameservice Id + * from the corresponding configuration key. * * @param conf Configuration * @param addressKey configuration key to get the address. - * @return name service Id on success, null on failure. + * @return nameservice Id on success, null if federation is not configured. * @throws HadoopIllegalArgumentException on error */ private static String getNameServiceId(Configuration conf, String addressKey) { @@ -666,34 +887,106 @@ public class DFSUtil { if (nameserviceId != null) { return nameserviceId; } - - Collection ids = getNameServiceIds(conf); - if (ids == null || ids.size() == 0) { - // Not federation configuration, hence no nameservice Id - return null; + Collection nsIds = getNameServiceIds(conf); + if (1 == nsIds.size()) { + return nsIds.toArray(new String[1])[0]; } + String nnId = conf.get(DFS_HA_NAMENODE_ID_KEY); - // Match the rpc address with that of local address + return getSuffixIDs(conf, addressKey, null, nnId, LOCAL_ADDRESS_MATCHER)[0]; + } + + /** + * Returns nameservice Id and namenode Id when the local host matches the + * configuration parameter {@code addressKey}.. + * + * @param conf Configuration + * @param addressKey configuration key corresponding to the address. + * @param knownNsId only look at configs for the given nameservice, if not-null + * @param knownNNId only look at configs for the given namenode, if not null + * @param matcher matching criteria for matching the address + * @return Array with nameservice Id and namenode Id on success. First element + * in the array is nameservice Id and second element is namenode Id. + * Null value indicates that the configuration does not have the the + * Id. + * @throws HadoopIllegalArgumentException on error + */ + static String[] getSuffixIDs(final Configuration conf, final String addressKey, + String knownNsId, String knownNNId, + final AddressMatcher matcher) { + String nameserviceId = null; + String namenodeId = null; int found = 0; - for (String id : ids) { - String addr = conf.get(getNameServiceIdKey(addressKey, id)); - InetSocketAddress s = NetUtils.createSocketAddr(addr); - if (NetUtils.isLocalAddress(s.getAddress())) { - nameserviceId = id; - found++; + + Collection nsIds = getNameServiceIds(conf); + for (String nsId : emptyAsSingletonNull(nsIds)) { + if (knownNsId != null && !knownNsId.equals(nsId)) { + continue; + } + + Collection nnIds = getNameNodeIds(conf, nsId); + for (String nnId : emptyAsSingletonNull(nnIds)) { + if (LOG.isTraceEnabled()) { + LOG.trace(String.format("addressKey: %s nsId: %s nnId: %s", + addressKey, nsId, nnId)); + } + if (knownNNId != null && !knownNNId.equals(nnId)) { + continue; + } + String key = addKeySuffixes(addressKey, nsId, nnId); + String addr = conf.get(key); + if (addr == null) { + continue; + } + InetSocketAddress s = null; + try { + s = NetUtils.createSocketAddr(addr); + } catch (Exception e) { + LOG.warn("Exception in creating socket address " + addr, e); + continue; + } + if (!s.isUnresolved() && matcher.match(s)) { + nameserviceId = nsId; + namenodeId = nnId; + found++; + } } } if (found > 1) { // Only one address must match the local address - throw new HadoopIllegalArgumentException( - "Configuration has multiple RPC addresses that matches " - + "the local node's address. Please configure the system with " - + "the parameter " + DFS_FEDERATION_NAMESERVICE_ID); + String msg = "Configuration has multiple addresses that match " + + "local node's address. Please configure the system with " + + DFS_FEDERATION_NAMESERVICE_ID + " and " + + DFS_HA_NAMENODE_ID_KEY; + throw new HadoopIllegalArgumentException(msg); } - if (found == 0) { - throw new HadoopIllegalArgumentException("Configuration address " - + addressKey + " is missing in configuration with name service Id"); + return new String[] { nameserviceId, namenodeId }; + } + + /** + * For given set of {@code keys} adds nameservice Id and or namenode Id + * and returns {nameserviceId, namenodeId} when address match is found. + * @see #getSuffixIDs(Configuration, String, AddressMatcher) + */ + static String[] getSuffixIDs(final Configuration conf, + final InetSocketAddress address, final String... keys) { + AddressMatcher matcher = new AddressMatcher() { + @Override + public boolean match(InetSocketAddress s) { + return address.equals(s); + } + }; + + for (String key : keys) { + String[] ids = getSuffixIDs(conf, key, null, null, matcher); + if (ids != null && (ids [0] != null || ids[1] != null)) { + return ids; + } } - return nameserviceId; + return null; + } + + private interface AddressMatcher { + public boolean match(InetSocketAddress s); } /** Create a URI from the scheme and address */ @@ -719,4 +1012,39 @@ public class DFSUtil { RPC.setProtocolEngine(conf, protocol, ProtobufRpcEngine.class); server.addProtocol(RpcKind.RPC_PROTOCOL_BUFFER, protocol, service); } + + /** + * Map a logical namenode ID to its service address. Use the given + * nameservice if specified, or the configured one if none is given. + * + * @param conf Configuration + * @param nsId which nameservice nnId is a part of, optional + * @param nnId the namenode ID to get the service addr for + * @return the service addr, null if it could not be determined + */ + public static String getNamenodeServiceAddr(final Configuration conf, + String nsId, String nnId) { + + if (nsId == null) { + Collection nsIds = getNameServiceIds(conf); + if (1 == nsIds.size()) { + nsId = nsIds.toArray(new String[1])[0]; + } else { + // No nameservice ID was given and more than one is configured + return null; + } + } + + String serviceAddrKey = concatSuffixes( + DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, nsId, nnId); + + String addrKey = concatSuffixes( + DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, nsId, nnId); + + String serviceRpcAddr = conf.get(serviceAddrKey); + if (serviceRpcAddr == null) { + serviceRpcAddr = conf.get(addrKey); + } + return serviceRpcAddr; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java index 119bca9b55..8dfced350a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java @@ -106,8 +106,7 @@ public class DistributedFileSystem extends FileSystem { throw new IOException("Incomplete HDFS URI, no host: "+ uri); } - InetSocketAddress namenode = NameNode.getAddress(uri.getAuthority()); - this.dfs = new DFSClient(namenode, conf, statistics); + this.dfs = new DFSClient(uri, conf, statistics); this.uri = URI.create(uri.getScheme()+"://"+uri.getAuthority()); this.workingDir = getHomeDirectory(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HAUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HAUtil.java new file mode 100644 index 0000000000..34e9d2e9dd --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HAUtil.java @@ -0,0 +1,261 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs; + +import static org.apache.hadoop.hdfs.DFSConfigKeys.*; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.HadoopIllegalArgumentException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; +import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSelector; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.security.SecurityUtil; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import static org.apache.hadoop.hdfs.protocol.HdfsConstants.HA_DT_SERVICE_PREFIX; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +public class HAUtil { + + private static final Log LOG = + LogFactory.getLog(HAUtil.class); + + private HAUtil() { /* Hidden constructor */ } + + /** + * Returns true if HA for namenode is configured for the given nameservice + * + * @param conf Configuration + * @param nsId nameservice, or null if no federated NS is configured + * @return true if HA is configured in the configuration; else false. + */ + public static boolean isHAEnabled(Configuration conf, String nsId) { + Map> addresses = + DFSUtil.getHaNnRpcAddresses(conf); + if (addresses == null) return false; + Map nnMap = addresses.get(nsId); + return nnMap != null && nnMap.size() > 1; + } + + /** + * Returns true if HA is using a shared edits directory. + * + * @param conf Configuration + * @return true if HA config is using a shared edits dir, false otherwise. + */ + public static boolean usesSharedEditsDir(Configuration conf) { + return null != conf.get(DFS_NAMENODE_SHARED_EDITS_DIR_KEY); + } + + /** + * Get the namenode Id by matching the {@code addressKey} + * with the the address of the local node. + * + * If {@link DFSConfigKeys#DFS_HA_NAMENODE_ID_KEY} is not specifically + * configured, this method determines the namenode Id by matching the local + * node's address with the configured addresses. When a match is found, it + * returns the namenode Id from the corresponding configuration key. + * + * @param conf Configuration + * @return namenode Id on success, null on failure. + * @throws HadoopIllegalArgumentException on error + */ + public static String getNameNodeId(Configuration conf, String nsId) { + String namenodeId = conf.getTrimmed(DFS_HA_NAMENODE_ID_KEY); + if (namenodeId != null) { + return namenodeId; + } + + String suffixes[] = DFSUtil.getSuffixIDs(conf, DFS_NAMENODE_RPC_ADDRESS_KEY, + nsId, null, DFSUtil.LOCAL_ADDRESS_MATCHER); + if (suffixes == null) { + String msg = "Configuration " + DFS_NAMENODE_RPC_ADDRESS_KEY + + " must be suffixed with nameservice and namenode ID for HA " + + "configuration."; + throw new HadoopIllegalArgumentException(msg); + } + + return suffixes[1]; + } + + /** + * Similar to + * {@link DFSUtil#getNameServiceIdFromAddress(Configuration, + * InetSocketAddress, String...)} + */ + public static String getNameNodeIdFromAddress(final Configuration conf, + final InetSocketAddress address, String... keys) { + // Configuration with a single namenode and no nameserviceId + String[] ids = DFSUtil.getSuffixIDs(conf, address, keys); + if (ids != null && ids.length > 1) { + return ids[1]; + } + return null; + } + + /** + * Given the configuration for this node, return a Configuration object for + * the other node in an HA setup. + * + * @param myConf the configuration of this node + * @return the configuration of the other node in an HA setup + */ + public static Configuration getConfForOtherNode( + Configuration myConf) { + + String nsId = DFSUtil.getNamenodeNameServiceId(myConf); + Preconditions.checkArgument(nsId != null, + "Could not determine namespace id. Please ensure that this " + + "machine is one of the machines listed as a NN RPC address, " + + "or configure " + DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID); + + Collection nnIds = DFSUtil.getNameNodeIds(myConf, nsId); + String myNNId = myConf.get(DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY); + Preconditions.checkArgument(nnIds != null, + "Could not determine namenode ids in namespace '%s'. " + + "Please configure " + + DFSUtil.addKeySuffixes(DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX, + nsId), + nsId); + Preconditions.checkArgument(nnIds.size() == 2, + "Expected exactly 2 NameNodes in namespace '%s'. " + + "Instead, got only %s (NN ids were '%s'", + nsId, nnIds.size(), Joiner.on("','").join(nnIds)); + Preconditions.checkState(myNNId != null && !myNNId.isEmpty(), + "Could not determine own NN ID in namespace '%s'. Please " + + "ensure that this node is one of the machines listed as an " + + "NN RPC address, or configure " + DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY, + nsId); + + ArrayList nnSet = Lists.newArrayList(nnIds); + nnSet.remove(myNNId); + assert nnSet.size() == 1; + String activeNN = nnSet.get(0); + + // Look up the address of the active NN. + Configuration confForOtherNode = new Configuration(myConf); + NameNode.initializeGenericKeys(confForOtherNode, nsId, activeNN); + return confForOtherNode; + } + + /** + * This is used only by tests at the moment. + * @return true if the NN should allow read operations while in standby mode. + */ + public static boolean shouldAllowStandbyReads(Configuration conf) { + return conf.getBoolean("dfs.ha.allow.stale.reads", false); + } + + public static void setAllowStandbyReads(Configuration conf, boolean val) { + conf.setBoolean("dfs.ha.allow.stale.reads", val); + } + + /** + * @return true if the given nameNodeUri appears to be a logical URI. + * This is the case if there is a failover proxy provider configured + * for it in the given configuration. + */ + public static boolean isLogicalUri( + Configuration conf, URI nameNodeUri) { + String host = nameNodeUri.getHost(); + String configKey = DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "." + + host; + return conf.get(configKey) != null; + } + + /** + * Parse the HDFS URI out of the provided token. + * @throws IOException if the token is invalid + */ + public static URI getServiceUriFromToken( + Token token) + throws IOException { + String tokStr = token.getService().toString(); + + if (tokStr.startsWith(HA_DT_SERVICE_PREFIX)) { + tokStr = tokStr.replaceFirst(HA_DT_SERVICE_PREFIX, ""); + } + + try { + return new URI(HdfsConstants.HDFS_URI_SCHEME + "://" + + tokStr); + } catch (URISyntaxException e) { + throw new IOException("Invalid token contents: '" + + tokStr + "'"); + } + } + + /** + * Get the service name used in the delegation token for the given logical + * HA service. + * @param uri the logical URI of the cluster + * @return the service name + */ + public static Text buildTokenServiceForLogicalUri(URI uri) { + return new Text(HA_DT_SERVICE_PREFIX + uri.getHost()); + } + + /** + * @return true if this token corresponds to a logical nameservice + * rather than a specific namenode. + */ + public static boolean isTokenForLogicalUri( + Token token) { + return token.getService().toString().startsWith(HA_DT_SERVICE_PREFIX); + } + + /** + * Locate a delegation token associated with the given HA cluster URI, and if + * one is found, clone it to also represent the underlying namenode address. + * @param ugi the UGI to modify + * @param haUri the logical URI for the cluster + * @param singleNNAddr one of the NNs in the cluster to which the token + * applies + */ + public static void cloneDelegationTokenForLogicalUri( + UserGroupInformation ugi, URI haUri, + InetSocketAddress singleNNAddr) { + Text haService = buildTokenServiceForLogicalUri(haUri); + Token haToken = + DelegationTokenSelector.selectHdfsDelegationToken(haService, ugi); + if (haToken == null) { + // no token + return; + } + Token specificToken = + new Token(haToken); + specificToken.setService(SecurityUtil.buildTokenService(singleNNAddr)); + ugi.addToken(specificToken); + LOG.debug("Mapped HA service delegation token for logical URI " + + haUri + " to namenode " + singleNNAddr); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java index 27702b5795..6e212458d0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.ha.HAServiceProtocol; import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; @@ -44,6 +45,8 @@ public class HDFSPolicyProvider extends PolicyProvider { new Service("security.inter.datanode.protocol.acl", InterDatanodeProtocol.class), new Service("security.namenode.protocol.acl", NamenodeProtocol.class), + new Service(CommonConfigurationKeys.SECURITY_HA_SERVICE_PROTOCOL_ACL, + HAServiceProtocol.class), new Service( CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_AUTHORIZATION_REFRESH_POLICY, RefreshAuthorizationPolicyProtocol.class), diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/NameNodeProxies.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/NameNodeProxies.java new file mode 100644 index 0000000000..650c313c0a --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/NameNodeProxies.java @@ -0,0 +1,333 @@ +package org.apache.hadoop.hdfs; + +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX; + +import java.io.IOException; +import java.lang.reflect.Constructor; +import java.net.InetSocketAddress; +import java.net.URI; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSClient.Conf; +import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; +import org.apache.hadoop.hdfs.protocol.ClientProtocol; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.GetUserMappingsProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.GetUserMappingsProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.JournalProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.JournalProtocolTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolPB; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.protocol.JournalProtocol; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.FailoverProxyProvider; +import org.apache.hadoop.io.retry.RetryPolicies; +import org.apache.hadoop.io.retry.RetryPolicy; +import org.apache.hadoop.io.retry.RetryProxy; +import org.apache.hadoop.ipc.ProtobufRpcEngine; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.ipc.RemoteException; +import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.RefreshUserMappingsProtocol; +import org.apache.hadoop.security.SecurityUtil; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol; +import org.apache.hadoop.tools.GetUserMappingsProtocol; + +import com.google.common.base.Preconditions; + +/** + * Create proxy objects to communicate with a remote NN. All remote access to an + * NN should be funneled through this class. Most of the time you'll want to use + * {@link NameNodeProxies#createProxy(Configuration, URI, Class)}, which will + * create either an HA- or non-HA-enabled client proxy as appropriate. + */ +public class NameNodeProxies { + + private static final Log LOG = LogFactory.getLog(NameNodeProxies.class); + + /** + * Wrapper for a client proxy as well as its associated service ID. + * This is simply used as a tuple-like return type for + * {@link NameNodeProxies#createProxy} and + * {@link NameNodeProxies#createNonHAProxy}. + */ + public static class ProxyAndInfo { + private final PROXYTYPE proxy; + private final Text dtService; + + public ProxyAndInfo(PROXYTYPE proxy, Text dtService) { + this.proxy = proxy; + this.dtService = dtService; + } + + public PROXYTYPE getProxy() { + return proxy; + } + + public Text getDelegationTokenService() { + return dtService; + } + } + + /** + * Creates the namenode proxy with the passed protocol. This will handle + * creation of either HA- or non-HA-enabled proxy objects, depending upon + * if the provided URI is a configured logical URI. + * + * @param conf the configuration containing the required IPC + * properties, client failover configurations, etc. + * @param nameNodeUri the URI pointing either to a specific NameNode + * or to a logical nameservice. + * @param xface the IPC interface which should be created + * @return an object containing both the proxy and the associated + * delegation token service it corresponds to + * @throws IOException if there is an error creating the proxy + **/ + @SuppressWarnings("unchecked") + public static ProxyAndInfo createProxy(Configuration conf, + URI nameNodeUri, Class xface) throws IOException { + Class> failoverProxyProviderClass = + getFailoverProxyProviderClass(conf, nameNodeUri, xface); + + if (failoverProxyProviderClass == null) { + // Non-HA case + return createNonHAProxy(conf, NameNode.getAddress(nameNodeUri), xface, + UserGroupInformation.getCurrentUser(), true); + } else { + // HA case + FailoverProxyProvider failoverProxyProvider = NameNodeProxies + .createFailoverProxyProvider(conf, failoverProxyProviderClass, xface, + nameNodeUri); + Conf config = new Conf(conf); + T proxy = (T) RetryProxy.create(xface, failoverProxyProvider, RetryPolicies + .failoverOnNetworkException(RetryPolicies.TRY_ONCE_THEN_FAIL, + config.maxFailoverAttempts, config.failoverSleepBaseMillis, + config.failoverSleepMaxMillis)); + + Text dtService = HAUtil.buildTokenServiceForLogicalUri(nameNodeUri); + return new ProxyAndInfo(proxy, dtService); + } + } + + /** + * Creates an explicitly non-HA-enabled proxy object. Most of the time you + * don't want to use this, and should instead use {@link NameNodeProxies#createProxy}. + * + * @param conf the configuration object + * @param nnAddr address of the remote NN to connect to + * @param xface the IPC interface which should be created + * @param ugi the user who is making the calls on the proxy object + * @param withRetries certain interfaces have a non-standard retry policy + * @return an object containing both the proxy and the associated + * delegation token service it corresponds to + * @throws IOException + */ + @SuppressWarnings("unchecked") + public static ProxyAndInfo createNonHAProxy( + Configuration conf, InetSocketAddress nnAddr, Class xface, + UserGroupInformation ugi, boolean withRetries) throws IOException { + Text dtService = SecurityUtil.buildTokenService(nnAddr); + + T proxy; + if (xface == ClientProtocol.class) { + proxy = (T) createNNProxyWithClientProtocol(nnAddr, conf, ugi, + withRetries); + } else if (xface == JournalProtocol.class) { + proxy = (T) createNNProxyWithJournalProtocol(nnAddr, conf, ugi); + } else if (xface == NamenodeProtocol.class) { + proxy = (T) createNNProxyWithNamenodeProtocol(nnAddr, conf, ugi, + withRetries); + } else if (xface == GetUserMappingsProtocol.class) { + proxy = (T) createNNProxyWithGetUserMappingsProtocol(nnAddr, conf, ugi); + } else if (xface == RefreshUserMappingsProtocol.class) { + proxy = (T) createNNProxyWithRefreshUserMappingsProtocol(nnAddr, conf, ugi); + } else if (xface == RefreshAuthorizationPolicyProtocol.class) { + proxy = (T) createNNProxyWithRefreshAuthorizationPolicyProtocol(nnAddr, + conf, ugi); + } else { + String message = "Upsupported protocol found when creating the proxy " + + "connection to NameNode: " + + ((xface != null) ? xface.getClass().getName() : "null"); + LOG.error(message); + throw new IllegalStateException(message); + } + return new ProxyAndInfo(proxy, dtService); + } + + private static JournalProtocol createNNProxyWithJournalProtocol( + InetSocketAddress address, Configuration conf, UserGroupInformation ugi) + throws IOException { + JournalProtocolPB proxy = (JournalProtocolPB) createNameNodeProxy(address, + conf, ugi, JournalProtocolPB.class); + return new JournalProtocolTranslatorPB(proxy); + } + + private static RefreshAuthorizationPolicyProtocol + createNNProxyWithRefreshAuthorizationPolicyProtocol(InetSocketAddress address, + Configuration conf, UserGroupInformation ugi) throws IOException { + RefreshAuthorizationPolicyProtocolPB proxy = (RefreshAuthorizationPolicyProtocolPB) + createNameNodeProxy(address, conf, ugi, RefreshAuthorizationPolicyProtocolPB.class); + return new RefreshAuthorizationPolicyProtocolClientSideTranslatorPB(proxy); + } + + private static RefreshUserMappingsProtocol + createNNProxyWithRefreshUserMappingsProtocol(InetSocketAddress address, + Configuration conf, UserGroupInformation ugi) throws IOException { + RefreshUserMappingsProtocolPB proxy = (RefreshUserMappingsProtocolPB) + createNameNodeProxy(address, conf, ugi, RefreshUserMappingsProtocolPB.class); + return new RefreshUserMappingsProtocolClientSideTranslatorPB(proxy); + } + + private static GetUserMappingsProtocol createNNProxyWithGetUserMappingsProtocol( + InetSocketAddress address, Configuration conf, UserGroupInformation ugi) + throws IOException { + GetUserMappingsProtocolPB proxy = (GetUserMappingsProtocolPB) + createNameNodeProxy(address, conf, ugi, GetUserMappingsProtocolPB.class); + return new GetUserMappingsProtocolClientSideTranslatorPB(proxy); + } + + private static NamenodeProtocol createNNProxyWithNamenodeProtocol( + InetSocketAddress address, Configuration conf, UserGroupInformation ugi, + boolean withRetries) throws IOException { + NamenodeProtocolPB proxy = (NamenodeProtocolPB) createNameNodeProxy( + address, conf, ugi, NamenodeProtocolPB.class); + if (withRetries) { // create the proxy with retries + RetryPolicy timeoutPolicy = RetryPolicies.exponentialBackoffRetry(5, 200, + TimeUnit.MILLISECONDS); + Map, RetryPolicy> exceptionToPolicyMap + = new HashMap, RetryPolicy>(); + RetryPolicy methodPolicy = RetryPolicies.retryByException(timeoutPolicy, + exceptionToPolicyMap); + Map methodNameToPolicyMap + = new HashMap(); + methodNameToPolicyMap.put("getBlocks", methodPolicy); + methodNameToPolicyMap.put("getAccessKeys", methodPolicy); + proxy = (NamenodeProtocolPB) RetryProxy.create(NamenodeProtocolPB.class, + proxy, methodNameToPolicyMap); + } + return new NamenodeProtocolTranslatorPB(proxy); + } + + private static ClientProtocol createNNProxyWithClientProtocol( + InetSocketAddress address, Configuration conf, UserGroupInformation ugi, + boolean withRetries) throws IOException { + ClientNamenodeProtocolPB proxy = (ClientNamenodeProtocolPB) NameNodeProxies + .createNameNodeProxy(address, conf, ugi, ClientNamenodeProtocolPB.class); + if (withRetries) { // create the proxy with retries + RetryPolicy createPolicy = RetryPolicies + .retryUpToMaximumCountWithFixedSleep(5, + HdfsConstants.LEASE_SOFTLIMIT_PERIOD, TimeUnit.MILLISECONDS); + + Map, RetryPolicy> remoteExceptionToPolicyMap + = new HashMap, RetryPolicy>(); + remoteExceptionToPolicyMap.put(AlreadyBeingCreatedException.class, + createPolicy); + + Map, RetryPolicy> exceptionToPolicyMap + = new HashMap, RetryPolicy>(); + exceptionToPolicyMap.put(RemoteException.class, RetryPolicies + .retryByRemoteException(RetryPolicies.TRY_ONCE_THEN_FAIL, + remoteExceptionToPolicyMap)); + RetryPolicy methodPolicy = RetryPolicies.retryByException( + RetryPolicies.TRY_ONCE_THEN_FAIL, exceptionToPolicyMap); + Map methodNameToPolicyMap + = new HashMap(); + + methodNameToPolicyMap.put("create", methodPolicy); + + proxy = (ClientNamenodeProtocolPB) RetryProxy + .create(ClientNamenodeProtocolPB.class, proxy, methodNameToPolicyMap); + } + return new ClientNamenodeProtocolTranslatorPB(proxy); + } + + @SuppressWarnings("unchecked") + private static Object createNameNodeProxy(InetSocketAddress address, + Configuration conf, UserGroupInformation ugi, Class xface) + throws IOException { + RPC.setProtocolEngine(conf, xface, ProtobufRpcEngine.class); + Object proxy = RPC.getProxy(xface, RPC.getProtocolVersion(xface), address, + ugi, conf, NetUtils.getDefaultSocketFactory(conf)); + return proxy; + } + + /** Gets the configured Failover proxy provider's class */ + private static Class> getFailoverProxyProviderClass( + Configuration conf, URI nameNodeUri, Class xface) throws IOException { + if (nameNodeUri == null) { + return null; + } + String host = nameNodeUri.getHost(); + + String configKey = DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "." + + host; + try { + @SuppressWarnings("unchecked") + Class> ret = (Class>) conf + .getClass(configKey, null, FailoverProxyProvider.class); + if (ret != null) { + // If we found a proxy provider, then this URI should be a logical NN. + // Given that, it shouldn't have a non-default port number. + int port = nameNodeUri.getPort(); + if (port > 0 && port != NameNode.DEFAULT_PORT) { + throw new IOException("Port " + port + " specified in URI " + + nameNodeUri + " but host '" + host + + "' is a logical (HA) namenode" + + " and does not use port information."); + } + } + return ret; + } catch (RuntimeException e) { + if (e.getCause() instanceof ClassNotFoundException) { + throw new IOException("Could not load failover proxy provider class " + + conf.get(configKey) + " which is configured for authority " + + nameNodeUri, e); + } else { + throw e; + } + } + } + + /** Creates the Failover proxy provider instance*/ + @SuppressWarnings("unchecked") + private static FailoverProxyProvider createFailoverProxyProvider( + Configuration conf, Class> failoverProxyProviderClass, + Class xface, URI nameNodeUri) throws IOException { + Preconditions.checkArgument( + xface.isAssignableFrom(NamenodeProtocols.class), + "Interface %s is not a NameNode protocol", xface); + try { + Constructor> ctor = failoverProxyProviderClass + .getConstructor(Configuration.class, URI.class, Class.class); + FailoverProxyProvider provider = ctor.newInstance(conf, nameNodeUri, + xface); + return (FailoverProxyProvider) provider; + } catch (Exception e) { + String message = "Couldn't create proxy provider " + failoverProxyProviderClass; + if (LOG.isDebugEnabled()) { + LOG.debug(message, e); + } + if (e.getCause() instanceof IOException) { + throw (IOException) e.getCause(); + } else { + throw new IOException(message, e); + } + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java index e1006a65d4..58af5fd50c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java @@ -40,7 +40,7 @@ import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; * - followed by the invalid replica represented with three -1s; * - followed by the under-construction replica list where each replica is * represented by 4 longs: three for the block id, length, generation - * stamp, and the forth for the replica state. + * stamp, and the fourth for the replica state. */ @InterfaceAudience.Private @InterfaceStability.Evolving @@ -304,4 +304,16 @@ public class BlockListAsLongs implements Iterable { blockList[idx+1] = -1; blockList[idx+2] = -1; } + + public long getMaxGsInBlockList() { + long maxGs = -1; + Iterator iter = getBlockReportIterator(); + while (iter.hasNext()) { + Block b = iter.next(); + if (b.getGenerationStamp() > maxGs) { + maxGs = b.getGenerationStamp(); + } + } + return maxGs; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java index ab6babcce3..099fd284ff 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException; import org.apache.hadoop.hdfs.server.namenode.SafeModeException; import org.apache.hadoop.io.EnumSetWritable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.Idempotent; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.KerberosInfo; import org.apache.hadoop.security.token.Token; @@ -114,6 +115,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException If src contains a symlink * @throws IOException If an I/O error occurred */ + @Idempotent public LocatedBlocks getBlockLocations(String src, long offset, long length) @@ -125,6 +127,7 @@ public interface ClientProtocol { * @return a set of server default configuration values * @throws IOException */ + @Idempotent public FsServerDefaults getServerDefaults() throws IOException; /** @@ -228,6 +231,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException if src contains a symlink * @throws IOException If an I/O error occurred */ + @Idempotent public boolean setReplication(String src, short replication) throws AccessControlException, DSQuotaExceededException, FileNotFoundException, SafeModeException, UnresolvedLinkException, @@ -242,6 +246,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException If src contains a symlink * @throws IOException If an I/O error occurred */ + @Idempotent public void setPermission(String src, FsPermission permission) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException; @@ -259,12 +264,13 @@ public interface ClientProtocol { * @throws UnresolvedLinkException If src contains a symlink * @throws IOException If an I/O error occurred */ + @Idempotent public void setOwner(String src, String username, String groupname) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException; /** - * The client can give up on a blcok by calling abandonBlock(). + * The client can give up on a block by calling abandonBlock(). * The client can then * either obtain a new block, or complete or abandon the file. * Any partial writes to the block will be discarded. @@ -331,6 +337,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException If src contains a symlink * @throws IOException If an I/O error occurred */ + @Idempotent public LocatedBlock getAdditionalDatanode(final String src, final ExtendedBlock blk, final DatanodeInfo[] existings, final DatanodeInfo[] excludes, final int numAdditionalNodes, final String clientName @@ -368,6 +375,7 @@ public interface ClientProtocol { * locations on datanodes). * @param blocks Array of located blocks to report */ + @Idempotent public void reportBadBlocks(LocatedBlock[] blocks) throws IOException; /////////////////////////////////////// @@ -482,6 +490,7 @@ public interface ClientProtocol { * RunTimeExceptions: * @throws InvalidPathException If src is invalid */ + @Idempotent public boolean mkdirs(String src, FsPermission masked, boolean createParent) throws AccessControlException, FileAlreadyExistsException, FileNotFoundException, NSQuotaExceededException, @@ -502,6 +511,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException If src contains a symlink * @throws IOException If an I/O error occurred */ + @Idempotent public DirectoryListing getListing(String src, byte[] startAfter, boolean needLocation) @@ -531,6 +541,7 @@ public interface ClientProtocol { * @throws AccessControlException permission denied * @throws IOException If an I/O error occurred */ + @Idempotent public void renewLease(String clientName) throws AccessControlException, IOException; @@ -543,6 +554,7 @@ public interface ClientProtocol { * @return true if the file is already closed * @throws IOException */ + @Idempotent public boolean recoverLease(String src, String clientName) throws IOException; public int GET_STATS_CAPACITY_IDX = 0; @@ -554,7 +566,7 @@ public interface ClientProtocol { /** * Get a set of statistics about the filesystem. - * Right now, only three values are returned. + * Right now, only seven values are returned. *
    *
  • [0] contains the total storage capacity of the system, in bytes.
  • *
  • [1] contains the total used space of the system, in bytes.
  • @@ -567,6 +579,7 @@ public interface ClientProtocol { * Use public constants like {@link #GET_STATS_CAPACITY_IDX} in place of * actual numbers to index into the array. */ + @Idempotent public long[] getStats() throws IOException; /** @@ -575,6 +588,7 @@ public interface ClientProtocol { * Return live datanodes if type is LIVE; dead datanodes if type is DEAD; * otherwise all datanodes if type is ALL. */ + @Idempotent public DatanodeInfo[] getDatanodeReport(HdfsConstants.DatanodeReportType type) throws IOException; @@ -585,6 +599,7 @@ public interface ClientProtocol { * @throws IOException * @throws UnresolvedLinkException if the path contains a symlink. */ + @Idempotent public long getPreferredBlockSize(String filename) throws IOException, UnresolvedLinkException; @@ -700,9 +715,9 @@ public interface ClientProtocol { * all corrupt files, call this method repeatedly and each time pass in the * cookie returned from the previous call. */ - public CorruptFileBlocks - listCorruptFileBlocks(String path, String cookie) - throws IOException; + @Idempotent + public CorruptFileBlocks listCorruptFileBlocks(String path, String cookie) + throws IOException; /** * Dumps namenode data structures into specified file. If the file @@ -719,6 +734,7 @@ public interface ClientProtocol { * @param bandwidth Blanacer bandwidth in bytes per second for this datanode. * @throws IOException */ + @Idempotent public void setBalancerBandwidth(long bandwidth) throws IOException; /** @@ -732,6 +748,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException if the path contains a symlink. * @throws IOException If an I/O error occurred */ + @Idempotent public HdfsFileStatus getFileInfo(String src) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException; @@ -747,6 +764,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException if src contains a symlink * @throws IOException If an I/O error occurred */ + @Idempotent public HdfsFileStatus getFileLinkInfo(String src) throws AccessControlException, UnresolvedLinkException, IOException; @@ -759,6 +777,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException if path contains a symlink. * @throws IOException If an I/O error occurred */ + @Idempotent public ContentSummary getContentSummary(String path) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException; @@ -784,6 +803,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException if the path contains a symlink. * @throws IOException If an I/O error occurred */ + @Idempotent public void setQuota(String path, long namespaceQuota, long diskspaceQuota) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException; @@ -799,6 +819,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException if src contains a symlink. * @throws IOException If an I/O error occurred */ + @Idempotent public void fsync(String src, String client) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException; @@ -818,6 +839,7 @@ public interface ClientProtocol { * @throws UnresolvedLinkException if src contains a symlink. * @throws IOException If an I/O error occurred */ + @Idempotent public void setTimes(String src, long mtime, long atime) throws AccessControlException, FileNotFoundException, UnresolvedLinkException, IOException; @@ -858,6 +880,7 @@ public interface ClientProtocol { * @throws IOException If the given path does not refer to a symlink * or an I/O error occurred */ + @Idempotent public String getLinkTarget(String path) throws AccessControlException, FileNotFoundException, IOException; @@ -873,6 +896,7 @@ public interface ClientProtocol { * @return a located block with a new generation stamp and an access token * @throws IOException if any error occurs */ + @Idempotent public LocatedBlock updateBlockForPipeline(ExtendedBlock block, String clientName) throws IOException; @@ -896,6 +920,7 @@ public interface ClientProtocol { * @return Token * @throws IOException */ + @Idempotent public Token getDelegationToken(Text renewer) throws IOException; @@ -906,6 +931,7 @@ public interface ClientProtocol { * @return the new expiration time * @throws IOException */ + @Idempotent public long renewDelegationToken(Token token) throws IOException; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java index 6b4835facc..da64b9e764 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java @@ -99,6 +99,14 @@ public class HdfsConstants { */ public static final String HDFS_URI_SCHEME = "hdfs"; + /** + * A prefix put before the namenode URI inside the "service" field + * of a delgation token, indicating that the URI is a logical (HA) + * URI. + */ + public static final String HA_DT_SERVICE_PREFIX = "ha-hdfs:"; + + /** * Please see {@link LayoutVersion} on adding new layout version. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java index 729748f302..3680ee54aa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java @@ -91,7 +91,10 @@ public class LayoutVersion { STORED_TXIDS(-37, "Transaction IDs are stored in edits log and image files"), TXID_BASED_LAYOUT(-38, "File names in NN Storage are based on transaction IDs"), EDITLOG_OP_OPTIMIZATION(-39, - "Use LongWritable and ShortWritable directly instead of ArrayWritable of UTF8"); + "Use LongWritable and ShortWritable directly instead of ArrayWritable of UTF8"), + OPTIMIZE_PERSIST_BLOCKS(-40, + "Serialize block lists with delta-encoded variable length ints, " + + "add OP_UPDATE_BLOCKS"); final int lv; final int ancestorLV; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java index f6a63fba8e..7382543397 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java @@ -45,6 +45,7 @@ import org.apache.hadoop.ipc.ProtobufHelper; import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.ProtocolMetaInterface; import org.apache.hadoop.ipc.ProtocolSignature; +import org.apache.hadoop.ipc.ProtocolTranslator; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RpcClientUtil; import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; @@ -63,7 +64,8 @@ import com.google.protobuf.ServiceException; @InterfaceAudience.Private @InterfaceStability.Stable public class ClientDatanodeProtocolTranslatorPB implements - ProtocolMetaInterface, ClientDatanodeProtocol, Closeable { + ProtocolMetaInterface, ClientDatanodeProtocol, + ProtocolTranslator, Closeable { public static final Log LOG = LogFactory .getLog(ClientDatanodeProtocolTranslatorPB.class); @@ -198,4 +200,9 @@ public class ClientDatanodeProtocolTranslatorPB implements ClientDatanodeProtocolPB.class, RpcKind.RPC_PROTOCOL_BUFFER, RPC.getProtocolVersion(ClientDatanodeProtocolPB.class), methodName); } + + @Override + public Object getUnderlyingProxyObject() { + return rpcProxy; + } } \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java index 369158439f..46a3c825cd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java @@ -20,15 +20,10 @@ package org.apache.hadoop.hdfs.protocolPB; import java.io.Closeable; import java.io.FileNotFoundException; import java.io.IOException; -import java.net.InetSocketAddress; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.TimeUnit; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FileAlreadyExistsException; @@ -49,6 +44,7 @@ import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.HdfsConstants.UpgradeAction; +import org.apache.hadoop.ipc.ProtocolTranslator; import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; @@ -137,52 +133,14 @@ import com.google.protobuf.ServiceException; @InterfaceAudience.Private @InterfaceStability.Stable public class ClientNamenodeProtocolTranslatorPB implements - ProtocolMetaInterface, ClientProtocol, Closeable { + ProtocolMetaInterface, ClientProtocol, Closeable, ProtocolTranslator { final private ClientNamenodeProtocolPB rpcProxy; - private static ClientNamenodeProtocolPB createNamenode( - InetSocketAddress nameNodeAddr, Configuration conf, - UserGroupInformation ugi) throws IOException { - RPC.setProtocolEngine(conf, ClientNamenodeProtocolPB.class, - ProtobufRpcEngine.class); - return RPC.getProxy(ClientNamenodeProtocolPB.class, - RPC.getProtocolVersion(ClientNamenodeProtocolPB.class), nameNodeAddr, ugi, conf, - NetUtils.getSocketFactory(conf, ClientNamenodeProtocolPB.class)); + public ClientNamenodeProtocolTranslatorPB(ClientNamenodeProtocolPB proxy) + throws IOException { + rpcProxy = proxy; } - - /** Create a {@link NameNode} proxy */ - static ClientNamenodeProtocolPB createNamenodeWithRetry( - ClientNamenodeProtocolPB rpcNamenode) { - RetryPolicy createPolicy = RetryPolicies - .retryUpToMaximumCountWithFixedSleep(5, - HdfsConstants.LEASE_SOFTLIMIT_PERIOD, TimeUnit.MILLISECONDS); - - Map, RetryPolicy> remoteExceptionToPolicyMap - = new HashMap, RetryPolicy>(); - remoteExceptionToPolicyMap.put(AlreadyBeingCreatedException.class, - createPolicy); - - Map, RetryPolicy> exceptionToPolicyMap = - new HashMap, RetryPolicy>(); - exceptionToPolicyMap.put(RemoteException.class, RetryPolicies - .retryByRemoteException(RetryPolicies.TRY_ONCE_THEN_FAIL, - remoteExceptionToPolicyMap)); - RetryPolicy methodPolicy = RetryPolicies.retryByException( - RetryPolicies.TRY_ONCE_THEN_FAIL, exceptionToPolicyMap); - Map methodNameToPolicyMap = new HashMap(); - - methodNameToPolicyMap.put("create", methodPolicy); - - return (ClientNamenodeProtocolPB) RetryProxy.create( - ClientNamenodeProtocolPB.class, rpcNamenode, methodNameToPolicyMap); - } - - public ClientNamenodeProtocolTranslatorPB(InetSocketAddress nameNodeAddr, - Configuration conf, UserGroupInformation ugi) throws IOException { - - rpcProxy = createNamenodeWithRetry(createNamenode(nameNodeAddr, conf, ugi)); - } - + public void close() { RPC.stopProxy(rpcProxy); } @@ -866,4 +824,9 @@ public class ClientNamenodeProtocolTranslatorPB implements ClientNamenodeProtocolPB.class, RpcKind.RPC_PROTOCOL_BUFFER, RPC.getProtocolVersion(ClientNamenodeProtocolPB.class), methodName); } + + @Override + public Object getUnderlyingProxyObject() { + return rpcProxy; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java index d47eac2295..2a661c0fc5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java @@ -41,6 +41,7 @@ import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.DatanodeComm import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ErrorReportRequestProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.HeartbeatRequestProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.HeartbeatResponseProto; +import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.NNHAStatusHeartbeatProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ProcessUpgradeRequestProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ProcessUpgradeResponseProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.RegisterDatanodeRequestProto; @@ -55,6 +56,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; @@ -161,7 +163,7 @@ public class DatanodeProtocolClientSideTranslatorPB implements } @Override - public DatanodeCommand[] sendHeartbeat(DatanodeRegistration registration, + public HeartbeatResponse sendHeartbeat(DatanodeRegistration registration, StorageReport[] reports, int xmitsInProgress, int xceiverCount, int failedVolumes) throws IOException { HeartbeatRequestProto.Builder builder = HeartbeatRequestProto.newBuilder() @@ -184,7 +186,7 @@ public class DatanodeProtocolClientSideTranslatorPB implements cmds[index] = PBHelper.convert(p); index++; } - return cmds; + return new HeartbeatResponse(cmds, PBHelper.convert(resp.getHaStatus())); } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java index 413bd3aabf..c653daa1ee 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java @@ -51,6 +51,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; @@ -103,7 +104,7 @@ public class DatanodeProtocolServerSideTranslatorPB implements @Override public HeartbeatResponseProto sendHeartbeat(RpcController controller, HeartbeatRequestProto request) throws ServiceException { - DatanodeCommand[] cmds = null; + HeartbeatResponse response; try { List list = request.getReportsList(); StorageReport[] report = new StorageReport[list.size()]; @@ -113,7 +114,7 @@ public class DatanodeProtocolServerSideTranslatorPB implements p.getCapacity(), p.getDfsUsed(), p.getRemaining(), p.getBlockPoolUsed()); } - cmds = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()), + response = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()), report, request.getXmitsInProgress(), request.getXceiverCount(), request.getFailedVolumes()); } catch (IOException e) { @@ -121,6 +122,7 @@ public class DatanodeProtocolServerSideTranslatorPB implements } HeartbeatResponseProto.Builder builder = HeartbeatResponseProto .newBuilder(); + DatanodeCommand[] cmds = response.getCommands(); if (cmds != null) { for (int i = 0; i < cmds.length; i++) { if (cmds[i] != null) { @@ -128,6 +130,7 @@ public class DatanodeProtocolServerSideTranslatorPB implements } } } + builder.setHaStatus(PBHelper.convert(response.getNameNodeHaState())); return builder.build(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java index c29595e159..01bd88ebbf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java @@ -20,22 +20,15 @@ package org.apache.hadoop.hdfs.protocolPB; import java.io.Closeable; import java.io.IOException; -import java.net.InetSocketAddress; - -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.proto.GetUserMappingsProtocolProtos.GetGroupsForUserRequestProto; import org.apache.hadoop.hdfs.protocol.proto.GetUserMappingsProtocolProtos.GetGroupsForUserResponseProto; import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable; -import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.ipc.ProtobufHelper; -import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.ProtocolMetaInterface; import org.apache.hadoop.ipc.ProtocolSignature; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RpcClientUtil; import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; -import org.apache.hadoop.net.NetUtils; -import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.tools.GetUserMappingsProtocol; import com.google.protobuf.RpcController; @@ -47,16 +40,10 @@ public class GetUserMappingsProtocolClientSideTranslatorPB implements /** RpcController is not used and hence is set to null */ private final static RpcController NULL_CONTROLLER = null; private final GetUserMappingsProtocolPB rpcProxy; - + public GetUserMappingsProtocolClientSideTranslatorPB( - InetSocketAddress nameNodeAddr, UserGroupInformation ugi, - Configuration conf) throws IOException { - RPC.setProtocolEngine(conf, GetUserMappingsProtocolPB.class, - ProtobufRpcEngine.class); - rpcProxy = RPC.getProxy(GetUserMappingsProtocolPB.class, - RPC.getProtocolVersion(GetUserMappingsProtocolPB.class), - NameNode.getAddress(conf), ugi, conf, - NetUtils.getSocketFactory(conf, GetUserMappingsProtocol.class)); + GetUserMappingsProtocolPB rpcProxy) { + this.rpcProxy = rpcProxy; } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java index 0735cfdbb5..76ca46f4e4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java @@ -19,17 +19,14 @@ package org.apache.hadoop.hdfs.protocolPB; import java.io.Closeable; import java.io.IOException; -import java.net.InetSocketAddress; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.JournalRequestProto; import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.StartLogSegmentRequestProto; import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable; import org.apache.hadoop.hdfs.server.protocol.JournalProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; -import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.ProtobufHelper; import org.apache.hadoop.ipc.ProtocolMetaInterface; import org.apache.hadoop.ipc.ProtocolSignature; @@ -52,12 +49,9 @@ public class JournalProtocolTranslatorPB implements ProtocolMetaInterface, /** RpcController is not used and hence is set to null */ private final static RpcController NULL_CONTROLLER = null; private final JournalProtocolPB rpcProxy; - - public JournalProtocolTranslatorPB(InetSocketAddress nameNodeAddr, - Configuration conf) throws IOException { - RPC.setProtocolEngine(conf, JournalProtocolPB.class, ProtobufRpcEngine.class); - rpcProxy = RPC.getProxy(JournalProtocolPB.class, - RPC.getProtocolVersion(JournalProtocolPB.class), nameNodeAddr, conf); + + public JournalProtocolTranslatorPB(JournalProtocolPB rpcProxy) { + this.rpcProxy = rpcProxy; } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java index f2ec7ba2a9..7de2c0e461 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java @@ -19,11 +19,9 @@ package org.apache.hadoop.hdfs.protocolPB; import java.io.Closeable; import java.io.IOException; -import java.net.InetSocketAddress; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.NamenodeCommandProto; @@ -47,14 +45,11 @@ import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.ipc.ProtobufHelper; -import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.ProtocolMetaInterface; import org.apache.hadoop.ipc.ProtocolSignature; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RpcClientUtil; import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; -import org.apache.hadoop.net.NetUtils; -import org.apache.hadoop.security.UserGroupInformation; import com.google.protobuf.RpcController; import com.google.protobuf.ServiceException; @@ -84,15 +79,6 @@ public class NamenodeProtocolTranslatorPB implements NamenodeProtocol, VersionRequestProto.newBuilder().build(); final private NamenodeProtocolPB rpcProxy; - - public NamenodeProtocolTranslatorPB(InetSocketAddress nameNodeAddr, - Configuration conf, UserGroupInformation ugi) throws IOException { - RPC.setProtocolEngine(conf, NamenodeProtocolPB.class, - ProtobufRpcEngine.class); - rpcProxy = RPC.getProxy(NamenodeProtocolPB.class, - RPC.getProtocolVersion(NamenodeProtocolPB.class), nameNodeAddr, ugi, - conf, NetUtils.getSocketFactory(conf, NamenodeProtocolPB.class)); - } public NamenodeProtocolTranslatorPB(NamenodeProtocolPB rpcProxy) { this.rpcProxy = rpcProxy; @@ -137,7 +123,6 @@ public class NamenodeProtocolTranslatorPB implements NamenodeProtocol, } @Override - @SuppressWarnings("deprecation") public CheckpointSignature rollEditLog() throws IOException { try { return PBHelper.convert(rpcProxy.rollEditLog(NULL_CONTROLLER, diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java index fab9f1f1c9..b1e7be0a0e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java @@ -57,6 +57,7 @@ import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.DatanodeStor import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.DatanodeStorageProto.StorageState; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.FinalizeCommandProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.KeyUpdateCommandProto; +import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.NNHAStatusHeartbeatProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ReceivedDeletedBlockInfoProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.RegisterCommandProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageReportProto; @@ -119,7 +120,9 @@ import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; +import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; +import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus; import org.apache.hadoop.hdfs.server.protocol.RegisterCommand; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; @@ -819,6 +822,23 @@ public class PBHelper { ReceivedDeletedBlockInfoProto.Builder builder = ReceivedDeletedBlockInfoProto.newBuilder(); + ReceivedDeletedBlockInfoProto.BlockStatus status; + switch (receivedDeletedBlockInfo.getStatus()) { + case RECEIVING_BLOCK: + status = ReceivedDeletedBlockInfoProto.BlockStatus.RECEIVING; + break; + case RECEIVED_BLOCK: + status = ReceivedDeletedBlockInfoProto.BlockStatus.RECEIVED; + break; + case DELETED_BLOCK: + status = ReceivedDeletedBlockInfoProto.BlockStatus.DELETED; + break; + default: + throw new IllegalArgumentException("Bad status: " + + receivedDeletedBlockInfo.getStatus()); + } + builder.setStatus(status); + if (receivedDeletedBlockInfo.getDelHints() != null) { builder.setDeleteHint(receivedDeletedBlockInfo.getDelHints()); } @@ -850,7 +870,21 @@ public class PBHelper { public static ReceivedDeletedBlockInfo convert( ReceivedDeletedBlockInfoProto proto) { - return new ReceivedDeletedBlockInfo(PBHelper.convert(proto.getBlock()), + ReceivedDeletedBlockInfo.BlockStatus status = null; + switch (proto.getStatus()) { + case RECEIVING: + status = BlockStatus.RECEIVING_BLOCK; + break; + case RECEIVED: + status = BlockStatus.RECEIVED_BLOCK; + break; + case DELETED: + status = BlockStatus.DELETED_BLOCK; + break; + } + return new ReceivedDeletedBlockInfo( + PBHelper.convert(proto.getBlock()), + status, proto.hasDeleteHint() ? proto.getDeleteHint() : null); } @@ -1245,6 +1279,37 @@ public class PBHelper { build(); } + public static NNHAStatusHeartbeat convert(NNHAStatusHeartbeatProto s) { + if (s == null) return null; + switch (s.getState()) { + case ACTIVE: + return new NNHAStatusHeartbeat(NNHAStatusHeartbeat.State.ACTIVE, s.getTxid()); + case STANDBY: + return new NNHAStatusHeartbeat(NNHAStatusHeartbeat.State.STANDBY, s.getTxid()); + default: + throw new IllegalArgumentException("Unexpected NNHAStatusHeartbeat.State:" + s.getState()); + } + } + + public static NNHAStatusHeartbeatProto convert(NNHAStatusHeartbeat hb) { + if (hb == null) return null; + NNHAStatusHeartbeatProto.Builder builder = + NNHAStatusHeartbeatProto.newBuilder(); + switch (hb.getState()) { + case ACTIVE: + builder.setState(NNHAStatusHeartbeatProto.State.ACTIVE); + break; + case STANDBY: + builder.setState(NNHAStatusHeartbeatProto.State.STANDBY); + break; + default: + throw new IllegalArgumentException("Unexpected NNHAStatusHeartbeat.State:" + + hb.getState()); + } + builder.setTxid(hb.getTxId()); + return builder.build(); + } + public static DatanodeStorageProto convert(DatanodeStorage s) { return DatanodeStorageProto.newBuilder() .setState(PBHelper.convert(s.getState())) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java index 0fcf424497..96ba2cf7a0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java @@ -20,21 +20,15 @@ package org.apache.hadoop.hdfs.protocolPB; import java.io.Closeable; import java.io.IOException; -import java.net.InetSocketAddress; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.proto.RefreshAuthorizationPolicyProtocolProtos.RefreshServiceAclRequestProto; import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable; -import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.ipc.ProtobufHelper; -import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.ProtocolMetaInterface; import org.apache.hadoop.ipc.ProtocolSignature; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RpcClientUtil; import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; -import org.apache.hadoop.net.NetUtils; -import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol; import com.google.protobuf.RpcController; @@ -46,16 +40,10 @@ public class RefreshAuthorizationPolicyProtocolClientSideTranslatorPB implements /** RpcController is not used and hence is set to null */ private final static RpcController NULL_CONTROLLER = null; private final RefreshAuthorizationPolicyProtocolPB rpcProxy; - + public RefreshAuthorizationPolicyProtocolClientSideTranslatorPB( - InetSocketAddress nameNodeAddr, UserGroupInformation ugi, - Configuration conf) throws IOException { - RPC.setProtocolEngine(conf, RefreshAuthorizationPolicyProtocolPB.class, - ProtobufRpcEngine.class); - rpcProxy = RPC.getProxy(RefreshAuthorizationPolicyProtocolPB.class, - RPC.getProtocolVersion(RefreshAuthorizationPolicyProtocolPB.class), - NameNode.getAddress(conf), ugi, conf, - NetUtils.getSocketFactory(conf, RefreshAuthorizationPolicyProtocol.class)); + RefreshAuthorizationPolicyProtocolPB rpcProxy) { + this.rpcProxy = rpcProxy; } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java index eb8e059e4a..6f07617bab 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java @@ -20,23 +20,17 @@ package org.apache.hadoop.hdfs.protocolPB; import java.io.Closeable; import java.io.IOException; -import java.net.InetSocketAddress; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.proto.RefreshUserMappingsProtocolProtos.RefreshSuperUserGroupsConfigurationRequestProto; import org.apache.hadoop.hdfs.protocol.proto.RefreshUserMappingsProtocolProtos.RefreshUserToGroupsMappingsRequestProto; import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable; -import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.ipc.ProtobufHelper; -import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.ProtocolMetaInterface; import org.apache.hadoop.ipc.ProtocolSignature; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RpcClientUtil; import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; -import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.RefreshUserMappingsProtocol; -import org.apache.hadoop.security.UserGroupInformation; import com.google.protobuf.RpcController; import com.google.protobuf.ServiceException; @@ -47,16 +41,10 @@ public class RefreshUserMappingsProtocolClientSideTranslatorPB implements /** RpcController is not used and hence is set to null */ private final static RpcController NULL_CONTROLLER = null; private final RefreshUserMappingsProtocolPB rpcProxy; - + public RefreshUserMappingsProtocolClientSideTranslatorPB( - InetSocketAddress nameNodeAddr, UserGroupInformation ugi, - Configuration conf) throws IOException { - RPC.setProtocolEngine(conf, RefreshUserMappingsProtocolPB.class, - ProtobufRpcEngine.class); - rpcProxy = RPC.getProxy(RefreshUserMappingsProtocolPB.class, - RPC.getProtocolVersion(RefreshUserMappingsProtocolPB.class), - NameNode.getAddress(conf), ugi, conf, - NetUtils.getSocketFactory(conf, RefreshUserMappingsProtocol.class)); + RefreshUserMappingsProtocolPB rpcProxy) { + this.rpcProxy = rpcProxy; } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java index b5f24d1855..ba62a2c225 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java @@ -21,6 +21,7 @@ package org.apache.hadoop.hdfs.security.token.delegation; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.io.InterruptedIOException; import java.net.InetSocketAddress; import java.util.Iterator; @@ -283,7 +284,18 @@ public class DelegationTokenSecretManager @Override //AbstractDelegationTokenManager protected void logUpdateMasterKey(DelegationKey key) throws IOException { - namesystem.logUpdateMasterKey(key); + synchronized (noInterruptsLock) { + // The edit logging code will fail catastrophically if it + // is interrupted during a logSync, since the interrupt + // closes the edit log files. Doing this inside the + // above lock and then checking interruption status + // prevents this bug. + if (Thread.interrupted()) { + throw new InterruptedIOException( + "Interrupted before updating master key"); + } + namesystem.logUpdateMasterKey(key); + } } /** A utility method for creating credentials. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java index 1822b27a1c..4f73b85164 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java @@ -59,6 +59,11 @@ public class DelegationTokenSelector new InetSocketAddress(nnAddr.getHostName(), nnRpcPort)); return INSTANCE.selectToken(serviceName, ugi.getTokens()); } + + public static Token selectHdfsDelegationToken( + Text serviceName, UserGroupInformation ugi) { + return INSTANCE.selectToken(serviceName, ugi.getTokens()); + } public DelegationTokenSelector() { super(DelegationTokenIdentifier.HDFS_DELEGATION_KIND); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java index a0146e75a8..e808af623c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java @@ -24,8 +24,8 @@ import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; -import java.net.InetSocketAddress; import java.net.Socket; +import java.net.URI; import java.text.DateFormat; import java.util.ArrayList; import java.util.Arrays; @@ -1379,7 +1379,7 @@ public class Balancer { * for each namenode, * execute a {@link Balancer} to work through all datanodes once. */ - static int run(List namenodes, final Parameters p, + static int run(Collection namenodes, final Parameters p, Configuration conf) throws IOException, InterruptedException { final long sleeptime = 2000*conf.getLong( DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, @@ -1393,8 +1393,8 @@ public class Balancer { final List connectors = new ArrayList(namenodes.size()); try { - for(InetSocketAddress isa : namenodes) { - connectors.add(new NameNodeConnector(isa, conf)); + for (URI uri : namenodes) { + connectors.add(new NameNodeConnector(uri, conf)); } boolean done = false; @@ -1476,7 +1476,7 @@ public class Balancer { try { checkReplicationPolicyCompatibility(conf); - final List namenodes = DFSUtil.getNNServiceRpcAddresses(conf); + final Collection namenodes = DFSUtil.getNsServiceRpcUris(conf); return Balancer.run(namenodes, parse(args), conf); } catch (IOException e) { System.out.println(e + ". Exiting ..."); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java index 83822e4c31..c4208b7951 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java @@ -21,38 +21,25 @@ import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.InetAddress; -import java.net.InetSocketAddress; +import java.net.URI; import java.util.EnumSet; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; -import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolPB; -import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys; -import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.io.IOUtils; -import org.apache.hadoop.io.retry.RetryPolicies; -import org.apache.hadoop.io.retry.RetryPolicy; -import org.apache.hadoop.io.retry.RetryProxy; -import org.apache.hadoop.ipc.ProtobufRpcEngine; -import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RemoteException; -import org.apache.hadoop.net.NetUtils; -import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.Daemon; @@ -64,7 +51,7 @@ class NameNodeConnector { private static final Log LOG = Balancer.LOG; private static final Path BALANCER_ID_PATH = new Path("/system/balancer.id"); - final InetSocketAddress namenodeAddress; + final URI nameNodeUri; final String blockpoolID; final NamenodeProtocol namenode; @@ -78,12 +65,17 @@ class NameNodeConnector { private BlockTokenSecretManager blockTokenSecretManager; private Daemon keyupdaterthread; // AccessKeyUpdater thread - NameNodeConnector(InetSocketAddress namenodeAddress, Configuration conf - ) throws IOException { - this.namenodeAddress = namenodeAddress; - this.namenode = createNamenode(namenodeAddress, conf); - this.client = DFSUtil.createNamenode(conf); - this.fs = FileSystem.get(NameNode.getUri(namenodeAddress), conf); + NameNodeConnector(URI nameNodeUri, + Configuration conf) throws IOException { + this.nameNodeUri = nameNodeUri; + + this.namenode = + NameNodeProxies.createProxy(conf, nameNodeUri, NamenodeProtocol.class) + .getProxy(); + this.client = + NameNodeProxies.createProxy(conf, nameNodeUri, ClientProtocol.class) + .getProxy(); + this.fs = FileSystem.get(nameNodeUri, conf); final NamespaceInfo namespaceinfo = namenode.versionRequest(); this.blockpoolID = namespaceinfo.getBlockPoolID(); @@ -188,38 +180,11 @@ class NameNodeConnector { @Override public String toString() { - return getClass().getSimpleName() + "[namenodeAddress=" + namenodeAddress + return getClass().getSimpleName() + "[namenodeUri=" + nameNodeUri + ", id=" + blockpoolID + "]"; } - /** Build a NamenodeProtocol connection to the namenode and - * set up the retry policy - */ - private static NamenodeProtocol createNamenode(InetSocketAddress address, - Configuration conf) throws IOException { - RetryPolicy timeoutPolicy = RetryPolicies.exponentialBackoffRetry( - 5, 200, TimeUnit.MILLISECONDS); - Map,RetryPolicy> exceptionToPolicyMap = - new HashMap, RetryPolicy>(); - RetryPolicy methodPolicy = RetryPolicies.retryByException( - timeoutPolicy, exceptionToPolicyMap); - Map methodNameToPolicyMap = - new HashMap(); - methodNameToPolicyMap.put("getBlocks", methodPolicy); - methodNameToPolicyMap.put("getAccessKeys", methodPolicy); - - RPC.setProtocolEngine(conf, NamenodeProtocolPB.class, - ProtobufRpcEngine.class); - NamenodeProtocolPB proxy = RPC.getProxy(NamenodeProtocolPB.class, - RPC.getProtocolVersion(NamenodeProtocolPB.class), address, - UserGroupInformation.getCurrentUser(), conf, - NetUtils.getDefaultSocketFactory(conf)); - NamenodeProtocolPB retryProxy = (NamenodeProtocolPB) RetryProxy.create( - NamenodeProtocolPB.class, proxy, methodNameToPolicyMap); - return new NamenodeProtocolTranslatorPB(retryProxy); - } - /** * Periodically updates access keys. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java index 58725a6b32..ce3ff8b3ed 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java @@ -183,7 +183,7 @@ public class BlockInfo extends Block implements /** * Count the number of data-nodes the block belongs to. */ - int numNodes() { + public int numNodes() { assert this.triplets != null : "BlockInfo is not initialized"; assert triplets.length % 3 == 0 : "Malformed BlockInfo"; for(int idx = getCapacity()-1; idx >= 0; idx--) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index c8f36a01d9..1c9b2aad4f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -28,6 +28,8 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Queue; +import java.util.Set; import java.util.TreeMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -48,6 +50,7 @@ import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode; import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys; +import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.common.Util; @@ -68,6 +71,7 @@ import org.apache.hadoop.net.Node; import org.apache.hadoop.util.Daemon; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Sets; /** * Keeps information related to the blocks stored in the Hadoop cluster. @@ -80,17 +84,27 @@ public class BlockManager { /** Default load factor of map */ public static final float DEFAULT_MAP_LOAD_FACTOR = 0.75f; + private static final String QUEUE_REASON_CORRUPT_STATE = + "it has the wrong state or generation stamp"; + + private static final String QUEUE_REASON_FUTURE_GENSTAMP = + "generation stamp is in the future"; + private final Namesystem namesystem; private final DatanodeManager datanodeManager; private final HeartbeatManager heartbeatManager; private final BlockTokenSecretManager blockTokenSecretManager; + + private final PendingDataNodeMessages pendingDNMessages = + new PendingDataNodeMessages(); private volatile long pendingReplicationBlocksCount = 0L; private volatile long corruptReplicaBlocksCount = 0L; private volatile long underReplicatedBlocksCount = 0L; private volatile long scheduledReplicationBlocksCount = 0L; private volatile long excessBlocksCount = 0L; + private volatile long postponedMisreplicatedBlocksCount = 0L; /** Used by metrics */ public long getPendingReplicationBlocksCount() { @@ -116,6 +130,14 @@ public class BlockManager { public long getExcessBlocksCount() { return excessBlocksCount; } + /** Used by metrics */ + public long getPostponedMisreplicatedBlocksCount() { + return postponedMisreplicatedBlocksCount; + } + /** Used by metrics */ + public int getPendingDataNodeMessageCount() { + return pendingDNMessages.count(); + } /**replicationRecheckInterval is how often namenode checks for new replication work*/ private final long replicationRecheckInterval; @@ -134,6 +156,15 @@ public class BlockManager { /** Blocks to be invalidated. */ private final InvalidateBlocks invalidateBlocks; + + /** + * After a failover, over-replicated blocks may not be handled + * until all of the replicas have done a block report to the + * new active. This is to make sure that this NameNode has been + * notified of all block deletions that might have been pending + * when the failover happened. + */ + private final Set postponedMisreplicatedBlocks = Sets.newHashSet(); // // Keeps a TreeSet for every named node. Each treeset contains @@ -316,49 +347,15 @@ public class BlockManager { out.println("Metasave: Blocks waiting for replication: " + neededReplications.size()); for (Block block : neededReplications) { - List containingNodes = - new ArrayList(); - List containingLiveReplicasNodes = - new ArrayList(); - - NumberReplicas numReplicas = new NumberReplicas(); - // source node returned is not used - chooseSourceDatanode(block, containingNodes, - containingLiveReplicasNodes, numReplicas); - assert containingLiveReplicasNodes.size() == numReplicas.liveReplicas(); - int usableReplicas = numReplicas.liveReplicas() + - numReplicas.decommissionedReplicas(); - - if (block instanceof BlockInfo) { - String fileName = ((BlockInfo)block).getINode().getFullPathName(); - out.print(fileName + ": "); - } - // l: == live:, d: == decommissioned c: == corrupt e: == excess - out.print(block + ((usableReplicas > 0)? "" : " MISSING") + - " (replicas:" + - " l: " + numReplicas.liveReplicas() + - " d: " + numReplicas.decommissionedReplicas() + - " c: " + numReplicas.corruptReplicas() + - " e: " + numReplicas.excessReplicas() + ") "); - - Collection corruptNodes = - corruptReplicas.getNodes(block); - - for (Iterator jt = blocksMap.nodeIterator(block); - jt.hasNext();) { - DatanodeDescriptor node = jt.next(); - String state = ""; - if (corruptNodes != null && corruptNodes.contains(node)) { - state = "(corrupt)"; - } else if (node.isDecommissioned() || - node.isDecommissionInProgress()) { - state = "(decommissioned)"; - } - out.print(" " + node + state + " : "); - } - out.println(""); + dumpBlockMeta(block, out); } } + + // Dump any postponed over-replicated blocks + out.println("Mis-replicated blocks that have been postponed:"); + for (Block block : postponedMisreplicatedBlocks) { + dumpBlockMeta(block, out); + } // Dump blocks from pendingReplication pendingReplications.metaSave(out); @@ -369,6 +366,58 @@ public class BlockManager { // Dump all datanodes getDatanodeManager().datanodeDump(out); } + + /** + * Dump the metadata for the given block in a human-readable + * form. + */ + private void dumpBlockMeta(Block block, PrintWriter out) { + List containingNodes = + new ArrayList(); + List containingLiveReplicasNodes = + new ArrayList(); + + NumberReplicas numReplicas = new NumberReplicas(); + // source node returned is not used + chooseSourceDatanode(block, containingNodes, + containingLiveReplicasNodes, numReplicas); + assert containingLiveReplicasNodes.size() == numReplicas.liveReplicas(); + int usableReplicas = numReplicas.liveReplicas() + + numReplicas.decommissionedReplicas(); + + if (block instanceof BlockInfo) { + String fileName = ((BlockInfo)block).getINode().getFullPathName(); + out.print(fileName + ": "); + } + // l: == live:, d: == decommissioned c: == corrupt e: == excess + out.print(block + ((usableReplicas > 0)? "" : " MISSING") + + " (replicas:" + + " l: " + numReplicas.liveReplicas() + + " d: " + numReplicas.decommissionedReplicas() + + " c: " + numReplicas.corruptReplicas() + + " e: " + numReplicas.excessReplicas() + ") "); + + Collection corruptNodes = + corruptReplicas.getNodes(block); + + for (Iterator jt = blocksMap.nodeIterator(block); + jt.hasNext();) { + DatanodeDescriptor node = jt.next(); + String state = ""; + if (corruptNodes != null && corruptNodes.contains(node)) { + state = "(corrupt)"; + } else if (node.isDecommissioned() || + node.isDecommissionInProgress()) { + state = "(decommissioned)"; + } + + if (node.areBlockContentsStale()) { + state += " (block deletions maybe out of date)"; + } + out.print(" " + node + state + " : "); + } + out.println(""); + } /** @return maxReplicationStreams */ public int getMaxReplicationStreams() { @@ -425,7 +474,7 @@ public class BlockManager { final boolean b = commitBlock((BlockInfoUnderConstruction)lastBlock, commitBlock); if(countNodes(lastBlock).liveReplicas() >= minReplication) - completeBlock(fileINode,fileINode.numBlocks()-1); + completeBlock(fileINode,fileINode.numBlocks()-1, false); return b; } @@ -437,19 +486,15 @@ public class BlockManager { * of replicas reported from data-nodes. */ private BlockInfo completeBlock(final INodeFile fileINode, - final int blkIndex) throws IOException { - return completeBlock(fileINode, blkIndex, false); - } - - public BlockInfo completeBlock(final INodeFile fileINode, - final int blkIndex, final boolean force) throws IOException { + final int blkIndex, boolean force) throws IOException { if(blkIndex < 0) return null; BlockInfo curBlock = fileINode.getBlocks()[blkIndex]; if(curBlock.isComplete()) return curBlock; BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)curBlock; - if(!force && ucBlock.numNodes() < minReplication) + int numNodes = ucBlock.numNodes(); + if (!force && numNodes < minReplication) throw new IOException("Cannot complete block: " + "block does not satisfy minimal replication requirement."); if(!force && ucBlock.getBlockUCState() != BlockUCState.COMMITTED) @@ -458,20 +503,43 @@ public class BlockManager { BlockInfo completeBlock = ucBlock.convertToCompleteBlock(); // replace penultimate block in file fileINode.setBlock(blkIndex, completeBlock); + + // Since safe-mode only counts complete blocks, and we now have + // one more complete block, we need to adjust the total up, and + // also count it as safe, if we have at least the minimum replica + // count. (We may not have the minimum replica count yet if this is + // a "forced" completion when a file is getting closed by an + // OP_CLOSE edit on the standby). + namesystem.adjustSafeModeBlockTotals(0, 1); + namesystem.incrementSafeBlockCount( + Math.min(numNodes, minReplication)); + // replace block in the blocksMap return blocksMap.replaceBlock(completeBlock); } private BlockInfo completeBlock(final INodeFile fileINode, - final BlockInfo block) throws IOException { + final BlockInfo block, boolean force) throws IOException { BlockInfo[] fileBlocks = fileINode.getBlocks(); for(int idx = 0; idx < fileBlocks.length; idx++) if(fileBlocks[idx] == block) { - return completeBlock(fileINode, idx); + return completeBlock(fileINode, idx, force); } return block; } + + /** + * Force the given block in the given file to be marked as complete, + * regardless of whether enough replicas are present. This is necessary + * when tailing edit logs as a Standby. + */ + public BlockInfo forceCompleteBlock(final INodeFile fileINode, + final BlockInfoUnderConstruction block) throws IOException { + block.commitBlock(block); + return completeBlock(fileINode, block, true); + } + /** * Convert the last block of the file to an under construction block.

    * The block is converted only if the file has blocks and the last one @@ -508,6 +576,14 @@ public class BlockManager { String datanodeId = dd.getStorageID(); invalidateBlocks.remove(datanodeId, oldBlock); } + + // Adjust safe-mode totals, since under-construction blocks don't + // count in safe-mode. + namesystem.adjustSafeModeBlockTotals( + // decrement safe if we had enough + targets.length >= minReplication ? -1 : 0, + // always decrement total blocks + -1); final long fileLength = fileINode.computeContentSummary().getLength(); final long pos = fileLength - ucBlock.getNumBytes(); @@ -598,8 +674,8 @@ public class BlockManager { final boolean isCorrupt = numCorruptNodes == numNodes; final int numMachines = isCorrupt ? numNodes: numNodes - numCorruptNodes; final DatanodeDescriptor[] machines = new DatanodeDescriptor[numMachines]; + int j = 0; if (numMachines > 0) { - int j = 0; for(Iterator it = blocksMap.nodeIterator(blk); it.hasNext();) { final DatanodeDescriptor d = it.next(); @@ -608,6 +684,12 @@ public class BlockManager { machines[j++] = d; } } + assert j == machines.length : + "isCorrupt: " + isCorrupt + + " numMachines: " + numMachines + + " numNodes: " + numNodes + + " numCorrupt: " + numCorruptNodes + + " numCorruptRepls: " + numCorruptReplicas; final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk); return new LocatedBlock(eb, machines, pos, isCorrupt); } @@ -772,6 +854,14 @@ public class BlockManager { node.resetBlocks(); invalidateBlocks.remove(node.getStorageID()); + + // If the DN hasn't block-reported since the most recent + // failover, then we may have been holding up on processing + // over-replicated blocks because of it. But we can now + // process those blocks. + if (node.areBlockContentsStale()) { + rescanPostponedMisreplicatedBlocks(); + } } /** @@ -809,22 +899,18 @@ public class BlockManager { */ public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk, final DatanodeInfo dn, String reason) throws IOException { - namesystem.writeLock(); - try { - final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock()); - if (storedBlock == null) { - // Check if the replica is in the blockMap, if not - // ignore the request for now. This could happen when BlockScanner - // thread of Datanode reports bad block before Block reports are sent - // by the Datanode on startup - NameNode.stateChangeLog.info("BLOCK* findAndMarkBlockAsCorrupt: " - + blk + " not found."); - return; - } - markBlockAsCorrupt(storedBlock, dn, reason); - } finally { - namesystem.writeUnlock(); + assert namesystem.hasWriteLock(); + final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock()); + if (storedBlock == null) { + // Check if the replica is in the blockMap, if not + // ignore the request for now. This could happen when BlockScanner + // thread of Datanode reports bad block before Block reports are sent + // by the Datanode on startup + NameNode.stateChangeLog.info("BLOCK* findAndMarkBlockAsCorrupt: " + + blk + " not found."); + return; } + markBlockAsCorrupt(storedBlock, dn, reason); } private void markBlockAsCorrupt(BlockInfo storedBlock, @@ -876,10 +962,17 @@ public class BlockManager { + " because datanode " + dn.getName() + " does not exist."); } - // Check how many copies we have of the block. If we have at least one - // copy on a live node, then we can delete it. - int count = countNodes(blk).liveReplicas(); - if (count >= 1) { + // Check how many copies we have of the block + NumberReplicas nr = countNodes(blk); + if (nr.replicasOnStaleNodes() > 0) { + NameNode.stateChangeLog.info("BLOCK* invalidateBlocks: postponing " + + "invalidation of block " + blk + " on " + dn + " because " + + nr.replicasOnStaleNodes() + " replica(s) are located on nodes " + + "with potentially out-of-date block reports."); + postponeBlock(blk); + + } else if (nr.liveReplicas() >= 1) { + // If we have at least one copy on a live node, then we can delete it. addToInvalidates(blk, dn); removeStoredBlock(blk, node); if(NameNode.stateChangeLog.isDebugEnabled()) { @@ -892,6 +985,13 @@ public class BlockManager { } } + private void postponeBlock(Block blk) { + if (postponedMisreplicatedBlocks.add(blk)) { + postponedMisreplicatedBlocksCount++; + } + } + + void updateState() { pendingReplicationBlocksCount = pendingReplications.size(); underReplicatedBlocksCount = neededReplications.size(); @@ -930,7 +1030,7 @@ public class BlockManager { * * @return number of blocks scheduled for replication during this iteration. */ - private int computeReplicationWork(int blocksToProcess) throws IOException { + int computeReplicationWork(int blocksToProcess) throws IOException { List> blocksToReplicate = null; namesystem.writeLock(); try { @@ -981,8 +1081,10 @@ public class BlockManager { NumberReplicas numReplicas = new NumberReplicas(); srcNode = chooseSourceDatanode( block, containingNodes, liveReplicaNodes, numReplicas); - if(srcNode == null) // block can not be replicated from any node + if(srcNode == null) { // block can not be replicated from any node + LOG.debug("Block " + block + " cannot be repl from any node"); continue; + } assert liveReplicaNodes.size() == numReplicas.liveReplicas(); // do not schedule more if enough replicas is already pending @@ -1232,7 +1334,7 @@ public class BlockManager { srcNode = node; } if(numReplicas != null) - numReplicas.initialize(live, decommissioned, corrupt, excess); + numReplicas.initialize(live, decommissioned, corrupt, excess, 0); return srcNode; } @@ -1314,7 +1416,7 @@ public class BlockManager { // To minimize startup time, we discard any second (or later) block reports // that we receive while still in startup phase. - if (namesystem.isInStartupSafeMode() && node.numBlocks() > 0) { + if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) { NameNode.stateChangeLog.info("BLOCK* processReport: " + "discarded non-initial block report from " + nodeID.getName() + " because namenode still in startup phase"); @@ -1328,6 +1430,19 @@ public class BlockManager { } else { processReport(node, newReport); } + + // Now that we have an up-to-date block report, we know that any + // deletions from a previous NN iteration have been accounted for. + boolean staleBefore = node.areBlockContentsStale(); + node.receivedBlockReport(); + if (staleBefore && !node.areBlockContentsStale()) { + LOG.info("BLOCK* processReport: " + + "Received first block report from " + node + + " after becoming active. Its block contents are no longer" + + " considered stale."); + rescanPostponedMisreplicatedBlocks(); + } + } finally { endTime = Util.now(); namesystem.writeUnlock(); @@ -1340,6 +1455,37 @@ public class BlockManager { + ", processing time: " + (endTime - startTime) + " msecs"); } + /** + * Rescan the list of blocks which were previously postponed. + */ + private void rescanPostponedMisreplicatedBlocks() { + for (Iterator it = postponedMisreplicatedBlocks.iterator(); + it.hasNext();) { + Block b = it.next(); + + BlockInfo bi = blocksMap.getStoredBlock(b); + if (bi == null) { + if (LOG.isDebugEnabled()) { + LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + + "Postponed mis-replicated block " + b + " no longer found " + + "in block map."); + } + it.remove(); + postponedMisreplicatedBlocksCount--; + continue; + } + MisReplicationResult res = processMisReplicatedBlock(bi); + if (LOG.isDebugEnabled()) { + LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + + "Re-scanned block " + b + ", result is " + res); + } + if (res != MisReplicationResult.POSTPONE) { + it.remove(); + postponedMisreplicatedBlocksCount--; + } + } + } + private void processReport(final DatanodeDescriptor node, final BlockListAsLongs report) throws IOException { // Normal case: @@ -1392,9 +1538,19 @@ public class BlockManager { assert (node.numBlocks() == 0); BlockReportIterator itBR = report.getBlockReportIterator(); + boolean isStandby = namesystem.isInStandbyState(); + while(itBR.hasNext()) { Block iblk = itBR.next(); ReplicaState reportedState = itBR.getCurrentReplicaState(); + + if (isStandby && + namesystem.isGenStampInFuture(iblk.getGenerationStamp())) { + queueReportedBlock(node, iblk, reportedState, + QUEUE_REASON_FUTURE_GENSTAMP); + continue; + } + BlockInfo storedBlock = blocksMap.getStoredBlock(iblk); // If block does not belong to any file, we are done. if (storedBlock == null) continue; @@ -1404,7 +1560,14 @@ public class BlockManager { BlockToMarkCorrupt c = checkReplicaCorrupt( iblk, reportedState, storedBlock, ucState, node); if (c != null) { - markBlockAsCorrupt(c.blockInfo, node, c.reason); + if (namesystem.isInStandbyState()) { + // In the Standby, we may receive a block report for a file that we + // just have an out-of-date gen-stamp or state for, for example. + queueReportedBlock(node, iblk, reportedState, + QUEUE_REASON_CORRUPT_STATE); + } else { + markBlockAsCorrupt(c.blockInfo, node, c.reason); + } continue; } @@ -1487,7 +1650,8 @@ public class BlockManager { * @param toCorrupt replicas with unexpected length or generation stamp; * add to corrupt replicas * @param toUC replicas of blocks currently under construction - * @return + * @return the up-to-date stored block, if it should be kept. + * Otherwise, null. */ private BlockInfo processReportedBlock(final DatanodeDescriptor dn, final Block block, final ReplicaState reportedState, @@ -1502,6 +1666,13 @@ public class BlockManager { + " replicaState = " + reportedState); } + if (namesystem.isInStandbyState() && + namesystem.isGenStampInFuture(block.getGenerationStamp())) { + queueReportedBlock(dn, block, reportedState, + QUEUE_REASON_FUTURE_GENSTAMP); + return null; + } + // find block by blockId BlockInfo storedBlock = blocksMap.getStoredBlock(block); if(storedBlock == null) { @@ -1519,15 +1690,24 @@ public class BlockManager { // Ignore replicas already scheduled to be removed from the DN if(invalidateBlocks.contains(dn.getStorageID(), block)) { - assert storedBlock.findDatanode(dn) < 0 : "Block " + block - + " in invalidated blocks set should not appear in DN " + dn; +/* TODO: following assertion is incorrect, see HDFS-2668 +assert storedBlock.findDatanode(dn) < 0 : "Block " + block + + " in recentInvalidatesSet should not appear in DN " + dn; */ return storedBlock; } BlockToMarkCorrupt c = checkReplicaCorrupt( block, reportedState, storedBlock, ucState, dn); if (c != null) { - toCorrupt.add(c); + if (namesystem.isInStandbyState()) { + // If the block is an out-of-date generation stamp or state, + // but we're the standby, we shouldn't treat it as corrupt, + // but instead just queue it for later processing. + queueReportedBlock(dn, storedBlock, reportedState, + QUEUE_REASON_CORRUPT_STATE); + } else { + toCorrupt.add(c); + } return storedBlock; } @@ -1545,6 +1725,68 @@ public class BlockManager { return storedBlock; } + /** + * Queue the given reported block for later processing in the + * standby node. {@see PendingDataNodeMessages}. + * @param reason a textual reason to report in the debug logs + */ + private void queueReportedBlock(DatanodeDescriptor dn, Block block, + ReplicaState reportedState, String reason) { + assert namesystem.isInStandbyState(); + + if (LOG.isDebugEnabled()) { + LOG.debug("Queueing reported block " + block + + " in state " + reportedState + + " from datanode " + dn + " for later processing " + + "because " + reason + "."); + } + pendingDNMessages.enqueueReportedBlock(dn, block, reportedState); + } + + /** + * Try to process any messages that were previously queued for the given + * block. This is called from FSEditLogLoader whenever a block's state + * in the namespace has changed or a new block has been created. + */ + public void processQueuedMessagesForBlock(Block b) throws IOException { + Queue queue = pendingDNMessages.takeBlockQueue(b); + if (queue == null) { + // Nothing to re-process + return; + } + processQueuedMessages(queue); + } + + private void processQueuedMessages(Iterable rbis) + throws IOException { + for (ReportedBlockInfo rbi : rbis) { + if (LOG.isDebugEnabled()) { + LOG.debug("Processing previouly queued message " + rbi); + } + processAndHandleReportedBlock( + rbi.getNode(), rbi.getBlock(), rbi.getReportedState(), null); + } + } + + /** + * Process any remaining queued datanode messages after entering + * active state. At this point they will not be re-queued since + * we are the definitive master node and thus should be up-to-date + * with the namespace information. + */ + public void processAllPendingDNMessages() throws IOException { + assert !namesystem.isInStandbyState() : + "processAllPendingDNMessages() should be called after exiting " + + "standby state!"; + int count = pendingDNMessages.count(); + if (count > 0) { + LOG.info("Processing " + count + " messages from DataNodes " + + "that were previously queued during standby state."); + } + processQueuedMessages(pendingDNMessages.takeAll()); + assert pendingDNMessages.count() == 0; + } + /* * The next two methods test the various cases under which we must conclude * the replica is corrupt, or under construction. These are laid out @@ -1675,13 +1917,15 @@ public class BlockManager { // Now check for completion of blocks and safe block count int numCurrentReplica = countLiveNodes(storedBlock); if (storedBlock.getBlockUCState() == BlockUCState.COMMITTED - && numCurrentReplica >= minReplication) - storedBlock = completeBlock(storedBlock.getINode(), storedBlock); - - // check whether safe replication is reached for the block - // only complete blocks are counted towards that - if(storedBlock.isComplete()) + && numCurrentReplica >= minReplication) { + completeBlock(storedBlock.getINode(), storedBlock, false); + } else if (storedBlock.isComplete()) { + // check whether safe replication is reached for the block + // only complete blocks are counted towards that. + // In the case that the block just became complete above, completeBlock() + // handles the safe block count maintenance. namesystem.incrementSafeBlockCount(numCurrentReplica); + } } /** @@ -1738,15 +1982,17 @@ public class BlockManager { + pendingReplications.getNumReplicas(storedBlock); if(storedBlock.getBlockUCState() == BlockUCState.COMMITTED && - numLiveReplicas >= minReplication) - storedBlock = completeBlock(fileINode, storedBlock); - - // check whether safe replication is reached for the block - // only complete blocks are counted towards that - // Is no-op if not in safe mode. - if(storedBlock.isComplete()) + numLiveReplicas >= minReplication) { + storedBlock = completeBlock(fileINode, storedBlock, false); + } else if (storedBlock.isComplete()) { + // check whether safe replication is reached for the block + // only complete blocks are counted towards that + // Is no-op if not in safe mode. + // In the case that the block just became complete above, completeBlock() + // handles the safe block count maintenance. namesystem.incrementSafeBlockCount(numCurrentReplica); - + } + // if file is under construction, then done for now if (fileINode.isUnderConstruction()) { return storedBlock; @@ -1839,49 +2085,93 @@ public class BlockManager { public void processMisReplicatedBlocks() { assert namesystem.hasWriteLock(); - long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0, + long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0, nrPostponed = 0, nrUnderConstruction = 0; neededReplications.clear(); for (BlockInfo block : blocksMap.getBlocks()) { - INodeFile fileINode = block.getINode(); - if (fileINode == null) { - // block does not belong to any file - nrInvalid++; - addToInvalidates(block); - continue; + MisReplicationResult res = processMisReplicatedBlock(block); + if (LOG.isTraceEnabled()) { + LOG.trace("block " + block + ": " + res); } - if (!block.isComplete()) { - // Incomplete blocks are never considered mis-replicated -- - // they'll be reached when they are completed or recovered. - nrUnderConstruction++; - continue; - } - // calculate current replication - short expectedReplication = fileINode.getReplication(); - NumberReplicas num = countNodes(block); - int numCurrentReplica = num.liveReplicas(); - // add to under-replicated queue if need to be - if (isNeededReplication(block, expectedReplication, numCurrentReplica)) { - if (neededReplications.add(block, numCurrentReplica, num - .decommissionedReplicas(), expectedReplication)) { - nrUnderReplicated++; - } - } - - if (numCurrentReplica > expectedReplication) { - // over-replicated block + switch (res) { + case UNDER_REPLICATED: + nrUnderReplicated++; + break; + case OVER_REPLICATED: nrOverReplicated++; - processOverReplicatedBlock(block, expectedReplication, null, null); + break; + case INVALID: + nrInvalid++; + break; + case POSTPONE: + nrPostponed++; + postponeBlock(block); + break; + case UNDER_CONSTRUCTION: + nrUnderConstruction++; + break; + case OK: + break; + default: + throw new AssertionError("Invalid enum value: " + res); } } - + LOG.info("Total number of blocks = " + blocksMap.size()); LOG.info("Number of invalid blocks = " + nrInvalid); LOG.info("Number of under-replicated blocks = " + nrUnderReplicated); - LOG.info("Number of over-replicated blocks = " + nrOverReplicated); + LOG.info("Number of over-replicated blocks = " + nrOverReplicated + + ((nrPostponed > 0) ? ( " (" + nrPostponed + " postponed)") : "")); LOG.info("Number of blocks being written = " + nrUnderConstruction); } + /** + * Process a single possibly misreplicated block. This adds it to the + * appropriate queues if necessary, and returns a result code indicating + * what happened with it. + */ + private MisReplicationResult processMisReplicatedBlock(BlockInfo block) { + INodeFile fileINode = block.getINode(); + if (fileINode == null) { + // block does not belong to any file + addToInvalidates(block); + return MisReplicationResult.INVALID; + } + if (!block.isComplete()) { + // Incomplete blocks are never considered mis-replicated -- + // they'll be reached when they are completed or recovered. + return MisReplicationResult.UNDER_CONSTRUCTION; + } + // calculate current replication + short expectedReplication = fileINode.getReplication(); + NumberReplicas num = countNodes(block); + int numCurrentReplica = num.liveReplicas(); + // add to under-replicated queue if need to be + if (isNeededReplication(block, expectedReplication, numCurrentReplica)) { + if (neededReplications.add(block, numCurrentReplica, num + .decommissionedReplicas(), expectedReplication)) { + return MisReplicationResult.UNDER_REPLICATED; + } + } + + if (numCurrentReplica > expectedReplication) { + if (num.replicasOnStaleNodes() > 0) { + // If any of the replicas of this block are on nodes that are + // considered "stale", then these replicas may in fact have + // already been deleted. So, we cannot safely act on the + // over-replication until a later point in time, when + // the "stale" nodes have block reported. + return MisReplicationResult.POSTPONE; + } + + // over-replicated block + processOverReplicatedBlock(block, expectedReplication, null, null); + return MisReplicationResult.OVER_REPLICATED; + } + + return MisReplicationResult.OK; + } + /** Set replication for the blocks. */ public void setReplication(final short oldRepl, final short newRepl, final String src, final Block... blocks) throws IOException { @@ -1925,6 +2215,14 @@ public class BlockManager { for (Iterator it = blocksMap.nodeIterator(block); it.hasNext();) { DatanodeDescriptor cur = it.next(); + if (cur.areBlockContentsStale()) { + LOG.info("BLOCK* processOverReplicatedBlock: " + + "Postponing processing of over-replicated block " + + block + " since datanode " + cur + " does not yet have up-to-date " + + "block information."); + postponeBlock(block); + return; + } LightWeightLinkedSet excessBlocks = excessReplicateMap.get(cur .getStorageID()); if (excessBlocks == null || !excessBlocks.contains(block)) { @@ -2151,13 +2449,19 @@ public class BlockManager { // Modify the blocks->datanode map and node's map. // pendingReplications.remove(block); - + processAndHandleReportedBlock(node, block, ReplicaState.FINALIZED, + delHintNode); + } + + private void processAndHandleReportedBlock(DatanodeDescriptor node, Block block, + ReplicaState reportedState, DatanodeDescriptor delHintNode) + throws IOException { // blockReceived reports a finalized block Collection toAdd = new LinkedList(); Collection toInvalidate = new LinkedList(); Collection toCorrupt = new LinkedList(); Collection toUC = new LinkedList(); - processReportedBlock(node, block, ReplicaState.FINALIZED, + processReportedBlock(node, block, reportedState, toAdd, toInvalidate, toCorrupt, toUC); // the block is only in one of the to-do lists // if it is in none then data-node already has it @@ -2181,59 +2485,80 @@ public class BlockManager { } } - /** The given node is reporting that it received/deleted certain blocks. */ - public void blockReceivedAndDeleted(final DatanodeID nodeID, + /** + * The given node is reporting incremental information about some blocks. + * This includes blocks that are starting to be received, completed being + * received, or deleted. + */ + public void processIncrementalBlockReport(final DatanodeID nodeID, final String poolId, - final ReceivedDeletedBlockInfo receivedAndDeletedBlocks[] + final ReceivedDeletedBlockInfo blockInfos[] ) throws IOException { namesystem.writeLock(); int received = 0; int deleted = 0; + int receiving = 0; try { final DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isAlive) { NameNode.stateChangeLog - .warn("BLOCK* blockReceivedDeleted" + .warn("BLOCK* processIncrementalBlockReport" + " is received from dead or unregistered node " + nodeID.getName()); throw new IOException( - "Got blockReceivedDeleted message from unregistered or dead node"); + "Got incremental block report from unregistered or dead node"); } - for (int i = 0; i < receivedAndDeletedBlocks.length; i++) { - if (receivedAndDeletedBlocks[i].isDeletedBlock()) { - removeStoredBlock( - receivedAndDeletedBlocks[i].getBlock(), node); + for (ReceivedDeletedBlockInfo rdbi : blockInfos) { + switch (rdbi.getStatus()) { + case DELETED_BLOCK: + removeStoredBlock(rdbi.getBlock(), node); deleted++; - } else { - addBlock(node, receivedAndDeletedBlocks[i].getBlock(), - receivedAndDeletedBlocks[i].getDelHints()); + break; + case RECEIVED_BLOCK: + addBlock(node, rdbi.getBlock(), rdbi.getDelHints()); received++; + break; + case RECEIVING_BLOCK: + receiving++; + processAndHandleReportedBlock(node, rdbi.getBlock(), + ReplicaState.RBW, null); + break; + default: + String msg = + "Unknown block status code reported by " + nodeID.getName() + + ": " + rdbi; + NameNode.stateChangeLog.warn(msg); + assert false : msg; // if assertions are enabled, throw. + break; } if (NameNode.stateChangeLog.isDebugEnabled()) { - NameNode.stateChangeLog.debug("BLOCK* block" - + (receivedAndDeletedBlocks[i].isDeletedBlock() ? "Deleted" - : "Received") + ": " + receivedAndDeletedBlocks[i].getBlock() + NameNode.stateChangeLog.debug("BLOCK* block " + + (rdbi.getStatus()) + ": " + rdbi.getBlock() + " is received from " + nodeID.getName()); } } } finally { namesystem.writeUnlock(); NameNode.stateChangeLog - .debug("*BLOCK* NameNode.blockReceivedAndDeleted: " + "from " - + nodeID.getName() + " received: " + received + ", " + .debug("*BLOCK* NameNode.processIncrementalBlockReport: " + "from " + + nodeID.getName() + + " receiving: " + receiving + ", " + + " received: " + received + ", " + " deleted: " + deleted); } } /** - * Return the number of nodes that are live and decommissioned. + * Return the number of nodes hosting a given block, grouped + * by the state of those replicas. */ public NumberReplicas countNodes(Block b) { - int count = 0; + int decommissioned = 0; int live = 0; int corrupt = 0; int excess = 0; + int stale = 0; Iterator nodeIter = blocksMap.nodeIterator(b); Collection nodesCorrupt = corruptReplicas.getNodes(b); while (nodeIter.hasNext()) { @@ -2241,7 +2566,7 @@ public class BlockManager { if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) { corrupt++; } else if (node.isDecommissionInProgress() || node.isDecommissioned()) { - count++; + decommissioned++; } else { LightWeightLinkedSet blocksExcess = excessReplicateMap.get(node .getStorageID()); @@ -2251,8 +2576,11 @@ public class BlockManager { live++; } } + if (node.areBlockContentsStale()) { + stale++; + } } - return new NumberReplicas(live, count, corrupt, excess); + return new NumberReplicas(live, decommissioned, corrupt, excess, stale); } /** @@ -2379,7 +2707,7 @@ public class BlockManager { } public int getActiveBlockCount() { - return blocksMap.size() - (int)invalidateBlocks.numBlocks(); + return blocksMap.size(); } public DatanodeDescriptor[] getNodes(BlockInfo block) { @@ -2397,10 +2725,17 @@ public class BlockManager { } public void removeBlock(Block block) { + assert namesystem.hasWriteLock(); + // No need to ACK blocks that are being removed entirely + // from the namespace, since the removal of the associated + // file already removes them from the block map below. block.setNumBytes(BlockCommand.NO_ACK); addToInvalidates(block); corruptReplicas.removeFromCorruptReplicasMap(block); blocksMap.removeBlock(block); + if (postponedMisreplicatedBlocks.remove(block)) { + postponedMisreplicatedBlocksCount--; + } } public BlockInfo getStoredBlock(Block block) { @@ -2412,6 +2747,9 @@ public class BlockManager { final int curReplicasDelta, int expectedReplicasDelta) { namesystem.writeLock(); try { + if (!namesystem.isPopulatingReplQueues()) { + return; + } NumberReplicas repl = countNodes(block); int curExpectedReplicas = getReplication(block); if (isNeededReplication(block, curExpectedReplicas, repl.liveReplicas())) { @@ -2461,8 +2799,10 @@ public class BlockManager { namesystem.writeLock(); try { // blocks should not be replicated or removed if safe mode is on - if (namesystem.isInSafeMode()) + if (namesystem.isInSafeMode()) { + LOG.debug("In safemode, not computing replication work"); return 0; + } // get blocks to invalidate for the nodeId assert nodeId != null; return invalidateBlocks.invalidateWork(nodeId); @@ -2645,6 +2985,19 @@ public class BlockManager { return workFound; } + /** + * Clear all queues that hold decisions previously made by + * this NameNode. + */ + public void clearQueues() { + neededReplications.clear(); + pendingReplications.clear(); + excessReplicateMap.clear(); + invalidateBlocks.clear(); + datanodeManager.clearPendingQueues(); + }; + + private static class ReplicationWork { private Block block; @@ -2675,4 +3028,24 @@ public class BlockManager { this.targets = null; } } + + /** + * A simple result enum for the result of + * {@link BlockManager#processMisReplicatedBlock(BlockInfo)}. + */ + enum MisReplicationResult { + /** The block should be invalidated since it belongs to a deleted file. */ + INVALID, + /** The block is currently under-replicated. */ + UNDER_REPLICATED, + /** The block is currently over-replicated. */ + OVER_REPLICATED, + /** A decision can't currently be made about this block. */ + POSTPONE, + /** The block is under construction, so should be ignored */ + UNDER_CONSTRUCTION, + /** The block is properly replicated */ + OK + } + } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java index ac1a7e68e0..058d2e37aa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java @@ -63,7 +63,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { initialize(conf, stats, clusterMap); } - BlockPlacementPolicyDefault() { + protected BlockPlacementPolicyDefault() { } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java index d927f05297..984456f142 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java @@ -94,6 +94,10 @@ public class DatanodeDescriptor extends DatanodeInfo { boolean contains(E e) { return blockq.contains(e); } + + synchronized void clear() { + blockq.clear(); + } } private volatile BlockInfo blockList = null; @@ -103,6 +107,24 @@ public class DatanodeDescriptor extends DatanodeInfo { public boolean isAlive = false; public boolean needKeyUpdate = false; + /** + * Set to false on any NN failover, and reset to true + * whenever a block report is received. + */ + private boolean heartbeatedSinceFailover = false; + + /** + * At startup or at any failover, the DNs in the cluster may + * have pending block deletions from a previous incarnation + * of the NameNode. Thus, we consider their block contents + * stale until we have received a block report. When a DN + * is considered stale, any replicas on it are transitively + * considered stale. If any block has at least one stale replica, + * then no invalidations will be processed for this block. + * See HDFS-1972. + */ + private boolean blockContentsStale = true; + // A system administrator can tune the balancer bandwidth parameter // (dfs.balance.bandwidthPerSec) dynamically by calling // "dfsadmin -setBalanacerBandwidth ", at which point the @@ -129,6 +151,10 @@ public class DatanodeDescriptor extends DatanodeInfo { private long lastBlocksScheduledRollTime = 0; private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min private int volumeFailures = 0; + + /** Set to false after processing first block report */ + private boolean firstBlockReport = true; + /** * When set to true, the node is not in include list and is not allowed * to communicate with the namenode @@ -281,6 +307,14 @@ public class DatanodeDescriptor extends DatanodeInfo { this.invalidateBlocks.clear(); this.volumeFailures = 0; } + + public void clearBlockQueues() { + synchronized (invalidateBlocks) { + this.invalidateBlocks.clear(); + this.recoverBlocks.clear(); + this.replicateBlocks.clear(); + } + } public int numBlocks() { return numBlocks; @@ -298,6 +332,7 @@ public class DatanodeDescriptor extends DatanodeInfo { this.lastUpdate = System.currentTimeMillis(); this.xceiverCount = xceiverCount; this.volumeFailures = volFailures; + this.heartbeatedSinceFailover = true; rollBlocksScheduled(lastUpdate); } @@ -564,5 +599,41 @@ public class DatanodeDescriptor extends DatanodeInfo { this.bandwidth = bandwidth; } + public boolean areBlockContentsStale() { + return blockContentsStale; + } + public void markStaleAfterFailover() { + heartbeatedSinceFailover = false; + blockContentsStale = true; + } + + public void receivedBlockReport() { + if (heartbeatedSinceFailover) { + blockContentsStale = false; + } + firstBlockReport = false; + } + + boolean isFirstBlockReport() { + return firstBlockReport; + } + + @Override + public String dumpDatanode() { + StringBuilder sb = new StringBuilder(super.dumpDatanode()); + int repl = replicateBlocks.size(); + if (repl > 0) { + sb.append(" ").append(repl).append(" blocks to be replicated;"); + } + int inval = invalidateBlocks.size(); + if (inval > 0) { + sb.append(" ").append(inval).append(" blocks to be invalidated;"); + } + int recover = recoverBlocks.size(); + if (recover > 0) { + sb.append(" ").append(recover).append(" blocks to be recovered;"); + } + return sb.toString(); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index 5d795e7445..8c59ccba5f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -923,7 +923,7 @@ public class DatanodeManager { } } - return null; + return new DatanodeCommand[0]; } /** @@ -947,4 +947,27 @@ public class DatanodeManager { } } } + + public void markAllDatanodesStale() { + LOG.info("Marking all datandoes as stale"); + synchronized (datanodeMap) { + for (DatanodeDescriptor dn : datanodeMap.values()) { + dn.markStaleAfterFailover(); + } + } + } + + /** + * Clear any actions that are queued up to be sent to the DNs + * on their next heartbeats. This includes block invalidations, + * recoveries, and replication requests. + */ + public void clearPendingQueues() { + synchronized (datanodeMap) { + for (DatanodeDescriptor dn : datanodeMap.values()) { + dn.clearBlockQueues(); + } + } + } + } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java index 2c6b46f050..5c7e0bdca1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java @@ -160,4 +160,9 @@ class InvalidateBlocks { numBlocks -= toInvalidate.size(); return toInvalidate; } + + synchronized void clear() { + node2blocks.clear(); + numBlocks = 0; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java index 52f62587b1..9e5c8dfd5e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java @@ -26,20 +26,22 @@ public class NumberReplicas { private int decommissionedReplicas; private int corruptReplicas; private int excessReplicas; + private int replicasOnStaleNodes; NumberReplicas() { - initialize(0, 0, 0, 0); + initialize(0, 0, 0, 0, 0); } - NumberReplicas(int live, int decommissioned, int corrupt, int excess) { - initialize(live, decommissioned, corrupt, excess); + NumberReplicas(int live, int decommissioned, int corrupt, int excess, int stale) { + initialize(live, decommissioned, corrupt, excess, stale); } - void initialize(int live, int decommissioned, int corrupt, int excess) { + void initialize(int live, int decommissioned, int corrupt, int excess, int stale) { liveReplicas = live; decommissionedReplicas = decommissioned; corruptReplicas = corrupt; excessReplicas = excess; + replicasOnStaleNodes = stale; } public int liveReplicas() { @@ -54,4 +56,13 @@ public class NumberReplicas { public int excessReplicas() { return excessReplicas; } + + /** + * @return the number of replicas which are on stale nodes. + * This is not mutually exclusive with the other counts -- ie a + * replica may count as both "live" and "stale". + */ + public int replicasOnStaleNodes() { + return replicasOnStaleNodes; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingDataNodeMessages.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingDataNodeMessages.java new file mode 100644 index 0000000000..b7da116048 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingDataNodeMessages.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.blockmanagement; + +import java.util.List; +import java.util.Map; +import java.util.Queue; + +import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; + +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +/** + * In the Standby Node, we can receive messages about blocks + * before they are actually available in the namespace, or while + * they have an outdated state in the namespace. In those cases, + * we queue those block-related messages in this structure. + * */ +class PendingDataNodeMessages { + + Map> queueByBlockId = + Maps.newHashMap(); + private int count = 0; + + + static class ReportedBlockInfo { + private final Block block; + private final DatanodeDescriptor dn; + private final ReplicaState reportedState; + + ReportedBlockInfo(DatanodeDescriptor dn, Block block, + ReplicaState reportedState) { + this.dn = dn; + this.block = block; + this.reportedState = reportedState; + } + + Block getBlock() { + return block; + } + + DatanodeDescriptor getNode() { + return dn; + } + + ReplicaState getReportedState() { + return reportedState; + } + + @Override + public String toString() { + return "ReportedBlockInfo [block=" + block + ", dn=" + dn + + ", reportedState=" + reportedState + "]"; + } + } + + void enqueueReportedBlock(DatanodeDescriptor dn, Block block, + ReplicaState reportedState) { + block = new Block(block); + getBlockQueue(block).add( + new ReportedBlockInfo(dn, block, reportedState)); + count++; + } + + /** + * @return any messages that were previously queued for the given block, + * or null if no messages were queued. + */ + Queue takeBlockQueue(Block block) { + Queue queue = queueByBlockId.remove(block); + if (queue != null) { + count -= queue.size(); + } + return queue; + } + + + private Queue getBlockQueue(Block block) { + Queue queue = queueByBlockId.get(block); + if (queue == null) { + queue = Lists.newLinkedList(); + queueByBlockId.put(block, queue); + } + return queue; + } + + public int count() { + return count ; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Map.Entry> entry : + queueByBlockId.entrySet()) { + sb.append("Block " + entry.getKey() + ":\n"); + for (ReportedBlockInfo rbi : entry.getValue()) { + sb.append(" ").append(rbi).append("\n"); + } + } + return sb.toString(); + } + + public Iterable takeAll() { + List rbis = Lists.newArrayListWithCapacity( + count); + for (Queue q : queueByBlockId.values()) { + rbis.addAll(q); + } + queueByBlockId.clear(); + count = 0; + return rbis; + } +} \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java index e07cf9bb2a..e200ed0dea 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java @@ -104,6 +104,14 @@ class PendingReplicationBlocks { } } + + public void clear() { + synchronized (pendingReplications) { + pendingReplications.clear(); + timedOutItems.clear(); + } + } + /** * The total number of blocks that are undergoing replication */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java index 3de906701f..c76d24c6a7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java @@ -568,7 +568,7 @@ public abstract class Storage extends StorageInfo { *

    Locking is not supported by all file systems. * E.g., NFS does not consistently support exclusive locks. * - *

    If locking is supported we guarantee exculsive access to the + *

    If locking is supported we guarantee exclusive access to the * storage directory. Otherwise, no guarantee is given. * * @throws IOException if locking fails diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java index 642551e379..1f4e974166 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java @@ -23,6 +23,7 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collection; +import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -97,9 +98,9 @@ public final class Util { * @param names collection of strings to convert to URIs * @return collection of URIs */ - public static Collection stringCollectionAsURIs( + public static List stringCollectionAsURIs( Collection names) { - Collection uris = new ArrayList(names.size()); + List uris = new ArrayList(names.size()); for(String name : names) { try { uris.add(stringAsURI(name)); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java index 5b1ed7c5a5..27567b543f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java @@ -17,20 +17,16 @@ */ package org.apache.hadoop.hdfs.server.datanode; -import static org.apache.hadoop.hdfs.server.common.Util.now; - import java.io.IOException; import java.net.InetSocketAddress; -import java.net.SocketTimeoutException; -import java.net.URI; -import java.util.Collection; -import java.util.LinkedList; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.concurrent.CopyOnWriteArrayList; import org.apache.commons.logging.Log; import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.protocol.Block; -import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.HdfsConstants; @@ -50,8 +46,11 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException; import org.apache.hadoop.hdfs.server.protocol.FinalizeCommand; import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand; +import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; +import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus; +import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand; import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; import org.apache.hadoop.hdfs.server.protocol.StorageReport; @@ -61,23 +60,22 @@ import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.util.StringUtils; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; /** - * A thread per namenode to perform: - *

      - *
    • Pre-registration handshake with namenode
    • - *
    • Registration with namenode
    • - *
    • Send periodic heartbeats to the namenode
    • - *
    • Handle commands received from the namenode
    • - *
    + * One instance per block-pool/namespace on the DN, which handles the + * heartbeats to the active and standby NNs for that namespace. + * This class manages an instance of {@link BPServiceActor} for each NN, + * and delegates calls to both NNs. + * It also maintains the state about which of the NNs is considered active. */ @InterfaceAudience.Private -class BPOfferService implements Runnable { +class BPOfferService { static final Log LOG = DataNode.LOG; - final InetSocketAddress nnAddr; - /** * Information about the namespace that this service * is registering with. This is assigned after @@ -92,43 +90,80 @@ class BPOfferService implements Runnable { */ DatanodeRegistration bpRegistration; - long lastBlockReport = 0; - long lastDeletedReport = 0; - - boolean resetBlockReportTime = true; - - Thread bpThread; - DatanodeProtocolClientSideTranslatorPB bpNamenode; - private long lastHeartbeat = 0; - private volatile boolean initialized = false; - private final LinkedList receivedAndDeletedBlockList - = new LinkedList(); - private volatile int pendingReceivedRequests = 0; - private volatile boolean shouldServiceRun = true; UpgradeManagerDatanode upgradeManager = null; private final DataNode dn; - private final DNConf dnConf; - BPOfferService(InetSocketAddress nnAddr, DataNode dn) { + /** + * A reference to the BPServiceActor associated with the currently + * ACTIVE NN. In the case that all NameNodes are in STANDBY mode, + * this can be null. If non-null, this must always refer to a member + * of the {@link #bpServices} list. + */ + private BPServiceActor bpServiceToActive = null; + + /** + * The list of all actors for namenodes in this nameservice, regardless + * of their active or standby states. + */ + private List bpServices = + new CopyOnWriteArrayList(); + + /** + * Each time we receive a heartbeat from a NN claiming to be ACTIVE, + * we record that NN's most recent transaction ID here, so long as it + * is more recent than the previous value. This allows us to detect + * split-brain scenarios in which a prior NN is still asserting its + * ACTIVE state but with a too-low transaction ID. See HDFS-2627 + * for details. + */ + private long lastActiveClaimTxId = -1; + + BPOfferService(List nnAddrs, DataNode dn) { + Preconditions.checkArgument(!nnAddrs.isEmpty(), + "Must pass at least one NN."); this.dn = dn; - this.nnAddr = nnAddr; - this.dnConf = dn.getDnConf(); + + for (InetSocketAddress addr : nnAddrs) { + this.bpServices.add(new BPServiceActor(addr, this)); + } + } + + void refreshNNList(ArrayList addrs) throws IOException { + Set oldAddrs = Sets.newHashSet(); + for (BPServiceActor actor : bpServices) { + oldAddrs.add(actor.getNNSocketAddress()); + } + Set newAddrs = Sets.newHashSet(addrs); + + if (!Sets.symmetricDifference(oldAddrs, newAddrs).isEmpty()) { + // Keep things simple for now -- we can implement this at a later date. + throw new IOException( + "HA does not currently support adding a new standby to a running DN. " + + "Please do a rolling restart of DNs to reconfigure the list of NNs."); + } } /** - * returns true if BP thread has completed initialization of storage - * and has registered with the corresponding namenode - * @return true if initialized + * @return true if the service has registered with at least one NameNode. */ - public boolean isInitialized() { - return initialized; + boolean isInitialized() { + return bpRegistration != null; } - public boolean isAlive() { - return shouldServiceRun && bpThread.isAlive(); + /** + * @return true if there is at least one actor thread running which is + * talking to a NameNode. + */ + boolean isAlive() { + for (BPServiceActor actor : bpServices) { + if (actor.isAlive()) { + return true; + } + } + return false; } - public String getBlockPoolId() { + String getBlockPoolId() { if (bpNSInfo != null) { return bpNSInfo.getBlockPoolID(); } else { @@ -138,10 +173,11 @@ class BPOfferService implements Runnable { } } - public NamespaceInfo getNamespaceInfo() { + synchronized NamespaceInfo getNamespaceInfo() { return bpNSInfo; } + @Override public String toString() { if (bpNSInfo == null) { // If we haven't yet connected to our NN, we don't yet know our @@ -153,522 +189,363 @@ class BPOfferService implements Runnable { storageId = "unknown"; } return "Block pool (storage id " + storageId + - ") connecting to " + nnAddr; + ")"; } else { return "Block pool " + getBlockPoolId() + " (storage id " + dn.getStorageId() + - ") registered with " + nnAddr; + ")"; } } - InetSocketAddress getNNSocketAddress() { - return nnAddr; - } - - /** - * Used to inject a spy NN in the unit tests. - */ - @VisibleForTesting - void setNameNode(DatanodeProtocolClientSideTranslatorPB dnProtocol) { - bpNamenode = dnProtocol; - } - - /** - * Perform the first part of the handshake with the NameNode. - * This calls versionRequest to determine the NN's - * namespace and version info. It automatically retries until - * the NN responds or the DN is shutting down. - * - * @return the NamespaceInfo - * @throws IncorrectVersionException if the remote NN does not match - * this DN's version - */ - NamespaceInfo retrieveNamespaceInfo() throws IncorrectVersionException { - NamespaceInfo nsInfo = null; - while (shouldRun()) { - try { - nsInfo = bpNamenode.versionRequest(); - LOG.debug(this + " received versionRequest response: " + nsInfo); - break; - } catch(SocketTimeoutException e) { // namenode is busy - LOG.warn("Problem connecting to server: " + nnAddr); - } catch(IOException e ) { // namenode is not available - LOG.warn("Problem connecting to server: " + nnAddr); - } - - // try again in a second - sleepAndLogInterrupts(5000, "requesting version info from NN"); - } - - if (nsInfo != null) { - checkNNVersion(nsInfo); - } - return nsInfo; - } - - private void checkNNVersion(NamespaceInfo nsInfo) - throws IncorrectVersionException { - // build and layout versions should match - String nsBuildVer = nsInfo.getBuildVersion(); - String stBuildVer = Storage.getBuildVersion(); - if (!nsBuildVer.equals(stBuildVer)) { - LOG.warn("Data-node and name-node Build versions must be the same. " + - "Namenode build version: " + nsBuildVer + "Datanode " + - "build version: " + stBuildVer); - throw new IncorrectVersionException(nsBuildVer, "namenode", stBuildVer); - } - - if (HdfsConstants.LAYOUT_VERSION != nsInfo.getLayoutVersion()) { - LOG.warn("Data-node and name-node layout versions must be the same." + - " Expected: "+ HdfsConstants.LAYOUT_VERSION + - " actual "+ bpNSInfo.getLayoutVersion()); - throw new IncorrectVersionException( - bpNSInfo.getLayoutVersion(), "namenode"); - } - } - - private void connectToNNAndHandshake() throws IOException { - // get NN proxy - bpNamenode = dn.connectToNN(nnAddr); - - // First phase of the handshake with NN - get the namespace - // info. - bpNSInfo = retrieveNamespaceInfo(); - - // Now that we know the namespace ID, etc, we can pass this to the DN. - // The DN can now initialize its local storage if we are the - // first BP to handshake, etc. - dn.initBlockPool(this); - - // Second phase of the handshake with the NN. - register(); - } - - /** - * This methods arranges for the data node to send the block report at - * the next heartbeat. - */ - void scheduleBlockReport(long delay) { - if (delay > 0) { // send BR after random delay - lastBlockReport = System.currentTimeMillis() - - ( dnConf.blockReportInterval - DFSUtil.getRandom().nextInt((int)(delay))); - } else { // send at next heartbeat - lastBlockReport = lastHeartbeat - dnConf.blockReportInterval; - } - resetBlockReportTime = true; // reset future BRs for randomness - } - void reportBadBlocks(ExtendedBlock block) { - DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) }; - LocatedBlock[] blocks = { new LocatedBlock(block, dnArr) }; - - try { - bpNamenode.reportBadBlocks(blocks); - } catch (IOException e){ - /* One common reason is that NameNode could be in safe mode. - * Should we keep on retrying in that case? - */ - LOG.warn("Failed to report bad block " + block + " to namenode : " - + " Exception", e); + checkBlock(block); + for (BPServiceActor actor : bpServices) { + actor.reportBadBlocks(block); } - } - /** - * Report received blocks and delete hints to the Namenode - * - * @throws IOException - */ - private void reportReceivedDeletedBlocks() throws IOException { - - // check if there are newly received blocks - ReceivedDeletedBlockInfo[] receivedAndDeletedBlockArray = null; - int currentReceivedRequestsCounter; - synchronized (receivedAndDeletedBlockList) { - currentReceivedRequestsCounter = pendingReceivedRequests; - int numBlocks = receivedAndDeletedBlockList.size(); - if (numBlocks > 0) { - // - // Send newly-received and deleted blockids to namenode - // - receivedAndDeletedBlockArray = receivedAndDeletedBlockList - .toArray(new ReceivedDeletedBlockInfo[numBlocks]); - } - } - if (receivedAndDeletedBlockArray != null) { - StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks( - bpRegistration.getStorageID(), receivedAndDeletedBlockArray) }; - bpNamenode.blockReceivedAndDeleted(bpRegistration, getBlockPoolId(), - report); - synchronized (receivedAndDeletedBlockList) { - for (int i = 0; i < receivedAndDeletedBlockArray.length; i++) { - receivedAndDeletedBlockList.remove(receivedAndDeletedBlockArray[i]); - } - pendingReceivedRequests -= currentReceivedRequestsCounter; - } - } - } - /* * Informing the name node could take a long long time! Should we wait * till namenode is informed before responding with success to the * client? For now we don't. */ void notifyNamenodeReceivedBlock(ExtendedBlock block, String delHint) { - if (block == null || delHint == null) { - throw new IllegalArgumentException(block == null ? "Block is null" - : "delHint is null"); - } + checkBlock(block); + checkDelHint(delHint); + ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo( + block.getLocalBlock(), + ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK, + delHint); - if (!block.getBlockPoolId().equals(getBlockPoolId())) { - LOG.warn("BlockPool mismatch " + block.getBlockPoolId() + " vs. " - + getBlockPoolId()); - return; + for (BPServiceActor actor : bpServices) { + actor.notifyNamenodeBlockImmediately(bInfo); } + } - synchronized (receivedAndDeletedBlockList) { - receivedAndDeletedBlockList.add(new ReceivedDeletedBlockInfo(block - .getLocalBlock(), delHint)); - pendingReceivedRequests++; - receivedAndDeletedBlockList.notifyAll(); - } + private void checkBlock(ExtendedBlock block) { + Preconditions.checkArgument(block != null, + "block is null"); + Preconditions.checkArgument(block.getBlockPoolId().equals(getBlockPoolId()), + "block belongs to BP %s instead of BP %s", + block.getBlockPoolId(), getBlockPoolId()); + } + + private void checkDelHint(String delHint) { + Preconditions.checkArgument(delHint != null, + "delHint is null"); } void notifyNamenodeDeletedBlock(ExtendedBlock block) { - if (block == null) { - throw new IllegalArgumentException("Block is null"); + checkBlock(block); + ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo( + block.getLocalBlock(), BlockStatus.DELETED_BLOCK, null); + + for (BPServiceActor actor : bpServices) { + actor.notifyNamenodeDeletedBlock(bInfo); } - - if (!block.getBlockPoolId().equals(getBlockPoolId())) { - LOG.warn("BlockPool mismatch " + block.getBlockPoolId() + " vs. " - + getBlockPoolId()); - return; - } - - synchronized (receivedAndDeletedBlockList) { - receivedAndDeletedBlockList.add(new ReceivedDeletedBlockInfo(block - .getLocalBlock(), ReceivedDeletedBlockInfo.TODELETE_HINT)); + } + + void notifyNamenodeReceivingBlock(ExtendedBlock block) { + checkBlock(block); + ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo( + block.getLocalBlock(), BlockStatus.RECEIVING_BLOCK, null); + + for (BPServiceActor actor : bpServices) { + actor.notifyNamenodeBlockImmediately(bInfo); } } - - /** - * Report the list blocks to the Namenode - * @throws IOException - */ - DatanodeCommand blockReport() throws IOException { - // send block report if timer has expired. - DatanodeCommand cmd = null; - long startTime = now(); - if (startTime - lastBlockReport > dnConf.blockReportInterval) { - - // Create block report - long brCreateStartTime = now(); - BlockListAsLongs bReport = dn.data.getBlockReport(getBlockPoolId()); - - // Send block report - long brSendStartTime = now(); - StorageBlockReport[] report = { new StorageBlockReport( - bpRegistration.getStorageID(), bReport.getBlockListAsLongs()) }; - cmd = bpNamenode.blockReport(bpRegistration, getBlockPoolId(), report); - - // Log the block report processing stats from Datanode perspective - long brSendCost = now() - brSendStartTime; - long brCreateCost = brSendStartTime - brCreateStartTime; - dn.metrics.addBlockReport(brSendCost); - LOG.info("BlockReport of " + bReport.getNumberOfBlocks() - + " blocks took " + brCreateCost + " msec to generate and " - + brSendCost + " msecs for RPC and NN processing"); - - // If we have sent the first block report, then wait a random - // time before we start the periodic block reports. - if (resetBlockReportTime) { - lastBlockReport = startTime - DFSUtil.getRandom().nextInt((int)(dnConf.blockReportInterval)); - resetBlockReportTime = false; - } else { - /* say the last block report was at 8:20:14. The current report - * should have started around 9:20:14 (default 1 hour interval). - * If current time is : - * 1) normal like 9:20:18, next report should be at 10:20:14 - * 2) unexpected like 11:35:43, next report should be at 12:20:14 - */ - lastBlockReport += (now() - lastBlockReport) / - dnConf.blockReportInterval * dnConf.blockReportInterval; - } - LOG.info("sent block report, processed command:" + cmd); - } - return cmd; - } - - - DatanodeCommand [] sendHeartBeat() throws IOException { - // reports number of failed volumes - StorageReport[] report = { new StorageReport(bpRegistration.getStorageID(), - false, dn.data.getCapacity(), dn.data.getDfsUsed(), - dn.data.getRemaining(), dn.data.getBlockPoolUsed(getBlockPoolId())) }; - return bpNamenode.sendHeartbeat(bpRegistration, report, - dn.xmitsInProgress.get(), - dn.getXceiverCount(), dn.data.getNumFailedVolumes()); - } - //This must be called only by blockPoolManager void start() { - if ((bpThread != null) && (bpThread.isAlive())) { - //Thread is started already - return; + for (BPServiceActor actor : bpServices) { + actor.start(); } - bpThread = new Thread(this, formatThreadName()); - bpThread.setDaemon(true); // needed for JUnit testing - bpThread.start(); - } - - private String formatThreadName() { - Collection dataDirs = DataNode.getStorageDirs(dn.getConf()); - return "DataNode: [" + - StringUtils.uriToString(dataDirs.toArray(new URI[0])) + "] " + - " heartbeating to " + nnAddr; } //This must be called only by blockPoolManager. void stop() { - shouldServiceRun = false; - if (bpThread != null) { - bpThread.interrupt(); + for (BPServiceActor actor : bpServices) { + actor.stop(); } } //This must be called only by blockPoolManager void join() { - try { - if (bpThread != null) { - bpThread.join(); - } - } catch (InterruptedException ie) { } + for (BPServiceActor actor : bpServices) { + actor.join(); + } + } + + synchronized UpgradeManagerDatanode getUpgradeManager() { + if(upgradeManager == null) + upgradeManager = + new UpgradeManagerDatanode(dn, getBlockPoolId()); + + return upgradeManager; } - //Cleanup method to be called by current thread before exiting. - private synchronized void cleanUp() { - - if(upgradeManager != null) - upgradeManager.shutdownUpgrade(); - shouldServiceRun = false; - IOUtils.cleanup(LOG, bpNamenode); - dn.shutdownBlockPool(this); + void processDistributedUpgradeCommand(UpgradeCommand comm) + throws IOException { + UpgradeManagerDatanode upgradeManager = getUpgradeManager(); + upgradeManager.processUpgradeCommand(comm); } /** - * Main loop for each BP thread. Run until shutdown, - * forever calling remote NameNode functions. + * Start distributed upgrade if it should be initiated by the data-node. */ - private void offerService() throws Exception { - LOG.info("For namenode " + nnAddr + " using DELETEREPORT_INTERVAL of " - + dnConf.deleteReportInterval + " msec " + " BLOCKREPORT_INTERVAL of " - + dnConf.blockReportInterval + "msec" + " Initial delay: " - + dnConf.initialBlockReportDelay + "msec" + "; heartBeatInterval=" - + dnConf.heartBeatInterval); - - // - // Now loop for a long time.... - // - while (shouldRun()) { - try { - long startTime = now(); - - // - // Every so often, send heartbeat or block-report - // - if (startTime - lastHeartbeat > dnConf.heartBeatInterval) { - // - // All heartbeat messages include following info: - // -- Datanode name - // -- data transfer port - // -- Total capacity - // -- Bytes remaining - // - lastHeartbeat = startTime; - if (!dn.areHeartbeatsDisabledForTests()) { - DatanodeCommand[] cmds = sendHeartBeat(); - dn.metrics.addHeartbeat(now() - startTime); - - long startProcessCommands = now(); - if (!processCommand(cmds)) - continue; - long endProcessCommands = now(); - if (endProcessCommands - startProcessCommands > 2000) { - LOG.info("Took " + (endProcessCommands - startProcessCommands) + - "ms to process " + cmds.length + " commands from NN"); - } - } - } - if (pendingReceivedRequests > 0 - || (startTime - lastDeletedReport > dnConf.deleteReportInterval)) { - reportReceivedDeletedBlocks(); - lastDeletedReport = startTime; - } - - DatanodeCommand cmd = blockReport(); - processCommand(cmd); - - // Now safe to start scanning the block pool - if (dn.blockScanner != null) { - dn.blockScanner.addBlockPool(this.getBlockPoolId()); - } - - // - // There is no work to do; sleep until hearbeat timer elapses, - // or work arrives, and then iterate again. - // - long waitTime = dnConf.heartBeatInterval - - (System.currentTimeMillis() - lastHeartbeat); - synchronized(receivedAndDeletedBlockList) { - if (waitTime > 0 && pendingReceivedRequests == 0) { - try { - receivedAndDeletedBlockList.wait(waitTime); - } catch (InterruptedException ie) { - LOG.warn("BPOfferService for " + this + " interrupted"); - } - } - } // synchronized - } catch(RemoteException re) { - String reClass = re.getClassName(); - if (UnregisteredNodeException.class.getName().equals(reClass) || - DisallowedDatanodeException.class.getName().equals(reClass) || - IncorrectVersionException.class.getName().equals(reClass)) { - LOG.warn(this + " is shutting down", re); - shouldServiceRun = false; - return; - } - LOG.warn("RemoteException in offerService", re); - try { - long sleepTime = Math.min(1000, dnConf.heartBeatInterval); - Thread.sleep(sleepTime); - } catch (InterruptedException ie) { - Thread.currentThread().interrupt(); - } - } catch (IOException e) { - LOG.warn("IOException in offerService", e); - } - } // while (shouldRun()) - } // offerService - - /** - * Register one bp with the corresponding NameNode - *

    - * The bpDatanode needs to register with the namenode on startup in order - * 1) to report which storage it is serving now and - * 2) to receive a registrationID - * - * issued by the namenode to recognize registered datanodes. - * - * @see FSNamesystem#registerDatanode(DatanodeRegistration) - * @throws IOException - */ - void register() throws IOException { - Preconditions.checkState(bpNSInfo != null, - "register() should be called after handshake()"); + synchronized void startDistributedUpgradeIfNeeded() throws IOException { + UpgradeManagerDatanode um = getUpgradeManager(); - // The handshake() phase loaded the block pool storage - // off disk - so update the bpRegistration object from that info - bpRegistration = dn.createBPRegistration(bpNSInfo); - - LOG.info(this + " beginning handshake with NN"); - - while (shouldRun()) { - try { - // Use returned registration from namenode with updated machine name. - bpRegistration = bpNamenode.registerDatanode(bpRegistration, - new DatanodeStorage[0]); - break; - } catch(SocketTimeoutException e) { // namenode is busy - LOG.info("Problem connecting to server: " + nnAddr); - sleepAndLogInterrupts(1000, "connecting to server"); - } - } - - LOG.info("Block pool " + this + " successfully registered with NN"); - dn.bpRegistrationSucceeded(bpRegistration, getBlockPoolId()); - - // random short delay - helps scatter the BR from all DNs - scheduleBlockReport(dnConf.initialBlockReportDelay); + if(!um.getUpgradeState()) + return; + um.setUpgradeState(false, um.getUpgradeVersion()); + um.startUpgrade(); + return; } - - - private void sleepAndLogInterrupts(int millis, - String stateString) { - try { - Thread.sleep(millis); - } catch (InterruptedException ie) { - LOG.info("BPOfferService " + this + - " interrupted while " + stateString); - } + + DataNode getDataNode() { + return dn; } /** - * No matter what kind of exception we get, keep retrying to offerService(). - * That's the loop that connects to the NameNode and provides basic DataNode - * functionality. - * - * Only stop when "shouldRun" or "shouldServiceRun" is turned off, which can - * happen either at shutdown or due to refreshNamenodes. + * Called by the BPServiceActors when they handshake to a NN. + * If this is the first NN connection, this sets the namespace info + * for this BPOfferService. If it's a connection to a new NN, it + * verifies that this namespace matches (eg to prevent a misconfiguration + * where a StandbyNode from a different cluster is specified) */ - @Override - public void run() { - LOG.info(this + " starting to offer service"); - - try { - // init stuff - try { - // setup storage - connectToNNAndHandshake(); - } catch (IOException ioe) { - // Initial handshake, storage recovery or registration failed - // End BPOfferService thread - LOG.fatal("Initialization failed for block pool " + this, ioe); - return; - } - - initialized = true; // bp is initialized; + synchronized void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException { + if (this.bpNSInfo == null) { + this.bpNSInfo = nsInfo; - while (shouldRun()) { - try { - startDistributedUpgradeIfNeeded(); - offerService(); - } catch (Exception ex) { - LOG.error("Exception in BPOfferService for " + this, ex); - sleepAndLogInterrupts(5000, "offering service"); - } - } - } catch (Throwable ex) { - LOG.warn("Unexpected exception in block pool " + this, ex); - } finally { - LOG.warn("Ending block pool service for: " + this); - cleanUp(); + // Now that we know the namespace ID, etc, we can pass this to the DN. + // The DN can now initialize its local storage if we are the + // first BP to handshake, etc. + dn.initBlockPool(this); + return; + } else { + checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(), + "Blockpool ID"); + checkNSEquality(bpNSInfo.getNamespaceID(), nsInfo.getNamespaceID(), + "Namespace ID"); + checkNSEquality(bpNSInfo.getClusterID(), nsInfo.getClusterID(), + "Cluster ID"); } } - private boolean shouldRun() { - return shouldServiceRun && dn.shouldRun(); - } - /** - * Process an array of datanode commands - * - * @param cmds an array of datanode commands - * @return true if further processing may be required or false otherwise. + * After one of the BPServiceActors registers successfully with the + * NN, it calls this function to verify that the NN it connected to + * is consistent with other NNs serving the block-pool. */ - private boolean processCommand(DatanodeCommand[] cmds) { - if (cmds != null) { - for (DatanodeCommand cmd : cmds) { - try { - if (processCommand(cmd) == false) { - return false; - } - } catch (IOException ioe) { - LOG.warn("Error processing datanode Command", ioe); - } + void registrationSucceeded(BPServiceActor bpServiceActor, + DatanodeRegistration reg) throws IOException { + if (bpRegistration != null) { + checkNSEquality(bpRegistration.storageInfo.getNamespaceID(), + reg.storageInfo.getNamespaceID(), "namespace ID"); + checkNSEquality(bpRegistration.storageInfo.getClusterID(), + reg.storageInfo.getClusterID(), "cluster ID"); + } else { + bpRegistration = reg; + } + + dn.bpRegistrationSucceeded(bpRegistration, getBlockPoolId()); + } + + /** + * Verify equality of two namespace-related fields, throwing + * an exception if they are unequal. + */ + private static void checkNSEquality( + Object ourID, Object theirID, + String idHelpText) throws IOException { + if (!ourID.equals(theirID)) { + throw new IOException(idHelpText + " mismatch: " + + "previously connected to " + idHelpText + " " + ourID + + " but now connected to " + idHelpText + " " + theirID); + } + } + + synchronized DatanodeRegistration createRegistration() { + Preconditions.checkState(bpNSInfo != null, + "getRegistration() can only be called after initial handshake"); + return dn.createBPRegistration(bpNSInfo); + } + + /** + * Called when an actor shuts down. If this is the last actor + * to shut down, shuts down the whole blockpool in the DN. + */ + synchronized void shutdownActor(BPServiceActor actor) { + if (bpServiceToActive == actor) { + bpServiceToActive = null; + } + + bpServices.remove(actor); + + if (bpServices.isEmpty()) { + dn.shutdownBlockPool(this); + + if(upgradeManager != null) + upgradeManager.shutdownUpgrade(); + } + } + + /** + * Called by the DN to report an error to the NNs. + */ + void trySendErrorReport(int errCode, String errMsg) { + for (BPServiceActor actor : bpServices) { + actor.trySendErrorReport(errCode, errMsg); + } + } + + /** + * Ask each of the actors to schedule a block report after + * the specified delay. + */ + void scheduleBlockReport(long delay) { + for (BPServiceActor actor : bpServices) { + actor.scheduleBlockReport(delay); + } + } + + /** + * Ask each of the actors to report a bad block hosted on another DN. + */ + void reportRemoteBadBlock(DatanodeInfo dnInfo, ExtendedBlock block) { + for (BPServiceActor actor : bpServices) { + try { + actor.reportRemoteBadBlock(dnInfo, block); + } catch (IOException e) { + LOG.warn("Couldn't report bad block " + block + " to " + actor, + e); } } - return true; + } + + /** + * @return a proxy to the active NN, or null if the BPOS has not + * acknowledged any NN as active yet. + */ + synchronized DatanodeProtocolClientSideTranslatorPB getActiveNN() { + if (bpServiceToActive != null) { + return bpServiceToActive.bpNamenode; + } else { + return null; + } + } + + @VisibleForTesting + synchronized List getBPServiceActors() { + return Lists.newArrayList(bpServices); + } + + /** + * Update the BPOS's view of which NN is active, based on a heartbeat + * response from one of the actors. + * + * @param actor the actor which received the heartbeat + * @param nnHaState the HA-related heartbeat contents + */ + synchronized void updateActorStatesFromHeartbeat( + BPServiceActor actor, + NNHAStatusHeartbeat nnHaState) { + final long txid = nnHaState.getTxId(); + + final boolean nnClaimsActive = + nnHaState.getState() == NNHAStatusHeartbeat.State.ACTIVE; + final boolean bposThinksActive = bpServiceToActive == actor; + final boolean isMoreRecentClaim = txid > lastActiveClaimTxId; + + if (nnClaimsActive && !bposThinksActive) { + LOG.info("Namenode " + actor + " trying to claim ACTIVE state with " + + "txid=" + txid); + if (!isMoreRecentClaim) { + // Split-brain scenario - an NN is trying to claim active + // state when a different NN has already claimed it with a higher + // txid. + LOG.warn("NN " + actor + " tried to claim ACTIVE state at txid=" + + txid + " but there was already a more recent claim at txid=" + + lastActiveClaimTxId); + return; + } else { + if (bpServiceToActive == null) { + LOG.info("Acknowledging ACTIVE Namenode " + actor); + } else { + LOG.info("Namenode " + actor + " taking over ACTIVE state from " + + bpServiceToActive + " at higher txid=" + txid); + } + bpServiceToActive = actor; + } + } else if (!nnClaimsActive && bposThinksActive) { + LOG.info("Namenode " + actor + " relinquishing ACTIVE state with " + + "txid=" + nnHaState.getTxId()); + bpServiceToActive = null; + } + + if (bpServiceToActive == actor) { + assert txid >= lastActiveClaimTxId; + lastActiveClaimTxId = txid; + } + } + + /** + * @return true if the given NN address is one of the NNs for this + * block pool + */ + boolean containsNN(InetSocketAddress addr) { + for (BPServiceActor actor : bpServices) { + if (actor.getNNSocketAddress().equals(addr)) { + return true; + } + } + return false; + } + + @VisibleForTesting + int countNameNodes() { + return bpServices.size(); + } + + /** + * Run an immediate block report on this thread. Used by tests. + */ + @VisibleForTesting + void triggerBlockReportForTests() throws IOException { + for (BPServiceActor actor : bpServices) { + actor.triggerBlockReportForTests(); + } + } + + /** + * Run an immediate deletion report on this thread. Used by tests. + */ + @VisibleForTesting + void triggerDeletionReportForTests() throws IOException { + for (BPServiceActor actor : bpServices) { + actor.triggerDeletionReportForTests(); + } + } + + /** + * Run an immediate heartbeat from all actors. Used by tests. + */ + @VisibleForTesting + void triggerHeartbeatForTests() throws IOException { + for (BPServiceActor actor : bpServices) { + actor.triggerHeartbeatForTests(); + } + } + + synchronized boolean processCommandFromActor(DatanodeCommand cmd, + BPServiceActor actor) throws IOException { + assert bpServices.contains(actor); + if (actor == bpServiceToActive) { + return processCommandFromActive(cmd, actor); + } else { + return processCommandFromStandby(cmd, actor); + } } /** @@ -677,7 +554,8 @@ class BPOfferService implements Runnable { * @return true if further processing may be required or false otherwise. * @throws IOException */ - private boolean processCommand(DatanodeCommand cmd) throws IOException { + private boolean processCommandFromActive(DatanodeCommand cmd, + BPServiceActor actor) throws IOException { if (cmd == null) return true; final BlockCommand bcmd = @@ -708,19 +586,13 @@ class BPOfferService implements Runnable { dn.metrics.incrBlocksRemoved(toDelete.length); break; case DatanodeProtocol.DNA_SHUTDOWN: - // shut down the data node - shouldServiceRun = false; - return false; + // TODO: DNA_SHUTDOWN appears to be unused - the NN never sends this command + // See HDFS-2987. + throw new UnsupportedOperationException("Received unimplemented DNA_SHUTDOWN"); case DatanodeProtocol.DNA_REGISTER: // namenode requested a registration - at start or if NN lost contact LOG.info("DatanodeCommand action: DNA_REGISTER"); - if (shouldRun()) { - // re-retrieve namespace info to make sure that, if the NN - // was restarted, we still match its version (HDFS-2120) - retrieveNamespaceInfo(); - // and re-register - register(); - } + actor.reRegister(); break; case DatanodeProtocol.DNA_FINALIZE: String bp = ((FinalizeCommand) cmd).getBlockPoolId(); @@ -740,7 +612,8 @@ class BPOfferService implements Runnable { case DatanodeProtocol.DNA_ACCESSKEYUPDATE: LOG.info("DatanodeCommand action: DNA_ACCESSKEYUPDATE"); if (dn.isBlockTokenEnabled) { - dn.blockPoolTokenSecretManager.setKeys(getBlockPoolId(), + dn.blockPoolTokenSecretManager.setKeys( + getBlockPoolId(), ((KeyUpdateCommand) cmd).getExportedKeys()); } break; @@ -759,41 +632,29 @@ class BPOfferService implements Runnable { } return true; } - - private void processDistributedUpgradeCommand(UpgradeCommand comm) - throws IOException { - UpgradeManagerDatanode upgradeManager = getUpgradeManager(); - upgradeManager.processUpgradeCommand(comm); + + private boolean processCommandFromStandby(DatanodeCommand cmd, + BPServiceActor actor) throws IOException { + if (cmd == null) + return true; + switch(cmd.getAction()) { + case DatanodeProtocol.DNA_REGISTER: + // namenode requested a registration - at start or if NN lost contact + LOG.info("DatanodeCommand action: DNA_REGISTER"); + actor.reRegister(); + return true; + case DatanodeProtocol.DNA_TRANSFER: + case DatanodeProtocol.DNA_INVALIDATE: + case DatanodeProtocol.DNA_SHUTDOWN: + case DatanodeProtocol.DNA_RECOVERBLOCK: + case DatanodeProtocol.DNA_ACCESSKEYUPDATE: + case DatanodeProtocol.DNA_BALANCERBANDWIDTHUPDATE: + LOG.warn("Got a command from standby NN - ignoring command:" + cmd.getAction()); + return true; + default: + LOG.warn("Unknown DatanodeCommand action: " + cmd.getAction()); + } + return true; } - synchronized UpgradeManagerDatanode getUpgradeManager() { - if(upgradeManager == null) - upgradeManager = - new UpgradeManagerDatanode(dn, getBlockPoolId()); - - return upgradeManager; - } - - /** - * Start distributed upgrade if it should be initiated by the data-node. - */ - private void startDistributedUpgradeIfNeeded() throws IOException { - UpgradeManagerDatanode um = getUpgradeManager(); - - if(!um.getUpgradeState()) - return; - um.setUpgradeState(false, um.getUpgradeVersion()); - um.startUpgrade(); - return; - } - - @VisibleForTesting - DatanodeProtocolClientSideTranslatorPB getBpNamenode() { - return bpNamenode; - } - - @VisibleForTesting - void setBpNamenode(DatanodeProtocolClientSideTranslatorPB bpNamenode) { - this.bpNamenode = bpNamenode; - } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java new file mode 100644 index 0000000000..75f32cbc04 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java @@ -0,0 +1,730 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import static org.apache.hadoop.hdfs.server.common.Util.now; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.SocketTimeoutException; +import java.net.URI; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException; +import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.server.common.IncorrectVersionException; +import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; +import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; +import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; +import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; +import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; +import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; +import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; +import org.apache.hadoop.hdfs.server.protocol.StorageReport; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.ipc.RemoteException; +import org.apache.hadoop.util.StringUtils; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Maps; + +/** + * A thread per active or standby namenode to perform: + *

      + *
    • Pre-registration handshake with namenode
    • + *
    • Registration with namenode
    • + *
    • Send periodic heartbeats to the namenode
    • + *
    • Handle commands received from the namenode
    • + *
    + */ +@InterfaceAudience.Private +class BPServiceActor implements Runnable { + + static final Log LOG = DataNode.LOG; + final InetSocketAddress nnAddr; + + BPOfferService bpos; + + long lastBlockReport = 0; + long lastDeletedReport = 0; + + boolean resetBlockReportTime = true; + + Thread bpThread; + DatanodeProtocolClientSideTranslatorPB bpNamenode; + private long lastHeartbeat = 0; + private volatile boolean initialized = false; + + /** + * Between block reports (which happen on the order of once an hour) the + * DN reports smaller incremental changes to its block list. This map, + * keyed by block ID, contains the pending changes which have yet to be + * reported to the NN. Access should be synchronized on this object. + */ + private final Map pendingIncrementalBR + = Maps.newHashMap(); + + private volatile int pendingReceivedRequests = 0; + private volatile boolean shouldServiceRun = true; + private final DataNode dn; + private final DNConf dnConf; + + private DatanodeRegistration bpRegistration; + + BPServiceActor(InetSocketAddress nnAddr, BPOfferService bpos) { + this.bpos = bpos; + this.dn = bpos.getDataNode(); + this.nnAddr = nnAddr; + this.dnConf = dn.getDnConf(); + } + + /** + * returns true if BP thread has completed initialization of storage + * and has registered with the corresponding namenode + * @return true if initialized + */ + boolean isInitialized() { + return initialized; + } + + boolean isAlive() { + return shouldServiceRun && bpThread.isAlive(); + } + + @Override + public String toString() { + return bpos.toString() + " service to " + nnAddr; + } + + InetSocketAddress getNNSocketAddress() { + return nnAddr; + } + + /** + * Used to inject a spy NN in the unit tests. + */ + @VisibleForTesting + void setNameNode(DatanodeProtocolClientSideTranslatorPB dnProtocol) { + bpNamenode = dnProtocol; + } + + @VisibleForTesting + DatanodeProtocolClientSideTranslatorPB getNameNodeProxy() { + return bpNamenode; + } + + /** + * Perform the first part of the handshake with the NameNode. + * This calls versionRequest to determine the NN's + * namespace and version info. It automatically retries until + * the NN responds or the DN is shutting down. + * + * @return the NamespaceInfo + */ + @VisibleForTesting + NamespaceInfo retrieveNamespaceInfo() throws IOException { + NamespaceInfo nsInfo = null; + while (shouldRun()) { + try { + nsInfo = bpNamenode.versionRequest(); + LOG.debug(this + " received versionRequest response: " + nsInfo); + break; + } catch(SocketTimeoutException e) { // namenode is busy + LOG.warn("Problem connecting to server: " + nnAddr); + } catch(IOException e ) { // namenode is not available + LOG.warn("Problem connecting to server: " + nnAddr); + } + + // try again in a second + sleepAndLogInterrupts(5000, "requesting version info from NN"); + } + + if (nsInfo != null) { + checkNNVersion(nsInfo); + } else { + throw new IOException("DN shut down before block pool connected"); + } + return nsInfo; + } + + private void checkNNVersion(NamespaceInfo nsInfo) + throws IncorrectVersionException { + // build and layout versions should match + String nsBuildVer = nsInfo.getBuildVersion(); + String stBuildVer = Storage.getBuildVersion(); + if (!nsBuildVer.equals(stBuildVer)) { + LOG.warn("Data-node and name-node Build versions must be the same. " + + "Namenode build version: " + nsBuildVer + "Datanode " + + "build version: " + stBuildVer); + throw new IncorrectVersionException(nsBuildVer, "namenode", stBuildVer); + } + + if (HdfsConstants.LAYOUT_VERSION != nsInfo.getLayoutVersion()) { + LOG.warn("Data-node and name-node layout versions must be the same." + + " Expected: "+ HdfsConstants.LAYOUT_VERSION + + " actual "+ nsInfo.getLayoutVersion()); + throw new IncorrectVersionException( + nsInfo.getLayoutVersion(), "namenode"); + } + } + + private void connectToNNAndHandshake() throws IOException { + // get NN proxy + bpNamenode = dn.connectToNN(nnAddr); + + // First phase of the handshake with NN - get the namespace + // info. + NamespaceInfo nsInfo = retrieveNamespaceInfo(); + + // Verify that this matches the other NN in this HA pair. + // This also initializes our block pool in the DN if we are + // the first NN connection for this BP. + bpos.verifyAndSetNamespaceInfo(nsInfo); + + // Second phase of the handshake with the NN. + register(); + } + + /** + * This methods arranges for the data node to send the block report at + * the next heartbeat. + */ + void scheduleBlockReport(long delay) { + if (delay > 0) { // send BR after random delay + lastBlockReport = System.currentTimeMillis() + - ( dnConf.blockReportInterval - DFSUtil.getRandom().nextInt((int)(delay))); + } else { // send at next heartbeat + lastBlockReport = lastHeartbeat - dnConf.blockReportInterval; + } + resetBlockReportTime = true; // reset future BRs for randomness + } + + void reportBadBlocks(ExtendedBlock block) { + DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) }; + LocatedBlock[] blocks = { new LocatedBlock(block, dnArr) }; + + try { + bpNamenode.reportBadBlocks(blocks); + } catch (IOException e){ + /* One common reason is that NameNode could be in safe mode. + * Should we keep on retrying in that case? + */ + LOG.warn("Failed to report bad block " + block + " to namenode : " + + " Exception", e); + } + } + + /** + * Report received blocks and delete hints to the Namenode + * + * @throws IOException + */ + private void reportReceivedDeletedBlocks() throws IOException { + + // check if there are newly received blocks + ReceivedDeletedBlockInfo[] receivedAndDeletedBlockArray = null; + synchronized (pendingIncrementalBR) { + int numBlocks = pendingIncrementalBR.size(); + if (numBlocks > 0) { + // + // Send newly-received and deleted blockids to namenode + // + receivedAndDeletedBlockArray = pendingIncrementalBR + .values().toArray(new ReceivedDeletedBlockInfo[numBlocks]); + } + pendingIncrementalBR.clear(); + } + if (receivedAndDeletedBlockArray != null) { + StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks( + bpRegistration.getStorageID(), receivedAndDeletedBlockArray) }; + boolean success = false; + try { + bpNamenode.blockReceivedAndDeleted(bpRegistration, bpos.getBlockPoolId(), + report); + success = true; + } finally { + synchronized (pendingIncrementalBR) { + if (!success) { + // If we didn't succeed in sending the report, put all of the + // blocks back onto our queue, but only in the case where we didn't + // put something newer in the meantime. + for (ReceivedDeletedBlockInfo rdbi : receivedAndDeletedBlockArray) { + if (!pendingIncrementalBR.containsKey(rdbi.getBlock().getBlockId())) { + pendingIncrementalBR.put(rdbi.getBlock().getBlockId(), rdbi); + } + } + } + pendingReceivedRequests = pendingIncrementalBR.size(); + } + } + } + } + + /* + * Informing the name node could take a long long time! Should we wait + * till namenode is informed before responding with success to the + * client? For now we don't. + */ + void notifyNamenodeBlockImmediately(ReceivedDeletedBlockInfo bInfo) { + synchronized (pendingIncrementalBR) { + pendingIncrementalBR.put( + bInfo.getBlock().getBlockId(), bInfo); + pendingReceivedRequests++; + pendingIncrementalBR.notifyAll(); + } + } + + void notifyNamenodeDeletedBlock(ReceivedDeletedBlockInfo bInfo) { + synchronized (pendingIncrementalBR) { + pendingIncrementalBR.put( + bInfo.getBlock().getBlockId(), bInfo); + } + } + + /** + * Run an immediate block report on this thread. Used by tests. + */ + @VisibleForTesting + void triggerBlockReportForTests() throws IOException { + synchronized (pendingIncrementalBR) { + lastBlockReport = 0; + lastHeartbeat = 0; + pendingIncrementalBR.notifyAll(); + while (lastBlockReport == 0) { + try { + pendingIncrementalBR.wait(100); + } catch (InterruptedException e) { + return; + } + } + } + } + + @VisibleForTesting + void triggerHeartbeatForTests() throws IOException { + synchronized (pendingIncrementalBR) { + lastHeartbeat = 0; + pendingIncrementalBR.notifyAll(); + while (lastHeartbeat == 0) { + try { + pendingIncrementalBR.wait(100); + } catch (InterruptedException e) { + return; + } + } + } + } + + @VisibleForTesting + void triggerDeletionReportForTests() throws IOException { + synchronized (pendingIncrementalBR) { + lastDeletedReport = 0; + pendingIncrementalBR.notifyAll(); + + while (lastDeletedReport == 0) { + try { + pendingIncrementalBR.wait(100); + } catch (InterruptedException e) { + return; + } + } + } + } + + /** + * Report the list blocks to the Namenode + * @throws IOException + */ + DatanodeCommand blockReport() throws IOException { + // send block report if timer has expired. + DatanodeCommand cmd = null; + long startTime = now(); + if (startTime - lastBlockReport > dnConf.blockReportInterval) { + + // Flush any block information that precedes the block report. Otherwise + // we have a chance that we will miss the delHint information + // or we will report an RBW replica after the BlockReport already reports + // a FINALIZED one. + reportReceivedDeletedBlocks(); + + // Create block report + long brCreateStartTime = now(); + BlockListAsLongs bReport = dn.getFSDataset().getBlockReport( + bpos.getBlockPoolId()); + + // Send block report + long brSendStartTime = now(); + StorageBlockReport[] report = { new StorageBlockReport( + bpRegistration.getStorageID(), bReport.getBlockListAsLongs()) }; + cmd = bpNamenode.blockReport(bpRegistration, bpos.getBlockPoolId(), report); + + // Log the block report processing stats from Datanode perspective + long brSendCost = now() - brSendStartTime; + long brCreateCost = brSendStartTime - brCreateStartTime; + dn.getMetrics().addBlockReport(brSendCost); + LOG.info("BlockReport of " + bReport.getNumberOfBlocks() + + " blocks took " + brCreateCost + " msec to generate and " + + brSendCost + " msecs for RPC and NN processing"); + + // If we have sent the first block report, then wait a random + // time before we start the periodic block reports. + if (resetBlockReportTime) { + lastBlockReport = startTime - DFSUtil.getRandom().nextInt((int)(dnConf.blockReportInterval)); + resetBlockReportTime = false; + } else { + /* say the last block report was at 8:20:14. The current report + * should have started around 9:20:14 (default 1 hour interval). + * If current time is : + * 1) normal like 9:20:18, next report should be at 10:20:14 + * 2) unexpected like 11:35:43, next report should be at 12:20:14 + */ + lastBlockReport += (now() - lastBlockReport) / + dnConf.blockReportInterval * dnConf.blockReportInterval; + } + LOG.info("sent block report, processed command:" + cmd); + } + return cmd; + } + + + HeartbeatResponse sendHeartBeat() throws IOException { + LOG.info("heartbeat: " + this); + // reports number of failed volumes + StorageReport[] report = { new StorageReport(bpRegistration.getStorageID(), + false, + dn.getFSDataset().getCapacity(), + dn.getFSDataset().getDfsUsed(), + dn.getFSDataset().getRemaining(), + dn.getFSDataset().getBlockPoolUsed(bpos.getBlockPoolId())) }; + return bpNamenode.sendHeartbeat(bpRegistration, report, + dn.getXmitsInProgress(), + dn.getXceiverCount(), + dn.getFSDataset().getNumFailedVolumes()); + } + + //This must be called only by BPOfferService + void start() { + if ((bpThread != null) && (bpThread.isAlive())) { + //Thread is started already + return; + } + bpThread = new Thread(this, formatThreadName()); + bpThread.setDaemon(true); // needed for JUnit testing + bpThread.start(); + } + + private String formatThreadName() { + Collection dataDirs = DataNode.getStorageDirs(dn.getConf()); + return "DataNode: [" + + StringUtils.uriToString(dataDirs.toArray(new URI[0])) + "] " + + " heartbeating to " + nnAddr; + } + + //This must be called only by blockPoolManager. + void stop() { + shouldServiceRun = false; + if (bpThread != null) { + bpThread.interrupt(); + } + } + + //This must be called only by blockPoolManager + void join() { + try { + if (bpThread != null) { + bpThread.join(); + } + } catch (InterruptedException ie) { } + } + + //Cleanup method to be called by current thread before exiting. + private synchronized void cleanUp() { + + shouldServiceRun = false; + IOUtils.cleanup(LOG, bpNamenode); + bpos.shutdownActor(this); + } + + /** + * Main loop for each BP thread. Run until shutdown, + * forever calling remote NameNode functions. + */ + private void offerService() throws Exception { + LOG.info("For namenode " + nnAddr + " using DELETEREPORT_INTERVAL of " + + dnConf.deleteReportInterval + " msec " + " BLOCKREPORT_INTERVAL of " + + dnConf.blockReportInterval + "msec" + " Initial delay: " + + dnConf.initialBlockReportDelay + "msec" + "; heartBeatInterval=" + + dnConf.heartBeatInterval); + + // + // Now loop for a long time.... + // + while (shouldRun()) { + try { + long startTime = now(); + + // + // Every so often, send heartbeat or block-report + // + if (startTime - lastHeartbeat > dnConf.heartBeatInterval) { + // + // All heartbeat messages include following info: + // -- Datanode name + // -- data transfer port + // -- Total capacity + // -- Bytes remaining + // + lastHeartbeat = startTime; + if (!dn.areHeartbeatsDisabledForTests()) { + HeartbeatResponse resp = sendHeartBeat(); + assert resp != null; + dn.getMetrics().addHeartbeat(now() - startTime); + + // If the state of this NN has changed (eg STANDBY->ACTIVE) + // then let the BPOfferService update itself. + // + // Important that this happens before processCommand below, + // since the first heartbeat to a new active might have commands + // that we should actually process. + bpos.updateActorStatesFromHeartbeat( + this, resp.getNameNodeHaState()); + + long startProcessCommands = now(); + if (!processCommand(resp.getCommands())) + continue; + long endProcessCommands = now(); + if (endProcessCommands - startProcessCommands > 2000) { + LOG.info("Took " + (endProcessCommands - startProcessCommands) + + "ms to process " + resp.getCommands().length + + " commands from NN"); + } + } + } + if (pendingReceivedRequests > 0 + || (startTime - lastDeletedReport > dnConf.deleteReportInterval)) { + reportReceivedDeletedBlocks(); + lastDeletedReport = startTime; + } + + DatanodeCommand cmd = blockReport(); + processCommand(new DatanodeCommand[]{ cmd }); + + // Now safe to start scanning the block pool. + // If it has already been started, this is a no-op. + if (dn.blockScanner != null) { + dn.blockScanner.addBlockPool(bpos.getBlockPoolId()); + } + + // + // There is no work to do; sleep until hearbeat timer elapses, + // or work arrives, and then iterate again. + // + long waitTime = dnConf.heartBeatInterval - + (System.currentTimeMillis() - lastHeartbeat); + synchronized(pendingIncrementalBR) { + if (waitTime > 0 && pendingReceivedRequests == 0) { + try { + pendingIncrementalBR.wait(waitTime); + } catch (InterruptedException ie) { + LOG.warn("BPOfferService for " + this + " interrupted"); + } + } + } // synchronized + } catch(RemoteException re) { + String reClass = re.getClassName(); + if (UnregisteredNodeException.class.getName().equals(reClass) || + DisallowedDatanodeException.class.getName().equals(reClass) || + IncorrectVersionException.class.getName().equals(reClass)) { + LOG.warn(this + " is shutting down", re); + shouldServiceRun = false; + return; + } + LOG.warn("RemoteException in offerService", re); + try { + long sleepTime = Math.min(1000, dnConf.heartBeatInterval); + Thread.sleep(sleepTime); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + } + } catch (IOException e) { + LOG.warn("IOException in offerService", e); + } + } // while (shouldRun()) + } // offerService + + /** + * Register one bp with the corresponding NameNode + *

    + * The bpDatanode needs to register with the namenode on startup in order + * 1) to report which storage it is serving now and + * 2) to receive a registrationID + * + * issued by the namenode to recognize registered datanodes. + * + * @see FSNamesystem#registerDatanode(DatanodeRegistration) + * @throws IOException + */ + void register() throws IOException { + // The handshake() phase loaded the block pool storage + // off disk - so update the bpRegistration object from that info + bpRegistration = bpos.createRegistration(); + + LOG.info(this + " beginning handshake with NN"); + + while (shouldRun()) { + try { + // Use returned registration from namenode with updated machine name. + bpRegistration = bpNamenode.registerDatanode(bpRegistration, + new DatanodeStorage[0]); + break; + } catch(SocketTimeoutException e) { // namenode is busy + LOG.info("Problem connecting to server: " + nnAddr); + sleepAndLogInterrupts(1000, "connecting to server"); + } + } + + LOG.info("Block pool " + this + " successfully registered with NN"); + bpos.registrationSucceeded(this, bpRegistration); + + // random short delay - helps scatter the BR from all DNs + scheduleBlockReport(dnConf.initialBlockReportDelay); + } + + + private void sleepAndLogInterrupts(int millis, + String stateString) { + try { + Thread.sleep(millis); + } catch (InterruptedException ie) { + LOG.info("BPOfferService " + this + + " interrupted while " + stateString); + } + } + + /** + * No matter what kind of exception we get, keep retrying to offerService(). + * That's the loop that connects to the NameNode and provides basic DataNode + * functionality. + * + * Only stop when "shouldRun" or "shouldServiceRun" is turned off, which can + * happen either at shutdown or due to refreshNamenodes. + */ + @Override + public void run() { + LOG.info(this + " starting to offer service"); + + try { + // init stuff + try { + // setup storage + connectToNNAndHandshake(); + } catch (IOException ioe) { + // Initial handshake, storage recovery or registration failed + // End BPOfferService thread + LOG.fatal("Initialization failed for block pool " + this, ioe); + return; + } + + initialized = true; // bp is initialized; + + while (shouldRun()) { + try { + bpos.startDistributedUpgradeIfNeeded(); + offerService(); + } catch (Exception ex) { + LOG.error("Exception in BPOfferService for " + this, ex); + sleepAndLogInterrupts(5000, "offering service"); + } + } + } catch (Throwable ex) { + LOG.warn("Unexpected exception in block pool " + this, ex); + } finally { + LOG.warn("Ending block pool service for: " + this); + cleanUp(); + } + } + + private boolean shouldRun() { + return shouldServiceRun && dn.shouldRun(); + } + + /** + * Process an array of datanode commands + * + * @param cmds an array of datanode commands + * @return true if further processing may be required or false otherwise. + */ + boolean processCommand(DatanodeCommand[] cmds) { + if (cmds != null) { + for (DatanodeCommand cmd : cmds) { + try { + if (bpos.processCommandFromActor(cmd, this) == false) { + return false; + } + } catch (IOException ioe) { + LOG.warn("Error processing datanode Command", ioe); + } + } + } + return true; + } + + void trySendErrorReport(int errCode, String errMsg) { + try { + bpNamenode.errorReport(bpRegistration, errCode, errMsg); + } catch(IOException e) { + LOG.warn("Error reporting an error to NameNode " + nnAddr, + e); + } + } + + /** + * Report a bad block from another DN in this cluster. + */ + void reportRemoteBadBlock(DatanodeInfo dnInfo, ExtendedBlock block) + throws IOException { + LocatedBlock lb = new LocatedBlock(block, + new DatanodeInfo[] {dnInfo}); + bpNamenode.reportBadBlocks(new LocatedBlock[] {lb}); + } + + void reRegister() throws IOException { + if (shouldRun()) { + // re-retrieve namespace info to make sure that, if the NN + // was restarted, we still match its version (HDFS-2120) + retrieveNamespaceInfo(); + // and re-register + register(); + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java new file mode 100644 index 0000000000..3355ee269a --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java @@ -0,0 +1,241 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.security.PrivilegedExceptionAction; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.security.UserGroupInformation; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +/** + * Manages the BPOfferService objects for the data node. + * Creation, removal, starting, stopping, shutdown on BPOfferService + * objects must be done via APIs in this class. + */ +@InterfaceAudience.Private +class BlockPoolManager { + private static final Log LOG = DataNode.LOG; + + private final Map bpByNameserviceId = + Maps.newHashMap(); + private final Map bpByBlockPoolId = + Maps.newHashMap(); + private final List offerServices = + Lists.newArrayList(); + + private final DataNode dn; + + //This lock is used only to ensure exclusion of refreshNamenodes + private final Object refreshNamenodesLock = new Object(); + + BlockPoolManager(DataNode dn) { + this.dn = dn; + } + + synchronized void addBlockPool(BPOfferService bpos) { + Preconditions.checkArgument(offerServices.contains(bpos), + "Unknown BPOS: %s", bpos); + if (bpos.getBlockPoolId() == null) { + throw new IllegalArgumentException("Null blockpool id"); + } + bpByBlockPoolId.put(bpos.getBlockPoolId(), bpos); + } + + /** + * Returns the array of BPOfferService objects. + * Caution: The BPOfferService returned could be shutdown any time. + */ + synchronized BPOfferService[] getAllNamenodeThreads() { + BPOfferService[] bposArray = new BPOfferService[offerServices.size()]; + return offerServices.toArray(bposArray); + } + + synchronized BPOfferService get(String bpid) { + return bpByBlockPoolId.get(bpid); + } + + synchronized void remove(BPOfferService t) { + offerServices.remove(t); + bpByBlockPoolId.remove(t.getBlockPoolId()); + + boolean removed = false; + for (Iterator it = bpByNameserviceId.values().iterator(); + it.hasNext() && !removed;) { + BPOfferService bpos = it.next(); + if (bpos == t) { + it.remove(); + LOG.info("Removed " + bpos); + removed = true; + } + } + + if (!removed) { + LOG.warn("Couldn't remove BPOS " + t + " from bpByNameserviceId map"); + } + } + + void shutDownAll() throws InterruptedException { + BPOfferService[] bposArray = this.getAllNamenodeThreads(); + + for (BPOfferService bpos : bposArray) { + bpos.stop(); //interrupts the threads + } + //now join + for (BPOfferService bpos : bposArray) { + bpos.join(); + } + } + + synchronized void startAll() throws IOException { + try { + UserGroupInformation.getLoginUser().doAs( + new PrivilegedExceptionAction() { + public Object run() throws Exception { + for (BPOfferService bpos : offerServices) { + bpos.start(); + } + return null; + } + }); + } catch (InterruptedException ex) { + IOException ioe = new IOException(); + ioe.initCause(ex.getCause()); + throw ioe; + } + } + + void joinAll() { + for (BPOfferService bpos: this.getAllNamenodeThreads()) { + bpos.join(); + } + } + + void refreshNamenodes(Configuration conf) + throws IOException { + LOG.info("Refresh request received for nameservices: " + + conf.get(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES)); + + Map> newAddressMap = + DFSUtil.getNNServiceRpcAddresses(conf); + + synchronized (refreshNamenodesLock) { + doRefreshNamenodes(newAddressMap); + } + } + + private void doRefreshNamenodes( + Map> addrMap) throws IOException { + assert Thread.holdsLock(refreshNamenodesLock); + + Set toRefresh = Sets.newHashSet(); + Set toAdd = Sets.newHashSet(); + Set toRemove; + + synchronized (this) { + // Step 1. For each of the new nameservices, figure out whether + // it's an update of the set of NNs for an existing NS, + // or an entirely new nameservice. + for (String nameserviceId : addrMap.keySet()) { + if (bpByNameserviceId.containsKey(nameserviceId)) { + toRefresh.add(nameserviceId); + } else { + toAdd.add(nameserviceId); + } + } + + // Step 2. Any nameservices we currently have but are no longer present + // need to be removed. + toRemove = Sets.newHashSet(Sets.difference( + bpByNameserviceId.keySet(), addrMap.keySet())); + + assert toRefresh.size() + toAdd.size() == + addrMap.size() : + "toAdd: " + Joiner.on(",").useForNull("").join(toAdd) + + " toRemove: " + Joiner.on(",").useForNull("").join(toRemove) + + " toRefresh: " + Joiner.on(",").useForNull("").join(toRefresh); + + + // Step 3. Start new nameservices + if (!toAdd.isEmpty()) { + LOG.info("Starting BPOfferServices for nameservices: " + + Joiner.on(",").useForNull("").join(toAdd)); + + for (String nsToAdd : toAdd) { + ArrayList addrs = + Lists.newArrayList(addrMap.get(nsToAdd).values()); + BPOfferService bpos = createBPOS(addrs); + bpByNameserviceId.put(nsToAdd, bpos); + offerServices.add(bpos); + } + } + startAll(); + } + + // Step 4. Shut down old nameservices. This happens outside + // of the synchronized(this) lock since they need to call + // back to .remove() from another thread + if (!toRemove.isEmpty()) { + LOG.info("Stopping BPOfferServices for nameservices: " + + Joiner.on(",").useForNull("").join(toRemove)); + + for (String nsToRemove : toRemove) { + BPOfferService bpos = bpByNameserviceId.get(nsToRemove); + bpos.stop(); + bpos.join(); + // they will call remove on their own + } + } + + // Step 5. Update nameservices whose NN list has changed + if (!toRefresh.isEmpty()) { + LOG.info("Refreshing list of NNs for nameservices: " + + Joiner.on(",").useForNull("").join(toRefresh)); + + for (String nsToRefresh : toRefresh) { + BPOfferService bpos = bpByNameserviceId.get(nsToRefresh); + ArrayList addrs = + Lists.newArrayList(addrMap.get(nsToRefresh).values()); + bpos.refreshNNList(addrs); + } + } + } + + /** + * Extracted out for test purposes. + */ + protected BPOfferService createBPOS(List nnAddrs) { + return new BPOfferService(nnAddrs, dn); + } +} \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java index 153fd93854..fd25c1df37 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java @@ -153,6 +153,7 @@ class BlockReceiver implements Closeable { switch (stage) { case PIPELINE_SETUP_CREATE: replicaInfo = datanode.data.createRbw(block); + datanode.notifyNamenodeReceivingBlock(block); break; case PIPELINE_SETUP_STREAMING_RECOVERY: replicaInfo = datanode.data.recoverRbw( @@ -166,6 +167,7 @@ class BlockReceiver implements Closeable { block.getLocalBlock()); } block.setGenerationStamp(newGs); + datanode.notifyNamenodeReceivingBlock(block); break; case PIPELINE_SETUP_APPEND_RECOVERY: replicaInfo = datanode.data.recoverAppend(block, newGs, minBytesRcvd); @@ -174,6 +176,7 @@ class BlockReceiver implements Closeable { block.getLocalBlock()); } block.setGenerationStamp(newGs); + datanode.notifyNamenodeReceivingBlock(block); break; case TRANSFER_RBW: case TRANSFER_FINALIZED: @@ -320,7 +323,6 @@ class BlockReceiver implements Closeable { private void verifyChunks( byte[] dataBuf, int dataOff, int len, byte[] checksumBuf, int checksumOff ) throws IOException { - DatanodeProtocol nn = datanode.getBPNamenode(block.getBlockPoolId()); while (len > 0) { int chunkLen = Math.min(len, bytesPerChecksum); @@ -331,9 +333,7 @@ class BlockReceiver implements Closeable { try { LOG.info("report corrupt block " + block + " from datanode " + srcDataNode + " to namenode"); - LocatedBlock lb = new LocatedBlock(block, - new DatanodeInfo[] {srcDataNode}); - nn.reportBadBlocks(new LocatedBlock[] {lb}); + datanode.reportRemoteBadBlock(srcDataNode, block); } catch (IOException e) { LOG.warn("Failed to report bad block " + block + " from datanode " + srcDataNode + " to namenode"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 5681525724..098809cb3a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -45,7 +45,6 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOUR import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_STARTUP_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_USER_NAME_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_FEDERATION_NAMESERVICES; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HTTPS_ENABLE_KEY; import java.io.BufferedOutputStream; @@ -86,6 +85,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress; import org.apache.hadoop.hdfs.HDFSPolicyProvider; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.protocol.Block; @@ -164,6 +164,8 @@ import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.VersionInfo; import org.mortbay.util.ajax.JSON; +import com.google.common.base.Preconditions; +import com.google.common.collect.Sets; import com.google.protobuf.BlockingService; @@ -230,143 +232,6 @@ public class DataNode extends Configured return NetUtils.createSocketAddr(target); } - /** - * Manages he BPOfferService objects for the data node. - * Creation, removal, starting, stopping, shutdown on BPOfferService - * objects must be done via APIs in this class. - */ - @InterfaceAudience.Private - class BlockPoolManager { - private final Map bpMapping; - private final Map nameNodeThreads; - - //This lock is used only to ensure exclusion of refreshNamenodes - private final Object refreshNamenodesLock = new Object(); - - BlockPoolManager(Configuration conf) - throws IOException { - bpMapping = new HashMap(); - nameNodeThreads = new HashMap(); - - List isas = DFSUtil.getNNServiceRpcAddresses(conf); - for(InetSocketAddress isa : isas) { - BPOfferService bpos = new BPOfferService(isa, DataNode.this); - nameNodeThreads.put(bpos.getNNSocketAddress(), bpos); - } - } - - synchronized void addBlockPool(BPOfferService t) { - if (nameNodeThreads.get(t.getNNSocketAddress()) == null) { - throw new IllegalArgumentException( - "Unknown BPOfferService thread for namenode address:" - + t.getNNSocketAddress()); - } - if (t.getBlockPoolId() == null) { - throw new IllegalArgumentException("Null blockpool id"); - } - bpMapping.put(t.getBlockPoolId(), t); - } - - /** - * Returns the array of BPOfferService objects. - * Caution: The BPOfferService returned could be shutdown any time. - */ - synchronized BPOfferService[] getAllNamenodeThreads() { - BPOfferService[] bposArray = new BPOfferService[nameNodeThreads.values() - .size()]; - return nameNodeThreads.values().toArray(bposArray); - } - - synchronized BPOfferService get(InetSocketAddress addr) { - return nameNodeThreads.get(addr); - } - - synchronized BPOfferService get(String bpid) { - return bpMapping.get(bpid); - } - - synchronized void remove(BPOfferService t) { - nameNodeThreads.remove(t.getNNSocketAddress()); - bpMapping.remove(t.getBlockPoolId()); - } - - void shutDownAll() throws InterruptedException { - BPOfferService[] bposArray = this.getAllNamenodeThreads(); - - for (BPOfferService bpos : bposArray) { - bpos.stop(); //interrupts the threads - } - //now join - for (BPOfferService bpos : bposArray) { - bpos.join(); - } - } - - synchronized void startAll() throws IOException { - try { - UserGroupInformation.getLoginUser().doAs( - new PrivilegedExceptionAction() { - public Object run() throws Exception { - for (BPOfferService bpos : nameNodeThreads.values()) { - bpos.start(); - } - return null; - } - }); - } catch (InterruptedException ex) { - IOException ioe = new IOException(); - ioe.initCause(ex.getCause()); - throw ioe; - } - } - - void joinAll() { - for (BPOfferService bpos: this.getAllNamenodeThreads()) { - bpos.join(); - } - } - - void refreshNamenodes(Configuration conf) - throws IOException { - LOG.info("Refresh request received for nameservices: " - + conf.get(DFS_FEDERATION_NAMESERVICES)); - List newAddresses = - DFSUtil.getNNServiceRpcAddresses(conf); - List toShutdown = new ArrayList(); - List toStart = new ArrayList(); - synchronized (refreshNamenodesLock) { - synchronized (this) { - for (InetSocketAddress nnaddr : nameNodeThreads.keySet()) { - if (!(newAddresses.contains(nnaddr))) { - toShutdown.add(nameNodeThreads.get(nnaddr)); - } - } - for (InetSocketAddress nnaddr : newAddresses) { - if (!(nameNodeThreads.containsKey(nnaddr))) { - toStart.add(nnaddr); - } - } - - for (InetSocketAddress nnaddr : toStart) { - BPOfferService bpos = new BPOfferService(nnaddr, DataNode.this); - nameNodeThreads.put(bpos.getNNSocketAddress(), bpos); - } - } - - for (BPOfferService bpos : toShutdown) { - bpos.stop(); - bpos.join(); - } - - // stoping the BPOSes causes them to call remove() on their own when they - // clean up. - - // Now start the threads that are not already running. - startAll(); - } - } - } - volatile boolean shouldRun = true; private BlockPoolManager blockPoolManager; public volatile FSDatasetInterface data = null; @@ -653,7 +518,18 @@ public class DataNode extends Configured if(bpos != null) { bpos.notifyNamenodeReceivedBlock(block, delHint); } else { - LOG.warn("Cannot find BPOfferService for reporting block received for bpid=" + LOG.error("Cannot find BPOfferService for reporting block received for bpid=" + + block.getBlockPoolId()); + } + } + + // calls specific to BP + protected void notifyNamenodeReceivingBlock(ExtendedBlock block) { + BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId()); + if(bpos != null) { + bpos.notifyNamenodeReceivingBlock(block); + } else { + LOG.error("Cannot find BPOfferService for reporting block receiving for bpid=" + block.getBlockPoolId()); } } @@ -664,18 +540,66 @@ public class DataNode extends Configured if (bpos != null) { bpos.notifyNamenodeDeletedBlock(block); } else { - LOG.warn("Cannot find BPOfferService for reporting block deleted for bpid=" + LOG.error("Cannot find BPOfferService for reporting block deleted for bpid=" + block.getBlockPoolId()); } } + /** + * Report a bad block which is hosted on the local DN. + */ public void reportBadBlocks(ExtendedBlock block) throws IOException{ - BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId()); - if(bpos == null || bpos.bpNamenode == null) { - throw new IOException("cannot locate OfferService thread for bp="+block.getBlockPoolId()); - } + BPOfferService bpos = getBPOSForBlock(block); bpos.reportBadBlocks(block); } + + /** + * Report a bad block on another DN (eg if we received a corrupt replica + * from a remote host). + * @param srcDataNode the DN hosting the bad block + * @param block the block itself + */ + public void reportRemoteBadBlock(DatanodeInfo srcDataNode, ExtendedBlock block) + throws IOException { + BPOfferService bpos = getBPOSForBlock(block); + bpos.reportRemoteBadBlock(srcDataNode, block); + } + + /** + * Try to send an error report to the NNs associated with the given + * block pool. + * @param bpid the block pool ID + * @param errCode error code to send + * @param errMsg textual message to send + */ + void trySendErrorReport(String bpid, int errCode, String errMsg) { + BPOfferService bpos = blockPoolManager.get(bpid); + if (bpos == null) { + throw new IllegalArgumentException("Bad block pool: " + bpid); + } + bpos.trySendErrorReport(errCode, errMsg); + } + + + + /** + * Return the BPOfferService instance corresponding to the given block. + * @param block + * @return the BPOS + * @throws IOException if no such BPOS can be found + */ + private BPOfferService getBPOSForBlock(ExtendedBlock block) + throws IOException { + Preconditions.checkNotNull(block); + BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId()); + if (bpos == null) { + throw new IOException("cannot locate OfferService thread for bp="+ + block.getBlockPoolId()); + } + return bpos; + } + + // used only for testing void setHeartbeatsDisabledForTests( @@ -728,7 +652,8 @@ public class DataNode extends Configured metrics = DataNodeMetrics.create(conf, getMachineName()); - blockPoolManager = new BlockPoolManager(conf); + blockPoolManager = new BlockPoolManager(this); + blockPoolManager.refreshNamenodes(conf); } /** @@ -961,11 +886,15 @@ public class DataNode extends Configured /** * get BP registration by machine and port name (host:port) - * @param mName + * @param mName - the name that the NN used * @return BP registration * @throws IOException */ DatanodeRegistration getDNRegistrationByMachineName(String mName) { + // TODO: all the BPs should have the same name as each other, they all come + // from getName() here! and the use cases only are in tests where they just + // call with getName(). So we could probably just make this method return + // the first BPOS's registration. See HDFS-2609. BPOfferService [] bposArray = blockPoolManager.getAllNamenodeThreads(); for (BPOfferService bpos : bposArray) { if(bpos.bpRegistration.getName().equals(mName)) @@ -1011,20 +940,6 @@ public class DataNode extends Configured throw new IOException(ie.getMessage()); } } - - /** - * get the name node address based on the block pool id - * @param bpid block pool ID - * @return namenode address corresponding to the bpid - */ - public InetSocketAddress getNameNodeAddr(String bpid) { - BPOfferService bp = blockPoolManager.get(bpid); - if (bp != null) { - return bp.getNNSocketAddress(); - } - LOG.warn("No name node address found for block pool ID " + bpid); - return null; - } public InetSocketAddress getSelfAddr() { return selfAddr; @@ -1251,12 +1166,7 @@ public class DataNode extends Configured //inform NameNodes for(BPOfferService bpos: blockPoolManager.getAllNamenodeThreads()) { - DatanodeProtocolClientSideTranslatorPB nn = bpos.bpNamenode; - try { - nn.errorReport(bpos.bpRegistration, dpError, errMsgr); - } catch(IOException e) { - LOG.warn("Error reporting disk failure to NameNode", e); - } + bpos.trySendErrorReport(dpError, errMsgr); } if(hasEnoughResources) { @@ -1273,6 +1183,10 @@ public class DataNode extends Configured public int getXceiverCount() { return threadGroup == null ? 0 : threadGroup.activeCount(); } + + int getXmitsInProgress() { + return xmitsInProgress.get(); + } UpgradeManagerDatanode getUpgradeManagerDatanode(String bpid) { BPOfferService bpos = blockPoolManager.get(bpid); @@ -1285,15 +1199,15 @@ public class DataNode extends Configured private void transferBlock( ExtendedBlock block, DatanodeInfo xferTargets[] ) throws IOException { - DatanodeProtocolClientSideTranslatorPB nn = getBPNamenode(block - .getBlockPoolId()); + BPOfferService bpos = getBPOSForBlock(block); DatanodeRegistration bpReg = getDNRegistrationForBP(block.getBlockPoolId()); if (!data.isValidBlock(block)) { // block does not exist or is under-construction String errStr = "Can't send invalid block " + block; LOG.info(errStr); - nn.errorReport(bpReg, DatanodeProtocol.INVALID_BLOCK, errStr); + + bpos.trySendErrorReport(DatanodeProtocol.INVALID_BLOCK, errStr); return; } @@ -1301,9 +1215,7 @@ public class DataNode extends Configured long onDiskLength = data.getLength(block); if (block.getNumBytes() > onDiskLength) { // Shorter on-disk len indicates corruption so report NN the corrupt block - nn.reportBadBlocks(new LocatedBlock[]{ - new LocatedBlock(block, new DatanodeInfo[] { - new DatanodeInfo(bpReg)})}); + bpos.reportBadBlocks(block); LOG.warn("Can't replicate block " + block + " because on-disk length " + onDiskLength + " is shorter than NameNode recorded length " + block.getNumBytes()); @@ -1861,6 +1773,13 @@ public class DataNode extends Configured long newLength) throws IOException { ReplicaInfo r = data.updateReplicaUnderRecovery(oldBlock, recoveryId, newLength); + // Notify the namenode of the updated block info. This is important + // for HA, since otherwise the standby node may lose track of the + // block locations until the next block report. + ExtendedBlock newBlock = new ExtendedBlock(oldBlock); + newBlock.setGenerationStamp(recoveryId); + newBlock.setNumBytes(newLength); + notifyNamenodeReceivedBlock(newBlock, ""); return new ExtendedBlock(oldBlock.getBlockPoolId(), r); } @@ -1935,23 +1854,32 @@ public class DataNode extends Configured * @return Namenode corresponding to the bpid * @throws IOException */ - public DatanodeProtocolClientSideTranslatorPB getBPNamenode(String bpid) + public DatanodeProtocolClientSideTranslatorPB getActiveNamenodeForBP(String bpid) throws IOException { BPOfferService bpos = blockPoolManager.get(bpid); if (bpos == null) { throw new IOException("No block pool offer service for bpid=" + bpid); - } else if (bpos.bpNamenode == null) { - throw new IOException("cannot find a namenode proxy for bpid=" + bpid); } - return bpos.bpNamenode; + + DatanodeProtocolClientSideTranslatorPB activeNN = bpos.getActiveNN(); + if (activeNN == null) { + throw new IOException( + "Block pool " + bpid + " has not recognized an active NN"); + } + return activeNN; } /** Block synchronization */ void syncBlock(RecoveringBlock rBlock, List syncList) throws IOException { ExtendedBlock block = rBlock.getBlock(); - DatanodeProtocolClientSideTranslatorPB nn = getBPNamenode(block - .getBlockPoolId()); + DatanodeProtocolClientSideTranslatorPB nn = + getActiveNamenodeForBP(block.getBlockPoolId()); + if (nn == null) { + throw new IOException( + "Unable to synchronize block " + rBlock + ", since this DN " + + " has not acknowledged any NN as active."); + } long recoveryId = rBlock.getNewGenerationStamp(); if (LOG.isDebugEnabled()) { @@ -2172,14 +2100,19 @@ public class DataNode extends Configured /** * Returned information is a JSON representation of a map with - * name node host name as the key and block pool Id as the value + * name node host name as the key and block pool Id as the value. + * Note that, if there are multiple NNs in an NA nameservice, + * a given block pool may be represented twice. */ @Override // DataNodeMXBean public String getNamenodeAddresses() { final Map info = new HashMap(); for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) { - if (bpos != null && bpos.bpThread != null) { - info.put(bpos.getNNSocketAddress().getHostName(), bpos.getBlockPoolId()); + if (bpos != null) { + for (BPServiceActor actor : bpos.getBPServiceActors()) { + info.put(actor.getNNSocketAddress().getHostName(), + bpos.getBlockPoolId()); + } } } return JSON.toString(info); @@ -2228,11 +2161,18 @@ public class DataNode extends Configured /** * @param addr rpc address of the namenode - * @return true - if BPOfferService corresponding to the namenode is alive + * @return true if the datanode is connected to a NameNode at the + * given address */ - public boolean isBPServiceAlive(InetSocketAddress addr) { - BPOfferService bp = blockPoolManager.get(addr); - return bp != null ? bp.isAlive() : false; + public boolean isConnectedToNN(InetSocketAddress addr) { + for (BPOfferService bpos : getAllBpOs()) { + for (BPServiceActor bpsa : bpos.getBPServiceActors()) { + if (addr.equals(bpsa.getNNSocketAddress())) { + return bpsa.isAlive(); + } + } + } + return false; } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java index 408a6afc47..89272b2ecf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java @@ -107,6 +107,14 @@ class FSDatasetAsyncDiskService { } + synchronized long countPendingDeletions() { + long count = 0; + for (ThreadPoolExecutor exec : executors.values()) { + count += exec.getTaskCount() - exec.getCompletedTaskCount(); + } + return count; + } + /** * Execute the task sometime in the future, using ThreadPools. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java index 478fb5660d..9ada40fd5f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java @@ -92,7 +92,7 @@ class UpgradeManagerDatanode extends UpgradeManager { "UpgradeManagerDatanode.currentUpgrades is not null."; assert upgradeDaemon == null : "UpgradeManagerDatanode.upgradeDaemon is not null."; - DatanodeProtocol nn = dataNode.getBPNamenode(bpid); + DatanodeProtocol nn = dataNode.getActiveNamenodeForBP(bpid); nn.processUpgradeCommand(broadcastCommand); return true; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java index ddb1d6029f..49d26212d0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java @@ -45,7 +45,7 @@ public abstract class UpgradeObjectDatanode extends UpgradeObject implements Run } protected DatanodeProtocol getNamenode() throws IOException { - return dataNode.getBPNamenode(bpid); + return dataNode.getActiveNamenodeForBP(bpid); } void setDatanode(DataNode dataNode, String bpid) { @@ -92,14 +92,7 @@ public abstract class UpgradeObjectDatanode extends UpgradeObject implements Run + " Name-node version = " + nsInfo.getLayoutVersion() + "."; DataNode.LOG.fatal( errorMsg ); String bpid = nsInfo.getBlockPoolID(); - DatanodeProtocol nn = dataNode.getBPNamenode(bpid); - try { - nn.errorReport(dataNode.getDNRegistrationForBP(bpid), - DatanodeProtocol.NOTIFY, errorMsg); - } catch(SocketTimeoutException e) { // namenode is busy - DataNode.LOG.info("Problem connecting to server: " - + dataNode.getNameNodeAddr(nsInfo.getBlockPoolID())); - } + dataNode.trySendErrorReport(bpid, DatanodeProtocol.NOTIFY, errorMsg); throw new IOException(errorMsg); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java index fc1fe14af7..ece013fa55 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java @@ -217,7 +217,7 @@ public class BackupImage extends FSImage { int logVersion = storage.getLayoutVersion(); backupInputStream.setBytes(data, logVersion); - int numLoaded = logLoader.loadEditRecords(logVersion, backupInputStream, + long numLoaded = logLoader.loadEditRecords(logVersion, backupInputStream, true, lastAppliedTxId + 1); if (numLoaded != numTxns) { throw new IOException("Batch of txns starting at txnid " + @@ -310,7 +310,7 @@ public class BackupImage extends FSImage { + " txns from in-progress stream " + stream); FSEditLogLoader loader = new FSEditLogLoader(namesystem); - int numLoaded = loader.loadFSEdits(stream, lastAppliedTxId + 1); + long numLoaded = loader.loadFSEdits(stream, lastAppliedTxId + 1); lastAppliedTxId += numLoaded; assert numLoaded == remainingTxns : "expected to load " + remainingTxns + " but loaded " + @@ -345,7 +345,7 @@ public class BackupImage extends FSImage { synchronized void namenodeStartedLogSegment(long txid) throws IOException { LOG.info("NameNode started a new log segment at txid " + txid); - if (editLog.isOpen()) { + if (editLog.isSegmentOpen()) { if (editLog.getLastWrittenTxId() == txid - 1) { // We are in sync with the NN, so end and finalize the current segment editLog.endCurrentLogSegment(false); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java index c655ee75bb..de75b76934 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java @@ -58,7 +58,7 @@ class BackupJournalManager implements JournalManager { } @Override - public long getNumberOfTransactions(long fromTxnId) + public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk) throws IOException, CorruptionException { // This JournalManager is never used for input. Therefore it cannot // return any transactions @@ -66,7 +66,8 @@ class BackupJournalManager implements JournalManager { } @Override - public EditLogInputStream getInputStream(long fromTxnId) throws IOException { + public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk) + throws IOException { // This JournalManager is never used for input. Therefore it cannot // return any transactions throw new IOException("Unsupported operation"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java index 05154703aa..9cad4eb043 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java @@ -26,13 +26,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.JournalProtocolService; import org.apache.hadoop.hdfs.protocolPB.JournalProtocolPB; import org.apache.hadoop.hdfs.protocolPB.JournalProtocolServerSideTranslatorPB; -import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; @@ -41,7 +41,8 @@ import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; -import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.ipc.StandbyException; +import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.UserGroupInformation; @@ -69,7 +70,7 @@ public class BackupNode extends NameNode { private static final String BN_SERVICE_RPC_ADDRESS_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_SERVICE_RPC_ADDRESS_KEY; /** Name-node proxy */ - NamenodeProtocolTranslatorPB namenode; + NamenodeProtocol namenode; /** Name-node RPC address */ String nnRpcAddress; /** Name-node HTTP address */ @@ -89,13 +90,13 @@ public class BackupNode extends NameNode { // Common NameNode methods implementation for backup node. ///////////////////////////////////////////////////// @Override // NameNode - protected InetSocketAddress getRpcServerAddress(Configuration conf) throws IOException { + protected InetSocketAddress getRpcServerAddress(Configuration conf) { String addr = conf.get(BN_ADDRESS_NAME_KEY, BN_ADDRESS_DEFAULT); return NetUtils.createSocketAddr(addr); } @Override - protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) throws IOException { + protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) { String addr = conf.get(BN_SERVICE_RPC_ADDRESS_KEY); if (addr == null || addr.isEmpty()) { return null; @@ -143,6 +144,7 @@ public class BackupNode extends NameNode { CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT); NamespaceInfo nsInfo = handshake(conf); super.initialize(conf); + if (false == namesystem.isInSafeMode()) { namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER); } @@ -189,7 +191,7 @@ public class BackupNode extends NameNode { } // Stop the RPC client if (namenode != null) { - IOUtils.cleanup(LOG, namenode); + RPC.stopProxy(namenode); } namenode = null; // Stop the checkpoint manager @@ -197,6 +199,11 @@ public class BackupNode extends NameNode { checkpointManager.interrupt(); checkpointManager = null; } + + // Abort current log segment - otherwise the NN shutdown code + // will close it gracefully, which is incorrect. + getFSImage().getEditLog().abortCurrentLogSegment(); + // Stop name-node threads super.stop(); } @@ -221,58 +228,31 @@ public class BackupNode extends NameNode { this.clientRpcServer); nnRpcAddress = nn.nnRpcAddress; } - - ///////////////////////////////////////////////////// - // NamenodeProtocol implementation for backup node. - ///////////////////////////////////////////////////// - @Override // NamenodeProtocol - public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size) - throws IOException { - throw new UnsupportedActionException("getBlocks"); - } - - // Only active name-node can register other nodes. - @Override // NamenodeProtocol - public NamenodeRegistration register(NamenodeRegistration registration - ) throws IOException { - throw new UnsupportedActionException("register"); - } - - @Override // NamenodeProtocol - public NamenodeCommand startCheckpoint(NamenodeRegistration registration) - throws IOException { - throw new UnsupportedActionException("startCheckpoint"); - } - - @Override // NamenodeProtocol - public void endCheckpoint(NamenodeRegistration registration, - CheckpointSignature sig) throws IOException { - throw new UnsupportedActionException("endCheckpoint"); - } - + ///////////////////////////////////////////////////// // BackupNodeProtocol implementation for backup node. ///////////////////////////////////////////////////// - + @Override + public void startLogSegment(NamenodeRegistration registration, long txid) + throws IOException { + namesystem.checkOperation(OperationCategory.JOURNAL); + verifyRequest(registration); + + getBNImage().namenodeStartedLogSegment(txid); + } + @Override public void journal(NamenodeRegistration nnReg, long firstTxId, int numTxns, byte[] records) throws IOException { + namesystem.checkOperation(OperationCategory.JOURNAL); verifyRequest(nnReg); if(!nnRpcAddress.equals(nnReg.getAddress())) throw new IOException("Journal request from unexpected name-node: " - + nnReg.getAddress() + " expecting " + clientRpcAddress); + + nnReg.getAddress() + " expecting " + nnRpcAddress); getBNImage().journal(firstTxId, numTxns, records); } - - @Override - public void startLogSegment(NamenodeRegistration registration, long txid) - throws IOException { - verifyRequest(registration); - - getBNImage().namenodeStartedLogSegment(txid); - } - + private BackupImage getBNImage() { return (BackupImage)nn.getFSImage(); } @@ -295,8 +275,9 @@ public class BackupNode extends NameNode { private NamespaceInfo handshake(Configuration conf) throws IOException { // connect to name node InetSocketAddress nnAddress = NameNode.getServiceAddress(conf, true); - this.namenode = new NamenodeProtocolTranslatorPB(nnAddress, conf, - UserGroupInformation.getCurrentUser()); + this.namenode = NameNodeProxies.createNonHAProxy(conf, nnAddress, + NamenodeProtocol.class, UserGroupInformation.getCurrentUser(), + true).getProxy(); this.nnRpcAddress = NetUtils.getHostPortString(nnAddress); this.nnHttpAddress = NetUtils.getHostPortString(super.getHttpServerAddress(conf)); // get version and id info from the name-node @@ -408,6 +389,28 @@ public class BackupNode extends NameNode { return clusterId; } + @Override + protected NameNodeHAContext createHAContext() { + return new BNHAContext(); + } + + private class BNHAContext extends NameNodeHAContext { + @Override // NameNode + public void checkOperation(OperationCategory op) + throws StandbyException { + if (op == OperationCategory.UNCHECKED || + op == OperationCategory.CHECKPOINT) { + return; + } + if (OperationCategory.JOURNAL != op && + !(OperationCategory.READ == op && allowStaleStandbyReads)) { + String msg = "Operation category " + op + + " is not supported at the BackupNode"; + throw new StandbyException(msg); + } + } + } + @Override protected String getNameServiceId(Configuration conf) { return DFSUtil.getBackupNameServiceId(conf); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java new file mode 100644 index 0000000000..8b3cf04d74 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import static org.apache.hadoop.hdfs.DFSConfigKeys.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.conf.Configuration; + +import com.google.common.collect.ImmutableList; + +@InterfaceAudience.Private +public class CheckpointConf { + private static final Log LOG = LogFactory.getLog(CheckpointConf.class); + + /** How often to checkpoint regardless of number of txns */ + private final long checkpointPeriod; // in seconds + + /** How often to poll the NN to check checkpointTxnCount */ + private final long checkpointCheckPeriod; // in seconds + + /** checkpoint once every this many transactions, regardless of time */ + private final long checkpointTxnCount; + + + public CheckpointConf(Configuration conf) { + checkpointCheckPeriod = conf.getLong( + DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, + DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT); + + checkpointPeriod = conf.getLong(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, + DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT); + checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY, + DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); + warnForDeprecatedConfigs(conf); + } + + private static void warnForDeprecatedConfigs(Configuration conf) { + for (String key : ImmutableList.of( + "fs.checkpoint.size", + "dfs.namenode.checkpoint.size")) { + if (conf.get(key) != null) { + LOG.warn("Configuration key " + key + " is deprecated! Ignoring..." + + " Instead please specify a value for " + + DFS_NAMENODE_CHECKPOINT_TXNS_KEY); + } + } + } + + public long getPeriod() { + return checkpointPeriod; + } + + public long getCheckPeriod() { + return Math.min(checkpointCheckPeriod, checkpointPeriod); + } + + public long getTxnCount() { + return checkpointTxnCount; + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java index 39d2abaee7..6ae931fd44 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java @@ -29,7 +29,6 @@ import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; @@ -58,17 +57,16 @@ class Checkpointer extends Daemon { private BackupNode backupNode; volatile boolean shouldRun; - private long checkpointPeriod; // in seconds - // Transactions count to trigger the checkpoint - private long checkpointTxnCount; private String infoBindAddress; + private CheckpointConf checkpointConf; + private BackupImage getFSImage() { return (BackupImage)backupNode.getFSImage(); } - private NamenodeProtocol getNamenode(){ + private NamenodeProtocol getRemoteNamenodeProxy(){ return backupNode.namenode; } @@ -89,26 +87,24 @@ class Checkpointer extends Daemon { /** * Initialize checkpoint. */ - @SuppressWarnings("deprecation") private void initialize(Configuration conf) throws IOException { // Create connection to the namenode. shouldRun = true; // Initialize other scheduling parameters from the configuration - checkpointPeriod = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, - DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT); - checkpointTxnCount = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, - DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); - SecondaryNameNode.warnForDeprecatedConfigs(conf); + checkpointConf = new CheckpointConf(conf); // Pull out exact http address for posting url to avoid ip aliasing issues String fullInfoAddr = conf.get(DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY, DFS_NAMENODE_BACKUP_HTTP_ADDRESS_DEFAULT); infoBindAddress = fullInfoAddr.substring(0, fullInfoAddr.indexOf(":")); - LOG.info("Checkpoint Period : " + checkpointPeriod + " secs " + - "(" + checkpointPeriod/60 + " min)"); - LOG.info("Transactions count is : " + checkpointTxnCount + ", to trigger checkpoint"); + LOG.info("Checkpoint Period : " + + checkpointConf.getPeriod() + " secs " + + "(" + checkpointConf.getPeriod()/60 + " min)"); + LOG.info("Transactions count is : " + + checkpointConf.getTxnCount() + + ", to trigger checkpoint"); } /** @@ -125,8 +121,8 @@ class Checkpointer extends Daemon { public void run() { // Check the size of the edit log once every 5 minutes. long periodMSec = 5 * 60; // 5 minutes - if(checkpointPeriod < periodMSec) { - periodMSec = checkpointPeriod; + if(checkpointConf.getPeriod() < periodMSec) { + periodMSec = checkpointConf.getPeriod(); } periodMSec *= 1000; @@ -142,7 +138,7 @@ class Checkpointer extends Daemon { shouldCheckpoint = true; } else { long txns = countUncheckpointedTxns(); - if(txns >= checkpointTxnCount) + if(txns >= checkpointConf.getTxnCount()) shouldCheckpoint = true; } if(shouldCheckpoint) { @@ -165,7 +161,7 @@ class Checkpointer extends Daemon { } private long countUncheckpointedTxns() throws IOException { - long curTxId = getNamenode().getTransactionID(); + long curTxId = getRemoteNamenodeProxy().getTransactionID(); long uncheckpointedTxns = curTxId - getFSImage().getStorage().getMostRecentCheckpointTxId(); assert uncheckpointedTxns >= 0; @@ -183,7 +179,7 @@ class Checkpointer extends Daemon { bnImage.freezeNamespaceAtNextRoll(); NamenodeCommand cmd = - getNamenode().startCheckpoint(backupNode.getRegistration()); + getRemoteNamenodeProxy().startCheckpoint(backupNode.getRegistration()); CheckpointCommand cpCmd = null; switch(cmd.getAction()) { case NamenodeProtocol.ACT_SHUTDOWN: @@ -207,7 +203,7 @@ class Checkpointer extends Daemon { long lastApplied = bnImage.getLastAppliedTxId(); LOG.debug("Doing checkpoint. Last applied: " + lastApplied); RemoteEditLogManifest manifest = - getNamenode().getEditLogManifest(bnImage.getLastAppliedTxId() + 1); + getRemoteNamenodeProxy().getEditLogManifest(bnImage.getLastAppliedTxId() + 1); if (!manifest.getLogs().isEmpty()) { RemoteEditLog firstRemoteLog = manifest.getLogs().get(0); @@ -243,11 +239,16 @@ class Checkpointer extends Daemon { long txid = bnImage.getLastAppliedTxId(); - backupNode.namesystem.dir.setReady(); - backupNode.namesystem.setBlockTotal(); - - bnImage.saveFSImageInAllDirs(backupNode.getNamesystem(), txid); - bnStorage.writeAll(); + backupNode.namesystem.writeLock(); + try { + backupNode.namesystem.dir.setReady(); + backupNode.namesystem.setBlockTotal(); + + bnImage.saveFSImageInAllDirs(backupNode.getNamesystem(), txid); + bnStorage.writeAll(); + } finally { + backupNode.namesystem.writeUnlock(); + } if(cpCmd.needToReturnImage()) { TransferFsImage.uploadImageFromStorage( @@ -255,7 +256,7 @@ class Checkpointer extends Daemon { bnStorage, txid); } - getNamenode().endCheckpoint(backupNode.getRegistration(), sig); + getRemoteNamenodeProxy().endCheckpoint(backupNode.getRegistration(), sig); if (backupNode.getRole() == NamenodeRole.BACKUP) { bnImage.convergeJournalSpool(); @@ -286,7 +287,7 @@ class Checkpointer extends Daemon { log.getStartTxId(), log.getEndTxId()); if (log.getStartTxId() > dstImage.getLastAppliedTxId()) { editsStreams.add(new EditLogFileInputStream(f, log.getStartTxId(), - log.getEndTxId())); + log.getEndTxId(), true)); } } LOG.info("Checkpointer about to load edits from " + diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java index 10601b1723..3ffc852667 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java @@ -39,6 +39,7 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress; import org.apache.hadoop.hdfs.protocol.DatanodeInfo.AdminStates; import org.apache.hadoop.util.StringUtils; import org.codehaus.jackson.JsonNode; @@ -66,9 +67,10 @@ class ClusterJspHelper { ClusterStatus generateClusterHealthReport() { ClusterStatus cs = new ClusterStatus(); Configuration conf = new Configuration(); - List isas = null; + List nns = null; try { - isas = DFSUtil.getNNServiceRpcAddresses(conf); + nns = DFSUtil.flattenAddressMap( + DFSUtil.getNNServiceRpcAddresses(conf)); } catch (Exception e) { // Could not build cluster status cs.setError(e); @@ -76,7 +78,8 @@ class ClusterJspHelper { } // Process each namenode and add it to ClusterStatus - for (InetSocketAddress isa : isas) { + for (ConfiguredNNAddress cnn : nns) { + InetSocketAddress isa = cnn.getAddress(); NamenodeMXBeanHelper nnHelper = null; try { nnHelper = new NamenodeMXBeanHelper(isa, conf); @@ -102,9 +105,10 @@ class ClusterJspHelper { DecommissionStatus generateDecommissioningReport() { String clusterid = ""; Configuration conf = new Configuration(); - List isas = null; + List cnns = null; try { - isas = DFSUtil.getNNServiceRpcAddresses(conf); + cnns = DFSUtil.flattenAddressMap( + DFSUtil.getNNServiceRpcAddresses(conf)); } catch (Exception e) { // catch any exception encountered other than connecting to namenodes DecommissionStatus dInfo = new DecommissionStatus(clusterid, e); @@ -122,7 +126,8 @@ class ClusterJspHelper { new HashMap(); List unreportedNamenode = new ArrayList(); - for (InetSocketAddress isa : isas) { + for (ConfiguredNNAddress cnn : cnns) { + InetSocketAddress isa = cnn.getAddress(); NamenodeMXBeanHelper nnHelper = null; try { nnHelper = new NamenodeMXBeanHelper(isa, conf); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java index 6459ffd0e0..402dcdd0ac 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java @@ -26,8 +26,8 @@ import javax.servlet.http.HttpServletRequest; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.server.common.JspHelper; import org.apache.hadoop.ipc.RemoteException; @@ -77,7 +77,8 @@ abstract class DfsServlet extends HttpServlet { NameNodeHttpServer.getNameNodeAddressFromContext(context); Configuration conf = new HdfsConfiguration( NameNodeHttpServer.getConfFromContext(context)); - return DFSUtil.createNamenode(nnAddr, conf); + return NameNodeProxies.createProxy(conf, NameNode.getUri(nnAddr), + ClientProtocol.class).getProxy(); } protected UserGroupInformation getUGI(HttpServletRequest request, diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java index 637400f926..a0fb8fe629 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java @@ -133,4 +133,9 @@ class EditLogBackupInputStream extends EditLogInputStream { public long getLastTxId() throws IOException { return HdfsConstants.INVALID_TXID; } + + @Override + public boolean isInProgress() { + return true; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java index 8c3ad2ecdb..bdb4c5e773 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java @@ -22,12 +22,14 @@ import java.net.InetSocketAddress; import java.util.Arrays; import org.apache.hadoop.hdfs.HdfsConfiguration; -import org.apache.hadoop.hdfs.protocolPB.JournalProtocolTranslatorPB; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.hdfs.server.protocol.JournalProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.io.DataOutputBuffer; -import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.UserGroupInformation; /** * An implementation of the abstract class {@link EditLogOutputStream}, @@ -40,7 +42,7 @@ import org.apache.hadoop.net.NetUtils; class EditLogBackupOutputStream extends EditLogOutputStream { static int DEFAULT_BUFFER_SIZE = 256; - private JournalProtocolTranslatorPB backupNode; // RPC proxy to backup node + private JournalProtocol backupNode; // RPC proxy to backup node private NamenodeRegistration bnRegistration; // backup node registration private NamenodeRegistration nnRegistration; // active node registration private EditsDoubleBuffer doubleBuf; @@ -55,8 +57,9 @@ class EditLogBackupOutputStream extends EditLogOutputStream { InetSocketAddress bnAddress = NetUtils.createSocketAddr(bnRegistration.getAddress()); try { - this.backupNode = - new JournalProtocolTranslatorPB(bnAddress, new HdfsConfiguration()); + this.backupNode = NameNodeProxies.createNonHAProxy(new HdfsConfiguration(), + bnAddress, JournalProtocol.class, UserGroupInformation.getCurrentUser(), + true).getProxy(); } catch(IOException e) { Storage.LOG.error("Error connecting to: " + bnAddress, e); throw e; @@ -93,14 +96,14 @@ class EditLogBackupOutputStream extends EditLogOutputStream { throw new IOException("BackupEditStream has " + size + " records still to be flushed and cannot be closed."); } - IOUtils.cleanup(Storage.LOG, backupNode); // stop the RPC threads + RPC.stopProxy(backupNode); // stop the RPC threads doubleBuf.close(); doubleBuf = null; } @Override public void abort() throws IOException { - IOUtils.cleanup(Storage.LOG, backupNode); + RPC.stopProxy(backupNode); doubleBuf = null; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java index 952e4a7684..22c1297dac 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java @@ -41,6 +41,7 @@ class EditLogFileInputStream extends EditLogInputStream { private final int logVersion; private final FSEditLogOp.Reader reader; private final FSEditLogLoader.PositionTrackingInputStream tracker; + private final boolean isInProgress; /** * Open an EditLogInputStream for the given file. @@ -53,7 +54,7 @@ class EditLogFileInputStream extends EditLogInputStream { */ EditLogFileInputStream(File name) throws LogHeaderCorruptException, IOException { - this(name, HdfsConstants.INVALID_TXID, HdfsConstants.INVALID_TXID); + this(name, HdfsConstants.INVALID_TXID, HdfsConstants.INVALID_TXID, false); } /** @@ -66,7 +67,8 @@ class EditLogFileInputStream extends EditLogInputStream { * @throws IOException if an actual IO error occurs while reading the * header */ - EditLogFileInputStream(File name, long firstTxId, long lastTxId) + EditLogFileInputStream(File name, long firstTxId, long lastTxId, + boolean isInProgress) throws LogHeaderCorruptException, IOException { file = name; fStream = new FileInputStream(name); @@ -84,6 +86,25 @@ class EditLogFileInputStream extends EditLogInputStream { reader = new FSEditLogOp.Reader(in, logVersion); this.firstTxId = firstTxId; this.lastTxId = lastTxId; + this.isInProgress = isInProgress; + } + + /** + * Skip over a number of transactions. Subsequent calls to + * {@link EditLogFileInputStream#readOp()} will begin after these skipped + * transactions. If more transactions are requested to be skipped than remain + * in the edit log, all edit log ops in the log will be skipped and subsequent + * calls to {@link EditLogInputStream#readOp} will return null. + * + * @param transactionsToSkip number of transactions to skip over. + * @throws IOException if there's an error while reading an operation + */ + public void skipTransactions(long transactionsToSkip) throws IOException { + assert firstTxId != HdfsConstants.INVALID_TXID && + lastTxId != HdfsConstants.INVALID_TXID; + for (long i = 0; i < transactionsToSkip; i++) { + reader.readOp(); + } } @Override @@ -132,6 +153,11 @@ class EditLogFileInputStream extends EditLogInputStream { return file.length(); } + @Override + public boolean isInProgress() { + return isInProgress; + } + @Override public String toString() { return getName(); @@ -142,11 +168,11 @@ class EditLogFileInputStream extends EditLogInputStream { try { in = new EditLogFileInputStream(file); } catch (LogHeaderCorruptException corrupt) { - // If it's missing its header, this is equivalent to no transactions + // If the header is malformed or the wrong value, this indicates a corruption FSImage.LOG.warn("Log at " + file + " has no valid header", corrupt); - return new FSEditLogLoader.EditLogValidation(0, HdfsConstants.INVALID_TXID, - HdfsConstants.INVALID_TXID); + return new FSEditLogLoader.EditLogValidation(0, + HdfsConstants.INVALID_TXID, HdfsConstants.INVALID_TXID, true); } try { @@ -172,14 +198,13 @@ class EditLogFileInputStream extends EditLogInputStream { throw new LogHeaderCorruptException( "Reached EOF when reading log header"); } - if (logVersion < HdfsConstants.LAYOUT_VERSION) { // future version + if (logVersion < HdfsConstants.LAYOUT_VERSION || // future version + logVersion > Storage.LAST_UPGRADABLE_LAYOUT_VERSION) { // unsupported throw new LogHeaderCorruptException( "Unexpected version of the file system log file: " + logVersion + ". Current version = " + HdfsConstants.LAYOUT_VERSION + "."); } - assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION : - "Unsupported version " + logVersion; return logVersion; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java index bdc0bd2a56..f7e1f01250 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hdfs.server.namenode; +import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -27,6 +28,7 @@ import java.nio.channels.FileChannel; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.io.IOUtils; @@ -36,7 +38,8 @@ import com.google.common.annotations.VisibleForTesting; * An implementation of the abstract class {@link EditLogOutputStream}, which * stores edits in a local file. */ -class EditLogFileOutputStream extends EditLogOutputStream { +@InterfaceAudience.Private +public class EditLogFileOutputStream extends EditLogOutputStream { private static Log LOG = LogFactory.getLog(EditLogFileOutputStream.class); private File file; @@ -96,11 +99,23 @@ class EditLogFileOutputStream extends EditLogOutputStream { public void create() throws IOException { fc.truncate(0); fc.position(0); - doubleBuf.getCurrentBuf().writeInt(HdfsConstants.LAYOUT_VERSION); + writeHeader(doubleBuf.getCurrentBuf()); setReadyToFlush(); flush(); } + /** + * Write header information for this EditLogFileOutputStream to the provided + * DataOutputSream. + * + * @param out the output stream to write the header to. + * @throws IOException in the event of error writing to the stream. + */ + @VisibleForTesting + public static void writeHeader(DataOutputStream out) throws IOException { + out.writeInt(HdfsConstants.LAYOUT_VERSION); + } + @Override public void close() throws IOException { if (fp == null) { @@ -204,6 +219,11 @@ class EditLogFileOutputStream extends EditLogOutputStream { File getFile() { return file; } + + @Override + public String toString() { + return "EditLogFileOutputStream(" + file + ")"; + } /** * @return true if this stream is currently open. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputException.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputException.java new file mode 100644 index 0000000000..56edf8cb22 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputException.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; + +/** + * Thrown when there's a failure to read an edit log op from disk when loading + * edits. + */ +@InterfaceAudience.Private +public class EditLogInputException extends IOException { + + private static final long serialVersionUID = 1L; + + private final long numEditsLoaded; + + public EditLogInputException(String message, Throwable cause, + long numEditsLoaded) { + super(message, cause); + this.numEditsLoaded = numEditsLoaded; + } + + public long getNumEditsLoaded() { + return numEditsLoaded; + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java index 3ad19951d7..7a7f8d8743 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java @@ -22,6 +22,9 @@ import org.apache.hadoop.classification.InterfaceStability; import java.io.Closeable; import java.io.IOException; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + /** * A generic abstract class to support reading edits log data from * persistent storage. @@ -79,4 +82,9 @@ public abstract class EditLogInputStream implements JournalStream, Closeable { * Return the size of the current edits log. */ public abstract long length() throws IOException; + + /** + * Return true if this stream is in progress, false if it is finalized. + */ + public abstract boolean isInProgress(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java index 45ce9df154..ab0f4c4ddd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java @@ -261,113 +261,32 @@ public class FSDirectory implements Closeable { */ INode unprotectedAddFile( String path, PermissionStatus permissions, - BlockInfo[] blocks, short replication, long modificationTime, long atime, long preferredBlockSize, + boolean underConstruction, String clientName, String clientMachine) throws UnresolvedLinkException { INode newNode; assert hasWriteLock(); - if (blocks == null) - newNode = new INodeDirectory(permissions, modificationTime); - else if(blocks.length == 0 || blocks[blocks.length-1].getBlockUCState() - == BlockUCState.UNDER_CONSTRUCTION) { + if (underConstruction) { newNode = new INodeFileUnderConstruction( - permissions, blocks.length, replication, + permissions, replication, preferredBlockSize, modificationTime, clientName, clientMachine, null); } else { - newNode = new INodeFile(permissions, blocks.length, replication, + newNode = new INodeFile(permissions, 0, replication, modificationTime, atime, preferredBlockSize); } - writeLock(); + try { - try { - newNode = addNode(path, newNode, UNKNOWN_DISK_SPACE); - if(newNode != null && blocks != null) { - int nrBlocks = blocks.length; - // Add file->block mapping - INodeFile newF = (INodeFile)newNode; - for (int i = 0; i < nrBlocks; i++) { - newF.setBlock(i, getBlockManager().addINode(blocks[i], newF)); - } - } - } catch (IOException e) { - return null; - } - return newNode; - } finally { - writeUnlock(); - } - - } - - /** - * Update files in-memory data structures with new block information. - * @throws IOException - */ - void updateFile(INodeFile file, - String path, - BlockInfo[] blocks, - long mtime, - long atime) throws IOException { - - // Update the salient file attributes. - file.setAccessTime(atime); - file.setModificationTimeForce(mtime); - - // Update its block list - BlockInfo[] oldBlocks = file.getBlocks(); - - // Are we only updating the last block's gen stamp. - boolean isGenStampUpdate = oldBlocks.length == blocks.length; - - // First, update blocks in common - BlockInfo oldBlock = null; - for (int i = 0; i < oldBlocks.length && i < blocks.length; i++) { - oldBlock = oldBlocks[i]; - Block newBlock = blocks[i]; - - boolean isLastBlock = i == oldBlocks.length - 1; - if (oldBlock.getBlockId() != newBlock.getBlockId() || - (oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() && - !(isGenStampUpdate && isLastBlock))) { - throw new IOException("Mismatched block IDs or generation stamps, " + - "attempting to replace block " + oldBlock + " with " + newBlock + - " as block # " + i + "/" + blocks.length + " of " + path); - } - - oldBlock.setNumBytes(newBlock.getNumBytes()); - oldBlock.setGenerationStamp(newBlock.getGenerationStamp()); - } - - if (blocks.length < oldBlocks.length) { - // We're removing a block from the file, e.g. abandonBlock(...) - if (!file.isUnderConstruction()) { - throw new IOException("Trying to remove a block from file " + - path + " which is not under construction."); - } - if (blocks.length != oldBlocks.length - 1) { - throw new IOException("Trying to remove more than one block from file " - + path); - } - unprotectedRemoveBlock(path, - (INodeFileUnderConstruction)file, oldBlocks[oldBlocks.length - 1]); - } else if (blocks.length > oldBlocks.length) { - // We're adding blocks - // First complete last old Block - getBlockManager().completeBlock(file, oldBlocks.length-1, true); - // Add the new blocks - for (int i = oldBlocks.length; i < blocks.length; i++) { - // addBlock(); - BlockInfo newBI = blocks[i]; - getBlockManager().addINode(newBI, file); - file.addBlock(newBI); - } + newNode = addNode(path, newNode, UNKNOWN_DISK_SPACE); + } catch (IOException e) { + return null; } + return newNode; } INodeDirectory addToParent(byte[] src, INodeDirectory parentINode, @@ -450,7 +369,7 @@ public class FSDirectory implements Closeable { writeLock(); try { - fsImage.getEditLog().logOpenFile(path, file); + fsImage.getEditLog().logUpdateBlocks(path, file); if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* FSDirectory.persistBlocks: " +path+" with "+ file.getBlocks().length @@ -460,7 +379,7 @@ public class FSDirectory implements Closeable { writeUnlock(); } } - + /** * Close file. */ @@ -483,7 +402,7 @@ public class FSDirectory implements Closeable { } /** - * Remove a block to the file. + * Remove a block from the file. */ boolean removeBlock(String path, INodeFileUnderConstruction fileNode, Block block) throws IOException { @@ -499,7 +418,7 @@ public class FSDirectory implements Closeable { } return true; } - + void unprotectedRemoveBlock(String path, INodeFileUnderConstruction fileNode, Block block) throws IOException { // modify file-> block and blocksMap diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java index 80c608814a..7c630d70db 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java @@ -62,22 +62,36 @@ public class FSEditLog { /** * State machine for edit log. + * + * In a non-HA setup: + * * The log starts in UNITIALIZED state upon construction. Once it's - * initialized, it is usually in IN_SEGMENT state, indicating that edits - * may be written. In the middle of a roll, or while saving the namespace, - * it briefly enters the BETWEEN_LOG_SEGMENTS state, indicating that the - * previous segment has been closed, but the new one has not yet been opened. + * initialized, it is usually in IN_SEGMENT state, indicating that edits may + * be written. In the middle of a roll, or while saving the namespace, it + * briefly enters the BETWEEN_LOG_SEGMENTS state, indicating that the previous + * segment has been closed, but the new one has not yet been opened. + * + * In an HA setup: + * + * The log starts in UNINITIALIZED state upon construction. Once it's + * initialized, it sits in the OPEN_FOR_READING state the entire time that the + * NN is in standby. Upon the NN transition to active, the log will be CLOSED, + * and then move to being BETWEEN_LOG_SEGMENTS, much as if the NN had just + * started up, and then will move to IN_SEGMENT so it can begin writing to the + * log. The log states will then revert to behaving as they do in a non-HA + * setup. */ private enum State { UNINITIALIZED, BETWEEN_LOG_SEGMENTS, IN_SEGMENT, + OPEN_FOR_READING, CLOSED; } private State state = State.UNINITIALIZED; //initialize - private JournalSet journalSet; + private JournalSet journalSet = null; private EditLogOutputStream editLogStream = null; // a monotonically increasing counter that represents transactionIds. @@ -112,7 +126,12 @@ public class FSEditLog { private NNStorage storage; private Configuration conf; - private Collection editsDirs; + private List editsDirs; + + /** + * The edit directories that are shared between primary and secondary. + */ + private List sharedEditsDirs; private static class TransactionId { public long txid; @@ -151,11 +170,11 @@ public class FSEditLog { * @param storage Storage object used by namenode * @param editsDirs List of journals to use */ - FSEditLog(Configuration conf, NNStorage storage, Collection editsDirs) { + FSEditLog(Configuration conf, NNStorage storage, List editsDirs) { init(conf, storage, editsDirs); } - private void init(Configuration conf, NNStorage storage, Collection editsDirs) { + private void init(Configuration conf, NNStorage storage, List editsDirs) { isSyncRunning = false; this.conf = conf; this.storage = storage; @@ -165,19 +184,44 @@ public class FSEditLog { // If this list is empty, an error will be thrown on first use // of the editlog, as no journals will exist this.editsDirs = Lists.newArrayList(editsDirs); + + this.sharedEditsDirs = FSNamesystem.getSharedEditsDirs(conf); + } + + public synchronized void initJournalsForWrite() { + Preconditions.checkState(state == State.UNINITIALIZED || + state == State.CLOSED, "Unexpected state: %s", state); + initJournals(this.editsDirs); + state = State.BETWEEN_LOG_SEGMENTS; + } + + public synchronized void initSharedJournalsForRead() { + if (state == State.OPEN_FOR_READING) { + LOG.warn("Initializing shared journals for READ, already open for READ", + new Exception()); + return; + } + Preconditions.checkState(state == State.UNINITIALIZED || + state == State.CLOSED); + + initJournals(this.sharedEditsDirs); + state = State.OPEN_FOR_READING; + } + + private synchronized void initJournals(List dirs) { int minimumRedundantJournals = conf.getInt( DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY, DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_DEFAULT); journalSet = new JournalSet(minimumRedundantJournals); - for (URI u : this.editsDirs) { + for (URI u : dirs) { boolean required = FSNamesystem.getRequiredNamespaceEditsDirs(conf) .contains(u); if (u.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) { StorageDirectory sd = storage.getStorageDirectory(u); if (sd != null) { - journalSet.add(new FileJournalManager(sd), required); + journalSet.add(new FileJournalManager(sd, storage), required); } } else { journalSet.add(createJournal(u), required); @@ -187,7 +231,6 @@ public class FSEditLog { if (journalSet.isEmpty()) { LOG.error("No edits directories configured!"); } - state = State.BETWEEN_LOG_SEGMENTS; } /** @@ -202,17 +245,50 @@ public class FSEditLog { * Initialize the output stream for logging, opening the first * log segment. */ - synchronized void open() throws IOException { - Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS); + synchronized void openForWrite() throws IOException { + Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS, + "Bad state: %s", state); - startLogSegment(getLastWrittenTxId() + 1, true); + long segmentTxId = getLastWrittenTxId() + 1; + // Safety check: we should never start a segment if there are + // newer txids readable. + EditLogInputStream s = journalSet.getInputStream(segmentTxId, true); + try { + Preconditions.checkState(s == null, + "Cannot start writing at txid %s when there is a stream " + + "available for read: %s", segmentTxId, s); + } finally { + IOUtils.closeStream(s); + } + + startLogSegment(segmentTxId, true); assert state == State.IN_SEGMENT : "Bad state: " + state; } - synchronized boolean isOpen() { + /** + * @return true if the log is currently open in write mode, regardless + * of whether it actually has an open segment. + */ + synchronized boolean isOpenForWrite() { + return state == State.IN_SEGMENT || + state == State.BETWEEN_LOG_SEGMENTS; + } + + /** + * @return true if the log is open in write mode and has a segment open + * ready to take edits. + */ + synchronized boolean isSegmentOpen() { return state == State.IN_SEGMENT; } + /** + * @return true if the log is open in read mode. + */ + public synchronized boolean isOpenForRead() { + return state == State.OPEN_FOR_READING; + } + /** * Shutdown the file store. */ @@ -242,7 +318,8 @@ public class FSEditLog { */ void logEdit(final FSEditLogOp op) { synchronized (this) { - assert state != State.CLOSED; + assert isOpenForWrite() : + "bad state: " + state; // wait if an automatic sync is scheduled waitIfAutoSyncScheduled(); @@ -329,7 +406,7 @@ public class FSEditLog { /** * Return the transaction ID of the last transaction written to the log. */ - synchronized long getLastWrittenTxId() { + public synchronized long getLastWrittenTxId() { return txid; } @@ -337,7 +414,7 @@ public class FSEditLog { * @return the first transaction ID in the current log segment */ synchronized long getCurSegmentTxId() { - Preconditions.checkState(state == State.IN_SEGMENT, + Preconditions.checkState(isSegmentOpen(), "Bad state: %s", state); return curSegmentTxId; } @@ -549,6 +626,13 @@ public class FSEditLog { logEdit(op); } + public void logUpdateBlocks(String path, INodeFileUnderConstruction file) { + UpdateBlocksOp op = UpdateBlocksOp.getInstance() + .setPath(path) + .setBlocks(file.getBlocks()); + logEdit(op); + } + /** * Add create directory record to edit log */ @@ -724,16 +808,25 @@ public class FSEditLog { * Used only by unit tests. */ @VisibleForTesting - List getJournals() { + synchronized List getJournals() { return journalSet.getAllJournalStreams(); } + /** + * Used only by tests. + */ + @VisibleForTesting + synchronized public JournalSet getJournalSet() { + return journalSet; + } + /** * Used only by unit tests. */ @VisibleForTesting synchronized void setRuntimeForTesting(Runtime runtime) { this.runtime = runtime; + this.journalSet.setRuntimeForTesting(runtime); } /** @@ -796,7 +889,7 @@ public class FSEditLog { editLogStream = journalSet.startLogSegment(segmentTxId); } catch (IOException ex) { throw new IOException("Unable to start log segment " + - segmentTxId + ": no journals successfully started."); + segmentTxId + ": too few journals successfully started.", ex); } curSegmentTxId = segmentTxId; @@ -815,7 +908,7 @@ public class FSEditLog { */ synchronized void endCurrentLogSegment(boolean writeEndTxn) { LOG.info("Ending log segment " + curSegmentTxId); - Preconditions.checkState(state == State.IN_SEGMENT, + Preconditions.checkState(isSegmentOpen(), "Bad state: %s", state); if (writeEndTxn) { @@ -847,6 +940,7 @@ public class FSEditLog { if (editLogStream != null) { editLogStream.abort(); editLogStream = null; + state = State.BETWEEN_LOG_SEGMENTS; } } catch (IOException e) { LOG.warn("All journals failed to abort", e); @@ -856,17 +950,14 @@ public class FSEditLog { /** * Archive any log files that are older than the given txid. */ - public void purgeLogsOlderThan(final long minTxIdToKeep) { - synchronized (this) { - // synchronized to prevent findbugs warning about inconsistent - // synchronization. This will be JIT-ed out if asserts are - // off. - assert curSegmentTxId == HdfsConstants.INVALID_TXID || // on format this is no-op - minTxIdToKeep <= curSegmentTxId : - "cannot purge logs older than txid " + minTxIdToKeep + - " when current segment starts at " + curSegmentTxId; - } + public synchronized void purgeLogsOlderThan(final long minTxIdToKeep) { + assert curSegmentTxId == HdfsConstants.INVALID_TXID || // on format this is no-op + minTxIdToKeep <= curSegmentTxId : + "cannot purge logs older than txid " + minTxIdToKeep + + " when current segment starts at " + curSegmentTxId; + // This could be improved to not need synchronization. But currently, + // journalSet is not threadsafe, so we need to synchronize this method. try { journalSet.purgeLogsOlderThan(minTxIdToKeep); } catch (IOException ex) { @@ -898,8 +989,8 @@ public class FSEditLog { // sets the initial capacity of the flush buffer. - public void setOutputBufferCapacity(int size) { - journalSet.setOutputBufferCapacity(size); + synchronized void setOutputBufferCapacity(int size) { + journalSet.setOutputBufferCapacity(size); } /** @@ -975,32 +1066,45 @@ public class FSEditLog { /** * Run recovery on all journals to recover any unclosed segments */ - void recoverUnclosedStreams() { + synchronized void recoverUnclosedStreams() { + Preconditions.checkState( + state == State.BETWEEN_LOG_SEGMENTS, + "May not recover segments - wrong state: %s", state); try { journalSet.recoverUnfinalizedSegments(); } catch (IOException ex) { // All journals have failed, it is handled in logSync. } } + + Collection selectInputStreams(long fromTxId, + long toAtLeastTxId) throws IOException { + return selectInputStreams(fromTxId, toAtLeastTxId, true); + } /** * Select a list of input streams to load. + * * @param fromTxId first transaction in the selected streams * @param toAtLeast the selected streams must contain this transaction + * @param inProgessOk set to true if in-progress streams are OK */ - Collection selectInputStreams(long fromTxId, - long toAtLeastTxId) throws IOException { + public synchronized Collection selectInputStreams(long fromTxId, + long toAtLeastTxId, boolean inProgressOk) throws IOException { List streams = new ArrayList(); - EditLogInputStream stream = journalSet.getInputStream(fromTxId); + EditLogInputStream stream = journalSet.getInputStream(fromTxId, inProgressOk); while (stream != null) { - fromTxId = stream.getLastTxId() + 1; streams.add(stream); - stream = journalSet.getInputStream(fromTxId); + // We're now looking for a higher range, so reset the fromTxId + fromTxId = stream.getLastTxId() + 1; + stream = journalSet.getInputStream(fromTxId, inProgressOk); } + if (fromTxId <= toAtLeastTxId) { closeAllStreams(streams); - throw new IOException("No non-corrupt logs for txid " - + fromTxId); + throw new IOException(String.format("Gap in transactions. Expected to " + + "be able to read up until at least txid %d but unable to find any " + + "edit logs containing txid %d", toAtLeastTxId, fromTxId)); } return streams; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java index b93942d951..7c24107150 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java @@ -28,6 +28,7 @@ import java.util.EnumMap; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.fs.permission.PermissionStatus; +import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.LayoutVersion; @@ -36,6 +37,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp; +import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.BlockListUpdatingOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ConcatDeleteOp; @@ -54,9 +56,12 @@ import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp; +import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateBlocksOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateMasterKeyOp; import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease; import org.apache.hadoop.hdfs.util.Holder; +import org.apache.hadoop.io.IOUtils; + import com.google.common.base.Joiner; @InterfaceAudience.Private @@ -73,40 +78,32 @@ public class FSEditLogLoader { * This is where we apply edits that we've been writing to disk all * along. */ - int loadFSEdits(EditLogInputStream edits, long expectedStartingTxId) - throws IOException { - long startTime = now(); - int numEdits = loadFSEdits(edits, true, expectedStartingTxId); - FSImage.LOG.info("Edits file " + edits.getName() - + " of size " + edits.length() + " edits # " + numEdits - + " loaded in " + (now()-startTime)/1000 + " seconds."); - return numEdits; - } - - int loadFSEdits(EditLogInputStream edits, boolean closeOnExit, - long expectedStartingTxId) + long loadFSEdits(EditLogInputStream edits, long expectedStartingTxId) throws IOException { - int numEdits = 0; + long numEdits = 0; int logVersion = edits.getVersion(); + fsNamesys.writeLock(); try { + long startTime = now(); numEdits = loadEditRecords(logVersion, edits, false, expectedStartingTxId); + FSImage.LOG.info("Edits file " + edits.getName() + + " of size " + edits.length() + " edits # " + numEdits + + " loaded in " + (now()-startTime)/1000 + " seconds."); } finally { - if(closeOnExit) { - edits.close(); - } + edits.close(); + fsNamesys.writeUnlock(); } return numEdits; } - @SuppressWarnings("deprecation") - int loadEditRecords(int logVersion, EditLogInputStream in, boolean closeOnExit, + long loadEditRecords(int logVersion, EditLogInputStream in, boolean closeOnExit, long expectedStartingTxId) - throws IOException { + throws IOException, EditLogInputException { FSDirectory fsDir = fsNamesys.dir; - int numEdits = 0; + long numEdits = 0; EnumMap> opCounts = new EnumMap>(FSEditLogOpCodes.class); @@ -120,9 +117,20 @@ public class FSEditLogLoader { long txId = expectedStartingTxId - 1; try { try { - FSEditLogOp op; - while ((op = in.readOp()) != null) { - recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] = + while (true) { + FSEditLogOp op; + try { + if ((op = in.readOp()) == null) { + break; + } + } catch (IOException ioe) { + long badTxId = txId + 1; // because txId hasn't been incremented yet + String errorMessage = formatEditLogReplayError(in, recentOpcodeOffsets, badTxId); + FSImage.LOG.error(errorMessage); + throw new EditLogInputException(errorMessage, + ioe, numEdits); + } + recentOpcodeOffsets[(int)(numEdits % recentOpcodeOffsets.length)] = in.getPosition(); if (LayoutVersion.supports(Feature.STORED_TXIDS, logVersion)) { long expectedTxId = txId + 1; @@ -133,310 +141,442 @@ public class FSEditLogLoader { } } - numEdits++; incrOpCount(op.opCode, opCounts); - switch (op.opCode) { - case OP_ADD: - case OP_CLOSE: { - AddCloseOp addCloseOp = (AddCloseOp)op; - - // versions > 0 support per file replication - // get name and replication - final short replication = fsNamesys.getBlockManager( - ).adjustReplication(addCloseOp.replication); - - long blockSize = addCloseOp.blockSize; - BlockInfo blocks[] = new BlockInfo[addCloseOp.blocks.length]; - for (int i = 0; i < addCloseOp.blocks.length; i++) { - if(addCloseOp.opCode == FSEditLogOpCodes.OP_ADD - && i == addCloseOp.blocks.length-1) { - blocks[i] = new BlockInfoUnderConstruction(addCloseOp.blocks[i], - replication); - } else { - blocks[i] = new BlockInfo(addCloseOp.blocks[i], replication); - } - } - - PermissionStatus permissions = fsNamesys.getUpgradePermission(); - if (addCloseOp.permissions != null) { - permissions = addCloseOp.permissions; - } - - - // Older versions of HDFS does not store the block size in inode. - // If the file has more than one block, use the size of the - // first block as the blocksize. Otherwise use the default - // block size. - if (-8 <= logVersion && blockSize == 0) { - if (blocks.length > 1) { - blockSize = blocks[0].getNumBytes(); - } else { - long first = ((blocks.length == 1)? blocks[0].getNumBytes(): 0); - blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first); - } - } - - - // The open lease transaction re-creates a file if necessary. - // Delete the file if it already exists. - if (FSNamesystem.LOG.isDebugEnabled()) { - FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path + - " numblocks : " + blocks.length + - " clientHolder " + addCloseOp.clientName + - " clientMachine " + addCloseOp.clientMachine); - } - - // There are four cases here: - // 1. OP_ADD to create a new file - // 2. OP_ADD to update file blocks - // 3. OP_ADD to open file for append - // 4. OP_CLOSE to close the file - - // See if the file already exists - INodeFile oldFile = fsDir.getFileINode(addCloseOp.path); - if (oldFile == null) { // OP_ADD for a new file - assert addCloseOp.opCode == FSEditLogOpCodes.OP_ADD : - "Expected opcode OP_ADD, but got " + addCloseOp.opCode; - fsDir.unprotectedAddFile( - addCloseOp.path, permissions, blocks, replication, - addCloseOp.mtime, addCloseOp.atime, blockSize, - addCloseOp.clientName, addCloseOp.clientMachine); - } else { - fsDir.updateFile(oldFile, addCloseOp.path, blocks, - addCloseOp.mtime, addCloseOp.atime); - if(addCloseOp.opCode == FSEditLogOpCodes.OP_CLOSE) { // OP_CLOSE - if (!oldFile.isUnderConstruction() && - logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) { - // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE - // could show up twice in a row. But after that version, this - // should be fixed, so we should treat it as an error. - throw new IOException( - "File is not under construction: " + addCloseOp.path); - } - fsNamesys.getBlockManager().completeBlock( - oldFile, blocks.length-1, true); - - if (oldFile.isUnderConstruction()) { - INodeFile newFile = - ((INodeFileUnderConstruction)oldFile).convertToInodeFile(); - fsDir.replaceNode(addCloseOp.path, oldFile, newFile); - } - } else if(! oldFile.isUnderConstruction()) { // OP_ADD for append - INodeFileUnderConstruction cons = new INodeFileUnderConstruction( - oldFile.getLocalNameBytes(), - oldFile.getReplication(), - oldFile.getModificationTime(), - oldFile.getPreferredBlockSize(), - oldFile.getBlocks(), - oldFile.getPermissionStatus(), - addCloseOp.clientName, - addCloseOp.clientMachine, - null); - fsDir.replaceNode(addCloseOp.path, oldFile, cons); - } - } - // Update file lease - if(addCloseOp.opCode == FSEditLogOpCodes.OP_ADD) { - fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path); - } else { // Ops.OP_CLOSE - if (oldFile.isUnderConstruction()) { - fsNamesys.leaseManager.removeLease( - ((INodeFileUnderConstruction)oldFile).getClientName(), addCloseOp.path); - } - } - break; - } - case OP_SET_REPLICATION: { - SetReplicationOp setReplicationOp = (SetReplicationOp)op; - short replication = fsNamesys.getBlockManager().adjustReplication( - setReplicationOp.replication); - fsDir.unprotectedSetReplication(setReplicationOp.path, - replication, null); - break; - } - case OP_CONCAT_DELETE: { - ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op; - fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs, - concatDeleteOp.timestamp); - break; - } - case OP_RENAME_OLD: { - RenameOldOp renameOp = (RenameOldOp)op; - HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false); - fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst, - renameOp.timestamp); - fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo); - break; - } - case OP_DELETE: { - DeleteOp deleteOp = (DeleteOp)op; - fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp); - break; - } - case OP_MKDIR: { - MkdirOp mkdirOp = (MkdirOp)op; - PermissionStatus permissions = fsNamesys.getUpgradePermission(); - if (mkdirOp.permissions != null) { - permissions = mkdirOp.permissions; - } - - fsDir.unprotectedMkdir(mkdirOp.path, permissions, - mkdirOp.timestamp); - break; - } - case OP_SET_GENSTAMP: { - SetGenstampOp setGenstampOp = (SetGenstampOp)op; - fsNamesys.setGenerationStamp(setGenstampOp.genStamp); - break; - } - case OP_SET_PERMISSIONS: { - SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op; - fsDir.unprotectedSetPermission(setPermissionsOp.src, - setPermissionsOp.permissions); - break; - } - case OP_SET_OWNER: { - SetOwnerOp setOwnerOp = (SetOwnerOp)op; - fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username, - setOwnerOp.groupname); - break; - } - case OP_SET_NS_QUOTA: { - SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op; - fsDir.unprotectedSetQuota(setNSQuotaOp.src, - setNSQuotaOp.nsQuota, - HdfsConstants.QUOTA_DONT_SET); - break; - } - case OP_CLEAR_NS_QUOTA: { - ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op; - fsDir.unprotectedSetQuota(clearNSQuotaOp.src, - HdfsConstants.QUOTA_RESET, - HdfsConstants.QUOTA_DONT_SET); - break; - } - - case OP_SET_QUOTA: - SetQuotaOp setQuotaOp = (SetQuotaOp)op; - fsDir.unprotectedSetQuota(setQuotaOp.src, - setQuotaOp.nsQuota, - setQuotaOp.dsQuota); - break; - - case OP_TIMES: { - TimesOp timesOp = (TimesOp)op; - - fsDir.unprotectedSetTimes(timesOp.path, - timesOp.mtime, - timesOp.atime, true); - break; - } - case OP_SYMLINK: { - SymlinkOp symlinkOp = (SymlinkOp)op; - fsDir.unprotectedSymlink(symlinkOp.path, symlinkOp.value, - symlinkOp.mtime, symlinkOp.atime, - symlinkOp.permissionStatus); - break; - } - case OP_RENAME: { - RenameOp renameOp = (RenameOp)op; - - HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false); - fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst, - renameOp.timestamp, renameOp.options); - fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo); - break; - } - case OP_GET_DELEGATION_TOKEN: { - GetDelegationTokenOp getDelegationTokenOp - = (GetDelegationTokenOp)op; - - fsNamesys.getDelegationTokenSecretManager() - .addPersistedDelegationToken(getDelegationTokenOp.token, - getDelegationTokenOp.expiryTime); - break; - } - case OP_RENEW_DELEGATION_TOKEN: { - RenewDelegationTokenOp renewDelegationTokenOp - = (RenewDelegationTokenOp)op; - fsNamesys.getDelegationTokenSecretManager() - .updatePersistedTokenRenewal(renewDelegationTokenOp.token, - renewDelegationTokenOp.expiryTime); - break; - } - case OP_CANCEL_DELEGATION_TOKEN: { - CancelDelegationTokenOp cancelDelegationTokenOp - = (CancelDelegationTokenOp)op; - fsNamesys.getDelegationTokenSecretManager() - .updatePersistedTokenCancellation( - cancelDelegationTokenOp.token); - break; - } - case OP_UPDATE_MASTER_KEY: { - UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op; - fsNamesys.getDelegationTokenSecretManager() - .updatePersistedMasterKey(updateMasterKeyOp.key); - break; - } - case OP_REASSIGN_LEASE: { - ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op; - - Lease lease = fsNamesys.leaseManager.getLease( - reassignLeaseOp.leaseHolder); - INodeFileUnderConstruction pendingFile = - (INodeFileUnderConstruction) fsDir.getFileINode( - reassignLeaseOp.path); - fsNamesys.reassignLeaseInternal(lease, - reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile); - break; - } - case OP_START_LOG_SEGMENT: - case OP_END_LOG_SEGMENT: { - // no data in here currently. - break; - } - case OP_DATANODE_ADD: - case OP_DATANODE_REMOVE: - break; - default: - throw new IOException("Invalid operation read " + op.opCode); + try { + applyEditLogOp(op, fsDir, logVersion); + } catch (Throwable t) { + // Catch Throwable because in the case of a truly corrupt edits log, any + // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.) + String errorMessage = formatEditLogReplayError(in, recentOpcodeOffsets, txId); + FSImage.LOG.error(errorMessage); + throw new IOException(errorMessage, t); } + numEdits++; } - } catch (IOException ex) { check203UpgradeFailure(logVersion, ex); } finally { if(closeOnExit) in.close(); } - } catch (Throwable t) { - // Catch Throwable because in the case of a truly corrupt edits log, any - // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.) - StringBuilder sb = new StringBuilder(); - sb.append("Error replaying edit log at offset " + in.getPosition()); - sb.append("On transaction ID ").append(txId); - if (recentOpcodeOffsets[0] != -1) { - Arrays.sort(recentOpcodeOffsets); - sb.append("\nRecent opcode offsets:"); - for (long offset : recentOpcodeOffsets) { - if (offset != -1) { - sb.append(' ').append(offset); - } - } - } - String errorMessage = sb.toString(); - FSImage.LOG.error(errorMessage); - throw new IOException(errorMessage, t); } finally { fsDir.writeUnlock(); fsNamesys.writeUnlock(); - } - if (FSImage.LOG.isDebugEnabled()) { - dumpOpCounts(opCounts); + if (FSImage.LOG.isDebugEnabled()) { + dumpOpCounts(opCounts); + } } return numEdits; } + + @SuppressWarnings("deprecation") + private void applyEditLogOp(FSEditLogOp op, FSDirectory fsDir, + int logVersion) throws IOException { + switch (op.opCode) { + case OP_ADD: { + AddCloseOp addCloseOp = (AddCloseOp)op; + if (FSNamesystem.LOG.isDebugEnabled()) { + FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path + + " numblocks : " + addCloseOp.blocks.length + + " clientHolder " + addCloseOp.clientName + + " clientMachine " + addCloseOp.clientMachine); + } + // There three cases here: + // 1. OP_ADD to create a new file + // 2. OP_ADD to update file blocks + // 3. OP_ADD to open file for append + // See if the file already exists (persistBlocks call) + INodeFile oldFile = getINodeFile(fsDir, addCloseOp.path); + INodeFile newFile = oldFile; + if (oldFile == null) { // this is OP_ADD on a new file (case 1) + // versions > 0 support per file replication + // get name and replication + final short replication = fsNamesys.getBlockManager( + ).adjustReplication(addCloseOp.replication); + PermissionStatus permissions = fsNamesys.getUpgradePermission(); + if (addCloseOp.permissions != null) { + permissions = addCloseOp.permissions; + } + long blockSize = addCloseOp.blockSize; + + // Versions of HDFS prior to 0.17 may log an OP_ADD transaction + // which includes blocks in it. When we update the minimum + // upgrade version to something more recent than 0.17, we can + // simplify this code by asserting that OP_ADD transactions + // don't have any blocks. + + // Older versions of HDFS does not store the block size in inode. + // If the file has more than one block, use the size of the + // first block as the blocksize. Otherwise use the default + // block size. + if (-8 <= logVersion && blockSize == 0) { + if (addCloseOp.blocks.length > 1) { + blockSize = addCloseOp.blocks[0].getNumBytes(); + } else { + long first = ((addCloseOp.blocks.length == 1)? + addCloseOp.blocks[0].getNumBytes(): 0); + blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first); + } + } + + // add to the file tree + newFile = (INodeFile)fsDir.unprotectedAddFile( + addCloseOp.path, permissions, + replication, addCloseOp.mtime, + addCloseOp.atime, blockSize, + true, addCloseOp.clientName, addCloseOp.clientMachine); + fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path); + + } else { // This is OP_ADD on an existing file + if (!oldFile.isUnderConstruction()) { + // This is case 3: a call to append() on an already-closed file. + if (FSNamesystem.LOG.isDebugEnabled()) { + FSNamesystem.LOG.debug("Reopening an already-closed file " + + "for append"); + } + fsNamesys.prepareFileForWrite(addCloseOp.path, oldFile, + addCloseOp.clientName, addCloseOp.clientMachine, null, + false); + newFile = getINodeFile(fsDir, addCloseOp.path); + } + } + // Fall-through for case 2. + // Regardless of whether it's a new file or an updated file, + // update the block list. + + // Update the salient file attributes. + newFile.setAccessTime(addCloseOp.atime); + newFile.setModificationTimeForce(addCloseOp.mtime); + updateBlocks(fsDir, addCloseOp, newFile); + break; + } + case OP_CLOSE: { + AddCloseOp addCloseOp = (AddCloseOp)op; + + if (FSNamesystem.LOG.isDebugEnabled()) { + FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path + + " numblocks : " + addCloseOp.blocks.length + + " clientHolder " + addCloseOp.clientName + + " clientMachine " + addCloseOp.clientMachine); + } + + INodeFile oldFile = getINodeFile(fsDir, addCloseOp.path); + if (oldFile == null) { + throw new IOException("Operation trying to close non-existent file " + + addCloseOp.path); + } + + // Update in-memory data structures + updateBlocks(fsDir, addCloseOp, oldFile); + + // Now close the file + if (!oldFile.isUnderConstruction() && + logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) { + // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE + // could show up twice in a row. But after that version, this + // should be fixed, so we should treat it as an error. + throw new IOException( + "File is not under construction: " + addCloseOp.path); + } + // One might expect that you could use removeLease(holder, path) here, + // but OP_CLOSE doesn't serialize the holder. So, remove by path. + if (oldFile.isUnderConstruction()) { + INodeFileUnderConstruction ucFile = (INodeFileUnderConstruction) oldFile; + fsNamesys.leaseManager.removeLeaseWithPrefixPath(addCloseOp.path); + INodeFile newFile = ucFile.convertToInodeFile(); + fsDir.replaceNode(addCloseOp.path, ucFile, newFile); + } + break; + } + case OP_UPDATE_BLOCKS: { + UpdateBlocksOp updateOp = (UpdateBlocksOp)op; + if (FSNamesystem.LOG.isDebugEnabled()) { + FSNamesystem.LOG.debug(op.opCode + ": " + updateOp.path + + " numblocks : " + updateOp.blocks.length); + } + INodeFile oldFile = getINodeFile(fsDir, updateOp.path); + if (oldFile == null) { + throw new IOException( + "Operation trying to update blocks in non-existent file " + + updateOp.path); + } + + // Update in-memory data structures + updateBlocks(fsDir, updateOp, oldFile); + break; + } + + case OP_SET_REPLICATION: { + SetReplicationOp setReplicationOp = (SetReplicationOp)op; + short replication = fsNamesys.getBlockManager().adjustReplication( + setReplicationOp.replication); + fsDir.unprotectedSetReplication(setReplicationOp.path, + replication, null); + break; + } + case OP_CONCAT_DELETE: { + ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op; + fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs, + concatDeleteOp.timestamp); + break; + } + case OP_RENAME_OLD: { + RenameOldOp renameOp = (RenameOldOp)op; + HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false); + fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst, + renameOp.timestamp); + fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo); + break; + } + case OP_DELETE: { + DeleteOp deleteOp = (DeleteOp)op; + fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp); + break; + } + case OP_MKDIR: { + MkdirOp mkdirOp = (MkdirOp)op; + PermissionStatus permissions = fsNamesys.getUpgradePermission(); + if (mkdirOp.permissions != null) { + permissions = mkdirOp.permissions; + } + + fsDir.unprotectedMkdir(mkdirOp.path, permissions, + mkdirOp.timestamp); + break; + } + case OP_SET_GENSTAMP: { + SetGenstampOp setGenstampOp = (SetGenstampOp)op; + fsNamesys.setGenerationStamp(setGenstampOp.genStamp); + break; + } + case OP_SET_PERMISSIONS: { + SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op; + fsDir.unprotectedSetPermission(setPermissionsOp.src, + setPermissionsOp.permissions); + break; + } + case OP_SET_OWNER: { + SetOwnerOp setOwnerOp = (SetOwnerOp)op; + fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username, + setOwnerOp.groupname); + break; + } + case OP_SET_NS_QUOTA: { + SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op; + fsDir.unprotectedSetQuota(setNSQuotaOp.src, + setNSQuotaOp.nsQuota, + HdfsConstants.QUOTA_DONT_SET); + break; + } + case OP_CLEAR_NS_QUOTA: { + ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op; + fsDir.unprotectedSetQuota(clearNSQuotaOp.src, + HdfsConstants.QUOTA_RESET, + HdfsConstants.QUOTA_DONT_SET); + break; + } + + case OP_SET_QUOTA: + SetQuotaOp setQuotaOp = (SetQuotaOp)op; + fsDir.unprotectedSetQuota(setQuotaOp.src, + setQuotaOp.nsQuota, + setQuotaOp.dsQuota); + break; + + case OP_TIMES: { + TimesOp timesOp = (TimesOp)op; + + fsDir.unprotectedSetTimes(timesOp.path, + timesOp.mtime, + timesOp.atime, true); + break; + } + case OP_SYMLINK: { + SymlinkOp symlinkOp = (SymlinkOp)op; + fsDir.unprotectedSymlink(symlinkOp.path, symlinkOp.value, + symlinkOp.mtime, symlinkOp.atime, + symlinkOp.permissionStatus); + break; + } + case OP_RENAME: { + RenameOp renameOp = (RenameOp)op; + + HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false); + fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst, + renameOp.timestamp, renameOp.options); + fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo); + break; + } + case OP_GET_DELEGATION_TOKEN: { + GetDelegationTokenOp getDelegationTokenOp + = (GetDelegationTokenOp)op; + + fsNamesys.getDelegationTokenSecretManager() + .addPersistedDelegationToken(getDelegationTokenOp.token, + getDelegationTokenOp.expiryTime); + break; + } + case OP_RENEW_DELEGATION_TOKEN: { + RenewDelegationTokenOp renewDelegationTokenOp + = (RenewDelegationTokenOp)op; + fsNamesys.getDelegationTokenSecretManager() + .updatePersistedTokenRenewal(renewDelegationTokenOp.token, + renewDelegationTokenOp.expiryTime); + break; + } + case OP_CANCEL_DELEGATION_TOKEN: { + CancelDelegationTokenOp cancelDelegationTokenOp + = (CancelDelegationTokenOp)op; + fsNamesys.getDelegationTokenSecretManager() + .updatePersistedTokenCancellation( + cancelDelegationTokenOp.token); + break; + } + case OP_UPDATE_MASTER_KEY: { + UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op; + fsNamesys.getDelegationTokenSecretManager() + .updatePersistedMasterKey(updateMasterKeyOp.key); + break; + } + case OP_REASSIGN_LEASE: { + ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op; + + Lease lease = fsNamesys.leaseManager.getLease( + reassignLeaseOp.leaseHolder); + INodeFileUnderConstruction pendingFile = + (INodeFileUnderConstruction) fsDir.getFileINode( + reassignLeaseOp.path); + fsNamesys.reassignLeaseInternal(lease, + reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile); + break; + } + case OP_START_LOG_SEGMENT: + case OP_END_LOG_SEGMENT: { + // no data in here currently. + break; + } + case OP_DATANODE_ADD: + case OP_DATANODE_REMOVE: + break; + default: + throw new IOException("Invalid operation read " + op.opCode); + } + } + + private static String formatEditLogReplayError(EditLogInputStream in, + long recentOpcodeOffsets[], long txid) { + StringBuilder sb = new StringBuilder(); + sb.append("Error replaying edit log at offset " + in.getPosition()); + sb.append(" on transaction ID ").append(txid); + if (recentOpcodeOffsets[0] != -1) { + Arrays.sort(recentOpcodeOffsets); + sb.append("\nRecent opcode offsets:"); + for (long offset : recentOpcodeOffsets) { + if (offset != -1) { + sb.append(' ').append(offset); + } + } + } + return sb.toString(); + } + + private static INodeFile getINodeFile(FSDirectory fsDir, String path) + throws IOException { + INode inode = fsDir.getINode(path); + if (inode != null) { + if (!(inode instanceof INodeFile)) { + throw new IOException("Operation trying to get non-file " + path); + } + } + return (INodeFile)inode; + } + + /** + * Update in-memory data structures with new block information. + * @throws IOException + */ + private void updateBlocks(FSDirectory fsDir, BlockListUpdatingOp op, + INodeFile file) throws IOException { + // Update its block list + BlockInfo[] oldBlocks = file.getBlocks(); + Block[] newBlocks = op.getBlocks(); + String path = op.getPath(); + + // Are we only updating the last block's gen stamp. + boolean isGenStampUpdate = oldBlocks.length == newBlocks.length; + + // First, update blocks in common + for (int i = 0; i < oldBlocks.length && i < newBlocks.length; i++) { + BlockInfo oldBlock = oldBlocks[i]; + Block newBlock = newBlocks[i]; + + boolean isLastBlock = i == newBlocks.length - 1; + if (oldBlock.getBlockId() != newBlock.getBlockId() || + (oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() && + !(isGenStampUpdate && isLastBlock))) { + throw new IOException("Mismatched block IDs or generation stamps, " + + "attempting to replace block " + oldBlock + " with " + newBlock + + " as block # " + i + "/" + newBlocks.length + " of " + + path); + } + + oldBlock.setNumBytes(newBlock.getNumBytes()); + boolean changeMade = + oldBlock.getGenerationStamp() != newBlock.getGenerationStamp(); + oldBlock.setGenerationStamp(newBlock.getGenerationStamp()); + + if (oldBlock instanceof BlockInfoUnderConstruction && + (!isLastBlock || op.shouldCompleteLastBlock())) { + changeMade = true; + fsNamesys.getBlockManager().forceCompleteBlock( + (INodeFileUnderConstruction)file, + (BlockInfoUnderConstruction)oldBlock); + } + if (changeMade) { + // The state or gen-stamp of the block has changed. So, we may be + // able to process some messages from datanodes that we previously + // were unable to process. + fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock); + } + } + + if (newBlocks.length < oldBlocks.length) { + // We're removing a block from the file, e.g. abandonBlock(...) + if (!file.isUnderConstruction()) { + throw new IOException("Trying to remove a block from file " + + path + " which is not under construction."); + } + if (newBlocks.length != oldBlocks.length - 1) { + throw new IOException("Trying to remove more than one block from file " + + path); + } + fsDir.unprotectedRemoveBlock(path, + (INodeFileUnderConstruction)file, oldBlocks[oldBlocks.length - 1]); + } else if (newBlocks.length > oldBlocks.length) { + // We're adding blocks + for (int i = oldBlocks.length; i < newBlocks.length; i++) { + Block newBlock = newBlocks[i]; + BlockInfo newBI; + if (!op.shouldCompleteLastBlock()) { + // TODO: shouldn't this only be true for the last block? + // what about an old-version fsync() where fsync isn't called + // until several blocks in? + newBI = new BlockInfoUnderConstruction( + newBlock, file.getReplication()); + } else { + // OP_CLOSE should add finalized blocks. This code path + // is only executed when loading edits written by prior + // versions of Hadoop. Current versions always log + // OP_ADD operations as each block is allocated. + newBI = new BlockInfo(newBlock, file.getReplication()); + } + fsNamesys.getBlockManager().addINode(newBI, file); + file.addBlock(newBI); + fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock); + } + } + } private static void dumpOpCounts( EnumMap> opCounts) { @@ -517,19 +657,21 @@ public class FSEditLogLoader { FSImage.LOG.debug("Caught exception after reading " + numValid + " ops from " + in + " while determining its valid length.", t); } - return new EditLogValidation(lastPos, firstTxId, lastTxId); + return new EditLogValidation(lastPos, firstTxId, lastTxId, false); } static class EditLogValidation { - private long validLength; - private long startTxId; - private long endTxId; + private final long validLength; + private final long startTxId; + private final long endTxId; + private final boolean corruptionDetected; - EditLogValidation(long validLength, - long startTxId, long endTxId) { + EditLogValidation(long validLength, long startTxId, long endTxId, + boolean corruptionDetected) { this.validLength = validLength; this.startTxId = startTxId; this.endTxId = endTxId; + this.corruptionDetected = corruptionDetected; } long getValidLength() { return validLength; } @@ -545,6 +687,8 @@ public class FSEditLogLoader { } return (endTxId - startTxId) + 1; } + + boolean hasCorruptHeader() { return corruptionDetected; } } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java index f075770c33..949554dbda 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java @@ -101,6 +101,7 @@ public abstract class FSEditLogOp { new LogSegmentOp(OP_START_LOG_SEGMENT)); instances.put(OP_END_LOG_SEGMENT, new LogSegmentOp(OP_END_LOG_SEGMENT)); + instances.put(OP_UPDATE_BLOCKS, new UpdateBlocksOp()); return instances; } }; @@ -128,8 +129,14 @@ public abstract class FSEditLogOp { abstract void writeFields(DataOutputStream out) throws IOException; + static interface BlockListUpdatingOp { + Block[] getBlocks(); + String getPath(); + boolean shouldCompleteLastBlock(); + } + @SuppressWarnings("unchecked") - static abstract class AddCloseOp extends FSEditLogOp { + static abstract class AddCloseOp extends FSEditLogOp implements BlockListUpdatingOp { int length; String path; short replication; @@ -151,6 +158,10 @@ public abstract class FSEditLogOp { this.path = path; return (T)this; } + + public String getPath() { + return path; + } T setReplication(short replication) { this.replication = replication; @@ -176,6 +187,10 @@ public abstract class FSEditLogOp { this.blocks = blocks; return (T)this; } + + public Block[] getBlocks() { + return blocks; + } T setPermissionStatus(PermissionStatus permissions) { this.permissions = permissions; @@ -347,6 +362,10 @@ public abstract class FSEditLogOp { return (AddOp)opInstances.get().get(OP_ADD); } + public boolean shouldCompleteLastBlock() { + return false; + } + @Override public String toString() { StringBuilder builder = new StringBuilder(); @@ -365,6 +384,10 @@ public abstract class FSEditLogOp { return (CloseOp)opInstances.get().get(OP_CLOSE); } + public boolean shouldCompleteLastBlock() { + return true; + } + @Override public String toString() { StringBuilder builder = new StringBuilder(); @@ -373,6 +396,68 @@ public abstract class FSEditLogOp { return builder.toString(); } } + + static class UpdateBlocksOp extends FSEditLogOp implements BlockListUpdatingOp { + String path; + Block[] blocks; + + private UpdateBlocksOp() { + super(OP_UPDATE_BLOCKS); + } + + static UpdateBlocksOp getInstance() { + return (UpdateBlocksOp)opInstances.get() + .get(OP_UPDATE_BLOCKS); + } + + + UpdateBlocksOp setPath(String path) { + this.path = path; + return this; + } + + public String getPath() { + return path; + } + + UpdateBlocksOp setBlocks(Block[] blocks) { + this.blocks = blocks; + return this; + } + + public Block[] getBlocks() { + return blocks; + } + + @Override + void writeFields(DataOutputStream out) throws IOException { + FSImageSerialization.writeString(path, out); + FSImageSerialization.writeCompactBlockArray(blocks, out); + } + + @Override + void readFields(DataInputStream in, int logVersion) throws IOException { + path = FSImageSerialization.readString(in); + this.blocks = FSImageSerialization.readCompactBlockArray( + in, logVersion); + } + + @Override + public boolean shouldCompleteLastBlock() { + return false; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("UpdateBlocksOp [path=") + .append(path) + .append(", blocks=") + .append(Arrays.toString(blocks)) + .append("]"); + return sb.toString(); + } + } static class SetReplicationOp extends FSEditLogOp { String path; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java index 220c267f08..1f809c12b2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java @@ -55,7 +55,8 @@ public enum FSEditLogOpCodes { OP_UPDATE_MASTER_KEY ((byte) 21), OP_REASSIGN_LEASE ((byte) 22), OP_END_LOG_SEGMENT ((byte) 23), - OP_START_LOG_SEGMENT ((byte) 24); + OP_START_LOG_SEGMENT ((byte) 24), + OP_UPDATE_BLOCKS ((byte) 25); private byte opCode; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java index 463fca5e0a..7fb3d4bdfc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java @@ -56,6 +56,8 @@ import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.util.MD5FileUtils; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; @@ -68,7 +70,7 @@ import com.google.common.collect.Lists; @InterfaceAudience.Private @InterfaceStability.Evolving public class FSImage implements Closeable { - protected static final Log LOG = LogFactory.getLog(FSImage.class.getName()); + public static final Log LOG = LogFactory.getLog(FSImage.class.getName()); protected FSEditLog editLog = null; private boolean isUpgradeFinalized = false; @@ -112,7 +114,8 @@ public class FSImage implements Closeable { * @throws IOException if directories are invalid. */ protected FSImage(Configuration conf, - Collection imageDirs, Collection editsDirs) + Collection imageDirs, + List editsDirs) throws IOException { this.conf = conf; @@ -123,6 +126,12 @@ public class FSImage implements Closeable { } this.editLog = new FSEditLog(conf, storage, editsDirs); + String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf); + if (!HAUtil.isHAEnabled(conf, nameserviceId)) { + editLog.initJournalsForWrite(); + } else { + editLog.initSharedJournalsForRead(); + } archivalManager = new NNStorageRetentionManager(conf, storage, editLog); } @@ -251,6 +260,11 @@ public class FSImage implements Closeable { StorageState curState; try { curState = sd.analyzeStorage(startOpt, storage); + String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf); + if (curState != StorageState.NORMAL && HAUtil.isHAEnabled(conf, nameserviceId)) { + throw new IOException("Cannot start an HA namenode with name dirs " + + "that need recovery. Dir: " + sd + " state: " + curState); + } // sd is locked but not opened switch(curState) { case NON_EXISTENT: @@ -324,9 +338,9 @@ public class FSImage implements Closeable { File prevDir = sd.getPreviousDir(); File tmpDir = sd.getPreviousTmp(); assert curDir.exists() : "Current directory must exist."; - assert !prevDir.exists() : "prvious directory must not exist."; - assert !tmpDir.exists() : "prvious.tmp directory must not exist."; - assert !editLog.isOpen() : "Edits log must not be open."; + assert !prevDir.exists() : "previous directory must not exist."; + assert !tmpDir.exists() : "previous.tmp directory must not exist."; + assert !editLog.isSegmentOpen() : "Edits log must not be open."; // rename current to tmp NNStorage.rename(curDir, tmpDir); @@ -469,7 +483,7 @@ public class FSImage implements Closeable { void doImportCheckpoint(FSNamesystem target) throws IOException { Collection checkpointDirs = FSImage.getCheckpointDirs(conf, null); - Collection checkpointEditsDirs = + List checkpointEditsDirs = FSImage.getCheckpointEditsDirs(conf, null); if (checkpointDirs == null || checkpointDirs.isEmpty()) { @@ -519,11 +533,9 @@ public class FSImage implements Closeable { return editLog; } - void openEditLog() throws IOException { + void openEditLogForWrite() throws IOException { assert editLog != null : "editLog must be initialized"; - Preconditions.checkState(!editLog.isOpen(), - "edit log should not yet be open"); - editLog.open(); + editLog.openForWrite(); storage.writeTransactionIdFileToStorage(editLog.getCurSegmentTxId()); }; @@ -564,12 +576,19 @@ public class FSImage implements Closeable { Iterable editStreams = null; - editLog.recoverUnclosedStreams(); + if (editLog.isOpenForWrite()) { + // We only want to recover streams if we're going into Active mode. + editLog.recoverUnclosedStreams(); + } if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, getLayoutVersion())) { + // If we're open for write, we're either non-HA or we're the active NN, so + // we better be able to load all the edits. If we're the standby NN, it's + // OK to not be able to read all of edits right now. + long toAtLeastTxId = editLog.isOpenForWrite() ? inspector.getMaxSeenTxId() : 0; editStreams = editLog.selectInputStreams(imageFile.getCheckpointTxId() + 1, - inspector.getMaxSeenTxId()); + toAtLeastTxId, false); } else { editStreams = FSImagePreTransactionalStorageInspector .getEditLogStreams(storage); @@ -644,12 +663,12 @@ public class FSImage implements Closeable { * Load the specified list of edit files into the image. * @return the number of transactions loaded */ - protected long loadEdits(Iterable editStreams, - FSNamesystem target) throws IOException { + public long loadEdits(Iterable editStreams, + FSNamesystem target) throws IOException, EditLogInputException { LOG.debug("About to load edits:\n " + Joiner.on("\n ").join(editStreams)); long startingTxId = getLastAppliedTxId() + 1; - int numLoaded = 0; + long numLoaded = 0; try { FSEditLogLoader loader = new FSEditLogLoader(target); @@ -657,17 +676,26 @@ public class FSImage implements Closeable { // Load latest edits for (EditLogInputStream editIn : editStreams) { LOG.info("Reading " + editIn + " expecting start txid #" + startingTxId); - int thisNumLoaded = loader.loadFSEdits(editIn, startingTxId); - startingTxId += thisNumLoaded; - numLoaded += thisNumLoaded; - lastAppliedTxId += thisNumLoaded; + long thisNumLoaded = 0; + try { + thisNumLoaded = loader.loadFSEdits(editIn, startingTxId); + } catch (EditLogInputException elie) { + thisNumLoaded = elie.getNumEditsLoaded(); + throw elie; + } finally { + // Update lastAppliedTxId even in case of error, since some ops may + // have been successfully applied before the error. + lastAppliedTxId = startingTxId + thisNumLoaded - 1; + startingTxId += thisNumLoaded; + numLoaded += thisNumLoaded; + } } } finally { FSEditLog.closeAllStreams(editStreams); + // update the counts + target.dir.updateCountForINodeWithQuota(); } - - // update the counts - target.dir.updateCountForINodeWithQuota(); + return numLoaded; } @@ -688,8 +716,7 @@ public class FSImage implements Closeable { /** * Load in the filesystem image from file. It's a big list of - * filenames and blocks. Return whether we should - * "re-save" and consolidate the edit-logs + * filenames and blocks. */ private void loadFSImage(File curFile, MD5Hash expectedMd5, FSNamesystem target) throws IOException { @@ -786,16 +813,16 @@ public class FSImage implements Closeable { * Save the contents of the FS image to a new image file in each of the * current storage directories. */ - synchronized void saveNamespace(FSNamesystem source) throws IOException { + public synchronized void saveNamespace(FSNamesystem source) throws IOException { assert editLog != null : "editLog must be initialized"; storage.attemptRestoreRemovedStorage(); - boolean editLogWasOpen = editLog.isOpen(); + boolean editLogWasOpen = editLog.isSegmentOpen(); if (editLogWasOpen) { editLog.endCurrentLogSegment(true); } - long imageTxId = editLog.getLastWrittenTxId(); + long imageTxId = getLastAppliedOrWrittenTxId(); try { saveFSImageInAllDirs(source, imageTxId); storage.writeAll(); @@ -812,7 +839,7 @@ public class FSImage implements Closeable { } - void cancelSaveNamespace(String reason) + public void cancelSaveNamespace(String reason) throws InterruptedException { SaveNamespaceContext ctx = curSaveNamespaceContext; if (ctx != null) { @@ -1061,7 +1088,7 @@ public class FSImage implements Closeable { return Util.stringCollectionAsURIs(dirNames); } - static Collection getCheckpointEditsDirs(Configuration conf, + static List getCheckpointEditsDirs(Configuration conf, String defaultName) { Collection dirNames = conf.getStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY); @@ -1095,4 +1122,16 @@ public class FSImage implements Closeable { return lastAppliedTxId; } + public long getLastAppliedOrWrittenTxId() { + return Math.max(lastAppliedTxId, + editLog != null ? editLog.getLastWrittenTxId() : 0); + } + + public void updateLastAppliedTxIdFromWritten() { + this.lastAppliedTxId = editLog.getLastWrittenTxId(); + } + + public synchronized long getMostRecentCheckpointTxId() { + return storage.getMostRecentCheckpointTxId(); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java index 5b480305b0..f5084339e8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java @@ -40,6 +40,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.ShortWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; /** * Static utility functions for serializing various pieces of data in the correct @@ -277,6 +278,49 @@ public class FSImageSerialization { ustr.getLength(), (byte) Path.SEPARATOR_CHAR); } + + /** + * Write an array of blocks as compactly as possible. This uses + * delta-encoding for the generation stamp and size, following + * the principle that genstamp increases relatively slowly, + * and size is equal for all but the last block of a file. + */ + public static void writeCompactBlockArray( + Block[] blocks, DataOutputStream out) throws IOException { + WritableUtils.writeVInt(out, blocks.length); + Block prev = null; + for (Block b : blocks) { + long szDelta = b.getNumBytes() - + (prev != null ? prev.getNumBytes() : 0); + long gsDelta = b.getGenerationStamp() - + (prev != null ? prev.getGenerationStamp() : 0); + out.writeLong(b.getBlockId()); // blockid is random + WritableUtils.writeVLong(out, szDelta); + WritableUtils.writeVLong(out, gsDelta); + prev = b; + } + } + + public static Block[] readCompactBlockArray( + DataInputStream in, int logVersion) throws IOException { + int num = WritableUtils.readVInt(in); + if (num < 0) { + throw new IOException("Invalid block array length: " + num); + } + Block prev = null; + Block[] ret = new Block[num]; + for (int i = 0; i < num; i++) { + long id = in.readLong(); + long sz = WritableUtils.readVLong(in) + + ((prev != null) ? prev.getNumBytes() : 0); + long gs = WritableUtils.readVLong(in) + + ((prev != null) ? prev.getGenerationStamp() : 0); + ret[i] = new Block(id, sz, gs); + prev = ret[i]; + } + return ret; + } + /** * DatanodeImage is used to store persistent information * about datanodes into the fsImage. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 604fbb00bd..f22f808825 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -32,6 +32,8 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT; @@ -47,10 +49,15 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DAT import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_UPGRADE_PERMISSION_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_UPGRADE_PERMISSION_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERSIST_BLOCKS_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT; @@ -68,6 +75,7 @@ import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; +import java.io.StringWriter; import java.lang.management.ManagementFactory; import java.net.InetAddress; import java.net.URI; @@ -80,6 +88,7 @@ import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -108,7 +117,10 @@ import org.apache.hadoop.fs.UnresolvedLinkException; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.PermissionStatus; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; import org.apache.hadoop.hdfs.protocol.Block; @@ -147,9 +159,18 @@ import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport; import org.apache.hadoop.hdfs.server.common.Util; import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease; +import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; +import org.apache.hadoop.hdfs.server.namenode.ha.ActiveState; +import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer; +import org.apache.hadoop.hdfs.server.namenode.ha.HAContext; +import org.apache.hadoop.hdfs.server.namenode.ha.HAState; +import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer; +import org.apache.hadoop.hdfs.server.namenode.ha.StandbyState; import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; +import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat; import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; @@ -157,6 +178,7 @@ import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.ipc.Server; +import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; @@ -170,13 +192,12 @@ import org.apache.hadoop.security.token.SecretManager.InvalidToken; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.delegation.DelegationKey; import org.apache.hadoop.util.Daemon; -import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.VersionInfo; import org.mortbay.util.ajax.JSON; -import com.google.common.base.Preconditions; - import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; /*************************************************** * FSNamesystem does the actual bookkeeping work for the @@ -194,7 +215,7 @@ import com.google.common.annotations.VisibleForTesting; @Metrics(context="dfs") public class FSNamesystem implements Namesystem, FSClusterStats, FSNamesystemMBean, NameNodeMXBean { - static final Log LOG = LogFactory.getLog(FSNamesystem.class); + public static final Log LOG = LogFactory.getLog(FSNamesystem.class); private static final ThreadLocal auditBuffer = new ThreadLocal() { @@ -243,14 +264,18 @@ public class FSNamesystem implements Namesystem, FSClusterStats, static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100; static int BLOCK_DELETION_INCREMENT = 1000; private boolean isPermissionEnabled; + private boolean persistBlocks; private UserGroupInformation fsOwner; private String supergroup; private PermissionStatus defaultPermission; + private boolean standbyShouldCheckpoint; // Scan interval is not configurable. private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL = TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS); private DelegationTokenSecretManager dtSecretManager; + private boolean alwaysUseDelegationTokensForTests; + // // Stores the correct file name hierarchy @@ -264,7 +289,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats, LeaseManager leaseManager = new LeaseManager(this); - Daemon lmthread = null; // LeaseMonitor thread Daemon smmthread = null; // SafeModeMonitor thread Daemon nnrmthread = null; // NamenodeResourceMonitor thread @@ -300,7 +324,26 @@ public class FSNamesystem implements Namesystem, FSClusterStats, // lock to protect FSNamesystem. private ReentrantReadWriteLock fsLock; - + /** + * Used when this NN is in standby state to read from the shared edit log. + */ + private EditLogTailer editLogTailer = null; + + /** + * Used when this NN is in standby state to perform checkpoints. + */ + private StandbyCheckpointer standbyCheckpointer; + + /** + * Reference to the NN's HAContext object. This is only set once + * {@link #startCommonServices(Configuration, HAContext)} is called. + */ + private HAContext haContext; + + private boolean haEnabled; + + private final Configuration conf; + /** * Instantiates an FSNamesystem loaded from the image and edits * directories specified in the passed Configuration. @@ -310,9 +353,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @return an FSNamesystem which contains the loaded namespace * @throws IOException if loading fails */ - public static FSNamesystem loadFromDisk(Configuration conf) throws IOException { + public static FSNamesystem loadFromDisk(Configuration conf) + throws IOException { Collection namespaceDirs = FSNamesystem.getNamespaceDirs(conf); - Collection namespaceEditsDirs = + List namespaceEditsDirs = FSNamesystem.getNamespaceEditsDirs(conf); if (namespaceDirs.size() == 1) { @@ -329,7 +373,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats, long loadStart = now(); StartupOption startOpt = NameNode.getStartupOption(conf); - namesystem.loadFSImage(startOpt, fsImage); + String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf); + namesystem.loadFSImage(startOpt, fsImage, + HAUtil.isHAEnabled(conf, nameserviceId)); long timeTakenToLoadFSImage = now() - loadStart; LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs"); NameNode.getNameNodeMetrics().setFsImageLoadTime( @@ -348,6 +394,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @throws IOException on bad configuration */ FSNamesystem(Configuration conf, FSImage fsImage) throws IOException { + this.conf = conf; try { initialize(conf, fsImage); } catch(IOException e) { @@ -375,7 +422,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, this.safeMode = new SafeModeInfo(conf); } - void loadFSImage(StartupOption startOpt, FSImage fsImage) + void loadFSImage(StartupOption startOpt, FSImage fsImage, boolean haEnabled) throws IOException { // format before starting up if requested if (startOpt == StartupOption.FORMAT) { @@ -385,43 +432,71 @@ public class FSNamesystem implements Namesystem, FSClusterStats, startOpt = StartupOption.REGULAR; } boolean success = false; + writeLock(); try { - if (fsImage.recoverTransitionRead(startOpt, this)) { + // We shouldn't be calling saveNamespace if we've come up in standby state. + if (fsImage.recoverTransitionRead(startOpt, this) && !haEnabled) { fsImage.saveNamespace(this); } - fsImage.openEditLog(); + // This will start a new log segment and write to the seen_txid file, so + // we shouldn't do it when coming up in standby state + if (!haEnabled) { + fsImage.openEditLogForWrite(); + } success = true; } finally { if (!success) { fsImage.close(); } + writeUnlock(); } dir.imageLoadComplete(); } - void activateSecretManager() throws IOException { + private void startSecretManager() { if (dtSecretManager != null) { - dtSecretManager.startThreads(); + try { + dtSecretManager.startThreads(); + } catch (IOException e) { + // Inability to start secret manager + // can't be recovered from. + throw new RuntimeException(e); + } } } - /** - * Activate FSNamesystem daemons. - */ - void activate(Configuration conf) throws IOException { - this.registerMBean(); // register the MBean for the FSNamesystemState + private void startSecretManagerIfNecessary() { + boolean shouldRun = shouldUseDelegationTokens() && + !isInSafeMode() && getEditLog().isOpenForWrite(); + boolean running = dtSecretManager.isRunning(); + if (shouldRun && !running) { + startSecretManager(); + } + } + private void stopSecretManager() { + if (dtSecretManager != null) { + dtSecretManager.stopThreads(); + } + } + + /** + * Start services common to both active and standby states + * @param haContext + * @throws IOException + */ + void startCommonServices(Configuration conf, HAContext haContext) throws IOException { + this.registerMBean(); // register the MBean for the FSNamesystemState writeLock(); + this.haContext = haContext; try { nnResourceChecker = new NameNodeResourceChecker(conf); checkAvailableResources(); - + assert safeMode != null && + !safeMode.isPopulatingReplQueues(); setBlockTotal(); blockManager.activate(conf); - - this.lmthread = new Daemon(leaseManager.new Monitor()); - lmthread.start(); this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); nnrmthread.start(); } finally { @@ -431,24 +506,169 @@ public class FSNamesystem implements Namesystem, FSClusterStats, registerMXBean(); DefaultMetricsSystem.instance().register(this); } - - public static Collection getNamespaceDirs(Configuration conf) { - return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY); - } - public static Collection getNamespaceEditsDirs(Configuration conf) { - Collection editsDirs = getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY); - if (editsDirs.isEmpty()) { - // If this is the case, no edit dirs have been explicitly configured. - // Image dirs are to be used for edits too. - return getNamespaceDirs(conf); - } else { - return editsDirs; + /** + * Stop services common to both active and standby states + * @throws IOException + */ + void stopCommonServices() { + writeLock(); + try { + if (blockManager != null) blockManager.close(); + if (nnrmthread != null) nnrmthread.interrupt(); + } finally { + writeUnlock(); } } + /** + * Start services required in active state + * @throws IOException + */ + void startActiveServices() throws IOException { + LOG.info("Starting services required for active state"); + writeLock(); + try { + FSEditLog editLog = dir.fsImage.getEditLog(); + + if (!editLog.isOpenForWrite()) { + // During startup, we're already open for write during initialization. + editLog.initJournalsForWrite(); + // May need to recover + editLog.recoverUnclosedStreams(); + + LOG.info("Catching up to latest edits from old active before " + + "taking over writer role in edits logs."); + editLogTailer.catchupDuringFailover(); + + LOG.info("Reprocessing replication and invalidation queues..."); + blockManager.getDatanodeManager().markAllDatanodesStale(); + blockManager.clearQueues(); + blockManager.processAllPendingDNMessages(); + blockManager.processMisReplicatedBlocks(); + + if (LOG.isDebugEnabled()) { + LOG.debug("NameNode metadata after re-processing " + + "replication and invalidation queues during failover:\n" + + metaSaveAsString()); + } + + long nextTxId = dir.fsImage.getLastAppliedTxId() + 1; + LOG.info("Will take over writing edit logs at txnid " + + nextTxId); + editLog.setNextTxId(nextTxId); + + dir.fsImage.editLog.openForWrite(); + } + if (haEnabled) { + // Renew all of the leases before becoming active. + // This is because, while we were in standby mode, + // the leases weren't getting renewed on this NN. + // Give them all a fresh start here. + leaseManager.renewAllLeases(); + } + leaseManager.startMonitor(); + startSecretManagerIfNecessary(); + } finally { + writeUnlock(); + } + } + + private boolean shouldUseDelegationTokens() { + return UserGroupInformation.isSecurityEnabled() || + alwaysUseDelegationTokensForTests; + } + + /** + * Stop services required in active state + * @throws InterruptedException + */ + void stopActiveServices() { + LOG.info("Stopping services started for active state"); + writeLock(); + try { + stopSecretManager(); + if (leaseManager != null) { + leaseManager.stopMonitor(); + } + if (dir != null && dir.fsImage != null) { + if (dir.fsImage.editLog != null) { + dir.fsImage.editLog.close(); + } + // Update the fsimage with the last txid that we wrote + // so that the tailer starts from the right spot. + dir.fsImage.updateLastAppliedTxIdFromWritten(); + } + } finally { + writeUnlock(); + } + } + + /** Start services required in standby state */ + void startStandbyServices() { + LOG.info("Starting services required for standby state"); + if (!dir.fsImage.editLog.isOpenForRead()) { + // During startup, we're already open for read. + dir.fsImage.editLog.initSharedJournalsForRead(); + } + editLogTailer = new EditLogTailer(this); + editLogTailer.start(); + if (standbyShouldCheckpoint) { + standbyCheckpointer = new StandbyCheckpointer(conf, this); + standbyCheckpointer.start(); + } + } + + + /** + * Called while the NN is in Standby state, but just about to be + * asked to enter Active state. This cancels any checkpoints + * currently being taken. + */ + void prepareToStopStandbyServices() throws ServiceFailedException { + if (standbyCheckpointer != null) { + standbyCheckpointer.cancelAndPreventCheckpoints(); + } + } + + /** Stop services required in standby state */ + void stopStandbyServices() throws IOException { + LOG.info("Stopping services started for standby state"); + if (standbyCheckpointer != null) { + standbyCheckpointer.stop(); + } + if (editLogTailer != null) { + editLogTailer.stop(); + } + if (dir != null && dir.fsImage != null && dir.fsImage.editLog != null) { + dir.fsImage.editLog.close(); + } + } + + + void checkOperation(OperationCategory op) throws StandbyException { + if (haContext != null) { + // null in some unit tests + haContext.checkOperation(op); + } + } + + public static Collection getNamespaceDirs(Configuration conf) { + return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY); + } + + /** + * Get all edits dirs which are required. If any shared edits dirs are + * configured, these are also included in the set of required dirs. + * + * @param conf the HDFS configuration. + * @return all required dirs. + */ public static Collection getRequiredNamespaceEditsDirs(Configuration conf) { - return getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY); + Set ret = new HashSet(); + ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY)); + ret.addAll(getSharedEditsDirs(conf)); + return ret; } private static Collection getStorageDirs(Configuration conf, @@ -481,6 +701,75 @@ public class FSNamesystem implements Namesystem, FSClusterStats, return Util.stringCollectionAsURIs(dirNames); } + /** + * Return an ordered list of edits directories to write to. + * The list is ordered such that all shared edits directories + * are ordered before non-shared directories, and any duplicates + * are removed. The order they are specified in the configuration + * is retained. + * @return Collection of shared edits directories. + * @throws IOException if multiple shared edits directories are configured + */ + public static List getNamespaceEditsDirs(Configuration conf) + throws IOException { + // Use a LinkedHashSet so that order is maintained while we de-dup + // the entries. + LinkedHashSet editsDirs = new LinkedHashSet(); + + List sharedDirs = getSharedEditsDirs(conf); + + // Fail until multiple shared edits directories are supported (HDFS-2782) + if (sharedDirs.size() > 1) { + throw new IOException( + "Multiple shared edits directories are not yet supported"); + } + + // First add the shared edits dirs. It's critical that the shared dirs + // are added first, since JournalSet syncs them in the order they are listed, + // and we need to make sure all edits are in place in the shared storage + // before they are replicated locally. See HDFS-2874. + for (URI dir : sharedDirs) { + if (!editsDirs.add(dir)) { + LOG.warn("Edits URI " + dir + " listed multiple times in " + + DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates."); + } + } + + // Now add the non-shared dirs. + for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) { + if (!editsDirs.add(dir)) { + LOG.warn("Edits URI " + dir + " listed multiple times in " + + DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " + + DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates."); + } + } + + if (editsDirs.isEmpty()) { + // If this is the case, no edit dirs have been explicitly configured. + // Image dirs are to be used for edits too. + return Lists.newArrayList(getNamespaceDirs(conf)); + } else { + return Lists.newArrayList(editsDirs); + } + } + + /** + * Returns edit directories that are shared between primary and secondary. + * @param conf + * @return Collection of edit directories. + */ + public static List getSharedEditsDirs(Configuration conf) { + // don't use getStorageDirs here, because we want an empty default + // rather than the dir in /tmp + Collection dirNames = conf.getTrimmedStringCollection( + DFS_NAMENODE_SHARED_EDITS_DIR_KEY); + return Util.stringCollectionAsURIs(dirNames); + } + + public Configuration getConf() { + return conf; + } + @Override public void readLock() { this.fsLock.readLock().lock(); @@ -494,6 +783,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats, this.fsLock.writeLock().lock(); } @Override + public void writeLockInterruptibly() throws InterruptedException { + this.fsLock.writeLock().lockInterruptibly(); + } + @Override public void writeUnlock() { this.fsLock.writeLock().unlock(); } @@ -526,6 +819,26 @@ public class FSNamesystem implements Namesystem, FSClusterStats, DFS_PERMISSIONS_ENABLED_DEFAULT); LOG.info("supergroup=" + supergroup); LOG.info("isPermissionEnabled=" + isPermissionEnabled); + + this.persistBlocks = conf.getBoolean(DFS_PERSIST_BLOCKS_KEY, + DFS_PERSIST_BLOCKS_DEFAULT); + // block allocation has to be persisted in HA using a shared edits directory + // so that the standby has up-to-date namespace information + String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf); + this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId); + this.persistBlocks |= haEnabled && HAUtil.usesSharedEditsDir(conf); + + // Sanity check the HA-related config. + if (nameserviceId != null) { + LOG.info("Determined nameservice ID: " + nameserviceId); + } + LOG.info("HA Enabled: " + haEnabled); + if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) { + LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf)); + throw new IOException("Invalid configuration: a shared edits dir " + + "must not be specified if HA is not enabled."); + } + short filePermission = (short)conf.getInt(DFS_NAMENODE_UPGRADE_PERMISSION_KEY, DFS_NAMENODE_UPGRADE_PERMISSION_DEFAULT); this.defaultPermission = PermissionStatus.createImmutable( @@ -546,6 +859,16 @@ public class FSNamesystem implements Namesystem, FSClusterStats, DFS_SUPPORT_APPEND_DEFAULT); this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf); + + this.standbyShouldCheckpoint = conf.getBoolean( + DFS_HA_STANDBY_CHECKPOINTS_KEY, + DFS_HA_STANDBY_CHECKPOINTS_DEFAULT); + + // For testing purposes, allow the DT secret manager to be started regardless + // of whether security is enabled. + alwaysUseDelegationTokensForTests = + conf.getBoolean(DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, + DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT); } /** @@ -566,7 +889,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } /** - * Version of {@see #getNamespaceInfo()} that is not protected by a lock. + * Version of @see #getNamespaceInfo() that is not protected by a lock. */ NamespaceInfo unprotectedGetNamespaceInfo() { return new NamespaceInfo(dir.fsImage.getStorage().getNamespaceID(), @@ -583,23 +906,16 @@ public class FSNamesystem implements Namesystem, FSClusterStats, void close() { fsRunning = false; try { - if (blockManager != null) blockManager.close(); + stopCommonServices(); if (smmthread != null) smmthread.interrupt(); - if (dtSecretManager != null) dtSecretManager.stopThreads(); - if (nnrmthread != null) nnrmthread.interrupt(); - } catch (Exception e) { - LOG.warn("Exception shutting down FSNamesystem", e); } finally { // using finally to ensure we also wait for lease daemon try { - if (lmthread != null) { - lmthread.interrupt(); - lmthread.join(3000); - } + stopActiveServices(); + stopStandbyServices(); if (dir != null) { dir.close(); } - } catch (InterruptedException ie) { } catch (IOException ie) { LOG.error("Error closing FSDirectory", ie); IOUtils.cleanup(LOG, dir); @@ -611,6 +927,18 @@ public class FSNamesystem implements Namesystem, FSClusterStats, public boolean isRunning() { return fsRunning; } + + @Override + public boolean isInStandbyState() { + if (haContext == null || haContext.getState() == null) { + // We're still starting up. In this case, if HA is + // on for the cluster, we always start in standby. Otherwise + // start in active. + return haEnabled; + } + + return haContext.getState() instanceof StandbyState; + } /** * Dump all metadata into specified file @@ -622,14 +950,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, File file = new File(System.getProperty("hadoop.log.dir"), filename); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file, true))); - - long totalInodes = this.dir.totalInodes(); - long totalBlocks = this.getBlocksTotal(); - out.println(totalInodes + " files and directories, " + totalBlocks - + " blocks = " + (totalInodes + totalBlocks) + " total"); - - blockManager.metaSave(out); - + metaSave(out); out.flush(); out.close(); } finally { @@ -637,11 +958,31 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } } + private void metaSave(PrintWriter out) { + assert hasWriteLock(); + long totalInodes = this.dir.totalInodes(); + long totalBlocks = this.getBlocksTotal(); + out.println(totalInodes + " files and directories, " + totalBlocks + + " blocks = " + (totalInodes + totalBlocks) + " total"); + + blockManager.metaSave(out); + } + + private String metaSaveAsString() { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + metaSave(pw); + pw.flush(); + return sw.toString(); + } + + long getDefaultBlockSize() { return serverDefaults.getBlockSize(); } - FsServerDefaults getServerDefaults() { + FsServerDefaults getServerDefaults() throws StandbyException { + checkOperation(OperationCategory.READ); return serverDefaults; } @@ -668,6 +1009,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, HdfsFileStatus resultingStat = null; writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot set permission for " + src, safeMode); } @@ -697,6 +1040,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, HdfsFileStatus resultingStat = null; writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot set owner for " + src, safeMode); } @@ -787,13 +1132,14 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } else { // second attempt is with write lock writeLock(); // writelock is needed to set accesstime } - - // if the namenode is in safemode, then do not update access time - if (isInSafeMode()) { - doAccessTime = false; - } - try { + checkOperation(OperationCategory.READ); + + // if the namenode is in safemode, then do not update access time + if (isInSafeMode()) { + doAccessTime = false; + } + long now = now(); INodeFile inode = dir.getFileINode(src); if (inode == null) { @@ -861,6 +1207,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, HdfsFileStatus resultingStat = null; writeLock(); try { + checkOperation(OperationCategory.WRITE); if (isInSafeMode()) { throw new SafeModeException("Cannot concat " + target, safeMode); } @@ -992,6 +1339,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } writeLock(); try { + checkOperation(OperationCategory.WRITE); + // Write access is required to set access and modification times if (isPermissionEnabled) { checkPathAccess(src, FsAction.WRITE); @@ -1022,6 +1371,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, HdfsFileStatus resultingStat = null; writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (!createParent) { verifyParentDir(link); } @@ -1091,6 +1442,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, final boolean isFile; writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot set replication for " + src, safeMode); } @@ -1121,6 +1474,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, throws IOException, UnresolvedLinkException { readLock(); try { + checkOperation(OperationCategory.READ); if (isPermissionEnabled) { checkTraverse(filename); } @@ -1163,6 +1517,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, FileNotFoundException, ParentNotDirectoryException, IOException { writeLock(); try { + checkOperation(OperationCategory.WRITE); + startFileInternal(src, permissions, holder, clientMachine, flag, createParent, replication, blockSize); } finally { @@ -1266,30 +1622,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, blockManager.getDatanodeManager().getDatanodeByHost(clientMachine); if (append && myFile != null) { - // - // Replace current node with a INodeUnderConstruction. - // Recreate in-memory lease record. - // - INodeFile node = (INodeFile) myFile; - INodeFileUnderConstruction cons = new INodeFileUnderConstruction( - node.getLocalNameBytes(), - node.getReplication(), - node.getModificationTime(), - node.getPreferredBlockSize(), - node.getBlocks(), - node.getPermissionStatus(), - holder, - clientMachine, - clientNode); - dir.replaceNode(src, node, cons); - leaseManager.addLease(cons.getClientName(), src); - - // convert last block to under-construction - LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons); - - // add append file record to log, record lease, etc. - getEditLog().logOpenFile(src, cons); - return ret; + return prepareFileForWrite( + src, myFile, holder, clientMachine, clientNode, true); } else { // Now we can add the name to the filesystem. This file has no // blocks associated with it. @@ -1320,6 +1654,45 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } return null; } + + /** + * Replace current node with a INodeUnderConstruction. + * Recreate in-memory lease record. + * + * @param src path to the file + * @param file existing file object + * @param leaseHolder identifier of the lease holder on this file + * @param clientMachine identifier of the client machine + * @param clientNode if the client is collocated with a DN, that DN's descriptor + * @param writeToEditLog whether to persist this change to the edit log + * @return the last block locations if the block is partial or null otherwise + * @throws UnresolvedLinkException + * @throws IOException + */ + public LocatedBlock prepareFileForWrite(String src, INode file, + String leaseHolder, String clientMachine, DatanodeDescriptor clientNode, + boolean writeToEditLog) + throws UnresolvedLinkException, IOException { + INodeFile node = (INodeFile) file; + INodeFileUnderConstruction cons = new INodeFileUnderConstruction( + node.getLocalNameBytes(), + node.getReplication(), + node.getModificationTime(), + node.getPreferredBlockSize(), + node.getBlocks(), + node.getPermissionStatus(), + leaseHolder, + clientMachine, + clientNode); + dir.replaceNode(src, node, cons); + leaseManager.addLease(cons.getClientName(), src); + + LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons); + if (writeToEditLog) { + getEditLog().logOpenFile(src, cons); + } + return ret; + } /** * Recover lease; @@ -1336,6 +1709,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, throws IOException { writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException( "Cannot recover the lease of " + src, safeMode); @@ -1455,6 +1830,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, LocatedBlock lb = null; writeLock(); try { + checkOperation(OperationCategory.WRITE); + lb = startFileInternal(src, null, holder, clientMachine, EnumSet.of(CreateFlag.APPEND), false, blockManager.maxReplication, 0); @@ -1519,6 +1896,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot add block to " + src, safeMode); } @@ -1552,6 +1931,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, // Allocate a new block and record it in the INode. writeLock(); try { + checkOperation(OperationCategory.WRITE); if (isInSafeMode()) { throw new SafeModeException("Cannot add block to " + src, safeMode); } @@ -1570,10 +1950,14 @@ public class FSNamesystem implements Namesystem, FSClusterStats, for (DatanodeDescriptor dn : targets) { dn.incBlocksScheduled(); - } + } + dir.persistBlocks(src, pendingFile); } finally { writeUnlock(); } + if (persistBlocks) { + getEditLog().logSync(); + } // Create next block LocatedBlock b = new LocatedBlock(getExtendedBlock(newBlock), targets, fileLength); @@ -1594,6 +1978,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, final List chosen; readLock(); try { + checkOperation(OperationCategory.WRITE); //check safe mode if (isInSafeMode()) { throw new SafeModeException("Cannot add datanode; src=" + src @@ -1635,6 +2020,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, UnresolvedLinkException, IOException { writeLock(); try { + checkOperation(OperationCategory.WRITE); // // Remove the block from the pending creates list // @@ -1652,10 +2038,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats, NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b + " is removed from pendingCreates"); } - return true; + dir.persistBlocks(src, file); } finally { writeUnlock(); } + if (persistBlocks) { + getEditLog().logSync(); + } + + return true; } // make sure that we still have the lease on this file. @@ -1705,6 +2096,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, boolean success = false; writeLock(); try { + checkOperation(OperationCategory.WRITE); + success = completeFileInternal(src, holder, ExtendedBlock.getLocalBlock(last)); } finally { @@ -1764,12 +2157,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @throws QuotaExceededException If addition of block exceeds space quota */ private Block allocateBlock(String src, INode[] inodes, - DatanodeDescriptor targets[]) throws QuotaExceededException { + DatanodeDescriptor targets[]) throws QuotaExceededException, + SafeModeException { assert hasWriteLock(); Block b = new Block(DFSUtil.getRandom().nextLong(), 0, 0); while(isValidBlock(b)) { b.setBlockId(DFSUtil.getRandom().nextLong()); } + // Increment the generation stamp for every new block. + nextGenerationStamp(); b.setGenerationStamp(getGenerationStamp()); b = dir.addBlock(src, inodes, b, targets); NameNode.stateChangeLog.info("BLOCK* NameSystem.allocateBlock: " @@ -1841,6 +2237,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } writeLock(); try { + checkOperation(OperationCategory.WRITE); + status = renameToInternal(src, dst); if (status && auditLog.isInfoEnabled() && isExternalInvocation()) { resultingStat = dir.getFileInfo(dst, false); @@ -1896,6 +2294,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } writeLock(); try { + checkOperation(OperationCategory.WRITE); + renameToInternal(src, dst, options); if (auditLog.isInfoEnabled() && isExternalInvocation()) { resultingStat = dir.getFileInfo(dst, false); @@ -1973,6 +2373,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, writeLock(); try { + checkOperation(OperationCategory.WRITE); if (isInSafeMode()) { throw new SafeModeException("Cannot delete " + src, safeMode); } @@ -2028,9 +2429,45 @@ public class FSNamesystem implements Namesystem, FSClusterStats, if (blocks == null) { return; } - for(Block b : blocks) { + + // In the case that we are a Standby tailing edits from the + // active while in safe-mode, we need to track the total number + // of blocks and safe blocks in the system. + boolean trackBlockCounts = isSafeModeTrackingBlocks(); + int numRemovedComplete = 0, numRemovedSafe = 0; + + for (Block b : blocks) { + if (trackBlockCounts) { + BlockInfo bi = blockManager.getStoredBlock(b); + if (bi.isComplete()) { + numRemovedComplete++; + if (bi.numNodes() >= blockManager.minReplication) { + numRemovedSafe++; + } + } + } blockManager.removeBlock(b); } + if (trackBlockCounts) { + if (LOG.isDebugEnabled()) { + LOG.debug("Adjusting safe-mode totals for deletion of " + src + ":" + + "decreasing safeBlocks by " + numRemovedSafe + + ", totalBlocks by " + numRemovedComplete); + } + adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete); + } + } + + /** + * @see SafeModeInfo#shouldIncrementallyTrackBlocks + */ + private boolean isSafeModeTrackingBlocks() { + if (!haEnabled) { + // Never track blocks incrementally in non-HA code. + return false; + } + SafeModeInfo sm = this.safeMode; + return sm != null && sm.shouldIncrementallyTrackBlocks(); } /** @@ -2045,11 +2482,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * * @return object containing information regarding the file * or null if file not found + * @throws StandbyException */ HdfsFileStatus getFileInfo(String src, boolean resolveLink) - throws AccessControlException, UnresolvedLinkException { + throws AccessControlException, UnresolvedLinkException, + StandbyException { readLock(); try { + checkOperation(OperationCategory.READ); + if (!DFSUtil.isValidName(src)) { throw new InvalidPathException("Invalid file name: " + src); } @@ -2073,6 +2514,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } writeLock(); try { + checkOperation(OperationCategory.WRITE); + status = mkdirsInternal(src, permissions, createParent); } finally { writeUnlock(); @@ -2127,9 +2570,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } ContentSummary getContentSummary(String src) throws AccessControlException, - FileNotFoundException, UnresolvedLinkException { + FileNotFoundException, UnresolvedLinkException, StandbyException { readLock(); try { + checkOperation(OperationCategory.READ); + if (isPermissionEnabled) { checkPermission(src, false, null, null, null, FsAction.READ_EXECUTE); } @@ -2148,6 +2593,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, throws IOException, UnresolvedLinkException { writeLock(); try { + checkOperation(OperationCategory.WRITE); if (isInSafeMode()) { throw new SafeModeException("Cannot set quota on " + path, safeMode); } @@ -2172,6 +2618,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, + src + " for " + clientName); writeLock(); try { + checkOperation(OperationCategory.WRITE); if (isInSafeMode()) { throw new SafeModeException("Cannot fsync file " + src, safeMode); } @@ -2381,6 +2828,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats, String src = ""; writeLock(); try { + checkOperation(OperationCategory.WRITE); + // If a DN tries to commit to the standby, the recovery will + // fail, and the next retry will succeed on the new NN. + if (isInSafeMode()) { throw new SafeModeException( "Cannot commitBlockSynchronization while in safe mode", @@ -2455,8 +2906,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, //remove lease, close file finalizeINodeFileUnderConstruction(src, pendingFile); } else if (supportAppends) { - // If this commit does not want to close the file, persist - // blocks only if append is supported + // If this commit does not want to close the file, persist blocks + // only if append is supported or we're explicitly told to dir.persistBlocks(src, pendingFile); } } finally { @@ -2481,6 +2932,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, void renewLease(String holder) throws IOException { writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot renew lease for " + holder, safeMode); } @@ -2508,6 +2961,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, DirectoryListing dl; readLock(); try { + checkOperation(OperationCategory.READ); + if (isPermissionEnabled) { if (dir.isDir(src)) { checkPathAccess(src, FsAction.READ_EXECUTE); @@ -2586,7 +3041,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @return an array of datanode commands * @throws IOException */ - DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg, + HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg, long capacity, long dfsUsed, long remaining, long blockPoolUsed, int xceiverCount, int xmitsInProgress, int failedVolumes) throws IOException { @@ -2597,28 +3052,40 @@ public class FSNamesystem implements Namesystem, FSClusterStats, DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat( nodeReg, blockPoolId, capacity, dfsUsed, remaining, blockPoolUsed, xceiverCount, maxTransfer, failedVolumes); - if (cmds != null) { - return cmds; + if (cmds == null || cmds.length == 0) { + DatanodeCommand cmd = upgradeManager.getBroadcastCommand(); + if (cmd != null) { + cmds = new DatanodeCommand[] {cmd}; + } } - - //check distributed upgrade - DatanodeCommand cmd = upgradeManager.getBroadcastCommand(); - if (cmd != null) { - return new DatanodeCommand[] {cmd}; - } - return null; + + return new HeartbeatResponse(cmds, createHaStatusHeartbeat()); } finally { readUnlock(); } } + private NNHAStatusHeartbeat createHaStatusHeartbeat() { + HAState state = haContext.getState(); + NNHAStatusHeartbeat.State hbState; + if (state instanceof ActiveState) { + hbState = NNHAStatusHeartbeat.State.ACTIVE; + } else if (state instanceof StandbyState) { + hbState = NNHAStatusHeartbeat.State.STANDBY; + } else { + throw new AssertionError("Invalid state: " + state.getClass()); + } + return new NNHAStatusHeartbeat(hbState, + getFSImage().getLastAppliedOrWrittenTxId()); + } + /** * Returns whether or not there were available resources at the last check of * resources. * * @return true if there were sufficient resources available, false otherwise. */ - private boolean nameNodeHasResourcesAvailable() { + boolean nameNodeHasResourcesAvailable() { return hasResourcesAvailable; } @@ -2626,7 +3093,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * Perform resource checks and cache the results. * @throws IOException */ - private void checkAvailableResources() throws IOException { + void checkAvailableResources() { Preconditions.checkState(nnResourceChecker != null, "nnResourceChecker not initialized"); hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace(); @@ -2665,11 +3132,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } } - FSImage getFSImage() { + public FSImage getFSImage() { return dir.fsImage; } - FSEditLog getEditLog() { + public FSEditLog getEditLog() { return getFSImage().getEditLog(); } @@ -2701,8 +3168,12 @@ public class FSNamesystem implements Namesystem, FSClusterStats, @Metric({"TransactionsSinceLastLogRoll", "Number of transactions since last edit log roll"}) public long getTransactionsSinceLastLogRoll() { - return (getEditLog().getLastWrittenTxId() - - getEditLog().getCurSegmentTxId()) + 1; + if (isInStandbyState()) { + return 0; + } else { + return getEditLog().getLastWrittenTxId() - + getEditLog().getCurSegmentTxId() + 1; + } } @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"}) @@ -2931,6 +3402,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, boolean initializedReplQueues = false; /** Was safemode entered automatically because available resources were low. */ private boolean resourcesLow = false; + /** Should safemode adjust its block totals as blocks come in */ + private boolean shouldIncrementallyTrackBlocks = false; /** * Creates SafeModeInfo when the name node enters @@ -2958,6 +3431,18 @@ public class FSNamesystem implements Namesystem, FSClusterStats, this.blockSafe = 0; } + /** + * In the HA case, the StandbyNode can be in safemode while the namespace + * is modified by the edit log tailer. In this case, the number of total + * blocks changes as edits are processed (eg blocks are added and deleted). + * However, we don't want to do the incremental tracking during the + * startup-time loading process -- only once the initial total has been + * set after the image has been loaded. + */ + private boolean shouldIncrementallyTrackBlocks() { + return shouldIncrementallyTrackBlocks; + } + /** * Creates SafeModeInfo when safe mode is entered manually, or because * available resources are low. @@ -2986,13 +3471,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @return true if in safe mode */ private synchronized boolean isOn() { - try { - assert isConsistent() : " SafeMode: Inconsistent filesystem state: " - + "Total num of blocks, active blocks, or " - + "total safe blocks don't match."; - } catch(IOException e) { - System.err.print(StringUtils.stringifyException(e)); - } + doConsistencyCheck(); return this.reached >= 0; } @@ -3031,8 +3510,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats, return; } } - // if not done yet, initialize replication queues - if (!isPopulatingReplQueues()) { + // if not done yet, initialize replication queues. + // In the standby, do not populate repl queues + if (!isPopulatingReplQueues() && !isInStandbyState()) { initializeReplQueues(); } long timeInSafemode = now() - systemStart; @@ -3051,6 +3531,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, + nt.getNumOfLeaves() + " datanodes"); NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has " + blockManager.numOfUnderReplicatedBlocks() + " blocks"); + + startSecretManagerIfNecessary(); } /** @@ -3073,7 +3555,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * initializing replication queues. */ private synchronized boolean canInitializeReplQueues() { - return blockSafe >= blockReplQueueThreshold; + return !isInStandbyState() && blockSafe >= blockReplQueueThreshold; } /** @@ -3106,6 +3588,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * Check and trigger safe mode if needed. */ private void checkMode() { + // Have to have write-lock since leaving safemode initializes + // repl queues, which requires write lock + assert hasWriteLock(); if (needEnter()) { enter(); // check if we are ready to initialize replication queues @@ -3145,6 +3630,13 @@ public class FSNamesystem implements Namesystem, FSClusterStats, this.blockThreshold = (int) (blockTotal * threshold); this.blockReplQueueThreshold = (int) (blockTotal * replQueueThreshold); + if (haEnabled) { + // After we initialize the block count, any further namespace + // modifications done while in safe mode need to keep track + // of the number of total blocks in the system. + this.shouldIncrementallyTrackBlocks = true; + } + checkMode(); } @@ -3154,9 +3646,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @param replication current replication */ private synchronized void incrementSafeBlockCount(short replication) { - if (replication == safeReplication) + if (replication == safeReplication) { this.blockSafe++; - checkMode(); + checkMode(); + } } /** @@ -3165,9 +3658,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @param replication current replication */ private synchronized void decrementSafeBlockCount(short replication) { - if (replication == safeReplication-1) + if (replication == safeReplication-1) { this.blockSafe--; - checkMode(); + assert blockSafe >= 0 || isManual(); + checkMode(); + } } /** @@ -3285,16 +3780,45 @@ public class FSNamesystem implements Namesystem, FSClusterStats, /** * Checks consistency of the class state. - * This is costly and currently called only in assert. - * @throws IOException + * This is costly so only runs if asserts are enabled. */ - private boolean isConsistent() throws IOException { + private void doConsistencyCheck() { + boolean assertsOn = false; + assert assertsOn = true; // set to true if asserts are on + if (!assertsOn) return; + if (blockTotal == -1 && blockSafe == -1) { - return true; // manual safe mode + return; // manual safe mode } int activeBlocks = blockManager.getActiveBlockCount(); - return (blockTotal == activeBlocks) || - (blockSafe >= 0 && blockSafe <= blockTotal); + if ((blockTotal != activeBlocks) && + !(blockSafe >= 0 && blockSafe <= blockTotal)) { + throw new AssertionError( + " SafeMode: Inconsistent filesystem state: " + + "SafeMode data: blockTotal=" + blockTotal + + " blockSafe=" + blockSafe + "; " + + "BlockManager data: active=" + activeBlocks); + } + } + + private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) { + if (!shouldIncrementallyTrackBlocks) { + return; + } + assert haEnabled; + + if (LOG.isDebugEnabled()) { + LOG.debug("Adjusting block totals from " + + blockSafe + "/" + blockTotal + " to " + + (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal)); + } + assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " + + blockSafe + " by " + deltaSafe + ": would be negative"; + assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " + + blockTotal + " by " + deltaTotal + ": would be negative"; + + blockSafe += deltaSafe; + setBlockTotal(blockTotal + deltaTotal); } } @@ -3376,6 +3900,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats, @Override public boolean isPopulatingReplQueues() { + if (isInStandbyState()) { + return false; + } // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) @@ -3398,13 +3925,30 @@ public class FSNamesystem implements Namesystem, FSClusterStats, SafeModeInfo safeMode = this.safeMode; if (safeMode == null) // mostly true return; - safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas()); + BlockInfo storedBlock = blockManager.getStoredBlock(b); + if (storedBlock.isComplete()) { + safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas()); + } + } + + /** + * Adjust the total number of blocks safe and expected during safe mode. + * If safe mode is not currently on, this is a no-op. + * @param deltaSafe the change in number of safe blocks + * @param deltaTotal the change i nnumber of total blocks expected + */ + public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) { + // safeMode is volatile, and may be set to null at any time + SafeModeInfo safeMode = this.safeMode; + if (safeMode == null) + return; + safeMode.adjustBlockTotals(deltaSafe, deltaTotal); } /** * Set the total number of blocks in the system. */ - void setBlockTotal() { + public void setBlockTotal() { // safeMode is volatile, and may be set to null at any time SafeModeInfo safeMode = this.safeMode; if (safeMode == null) @@ -3440,7 +3984,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } assert node != null : "Found a lease for nonexisting file."; assert node.isUnderConstruction() : - "Found a lease for file that is not under construction."; + "Found a lease for file " + path + " that is not under construction." + + " lease=" + lease; INodeFileUnderConstruction cons = (INodeFileUnderConstruction) node; BlockInfo[] blocks = cons.getBlocks(); if(blocks == null) @@ -3465,21 +4010,32 @@ public class FSNamesystem implements Namesystem, FSClusterStats, void enterSafeMode(boolean resourcesLow) throws IOException { writeLock(); try { - // Ensure that any concurrent operations have been fully synced - // before entering safe mode. This ensures that the FSImage - // is entirely stable on disk as soon as we're in safe mode. - getEditLog().logSyncAll(); - if (!isInSafeMode()) { - safeMode = new SafeModeInfo(resourcesLow); - return; - } - if (resourcesLow) { - safeMode.setResourcesLow(); - } - safeMode.setManual(); - getEditLog().logSyncAll(); - NameNode.stateChangeLog.info("STATE* Safe mode is ON. " - + safeMode.getTurnOffTip()); + // Stop the secret manager, since rolling the master key would + // try to write to the edit log + stopSecretManager(); + + // Ensure that any concurrent operations have been fully synced + // before entering safe mode. This ensures that the FSImage + // is entirely stable on disk as soon as we're in safe mode. + boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite(); + // Before Editlog is in OpenForWrite mode, editLogStream will be null. So, + // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode + if (isEditlogOpenForWrite) { + getEditLog().logSyncAll(); + } + if (!isInSafeMode()) { + safeMode = new SafeModeInfo(resourcesLow); + return; + } + if (resourcesLow) { + safeMode.setResourcesLow(); + } + safeMode.setManual(); + if (isEditlogOpenForWrite) { + getEditLog().logSyncAll(); + } + NameNode.stateChangeLog.info("STATE* Safe mode is ON. " + + safeMode.getTurnOffTip()); } finally { writeUnlock(); } @@ -3520,6 +4076,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, CheckpointSignature rollEditLog() throws IOException { writeLock(); try { + checkOperation(OperationCategory.JOURNAL); if (isInSafeMode()) { throw new SafeModeException("Log not rolled", safeMode); } @@ -3536,6 +4093,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, throws IOException { writeLock(); try { + checkOperation(OperationCategory.CHECKPOINT); + if (isInSafeMode()) { throw new SafeModeException("Checkpoint not started", safeMode); } @@ -3552,6 +4111,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, CheckpointSignature sig) throws IOException { readLock(); try { + checkOperation(OperationCategory.CHECKPOINT); + if (isInSafeMode()) { throw new SafeModeException("Checkpoint not ended", safeMode); } @@ -3704,6 +4265,34 @@ public class FSNamesystem implements Namesystem, FSClusterStats, return blockManager.getExcessBlocksCount(); } + // HA-only metric + @Metric + public long getPostponedMisreplicatedBlocks() { + return blockManager.getPostponedMisreplicatedBlocksCount(); + } + + // HA-only metric + @Metric + public int getPendingDataNodeMessageCount() { + return blockManager.getPendingDataNodeMessageCount(); + } + + // HA-only metric + @Metric + public String getHAState() { + return haContext.getState().toString(); + } + + // HA-only metric + @Metric + public long getMillisSinceLastLoadedEdits() { + if (isInStandbyState() && editLogTailer != null) { + return now() - editLogTailer.getLastLoadTimestamp(); + } else { + return 0; + } + } + @Metric public int getBlockCapacity() { return blockManager.getCapacity(); @@ -3715,6 +4304,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } private ObjectName mbeanName; + /** * Register the FSNamesystem MBean using the name * "hadoop:service=NameNode,name=FSNamesystemState" @@ -3812,6 +4402,29 @@ public class FSNamesystem implements Namesystem, FSClusterStats, return pendingFile; } + /** + * Client is reporting some bad block locations. + */ + void reportBadBlocks(LocatedBlock[] blocks) throws IOException { + writeLock(); + try { + checkOperation(OperationCategory.WRITE); + + NameNode.stateChangeLog.info("*DIR* NameNode.reportBadBlocks"); + for (int i = 0; i < blocks.length; i++) { + ExtendedBlock blk = blocks[i].getBlock(); + DatanodeInfo[] nodes = blocks[i].getLocations(); + for (int j = 0; j < nodes.length; j++) { + DatanodeInfo dn = nodes[j]; + blockManager.findAndMarkBlockAsCorrupt(blk, dn, + "client machine reported it"); + } + } + } finally { + writeUnlock(); + } + } + /** * Get a new generation stamp together with an access token for * a block under construction @@ -3829,6 +4442,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, LocatedBlock locatedBlock; writeLock(); try { + checkOperation(OperationCategory.WRITE); + // check vadility of parameters checkUCBlock(block, clientName); @@ -3858,6 +4473,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, throws IOException { writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Pipeline not updated", safeMode); } @@ -3873,7 +4490,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } finally { writeUnlock(); } - if (supportAppends) { + if (supportAppends || persistBlocks) { getEditLog().logSync(); } LOG.info("updatePipeline(" + oldBlock + ") successfully to " + newBlock); @@ -4067,6 +4684,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, readLock(); try { + checkOperation(OperationCategory.READ); + if (!isPopulatingReplQueues()) { throw new IOException("Cannot run listCorruptFileBlocks because " + "replication queues have not been initialized."); @@ -4159,6 +4778,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, Token token; writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot issue delegation token", safeMode); } @@ -4203,6 +4824,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, long expiryTime; writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot renew delegation token", safeMode); } @@ -4233,6 +4856,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, throws IOException { writeLock(); try { + checkOperation(OperationCategory.WRITE); + if (isInSafeMode()) { throw new SafeModeException("Cannot cancel delegation token", safeMode); } @@ -4266,16 +4891,14 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * @param key new delegation key. */ public void logUpdateMasterKey(DelegationKey key) throws IOException { - writeLock(); - try { - if (isInSafeMode()) { - throw new SafeModeException( - "Cannot log master key update in safe mode", safeMode); - } - getEditLog().logUpdateMasterKey(key); - } finally { - writeUnlock(); - } + + assert !isInSafeMode() : + "this should never be called while in safemode, since we stop " + + "the DT manager before entering safemode!"; + // No need to hold FSN lock since we don't access any internal + // structures, and this is stopped before the FSN shuts itself + // down, etc. + getEditLog().logUpdateMasterKey(key); getEditLog().logSync(); } @@ -4545,9 +5168,32 @@ public class FSNamesystem implements Namesystem, FSClusterStats, byte[] password) throws InvalidToken { getDelegationTokenSecretManager().verifyToken(identifier, password); } + + public boolean isGenStampInFuture(long genStamp) { + return (genStamp > getGenerationStamp()); + } + @VisibleForTesting + public EditLogTailer getEditLogTailer() { + return editLogTailer; + } + + @VisibleForTesting + void setFsLockForTests(ReentrantReadWriteLock lock) { + this.fsLock = lock; + } + + @VisibleForTesting + ReentrantReadWriteLock getFsLockForTests() { + return fsLock; + } @VisibleForTesting public SafeModeInfo getSafeModeInfoForTests() { return safeMode; } + + @VisibleForTesting + public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) { + this.nnResourceChecker = nnResourceChecker; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java index eeb40c2f57..603dd00090 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java @@ -52,6 +52,7 @@ class FileJournalManager implements JournalManager { private static final Log LOG = LogFactory.getLog(FileJournalManager.class); private final StorageDirectory sd; + private final NNStorage storage; private int outputBufferCapacity = 512*1024; private static final Pattern EDITS_REGEX = Pattern.compile( @@ -60,14 +61,14 @@ class FileJournalManager implements JournalManager { NameNodeFile.EDITS_INPROGRESS.getName() + "_(\\d+)"); private File currentInProgress = null; - private long maxSeenTransaction = 0L; @VisibleForTesting StoragePurger purger = new NNStorageRetentionManager.DeletionStoragePurger(); - public FileJournalManager(StorageDirectory sd) { + public FileJournalManager(StorageDirectory sd, NNStorage storage) { this.sd = sd; + this.storage = storage; } @Override @@ -76,11 +77,16 @@ class FileJournalManager implements JournalManager { @Override synchronized public EditLogOutputStream startLogSegment(long txid) throws IOException { - currentInProgress = NNStorage.getInProgressEditsFile(sd, txid); - EditLogOutputStream stm = new EditLogFileOutputStream(currentInProgress, - outputBufferCapacity); - stm.create(); - return stm; + try { + currentInProgress = NNStorage.getInProgressEditsFile(sd, txid); + EditLogOutputStream stm = new EditLogFileOutputStream(currentInProgress, + outputBufferCapacity); + stm.create(); + return stm; + } catch (IOException e) { + storage.reportErrorsOnDirectory(sd); + throw e; + } } @Override @@ -90,13 +96,14 @@ class FileJournalManager implements JournalManager { File dstFile = NNStorage.getFinalizedEditsFile( sd, firstTxId, lastTxId); - LOG.debug("Finalizing edits file " + inprogressFile + " -> " + dstFile); + LOG.info("Finalizing edits file " + inprogressFile + " -> " + dstFile); Preconditions.checkState(!dstFile.exists(), "Can't finalize edits file " + inprogressFile + " since finalized file " + "already exists"); if (!inprogressFile.renameTo(dstFile)) { - throw new IOException("Unable to finalize edits file " + inprogressFile); + storage.reportErrorsOnDirectory(sd); + throw new IllegalStateException("Unable to finalize edits file " + inprogressFile); } if (inprogressFile.equals(currentInProgress)) { currentInProgress = null; @@ -116,6 +123,7 @@ class FileJournalManager implements JournalManager { @Override public void purgeLogsOlderThan(long minTxIdToKeep) throws IOException { + LOG.info("Purging logs older than " + minTxIdToKeep); File[] files = FileUtil.listFiles(sd.getCurrentDir()); List editLogs = FileJournalManager.matchEditLogs(files); @@ -135,18 +143,18 @@ class FileJournalManager implements JournalManager { */ List getRemoteEditLogs(long firstTxId) throws IOException { File currentDir = sd.getCurrentDir(); - List allLogFiles = matchEditLogs( - FileUtil.listFiles(currentDir)); + List allLogFiles = matchEditLogs(currentDir); List ret = Lists.newArrayListWithCapacity( allLogFiles.size()); for (EditLogFile elf : allLogFiles) { - if (elf.isCorrupt() || elf.isInProgress()) continue; + if (elf.hasCorruptHeader() || elf.isInProgress()) continue; if (elf.getFirstTxId() >= firstTxId) { ret.add(new RemoteEditLog(elf.firstTxId, elf.lastTxId)); } else if ((firstTxId > elf.getFirstTxId()) && (firstTxId <= elf.getLastTxId())) { - throw new IOException("Asked for firstTxId " + firstTxId + // Note that this behavior is different from getLogFiles below. + throw new IllegalStateException("Asked for firstTxId " + firstTxId + " which is in the middle of file " + elf.file); } } @@ -154,6 +162,20 @@ class FileJournalManager implements JournalManager { return ret; } + /** + * returns matching edit logs via the log directory. Simple helper function + * that lists the files in the logDir and calls matchEditLogs(File[]) + * + * @param logDir + * directory to match edit logs in + * @return matched edit logs + * @throws IOException + * IOException thrown for invalid logDir + */ + static List matchEditLogs(File logDir) throws IOException { + return matchEditLogs(FileUtil.listFiles(logDir)); + } + static List matchEditLogs(File[] filesInStorage) { List ret = Lists.newArrayList(); for (File f : filesInStorage) { @@ -169,7 +191,7 @@ class FileJournalManager implements JournalManager { LOG.error("Edits file " + f + " has improperly formatted " + "transaction ID"); // skip - } + } } // Check for in-progress edits @@ -190,27 +212,37 @@ class FileJournalManager implements JournalManager { } @Override - synchronized public EditLogInputStream getInputStream(long fromTxId) - throws IOException { + synchronized public EditLogInputStream getInputStream(long fromTxId, + boolean inProgressOk) throws IOException { for (EditLogFile elf : getLogFiles(fromTxId)) { - if (elf.getFirstTxId() == fromTxId) { + if (elf.containsTxId(fromTxId)) { + if (!inProgressOk && elf.isInProgress()) { + continue; + } if (elf.isInProgress()) { elf.validateLog(); } if (LOG.isTraceEnabled()) { LOG.trace("Returning edit stream reading from " + elf); } - return new EditLogFileInputStream(elf.getFile(), - elf.getFirstTxId(), elf.getLastTxId()); + EditLogFileInputStream elfis = new EditLogFileInputStream(elf.getFile(), + elf.getFirstTxId(), elf.getLastTxId(), elf.isInProgress()); + long transactionsToSkip = fromTxId - elf.getFirstTxId(); + if (transactionsToSkip > 0) { + LOG.info(String.format("Log begins at txid %d, but requested start " + + "txid is %d. Skipping %d edits.", elf.getFirstTxId(), fromTxId, + transactionsToSkip)); + elfis.skipTransactions(transactionsToSkip); + } + return elfis; } } - throw new IOException("Cannot find editlog file with " + fromTxId - + " as first first txid"); + throw new IOException("Cannot find editlog file containing " + fromTxId); } @Override - public long getNumberOfTransactions(long fromTxId) + public long getNumberOfTransactions(long fromTxId, boolean inProgressOk) throws IOException, CorruptionException { long numTxns = 0L; @@ -222,21 +254,25 @@ class FileJournalManager implements JournalManager { LOG.warn("Gap in transactions in " + sd.getRoot() + ". Gap is " + fromTxId + " - " + (elf.getFirstTxId() - 1)); break; - } else if (fromTxId == elf.getFirstTxId()) { + } else if (elf.containsTxId(fromTxId)) { + if (!inProgressOk && elf.isInProgress()) { + break; + } + if (elf.isInProgress()) { elf.validateLog(); } - if (elf.isCorrupt()) { + if (elf.hasCorruptHeader()) { break; } + numTxns += elf.getLastTxId() + 1 - fromTxId; fromTxId = elf.getLastTxId() + 1; - numTxns += fromTxId - elf.getFirstTxId(); if (elf.isInProgress()) { break; } - } // else skip + } } if (LOG.isDebugEnabled()) { @@ -244,7 +280,8 @@ class FileJournalManager implements JournalManager { + " txns from " + fromTxId); } - long max = findMaxTransaction(); + long max = findMaxTransaction(inProgressOk); + // fromTxId should be greater than max, as it points to the next // transaction we should expect to find. If it is less than or equal // to max, it means that a transaction with txid == max has not been found @@ -261,23 +298,44 @@ class FileJournalManager implements JournalManager { @Override synchronized public void recoverUnfinalizedSegments() throws IOException { File currentDir = sd.getCurrentDir(); - List allLogFiles = matchEditLogs(currentDir.listFiles()); - - // make sure journal is aware of max seen transaction before moving corrupt - // files aside - findMaxTransaction(); + LOG.info("Recovering unfinalized segments in " + currentDir); + List allLogFiles = matchEditLogs(currentDir); for (EditLogFile elf : allLogFiles) { if (elf.getFile().equals(currentInProgress)) { continue; } if (elf.isInProgress()) { - elf.validateLog(); - - if (elf.isCorrupt()) { - elf.moveAsideCorruptFile(); + // If the file is zero-length, we likely just crashed after opening the + // file, but before writing anything to it. Safe to delete it. + if (elf.getFile().length() == 0) { + LOG.info("Deleting zero-length edit log file " + elf); + if (!elf.getFile().delete()) { + throw new IOException("Unable to delete file " + elf.getFile()); + } continue; } + + elf.validateLog(); + + if (elf.hasCorruptHeader()) { + elf.moveAsideCorruptFile(); + throw new CorruptionException("In-progress edit log file is corrupt: " + + elf); + } + + // If the file has a valid header (isn't corrupt) but contains no + // transactions, we likely just crashed after opening the file and + // writing the header, but before syncing any transactions. Safe to + // delete the file. + if (elf.getNumTransactions() == 0) { + LOG.info("Deleting edit log file with zero transactions " + elf); + if (!elf.getFile().delete()) { + throw new IOException("Unable to delete " + elf.getFile()); + } + continue; + } + finalizeLogSegment(elf.getFirstTxId(), elf.getLastTxId()); } } @@ -285,16 +343,12 @@ class FileJournalManager implements JournalManager { private List getLogFiles(long fromTxId) throws IOException { File currentDir = sd.getCurrentDir(); - List allLogFiles = matchEditLogs(currentDir.listFiles()); + List allLogFiles = matchEditLogs(currentDir); List logFiles = Lists.newArrayList(); for (EditLogFile elf : allLogFiles) { - if (fromTxId > elf.getFirstTxId() - && fromTxId <= elf.getLastTxId()) { - throw new IOException("Asked for fromTxId " + fromTxId - + " which is in middle of file " + elf.file); - } - if (fromTxId <= elf.getFirstTxId()) { + if (fromTxId <= elf.getFirstTxId() || + elf.containsTxId(fromTxId)) { logFiles.add(elf); } } @@ -306,21 +360,35 @@ class FileJournalManager implements JournalManager { /** * Find the maximum transaction in the journal. - * This gets stored in a member variable, as corrupt edit logs - * will be moved aside, but we still need to remember their first - * tranaction id in the case that it was the maximum transaction in - * the journal. */ - private long findMaxTransaction() + private long findMaxTransaction(boolean inProgressOk) throws IOException { + boolean considerSeenTxId = true; + long seenTxId = NNStorage.readTransactionIdFile(sd); + long maxSeenTransaction = 0; for (EditLogFile elf : getLogFiles(0)) { + if (elf.isInProgress() && !inProgressOk) { + if (elf.getFirstTxId() != HdfsConstants.INVALID_TXID && + elf.getFirstTxId() <= seenTxId) { + // don't look at the seen_txid file if in-progress logs are not to be + // examined, and the value in seen_txid falls within the in-progress + // segment. + considerSeenTxId = false; + } + continue; + } + if (elf.isInProgress()) { maxSeenTransaction = Math.max(elf.getFirstTxId(), maxSeenTransaction); elf.validateLog(); } maxSeenTransaction = Math.max(elf.getLastTxId(), maxSeenTransaction); } - return maxSeenTransaction; + if (considerSeenTxId) { + return Math.max(maxSeenTransaction, seenTxId); + } else { + return maxSeenTransaction; + } } @Override @@ -335,8 +403,9 @@ class FileJournalManager implements JournalManager { private File file; private final long firstTxId; private long lastTxId; + private long numTx = -1; - private boolean isCorrupt = false; + private boolean hasCorruptHeader = false; private final boolean isInProgress; final static Comparator COMPARE_BY_START_TXID @@ -376,6 +445,10 @@ class FileJournalManager implements JournalManager { long getLastTxId() { return lastTxId; } + + boolean containsTxId(long txId) { + return firstTxId <= txId && txId <= lastTxId; + } /** * Count the number of valid transactions in a log. @@ -384,11 +457,13 @@ class FileJournalManager implements JournalManager { */ void validateLog() throws IOException { EditLogValidation val = EditLogFileInputStream.validateEditLog(file); - if (val.getNumTransactions() == 0) { - markCorrupt(); - } else { - this.lastTxId = val.getEndTxId(); - } + this.numTx = val.getNumTransactions(); + this.lastTxId = val.getEndTxId(); + this.hasCorruptHeader = val.hasCorruptHeader(); + } + + long getNumTransactions() { + return numTx; } boolean isInProgress() { @@ -399,16 +474,12 @@ class FileJournalManager implements JournalManager { return file; } - void markCorrupt() { - isCorrupt = true; - } - - boolean isCorrupt() { - return isCorrupt; + boolean hasCorruptHeader() { + return hasCorruptHeader; } void moveAsideCorruptFile() throws IOException { - assert isCorrupt; + assert hasCorruptHeader; File src = file; File dst = new File(src.getParent(), src.getName() + ".corrupt"); @@ -423,8 +494,9 @@ class FileJournalManager implements JournalManager { @Override public String toString() { return String.format("EditLogFile(file=%s,first=%019d,last=%019d," - +"inProgress=%b,corrupt=%b)", file.toString(), - firstTxId, lastTxId, isInProgress(), isCorrupt); + +"inProgress=%b,hasCorruptHeader=%b,numTx=%d)", + file.toString(), firstTxId, lastTxId, + isInProgress(), hasCorruptHeader, numTx); } } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java index 8753b270f1..b9860032e6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java @@ -124,16 +124,18 @@ public class GetImageServlet extends HttpServlet { final long txid = parsedParams.getTxId(); if (! currentlyDownloadingCheckpoints.add(txid)) { - throw new IOException( + response.sendError(HttpServletResponse.SC_CONFLICT, "Another checkpointer is already in the process of uploading a" + " checkpoint made at transaction ID " + txid); + return null; } try { if (nnImage.getStorage().findImageFile(txid) != null) { - throw new IOException( + response.sendError(HttpServletResponse.SC_CONFLICT, "Another checkpointer already uploaded an checkpoint " + "for txid " + txid); + return null; } // issue a HTTP get request to download the new fsimage diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java index 0fab53c95f..c5c47fd646 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java @@ -26,6 +26,8 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; +import com.google.common.base.Joiner; + /** * I-node for file being written. */ @@ -41,19 +43,7 @@ public class INodeFileUnderConstruction extends INodeFile { String clientName, String clientMachine, DatanodeDescriptor clientNode) { - this(permissions, 0, replication, preferredBlockSize, modTime, - clientName, clientMachine, clientNode); - } - - INodeFileUnderConstruction(PermissionStatus permissions, - int nrBlocks, - short replication, - long preferredBlockSize, - long modTime, - String clientName, - String clientMachine, - DatanodeDescriptor clientNode) { - super(permissions.applyUMask(UMASK), nrBlocks, replication, + super(permissions.applyUMask(UMASK), 0, replication, modTime, modTime, preferredBlockSize); this.clientName = clientName; this.clientMachine = clientMachine; @@ -106,6 +96,9 @@ public class INodeFileUnderConstruction extends INodeFile { // use the modification time as the access time // INodeFile convertToInodeFile() { + assert allBlocksComplete() : + "Can't finalize inode " + this + " since it contains " + + "non-complete blocks! Blocks are: " + blocksAsString(); INodeFile obj = new INodeFile(getPermissionStatus(), getBlocks(), getReplication(), @@ -115,6 +108,18 @@ public class INodeFileUnderConstruction extends INodeFile { return obj; } + + /** + * @return true if all of the blocks in this file are marked as completed. + */ + private boolean allBlocksComplete() { + for (BlockInfo b : blocks) { + if (!b.isComplete()) { + return false; + } + } + return true; + } /** * Remove a block from the block list. This block should be @@ -153,4 +158,8 @@ public class INodeFileUnderConstruction extends INodeFile { setBlock(numBlocks()-1, ucBlock); return ucBlock; } + + private String blocksAsString() { + return Joiner.on(",").join(this.blocks); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java index d45de18e92..f9c622dc38 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java @@ -48,20 +48,23 @@ public interface JournalManager extends Closeable { /** * Get the input stream starting with fromTxnId from this journal manager * @param fromTxnId the first transaction id we want to read + * @param inProgressOk whether or not in-progress streams should be returned * @return the stream starting with transaction fromTxnId * @throws IOException if a stream cannot be found. */ - EditLogInputStream getInputStream(long fromTxnId) throws IOException; + EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk) + throws IOException; /** * Get the number of transaction contiguously available from fromTxnId. * * @param fromTxnId Transaction id to count from + * @param inProgressOk whether or not in-progress streams should be counted * @return The number of transactions available from fromTxnId * @throws IOException if the journal cannot be read. * @throws CorruptionException if there is a gap in the journal at fromTxnId. */ - long getNumberOfTransactions(long fromTxnId) + long getNumberOfTransactions(long fromTxnId, boolean inProgressOk) throws IOException, CorruptionException; /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java index b1accd8562..d84d79dcb5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java @@ -25,8 +25,10 @@ import java.util.SortedSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; + import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -35,8 +37,6 @@ import com.google.common.collect.Lists; import com.google.common.collect.Multimaps; import com.google.common.collect.Sets; -import org.apache.hadoop.classification.InterfaceAudience; - /** * Manages a collection of Journals. None of the methods are synchronized, it is * assumed that FSEditLog methods, that use this class, use proper @@ -148,11 +148,17 @@ public class JournalSet implements JournalManager { private List journals = Lists.newArrayList(); final int minimumRedundantJournals; + private volatile Runtime runtime = Runtime.getRuntime(); JournalSet(int minimumRedundantResources) { this.minimumRedundantJournals = minimumRedundantResources; } + @VisibleForTesting + public void setRuntimeForTesting(Runtime runtime) { + this.runtime = runtime; + } + @Override public EditLogOutputStream startLogSegment(final long txId) throws IOException { mapJournalsAndReportErrors(new JournalClosure() { @@ -201,19 +207,25 @@ public class JournalSet implements JournalManager { * or null if no more exist */ @Override - public EditLogInputStream getInputStream(long fromTxnId) throws IOException { + public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk) + throws IOException { JournalManager bestjm = null; long bestjmNumTxns = 0; CorruptionException corruption = null; for (JournalAndStream jas : journals) { + if (jas.isDisabled()) continue; + JournalManager candidate = jas.getManager(); long candidateNumTxns = 0; try { - candidateNumTxns = candidate.getNumberOfTransactions(fromTxnId); + candidateNumTxns = candidate.getNumberOfTransactions(fromTxnId, + inProgressOk); } catch (CorruptionException ce) { corruption = ce; } catch (IOException ioe) { + LOG.warn("Unable to read input streams from JournalManager " + candidate, + ioe); continue; // error reading disk, just skip } @@ -231,15 +243,20 @@ public class JournalSet implements JournalManager { return null; } } - return bestjm.getInputStream(fromTxnId); + return bestjm.getInputStream(fromTxnId, inProgressOk); } @Override - public long getNumberOfTransactions(long fromTxnId) throws IOException { + public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk) + throws IOException { long num = 0; for (JournalAndStream jas: journals) { - if (jas.isActive()) { - long newNum = jas.getManager().getNumberOfTransactions(fromTxnId); + if (jas.isDisabled()) { + LOG.info("Skipping jas " + jas + " since it's disabled"); + continue; + } else { + long newNum = jas.getManager().getNumberOfTransactions(fromTxnId, + inProgressOk); if (newNum > num) { num = newNum; } @@ -298,13 +315,31 @@ public class JournalSet implements JournalManager { */ private void mapJournalsAndReportErrors( JournalClosure closure, String status) throws IOException{ + List badJAS = Lists.newLinkedList(); for (JournalAndStream jas : journals) { try { closure.apply(jas); } catch (Throwable t) { - LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); - badJAS.add(jas); + if (jas.isRequired()) { + String msg = "Error: " + status + " failed for required journal (" + + jas + ")"; + LOG.fatal(msg, t); + // If we fail on *any* of the required journals, then we must not + // continue on any of the other journals. Abort them to ensure that + // retry behavior doesn't allow them to keep going in any way. + abortAllJournals(); + // the current policy is to shutdown the NN on errors to shared edits + // dir. There are many code paths to shared edits failures - syncs, + // roll of edits etc. All of them go through this common function + // where the isRequired() check is made. Applying exit policy here + // to catch all code paths. + runtime.exit(1); + throw new IOException(msg); + } else { + LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); + badJAS.add(jas); + } } } disableAndReportErrorOnJournals(badJAS); @@ -316,6 +351,17 @@ public class JournalSet implements JournalManager { } } + /** + * Abort all of the underlying streams. + */ + private void abortAllJournals() { + for (JournalAndStream jas : journals) { + if (jas.isActive()) { + jas.abort(); + } + } + } + /** * An implementation of EditLogOutputStream that applies a requested method on * all the journals that are currently active. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java index 6f03452195..71e6cbb1e2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java @@ -34,6 +34,10 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.UnresolvedLinkException; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; +import org.apache.hadoop.util.Daemon; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; import static org.apache.hadoop.hdfs.server.common.Util.now; @@ -82,6 +86,9 @@ public class LeaseManager { // private SortedMap sortedLeasesByPath = new TreeMap(); + private Daemon lmthread; + private volatile boolean shouldRunMonitor; + LeaseManager(FSNamesystem fsnamesystem) {this.fsnamesystem = fsnamesystem;} Lease getLease(String holder) { @@ -146,6 +153,9 @@ public class LeaseManager { Lease lease = getLease(holder); if (lease != null) { removeLease(lease, src); + } else { + LOG.warn("Removing non-existent lease! holder=" + holder + + " src=" + src); } } @@ -190,6 +200,15 @@ public class LeaseManager { } } + /** + * Renew all of the currently open leases. + */ + synchronized void renewAllLeases() { + for (Lease l : leases.values()) { + renewLease(l); + } + } + /************************************************************ * A Lease governs all the locks held by a single client. * For each client there's a corresponding lease, whose @@ -296,6 +315,11 @@ public class LeaseManager { paths.remove(oldpath); paths.add(newpath); } + + @VisibleForTesting + long getLastUpdate() { + return lastUpdate; + } } synchronized void changeLease(String src, String dst, @@ -367,18 +391,18 @@ public class LeaseManager { /** Check leases periodically. */ public void run() { - for(; fsnamesystem.isRunning(); ) { - fsnamesystem.writeLock(); + for(; shouldRunMonitor && fsnamesystem.isRunning(); ) { try { - if (!fsnamesystem.isInSafeMode()) { - checkLeases(); + fsnamesystem.writeLockInterruptibly(); + try { + if (!fsnamesystem.isInSafeMode()) { + checkLeases(); + } + } finally { + fsnamesystem.writeUnlock(); } - } finally { - fsnamesystem.writeUnlock(); - } - - - try { + + Thread.sleep(HdfsServerConstants.NAMENODE_LEASE_RECHECK_INTERVAL); } catch(InterruptedException ie) { if (LOG.isDebugEnabled()) { @@ -437,4 +461,36 @@ public class LeaseManager { + "\n sortedLeasesByPath=" + sortedLeasesByPath + "\n}"; } + + void startMonitor() { + Preconditions.checkState(lmthread == null, + "Lease Monitor already running"); + shouldRunMonitor = true; + lmthread = new Daemon(new Monitor()); + lmthread.start(); + } + + void stopMonitor() { + if (lmthread != null) { + shouldRunMonitor = false; + try { + lmthread.interrupt(); + lmthread.join(3000); + } catch (InterruptedException ie) { + LOG.warn("Encountered exception ", ie); + } + lmthread = null; + } + } + + /** + * Trigger the currently-running Lease monitor to re-check + * its leases immediately. This is for use by unit tests. + */ + @VisibleForTesting + void triggerMonitorCheckNow() { + Preconditions.checkState(lmthread != null, + "Lease monitor is not running"); + lmthread.interrupt(); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java index 7bca8f4b31..3f157e01ce 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java @@ -28,6 +28,7 @@ import java.net.URI; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -161,7 +162,8 @@ public class NNStorage extends Storage implements Closeable { // this may modify the editsDirs, so copy before passing in setStorageDirectories(imageDirs, - Lists.newArrayList(editsDirs)); + Lists.newArrayList(editsDirs), + FSNamesystem.getSharedEditsDirs(conf)); } @Override // Storage @@ -249,6 +251,16 @@ public class NNStorage extends Storage implements Closeable { List getRemovedStorageDirs() { return this.removedStorageDirs; } + + /** + * See {@link NNStorage#setStorageDirectories(Collection, Collection, Collection)} + */ + @VisibleForTesting + synchronized void setStorageDirectories(Collection fsNameDirs, + Collection fsEditsDirs) + throws IOException { + setStorageDirectories(fsNameDirs, fsEditsDirs, new ArrayList()); + } /** * Set the storage directories which will be used. This should only ever be @@ -265,7 +277,8 @@ public class NNStorage extends Storage implements Closeable { */ @VisibleForTesting synchronized void setStorageDirectories(Collection fsNameDirs, - Collection fsEditsDirs) + Collection fsEditsDirs, + Collection sharedEditsDirs) throws IOException { this.storageDirs.clear(); this.removedStorageDirs.clear(); @@ -289,7 +302,8 @@ public class NNStorage extends Storage implements Closeable { if(dirName.getScheme().compareTo(JournalType.FILE.name().toLowerCase()) == 0){ this.addStorageDir(new StorageDirectory(new File(dirName.getPath()), - dirType)); + dirType, + !sharedEditsDirs.contains(dirName))); // Don't lock the dir if it's shared. } } @@ -301,7 +315,7 @@ public class NNStorage extends Storage implements Closeable { if(dirName.getScheme().compareTo(JournalType.FILE.name().toLowerCase()) == 0) this.addStorageDir(new StorageDirectory(new File(dirName.getPath()), - NameNodeDirType.EDITS)); + NameNodeDirType.EDITS, !sharedEditsDirs.contains(dirName))); } } @@ -458,7 +472,7 @@ public class NNStorage extends Storage implements Closeable { /** * @return the transaction ID of the last checkpoint. */ - long getMostRecentCheckpointTxId() { + public long getMostRecentCheckpointTxId() { return mostRecentCheckpointTxId; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java index fe651001aa..fe75247b8e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFil import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile; import org.apache.hadoop.hdfs.util.MD5FileUtils; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -46,6 +47,7 @@ import com.google.common.collect.Sets; public class NNStorageRetentionManager { private final int numCheckpointsToRetain; + private final long numExtraEditsToRetain; private static final Log LOG = LogFactory.getLog( NNStorageRetentionManager.class); private final NNStorage storage; @@ -60,6 +62,15 @@ public class NNStorageRetentionManager { this.numCheckpointsToRetain = conf.getInt( DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY, DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT); + this.numExtraEditsToRetain = conf.getLong( + DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY, + DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_DEFAULT); + Preconditions.checkArgument(numCheckpointsToRetain > 0, + "Must retain at least one checkpoint"); + Preconditions.checkArgument(numExtraEditsToRetain >= 0, + DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY + + " must not be negative"); + this.storage = storage; this.editLog = editLog; this.purger = purger; @@ -79,8 +90,12 @@ public class NNStorageRetentionManager { purgeCheckpointsOlderThan(inspector, minImageTxId); // If fsimage_N is the image we want to keep, then we need to keep // all txns > N. We can remove anything < N+1, since fsimage_N - // reflects the state up to and including N. - editLog.purgeLogsOlderThan(minImageTxId + 1); + // reflects the state up to and including N. However, we also + // provide a "cushion" of older txns that we keep, which is + // handy for HA, where a remote node may not have as many + // new images. + long purgeLogsFrom = Math.max(0, minImageTxId + 1 - numExtraEditsToRetain); + editLog.purgeLogsOlderThan(purgeLogsFrom); } private void purgeCheckpointsOlderThan( diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index a0f4d4b763..b62f0d5d9e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -17,9 +17,6 @@ */ package org.apache.hadoop.hdfs.server.namenode; -import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.*; - import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; @@ -33,22 +30,40 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import org.apache.hadoop.ha.HealthCheckFailedException; +import org.apache.hadoop.ha.ServiceFailedException; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Trash; +import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; +import org.apache.hadoop.hdfs.server.namenode.ha.ActiveState; +import org.apache.hadoop.hdfs.server.namenode.ha.HAContext; +import org.apache.hadoop.hdfs.server.namenode.ha.HAState; +import org.apache.hadoop.hdfs.server.namenode.ha.StandbyState; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; +import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.JournalProtocol; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; +import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.security.RefreshUserMappingsProtocol; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol; +import org.apache.hadoop.tools.GetUserMappingsProtocol; import org.apache.hadoop.util.ServicePlugin; import org.apache.hadoop.util.StringUtils; @@ -95,6 +110,22 @@ public class NameNode { HdfsConfiguration.init(); } + /** + * Categories of operations supported by the namenode. + */ + public static enum OperationCategory { + /** Operations that are state agnostic */ + UNCHECKED, + /** Read operation that does not change the namespace state */ + READ, + /** Write operation that changes the namespace state */ + WRITE, + /** Operations related to checkpointing */ + CHECKPOINT, + /** Operations related to {@link JournalProtocol} */ + JOURNAL + } + /** * HDFS federation configuration can have two types of parameters: *
      @@ -110,6 +141,7 @@ public class NameNode { DFS_NAMENODE_RPC_ADDRESS_KEY, DFS_NAMENODE_NAME_DIR_KEY, DFS_NAMENODE_EDITS_DIR_KEY, + DFS_NAMENODE_SHARED_EDITS_DIR_KEY, DFS_NAMENODE_CHECKPOINT_DIR_KEY, DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY, DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, @@ -124,14 +156,40 @@ public class NameNode { DFS_NAMENODE_BACKUP_SERVICE_RPC_ADDRESS_KEY }; - + public long getProtocolVersion(String protocol, + long clientVersion) throws IOException { + if (protocol.equals(ClientProtocol.class.getName())) { + return ClientProtocol.versionID; + } else if (protocol.equals(DatanodeProtocol.class.getName())){ + return DatanodeProtocol.versionID; + } else if (protocol.equals(NamenodeProtocol.class.getName())){ + return NamenodeProtocol.versionID; + } else if (protocol.equals(RefreshAuthorizationPolicyProtocol.class.getName())){ + return RefreshAuthorizationPolicyProtocol.versionID; + } else if (protocol.equals(RefreshUserMappingsProtocol.class.getName())){ + return RefreshUserMappingsProtocol.versionID; + } else if (protocol.equals(GetUserMappingsProtocol.class.getName())){ + return GetUserMappingsProtocol.versionID; + } else { + throw new IOException("Unknown protocol to name node: " + protocol); + } + } + public static final int DEFAULT_PORT = 8020; - public static final Log LOG = LogFactory.getLog(NameNode.class.getName()); public static final Log stateChangeLog = LogFactory.getLog("org.apache.hadoop.hdfs.StateChange"); + public static final HAState ACTIVE_STATE = new ActiveState(); + public static final HAState STANDBY_STATE = new StandbyState(); protected FSNamesystem namesystem; + protected final Configuration conf; protected NamenodeRole role; + private HAState state; + private final boolean haEnabled; + private final HAContext haContext; + protected boolean allowStaleStandbyReads; + + /** httpServer */ protected NameNodeHttpServer httpServer; private Thread emptier; @@ -212,7 +270,7 @@ public class NameNode { * @param filesystemURI * @return address of file system */ - static InetSocketAddress getAddress(URI filesystemURI) { + public static InetSocketAddress getAddress(URI filesystemURI) { String authority = filesystemURI.getAuthority(); if (authority == null) { throw new IllegalArgumentException(String.format( @@ -251,13 +309,11 @@ public class NameNode { * Given a configuration get the address of the service rpc server * If the service rpc is not configured returns null */ - protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) - throws IOException { + protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) { return NameNode.getServiceAddress(conf, false); } - protected InetSocketAddress getRpcServerAddress(Configuration conf) - throws IOException { + protected InetSocketAddress getRpcServerAddress(Configuration conf) { return getAddress(conf); } @@ -335,11 +391,7 @@ public class NameNode { throw e; } - activate(conf); - LOG.info(getRole() + " up at: " + rpcServer.getRpcAddress()); - if (rpcServer.getServiceRpcAddress() != null) { - LOG.info(getRole() + " service server is up at: " + rpcServer.getServiceRpcAddress()); - } + startCommonServices(conf); } /** @@ -373,19 +425,11 @@ public class NameNode { } } - /** - * Activate name-node servers and threads. - */ - void activate(Configuration conf) throws IOException { - if ((isRole(NamenodeRole.NAMENODE)) - && (UserGroupInformation.isSecurityEnabled())) { - namesystem.activateSecretManager(); - } - namesystem.activate(conf); + /** Start the services common to active and standby states */ + private void startCommonServices(Configuration conf) throws IOException { + namesystem.startCommonServices(conf, haContext); startHttpServer(conf); rpcServer.start(); - startTrashEmptier(conf); - plugins = conf.getInstances(DFS_NAMENODE_PLUGINS_KEY, ServicePlugin.class); for (ServicePlugin p: plugins) { @@ -395,8 +439,28 @@ public class NameNode { LOG.warn("ServicePlugin " + p + " could not be started", t); } } + LOG.info(getRole() + " up at: " + rpcServer.getRpcAddress()); + if (rpcServer.getServiceRpcAddress() != null) { + LOG.info(getRole() + " service server is up at: " + + rpcServer.getServiceRpcAddress()); + } } - + + private void stopCommonServices() { + if(namesystem != null) namesystem.close(); + if(rpcServer != null) rpcServer.stop(); + if (plugins != null) { + for (ServicePlugin p : plugins) { + try { + p.stop(); + } catch (Throwable t) { + LOG.warn("ServicePlugin " + p + " could not be stopped", t); + } + } + } + stopHttpServer(); + } + private void startTrashEmptier(Configuration conf) throws IOException { long trashInterval = conf.getLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY, @@ -408,11 +472,26 @@ public class NameNode { this.emptier.start(); } + private void stopTrashEmptier() { + if (this.emptier != null) { + emptier.interrupt(); + emptier = null; + } + } + private void startHttpServer(final Configuration conf) throws IOException { httpServer = new NameNodeHttpServer(conf, this, getHttpServerAddress(conf)); httpServer.start(); setHttpServerAddress(conf); } + + private void stopHttpServer() { + try { + if (httpServer != null) httpServer.stop(); + } catch (Exception e) { + LOG.error("Exception while stopping httpserver", e); + } + } /** * Start NameNode. @@ -447,10 +526,23 @@ public class NameNode { protected NameNode(Configuration conf, NamenodeRole role) throws IOException { + this.conf = conf; this.role = role; + String nsId = getNameServiceId(conf); + String namenodeId = HAUtil.getNameNodeId(conf, nsId); + this.haEnabled = HAUtil.isHAEnabled(conf, nsId); + if (!haEnabled) { + state = ACTIVE_STATE; + } else { + state = STANDBY_STATE; + } + this.allowStaleStandbyReads = HAUtil.shouldAllowStandbyReads(conf); + this.haContext = createHAContext(); try { - initializeGenericKeys(conf, getNameServiceId(conf)); + initializeGenericKeys(conf, nsId, namenodeId); initialize(conf); + state.prepareToEnterState(haContext); + state.enterState(haContext); } catch (IOException e) { this.stop(); throw e; @@ -460,6 +552,10 @@ public class NameNode { } } + protected HAContext createHAContext() { + return new NameNodeHAContext(); + } + /** * Wait for service to finish. * (Normally, it runs forever.) @@ -468,6 +564,7 @@ public class NameNode { try { this.rpcServer.join(); } catch (InterruptedException ie) { + LOG.info("Caught interrupted exception ", ie); } } @@ -480,23 +577,14 @@ public class NameNode { return; stopRequested = true; } - if (plugins != null) { - for (ServicePlugin p : plugins) { - try { - p.stop(); - } catch (Throwable t) { - LOG.warn("ServicePlugin " + p + " could not be stopped", t); - } - } - } try { - if (httpServer != null) httpServer.stop(); - } catch (Exception e) { - LOG.error("Exception while stopping httpserver", e); + if (state != null) { + state.exitState(haContext); + } + } catch (ServiceFailedException e) { + LOG.warn("Encountered exception while exiting state ", e); } - if(namesystem != null) namesystem.close(); - if(emptier != null) emptier.interrupt(); - if(rpcServer != null) rpcServer.stop(); + stopCommonServices(); if (metrics != null) { metrics.shutdown(); } @@ -561,6 +649,10 @@ public class NameNode { private static boolean format(Configuration conf, boolean isConfirmationNeeded) throws IOException { + String nsId = DFSUtil.getNamenodeNameServiceId(conf); + String namenodeId = HAUtil.getNameNodeId(conf, nsId); + initializeGenericKeys(conf, nsId, namenodeId); + if (!conf.getBoolean(DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY, DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_DEFAULT)) { throw new IOException("The option " + DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY @@ -571,7 +663,7 @@ public class NameNode { } Collection dirsToFormat = FSNamesystem.getNamespaceDirs(conf); - Collection editDirsToFormat = + List editDirsToFormat = FSNamesystem.getNamespaceEditsDirs(conf); for(Iterator it = dirsToFormat.iterator(); it.hasNext();) { File curDir = new File(it.next().getPath()); @@ -605,6 +697,10 @@ public class NameNode { private static boolean finalize(Configuration conf, boolean isConfirmationNeeded ) throws IOException { + String nsId = DFSUtil.getNamenodeNameServiceId(conf); + String namenodeId = HAUtil.getNameNodeId(conf, nsId); + initializeGenericKeys(conf, nsId, namenodeId); + FSNamesystem nsys = new FSNamesystem(conf, new FSImage(conf)); System.err.print( "\"finalize\" will remove the previous state of the files system.\n" @@ -721,6 +817,14 @@ public class NameNode { return null; } setStartupOption(conf, startOpt); + + if (HAUtil.isHAEnabled(conf, DFSUtil.getNamenodeNameServiceId(conf)) && + (startOpt == StartupOption.UPGRADE || + startOpt == StartupOption.ROLLBACK || + startOpt == StartupOption.FINALIZE)) { + throw new HadoopIllegalArgumentException("Invalid startup option. " + + "Cannot perform DFS upgrade with HA enabled."); + } switch (startOpt) { case FORMAT: @@ -761,16 +865,26 @@ public class NameNode { * @param conf * Configuration object to lookup specific key and to set the value * to the key passed. Note the conf object is modified - * @param nameserviceId name service Id - * @see DFSUtil#setGenericConf(Configuration, String, String...) + * @param nameserviceId name service Id (to distinguish federated NNs) + * @param namenodeId the namenode ID (to distinguish HA NNs) + * @see DFSUtil#setGenericConf(Configuration, String, String, String...) */ - public static void initializeGenericKeys(Configuration conf, String - nameserviceId) { - if ((nameserviceId == null) || nameserviceId.isEmpty()) { + public static void initializeGenericKeys(Configuration conf, + String nameserviceId, String namenodeId) { + if ((nameserviceId == null || nameserviceId.isEmpty()) && + (namenodeId == null || namenodeId.isEmpty())) { return; } - DFSUtil.setGenericConf(conf, nameserviceId, NAMESERVICE_SPECIFIC_KEYS); + if (nameserviceId != null) { + conf.set(DFS_FEDERATION_NAMESERVICE_ID, nameserviceId); + } + if (namenodeId != null) { + conf.set(DFS_HA_NAMENODE_ID_KEY, namenodeId); + } + + DFSUtil.setGenericConf(conf, nameserviceId, namenodeId, + NAMESERVICE_SPECIFIC_KEYS); if (conf.get(DFS_NAMENODE_RPC_ADDRESS_KEY) != null) { URI defaultUri = URI.create(HdfsConstants.HDFS_URI_SCHEME + "://" + conf.get(DFS_NAMENODE_RPC_ADDRESS_KEY)); @@ -799,4 +913,131 @@ public class NameNode { System.exit(-1); } } + + synchronized void monitorHealth() + throws HealthCheckFailedException, AccessControlException { + namesystem.checkSuperuserPrivilege(); + if (!haEnabled) { + return; // no-op, if HA is not enabled + } + getNamesystem().checkAvailableResources(); + if (!getNamesystem().nameNodeHasResourcesAvailable()) { + throw new HealthCheckFailedException( + "The NameNode has no resources available"); + } + } + + synchronized void transitionToActive() + throws ServiceFailedException, AccessControlException { + namesystem.checkSuperuserPrivilege(); + if (!haEnabled) { + throw new ServiceFailedException("HA for namenode is not enabled"); + } + state.setState(haContext, ACTIVE_STATE); + } + + synchronized void transitionToStandby() + throws ServiceFailedException, AccessControlException { + namesystem.checkSuperuserPrivilege(); + if (!haEnabled) { + throw new ServiceFailedException("HA for namenode is not enabled"); + } + state.setState(haContext, STANDBY_STATE); + } + + synchronized HAServiceState getServiceState() throws AccessControlException { + namesystem.checkSuperuserPrivilege(); + if (state == null) { + return HAServiceState.INITIALIZING; + } + return state.getServiceState(); + } + + synchronized boolean readyToBecomeActive() + throws ServiceFailedException, AccessControlException { + namesystem.checkSuperuserPrivilege(); + if (!haEnabled) { + throw new ServiceFailedException("HA for namenode is not enabled"); + } + return !isInSafeMode(); + } + + + /** + * Class used as expose {@link NameNode} as context to {@link HAState} + * + * TODO(HA): + * When entering and exiting state, on failing to start services, + * appropriate action is needed todo either shutdown the node or recover + * from failure. + */ + protected class NameNodeHAContext implements HAContext { + @Override + public void setState(HAState s) { + state = s; + } + + @Override + public HAState getState() { + return state; + } + + @Override + public void startActiveServices() throws IOException { + namesystem.startActiveServices(); + startTrashEmptier(conf); + } + + @Override + public void stopActiveServices() throws IOException { + if (namesystem != null) { + namesystem.stopActiveServices(); + } + stopTrashEmptier(); + } + + @Override + public void startStandbyServices() throws IOException { + namesystem.startStandbyServices(); + } + + @Override + public void prepareToStopStandbyServices() throws ServiceFailedException { + namesystem.prepareToStopStandbyServices(); + } + + @Override + public void stopStandbyServices() throws IOException { + if (namesystem != null) { + namesystem.stopStandbyServices(); + } + } + + @Override + public void writeLock() { + namesystem.writeLock(); + } + + @Override + public void writeUnlock() { + namesystem.writeUnlock(); + } + + /** Check if an operation of given category is allowed */ + @Override + public void checkOperation(final OperationCategory op) + throws StandbyException { + state.checkOperation(haContext, op); + } + + @Override + public boolean allowStaleReads() { + return allowStaleStandbyReads; + } + + } + + public boolean isStandbyState() { + return (state.equals(STANDBY_STATE)); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java index e4817c7b18..a024a5524a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java @@ -46,7 +46,7 @@ import com.google.common.base.Predicate; * are added by default, and arbitrary extra volumes may be configured as well. */ @InterfaceAudience.Private -class NameNodeResourceChecker { +public class NameNodeResourceChecker { private static final Log LOG = LogFactory.getLog(NameNodeResourceChecker.class.getName()); // Space (in bytes) reserved per volume. @@ -176,8 +176,7 @@ class NameNodeResourceChecker { * least one redundant volume and all of the required volumes, false * otherwise. */ - boolean hasAvailableDiskSpace() - throws IOException { + public boolean hasAvailableDiskSpace() { return NameNodeResourcePolicy.areResourcesAvailable(volumes.values(), minimumRedundantVolumes); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java index 53cd867fbc..3896165ff3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java @@ -37,9 +37,6 @@ final class NameNodeResourcePolicy { * required to continue operation. * @return true if and only if there are sufficient NN resources to * continue logging edits. - * @throws RuntimeException if the number of configured - * redundant resources is fewer than the minimum number of available - * redundant resources. */ static boolean areResourcesAvailable( Collection resources, @@ -63,12 +60,6 @@ final class NameNodeResourcePolicy { } } - if (redundantResourceCount < minimumRedundantResources) { - throw new RuntimeException("Need a minimum of " + minimumRedundantResources - + " for NN to operate but only " + redundantResourceCount - + " are configured."); - } - if (redundantResourceCount == 0) { // If there are no redundant resources, return true if there are any // required resources available. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java index 075060093c..17b387cb42 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java @@ -41,6 +41,13 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.UnresolvedLinkException; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.PermissionStatus; +import org.apache.hadoop.ha.HAServiceProtocol; +import org.apache.hadoop.ha.HealthCheckFailedException; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceProtocolService; +import org.apache.hadoop.ha.protocolPB.HAServiceProtocolPB; +import org.apache.hadoop.ha.protocolPB.HAServiceProtocolServerSideTranslatorPB; + import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HDFSPolicyProvider; import org.apache.hadoop.hdfs.HdfsConfiguration; @@ -82,6 +89,7 @@ import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifie import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.IncorrectVersionException; import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport; +import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; @@ -90,6 +98,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; import org.apache.hadoop.hdfs.server.protocol.FinalizeCommand; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; @@ -104,6 +113,7 @@ import org.apache.hadoop.io.EnumSetWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.ipc.WritableRpcEngine; import org.apache.hadoop.net.Node; @@ -127,7 +137,7 @@ class NameNodeRpcServer implements NamenodeProtocols { private static final Log stateChangeLog = NameNode.stateChangeLog; // Dependencies from other parts of NN. - private final FSNamesystem namesystem; + protected final FSNamesystem namesystem; protected final NameNode nn; private final NameNodeMetrics metrics; @@ -183,6 +193,11 @@ class NameNodeRpcServer implements NamenodeProtocols { new GetUserMappingsProtocolServerSideTranslatorPB(this); BlockingService getUserMappingService = GetUserMappingsProtocolService .newReflectiveBlockingService(getUserMappingXlator); + + HAServiceProtocolServerSideTranslatorPB haServiceProtocolXlator = + new HAServiceProtocolServerSideTranslatorPB(this); + BlockingService haPbService = HAServiceProtocolService + .newReflectiveBlockingService(haServiceProtocolXlator); WritableRpcEngine.ensureInitialized(); @@ -198,6 +213,8 @@ class NameNodeRpcServer implements NamenodeProtocols { dnSocketAddr.getHostName(), dnSocketAddr.getPort(), serviceHandlerCount, false, conf, namesystem.getDelegationTokenSecretManager()); + DFSUtil.addPBProtocol(conf, HAServiceProtocolPB.class, haPbService, + serviceRpcServer); DFSUtil.addPBProtocol(conf, NamenodeProtocolPB.class, NNPbService, serviceRpcServer); DFSUtil.addPBProtocol(conf, DatanodeProtocolPB.class, dnProtoPbService, @@ -221,6 +238,8 @@ class NameNodeRpcServer implements NamenodeProtocols { clientNNPbService, socAddr.getHostName(), socAddr.getPort(), handlerCount, false, conf, namesystem.getDelegationTokenSecretManager()); + DFSUtil.addPBProtocol(conf, HAServiceProtocolPB.class, haPbService, + clientRpcServer); DFSUtil.addPBProtocol(conf, NamenodeProtocolPB.class, NNPbService, clientRpcServer); DFSUtil.addPBProtocol(conf, DatanodeProtocolPB.class, dnProtoPbService, @@ -287,7 +306,7 @@ class NameNodeRpcServer implements NamenodeProtocols { throw new IllegalArgumentException( "Unexpected not positive size: "+size); } - + namesystem.checkOperation(OperationCategory.READ); return namesystem.getBlockManager().getBlocks(datanode, size); } @@ -300,6 +319,7 @@ class NameNodeRpcServer implements NamenodeProtocols { public void errorReport(NamenodeRegistration registration, int errorCode, String msg) throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); verifyRequest(registration); LOG.info("Error report from " + registration + ": " + msg); if(errorCode == FATAL) @@ -327,9 +347,6 @@ class NameNodeRpcServer implements NamenodeProtocols { @Override // NamenodeProtocol public void endCheckpoint(NamenodeRegistration registration, CheckpointSignature sig) throws IOException { - verifyRequest(registration); - if(!nn.isRole(NamenodeRole.NAMENODE)) - throw new IOException("Only an ACTIVE node can invoke endCheckpoint."); namesystem.endCheckpoint(registration, sig); } @@ -478,10 +495,10 @@ class NameNodeRpcServer implements NamenodeProtocols { return namesystem.getAdditionalDatanode(src, blk, existings, excludeSet, numAdditionalNodes, clientName); } - /** * The client needs to give up on the block. */ + @Override // ClientProtocol public void abandonBlock(ExtendedBlock b, String src, String holder) throws IOException { if(stateChangeLog.isDebugEnabled()) { @@ -509,18 +526,9 @@ class NameNodeRpcServer implements NamenodeProtocols { * mark the block as corrupt. In the future we might * check the blocks are actually corrupt. */ - @Override + @Override // ClientProtocol, DatanodeProtocol public void reportBadBlocks(LocatedBlock[] blocks) throws IOException { - stateChangeLog.info("*DIR* NameNode.reportBadBlocks"); - for (int i = 0; i < blocks.length; i++) { - ExtendedBlock blk = blocks[i].getBlock(); - DatanodeInfo[] nodes = blocks[i].getLocations(); - for (int j = 0; j < nodes.length; j++) { - DatanodeInfo dn = nodes[j]; - namesystem.getBlockManager().findAndMarkBlockAsCorrupt(blk, dn, - "client machine reported it"); - } - } + namesystem.reportBadBlocks(blocks); } @Override // ClientProtocol @@ -633,8 +641,7 @@ class NameNodeRpcServer implements NamenodeProtocols { @Override // ClientProtocol public DirectoryListing getListing(String src, byte[] startAfter, - boolean needLocation) - throws IOException { + boolean needLocation) throws IOException { DirectoryListing files = namesystem.getListing( src, startAfter, needLocation); if (files != null) { @@ -656,14 +663,16 @@ class NameNodeRpcServer implements NamenodeProtocols { return namesystem.getFileInfo(src, false); } - @Override - public long[] getStats() { + @Override // ClientProtocol + public long[] getStats() throws IOException { + namesystem.checkOperation(OperationCategory.READ); return namesystem.getStats(); } @Override // ClientProtocol public DatanodeInfo[] getDatanodeReport(DatanodeReportType type) - throws IOException { + throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); DatanodeInfo results[] = namesystem.datanodeReport(type); if (results == null ) { throw new IOException("Cannot find datanode report"); @@ -673,28 +682,32 @@ class NameNodeRpcServer implements NamenodeProtocols { @Override // ClientProtocol public boolean setSafeMode(SafeModeAction action) throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); return namesystem.setSafeMode(action); } @Override // ClientProtocol - public boolean restoreFailedStorage(String arg) - throws AccessControlException { + public boolean restoreFailedStorage(String arg) throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); return namesystem.restoreFailedStorage(arg); } @Override // ClientProtocol public void saveNamespace() throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); namesystem.saveNamespace(); } @Override // ClientProtocol public void refreshNodes() throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); namesystem.getBlockManager().getDatanodeManager().refreshNodes( new HdfsConfiguration()); } @Override // NamenodeProtocol - public long getTransactionID() { + public long getTransactionID() throws IOException { + namesystem.checkOperation(OperationCategory.CHECKPOINT); return namesystem.getEditLog().getSyncTxId(); } @@ -703,32 +716,36 @@ class NameNodeRpcServer implements NamenodeProtocols { return namesystem.rollEditLog(); } - @Override + @Override // NamenodeProtocol public RemoteEditLogManifest getEditLogManifest(long sinceTxId) throws IOException { + namesystem.checkOperation(OperationCategory.READ); return namesystem.getEditLog().getEditLogManifest(sinceTxId); } @Override // ClientProtocol public void finalizeUpgrade() throws IOException { + namesystem.checkOperation(OperationCategory.WRITE); namesystem.finalizeUpgrade(); } @Override // ClientProtocol public UpgradeStatusReport distributedUpgradeProgress(UpgradeAction action) throws IOException { + namesystem.checkOperation(OperationCategory.READ); return namesystem.distributedUpgradeProgress(action); } @Override // ClientProtocol public void metaSave(String filename) throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); namesystem.metaSave(filename); } @Override // ClientProtocol public CorruptFileBlocks listCorruptFileBlocks(String path, String cookie) throws IOException { - String[] cookieTab = new String[] { cookie }; + String[] cookieTab = new String[] { cookie }; Collection fbs = namesystem.listCorruptFileBlocks(path, cookieTab); @@ -743,11 +760,12 @@ class NameNodeRpcServer implements NamenodeProtocols { /** * Tell all datanodes to use a new, non-persistent bandwidth value for * dfs.datanode.balance.bandwidthPerSec. - * @param bandwidth Blanacer bandwidth in bytes per second for all datanodes. + * @param bandwidth Balancer bandwidth in bytes per second for all datanodes. * @throws IOException */ @Override // ClientProtocol public void setBalancerBandwidth(long bandwidth) throws IOException { + namesystem.checkOperation(OperationCategory.UNCHECKED); namesystem.getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth); } @@ -822,7 +840,7 @@ class NameNodeRpcServer implements NamenodeProtocols { } @Override // DatanodeProtocol - public DatanodeCommand[] sendHeartbeat(DatanodeRegistration nodeReg, + public HeartbeatResponse sendHeartbeat(DatanodeRegistration nodeReg, StorageReport[] report, int xmitsInProgress, int xceiverCount, int failedVolumes) throws IOException { verifyRequest(nodeReg); @@ -844,7 +862,7 @@ class NameNodeRpcServer implements NamenodeProtocols { } namesystem.getBlockManager().processReport(nodeReg, poolId, blist); - if (nn.getFSImage().isUpgradeFinalized()) + if (nn.getFSImage().isUpgradeFinalized() && !nn.isStandbyState()) return new FinalizeCommand(poolId); return null; } @@ -858,7 +876,7 @@ class NameNodeRpcServer implements NamenodeProtocols { +"from "+nodeReg.getName()+" "+receivedAndDeletedBlocks.length +" blocks."); } - namesystem.getBlockManager().blockReceivedAndDeleted( + namesystem.getBlockManager().processIncrementalBlockReport( nodeReg, poolId, receivedAndDeletedBlocks[0].getBlocks()); } @@ -946,6 +964,35 @@ class NameNodeRpcServer implements NamenodeProtocols { return UserGroupInformation.createRemoteUser(user).getGroupNames(); } + @Override // HAServiceProtocol + public synchronized void monitorHealth() + throws HealthCheckFailedException, AccessControlException { + nn.monitorHealth(); + } + + @Override // HAServiceProtocol + public synchronized void transitionToActive() + throws ServiceFailedException, AccessControlException { + nn.transitionToActive(); + } + + @Override // HAServiceProtocol + public synchronized void transitionToStandby() + throws ServiceFailedException, AccessControlException { + nn.transitionToStandby(); + } + + @Override // HAServiceProtocol + public synchronized HAServiceState getServiceState() + throws AccessControlException { + return nn.getServiceState(); + } + + @Override // HAServiceProtocol + public synchronized boolean readyToBecomeActive() + throws ServiceFailedException, AccessControlException { + return nn.readyToBecomeActive(); + } /** * Verify version. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java index 64b2723988..44c07510ba 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java @@ -36,6 +36,7 @@ import javax.servlet.http.HttpServletResponse; import javax.servlet.jsp.JspWriter; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.protocol.Block; @@ -309,7 +310,16 @@ class NamenodeJspHelper { long bpUsed = fsnStats[6]; float percentBpUsed = DFSUtil.getPercentUsed(bpUsed, total); - + + // don't show under-replicated/missing blocks or corrupt files for SBN + // since the standby namenode doesn't compute replication queues + String underReplicatedBlocks = ""; + if (nn.getServiceState() == HAServiceState.ACTIVE) { + underReplicatedBlocks = rowTxt() + + colTxt("Excludes missing blocks.") + + "Number of Under-Replicated Blocks" + colTxt() + ":" + colTxt() + + fsn.getBlockManager().getUnderReplicatedNotMissingBlocks(); + } out.print("
      \n" + rowTxt() + colTxt() + "Configured Capacity" + colTxt() + ":" + colTxt() + StringUtils.byteDesc(total) + rowTxt() + colTxt() + "DFS Used" @@ -344,10 +354,8 @@ class NamenodeJspHelper { + rowTxt() + colTxt() + "" + "Decommissioning Nodes " - + colTxt() + ":" + colTxt() + decommissioning.size() - + rowTxt() + colTxt("Excludes missing blocks.") - + "Number of Under-Replicated Blocks" + colTxt() + ":" + colTxt() - + fsn.getBlockManager().getUnderReplicatedNotMissingBlocks() + + colTxt() + ":" + colTxt() + decommissioning.size() + + underReplicatedBlocks + "

      \n"); if (live.isEmpty() && dead.isEmpty()) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java index 6846e959a4..c453db561e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java @@ -32,4 +32,10 @@ public interface Namesystem extends RwLock, SafeMode { /** @return the block pool ID */ public String getBlockPoolId(); + + public boolean isInStandbyState(); + + public boolean isGenStampInFuture(long generationStamp); + + public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal); } \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java index 2731275f26..5b49f0ee47 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java @@ -18,8 +18,10 @@ package org.apache.hadoop.hdfs.server.namenode; import java.io.IOException; +import org.apache.hadoop.classification.InterfaceAudience;; -class SaveNamespaceCancelledException extends IOException { +@InterfaceAudience.Private +public class SaveNamespaceCancelledException extends IOException { private static final long serialVersionUID = 1L; SaveNamespaceCancelledException(String cancelReason) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java index 50dca62e8f..c1ce79e439 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java @@ -45,10 +45,11 @@ import org.apache.hadoop.fs.FileSystem; import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.DFSUtil.ErrorSimulator; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.protocol.HdfsConstants; -import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; import org.apache.hadoop.hdfs.server.common.JspHelper; @@ -112,17 +113,9 @@ public class SecondaryNameNode implements Runnable { private String infoBindAddress; private Collection checkpointDirs; - private Collection checkpointEditsDirs; - - /** How often to checkpoint regardless of number of txns */ - private long checkpointPeriod; // in seconds - - /** How often to poll the NN to check checkpointTxnCount */ - private long checkpointCheckPeriod; // in seconds - - /** checkpoint once every this many transactions, regardless of time */ - private long checkpointTxnCount; + private List checkpointEditsDirs; + private CheckpointConf checkpointConf; private FSNamesystem namesystem; @@ -132,9 +125,9 @@ public class SecondaryNameNode implements Runnable { + "\nName Node Address : " + nameNodeAddr + "\nStart Time : " + new Date(starttime) + "\nLast Checkpoint Time : " + (lastCheckpointTime == 0? "--": new Date(lastCheckpointTime)) - + "\nCheckpoint Period : " + checkpointPeriod + " seconds" - + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointTxnCount) - + " (= " + checkpointTxnCount + " bytes)" + + "\nCheckpoint Period : " + checkpointConf.getPeriod() + " seconds" + + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointConf.getTxnCount()) + + " (= " + checkpointConf.getTxnCount() + " bytes)" + "\nCheckpoint Dirs : " + checkpointDirs + "\nCheckpoint Edits Dirs: " + checkpointEditsDirs; } @@ -174,16 +167,19 @@ public class SecondaryNameNode implements Runnable { public SecondaryNameNode(Configuration conf, CommandLineOpts commandLineOpts) throws IOException { try { - NameNode.initializeGenericKeys(conf, - DFSUtil.getSecondaryNameServiceId(conf)); + String nsId = DFSUtil.getSecondaryNameServiceId(conf); + if (HAUtil.isHAEnabled(conf, nsId)) { + throw new IOException( + "Cannot use SecondaryNameNode in an HA cluster." + + " The Standby Namenode will perform checkpointing."); + } + NameNode.initializeGenericKeys(conf, nsId, null); initialize(conf, commandLineOpts); - } catch(IOException e) { + } catch (IOException e) { shutdown(); - LOG.fatal("Failed to start secondary namenode. ", e); throw e; - } catch(HadoopIllegalArgumentException e) { + } catch (HadoopIllegalArgumentException e) { shutdown(); - LOG.fatal("Failed to start secondary namenode. ", e); throw e; } } @@ -216,8 +212,9 @@ public class SecondaryNameNode implements Runnable { nameNodeAddr = NameNode.getServiceAddress(conf, true); this.conf = conf; - this.namenode = new NamenodeProtocolTranslatorPB(nameNodeAddr, conf, - UserGroupInformation.getCurrentUser()); + this.namenode = NameNodeProxies.createNonHAProxy(conf, nameNodeAddr, + NamenodeProtocol.class, UserGroupInformation.getCurrentUser(), + true).getProxy(); // initialize checkpoint directories fsName = getInfoServer(); @@ -231,16 +228,8 @@ public class SecondaryNameNode implements Runnable { namesystem = new FSNamesystem(conf, checkpointImage); // Initialize other scheduling parameters from the configuration - checkpointCheckPeriod = conf.getLong( - DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, - DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT); - - checkpointPeriod = conf.getLong(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, - DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT); - checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY, - DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); - warnForDeprecatedConfigs(conf); - + checkpointConf = new CheckpointConf(conf); + // initialize the webserver for uploading files. // Kerberized SSL servers must be run from the host principal... UserGroupInformation httpUGI = @@ -296,21 +285,9 @@ public class SecondaryNameNode implements Runnable { conf.set(DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, infoBindAddress + ":" +infoPort); LOG.info("Secondary Web-server up at: " + infoBindAddress + ":" +infoPort); LOG.info("Secondary image servlet up at: " + infoBindAddress + ":" + imagePort); - LOG.info("Checkpoint Period :" + checkpointPeriod + " secs " + - "(" + checkpointPeriod/60 + " min)"); - LOG.info("Log Size Trigger :" + checkpointTxnCount + " txns"); - } - - static void warnForDeprecatedConfigs(Configuration conf) { - for (String key : ImmutableList.of( - "fs.checkpoint.size", - "dfs.namenode.checkpoint.size")) { - if (conf.get(key) != null) { - LOG.warn("Configuration key " + key + " is deprecated! Ignoring..." + - " Instead please specify a value for " + - DFS_NAMENODE_CHECKPOINT_TXNS_KEY); - } - } + LOG.info("Checkpoint Period :" + checkpointConf.getPeriod() + " secs " + + "(" + checkpointConf.getPeriod()/60 + " min)"); + LOG.info("Log Size Trigger :" + checkpointConf.getTxnCount() + " txns"); } /** @@ -332,36 +309,24 @@ public class SecondaryNameNode implements Runnable { } public void run() { - if (UserGroupInformation.isSecurityEnabled()) { - UserGroupInformation ugi = null; - try { - ugi = UserGroupInformation.getLoginUser(); - } catch (IOException e) { - LOG.error("Exception while getting login user", e); - e.printStackTrace(); - Runtime.getRuntime().exit(-1); - } - ugi.doAs(new PrivilegedAction() { + SecurityUtil.doAsLoginUserOrFatal( + new PrivilegedAction() { @Override public Object run() { doWork(); return null; } }); - } else { - doWork(); - } } // // The main work loop // public void doWork() { - // // Poll the Namenode (once every checkpointCheckPeriod seconds) to find the // number of transactions in the edit log that haven't yet been checkpointed. // - long period = Math.min(checkpointCheckPeriod, checkpointPeriod); + long period = checkpointConf.getCheckPeriod(); while (shouldRun) { try { @@ -380,7 +345,7 @@ public class SecondaryNameNode implements Runnable { long now = System.currentTimeMillis(); if (shouldCheckpointBasedOnCount() || - now >= lastCheckpointTime + 1000 * checkpointPeriod) { + now >= lastCheckpointTime + 1000 * checkpointConf.getPeriod()) { doCheckpoint(); lastCheckpointTime = now; } @@ -471,19 +436,10 @@ public class SecondaryNameNode implements Runnable { } String configuredAddress = DFSUtil.getInfoServer(null, conf, true); - InetSocketAddress sockAddr = NetUtils.createSocketAddr(configuredAddress); - if (sockAddr.getAddress().isAnyLocalAddress()) { - if(UserGroupInformation.isSecurityEnabled()) { - throw new IOException("Cannot use a wildcard address with security. " + - "Must explicitly set bind address for Kerberos"); - } - return fsName.getHost() + ":" + sockAddr.getPort(); - } else { - if(LOG.isDebugEnabled()) { - LOG.debug("configuredAddress = " + configuredAddress); - } - return configuredAddress; - } + String address = DFSUtil.substituteForWildcardAddress(configuredAddress, + fsName.getHost()); + LOG.debug("Will connect to NameNode at HTTP address: " + address); + return address; } /** @@ -574,13 +530,13 @@ public class SecondaryNameNode implements Runnable { switch (opts.getCommand()) { case CHECKPOINT: long count = countUncheckpointedTxns(); - if (count > checkpointTxnCount || + if (count > checkpointConf.getTxnCount() || opts.shouldForceCheckpoint()) { doCheckpoint(); } else { System.err.println("EditLog size " + count + " transactions is " + "smaller than configured checkpoint " + - "interval " + checkpointTxnCount + " transactions."); + "interval " + checkpointConf.getTxnCount() + " transactions."); System.err.println("Skipping checkpoint."); } break; @@ -626,7 +582,7 @@ public class SecondaryNameNode implements Runnable { } boolean shouldCheckpointBasedOnCount() throws IOException { - return countUncheckpointedTxns() >= checkpointTxnCount; + return countUncheckpointedTxns() >= checkpointConf.getTxnCount(); } /** @@ -642,7 +598,13 @@ public class SecondaryNameNode implements Runnable { StringUtils.startupShutdownMessage(SecondaryNameNode.class, argv, LOG); Configuration tconf = new HdfsConfiguration(); - SecondaryNameNode secondary = new SecondaryNameNode(tconf, opts); + SecondaryNameNode secondary = null; + try { + secondary = new SecondaryNameNode(tconf, opts); + } catch (IOException ioe) { + LOG.fatal("Failed to start secondary namenode", ioe); + System.exit(-1); + } if (opts.getCommand() != null) { int ret = secondary.processStartupCommand(opts); @@ -759,7 +721,7 @@ public class SecondaryNameNode implements Runnable { */ CheckpointStorage(Configuration conf, Collection imageDirs, - Collection editsDirs) throws IOException { + List editsDirs) throws IOException { super(conf, imageDirs, editsDirs); // the 2NN never writes edits -- it only downloads them. So diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java index cc8dccaf1a..985d85ba98 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java @@ -24,8 +24,11 @@ import java.security.MessageDigest; import java.util.List; import java.lang.Math; +import javax.servlet.http.HttpServletResponse; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; @@ -41,7 +44,8 @@ import com.google.common.collect.Lists; /** * This class provides fetching a specified file from the NameNode. */ -class TransferFsImage { +@InterfaceAudience.Private +public class TransferFsImage { public final static String CONTENT_LENGTH = "Content-Length"; public final static String MD5_HEADER = "X-MD5-Digest"; @@ -103,7 +107,7 @@ class TransferFsImage { * @param storage the storage directory to transfer the image from * @param txid the transaction ID of the image to be uploaded */ - static void uploadImageFromStorage(String fsName, + public static void uploadImageFromStorage(String fsName, InetSocketAddress imageListenAddress, NNStorage storage, long txid) throws IOException { @@ -111,7 +115,20 @@ class TransferFsImage { txid, imageListenAddress, storage); // this doesn't directly upload an image, but rather asks the NN // to connect back to the 2NN to download the specified image. - TransferFsImage.getFileClient(fsName, fileid, null, null, false); + try { + TransferFsImage.getFileClient(fsName, fileid, null, null, false); + } catch (HttpGetFailedException e) { + if (e.getResponseCode() == HttpServletResponse.SC_CONFLICT) { + // this is OK - this means that a previous attempt to upload + // this checkpoint succeeded even though we thought it failed. + LOG.info("Image upload with txid " + txid + + " conflicted with a previous image upload to the " + + "same NameNode. Continuing...", e); + return; + } else { + throw e; + } + } LOG.info("Uploaded image with txid " + txid + " to namenode at " + fsName); } @@ -194,10 +211,11 @@ class TransferFsImage { HttpURLConnection connection = (HttpURLConnection) url.openConnection(); if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) { - throw new IOException( + throw new HttpGetFailedException( "Image transfer servlet at " + url + " failed with status code " + connection.getResponseCode() + - "\nResponse message:\n" + connection.getResponseMessage()); + "\nResponse message:\n" + connection.getResponseMessage(), + connection); } long advertisedSize; @@ -289,5 +307,19 @@ class TransferFsImage { String header = connection.getHeaderField(MD5_HEADER); return (header != null) ? new MD5Hash(header) : null; } + + public static class HttpGetFailedException extends IOException { + private static final long serialVersionUID = 1L; + private final int responseCode; + + HttpGetFailedException(String msg, HttpURLConnection connection) throws IOException { + super(msg); + this.responseCode = connection.getResponseCode(); + } + + public int getResponseCode() { + return responseCode; + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java index 9ac17fc57c..ca7e1d7787 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java @@ -32,8 +32,7 @@ public class UnsupportedActionException extends IOException { /** for java.io.Serializable */ private static final long serialVersionUID = 1L; - public UnsupportedActionException(String action) { - super("Action " + action + "() is not supported."); + public UnsupportedActionException(String msg) { + super(msg); } - } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ActiveState.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ActiveState.java new file mode 100644 index 0000000000..a61e134cc7 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ActiveState.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; + +/** + * Active state of the namenode. In this state, namenode provides the namenode + * service and handles operations of type {@link OperationCategory#WRITE} and + * {@link OperationCategory#READ}. + */ +@InterfaceAudience.Private +public class ActiveState extends HAState { + public ActiveState() { + super(HAServiceState.ACTIVE); + } + + @Override + public void checkOperation(HAContext context, OperationCategory op) { + return; // Other than journal all operations are allowed in active state + } + + @Override + public boolean shouldPopulateReplQueues() { + return true; + } + + @Override + public void setState(HAContext context, HAState s) throws ServiceFailedException { + if (s == NameNode.STANDBY_STATE) { + setStateInternal(context, s); + return; + } + super.setState(context, s); + } + + @Override + public void enterState(HAContext context) throws ServiceFailedException { + try { + context.startActiveServices(); + } catch (IOException e) { + throw new ServiceFailedException("Failed to start active services", e); + } + } + + @Override + public void exitState(HAContext context) throws ServiceFailedException { + try { + context.stopActiveServices(); + } catch (IOException e) { + throw new ServiceFailedException("Failed to stop active services", e); + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ConfiguredFailoverProxyProvider.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ConfiguredFailoverProxyProvider.java new file mode 100644 index 0000000000..a20880aad6 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ConfiguredFailoverProxyProvider.java @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.Closeable; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.URI; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.NameNodeProxies; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; +import org.apache.hadoop.io.retry.FailoverProxyProvider; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.security.UserGroupInformation; + +import com.google.common.base.Preconditions; + +/** + * A FailoverProxyProvider implementation which allows one to configure two URIs + * to connect to during fail-over. The first configured address is tried first, + * and on a fail-over event the other address is tried. + */ +public class ConfiguredFailoverProxyProvider implements + FailoverProxyProvider { + + private static final Log LOG = + LogFactory.getLog(ConfiguredFailoverProxyProvider.class); + + private final Configuration conf; + private final List> proxies = + new ArrayList>(); + private final UserGroupInformation ugi; + private final Class xface; + + private int currentProxyIndex = 0; + + public ConfiguredFailoverProxyProvider(Configuration conf, URI uri, + Class xface) { + Preconditions.checkArgument( + xface.isAssignableFrom(NamenodeProtocols.class), + "Interface class %s is not a valid NameNode protocol!"); + this.xface = xface; + + this.conf = new Configuration(conf); + int maxRetries = this.conf.getInt( + DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_KEY, + DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_DEFAULT); + this.conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, + maxRetries); + + int maxRetriesOnSocketTimeouts = this.conf.getInt( + DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT); + this.conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + maxRetriesOnSocketTimeouts); + + try { + ugi = UserGroupInformation.getCurrentUser(); + + Map> map = DFSUtil.getHaNnRpcAddresses( + conf); + Map addressesInNN = map.get(uri.getHost()); + + if (addressesInNN == null || addressesInNN.size() == 0) { + throw new RuntimeException("Could not find any configured addresses " + + "for URI " + uri); + } + + for (InetSocketAddress address : addressesInNN.values()) { + proxies.add(new AddressRpcProxyPair(address)); + + // The client may have a delegation token set for the logical + // URI of the cluster. Clone this token to apply to each of the + // underlying IPC addresses so that the IPC code can find it. + HAUtil.cloneDelegationTokenForLogicalUri(ugi, uri, address); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public Class getInterface() { + return xface; + } + + /** + * Lazily initialize the RPC proxy object. + */ + @SuppressWarnings("unchecked") + @Override + public synchronized T getProxy() { + AddressRpcProxyPair current = proxies.get(currentProxyIndex); + if (current.namenode == null) { + try { + current.namenode = NameNodeProxies.createNonHAProxy(conf, + current.address, xface, ugi, false).getProxy(); + } catch (IOException e) { + LOG.error("Failed to create RPC proxy to NameNode", e); + throw new RuntimeException(e); + } + } + return (T)current.namenode; + } + + @Override + public synchronized void performFailover(T currentProxy) { + currentProxyIndex = (currentProxyIndex + 1) % proxies.size(); + } + + /** + * A little pair object to store the address and connected RPC proxy object to + * an NN. Note that {@link AddressRpcProxyPair#namenode} may be null. + */ + private static class AddressRpcProxyPair { + public InetSocketAddress address; + public T namenode; + + public AddressRpcProxyPair(InetSocketAddress address) { + this.address = address; + } + } + + /** + * Close all the proxy objects which have been opened over the lifetime of + * this proxy provider. + */ + @Override + public synchronized void close() throws IOException { + for (AddressRpcProxyPair proxy : proxies) { + if (proxy.namenode != null) { + if (proxy.namenode instanceof Closeable) { + ((Closeable)proxy.namenode).close(); + } else { + RPC.stopProxy(proxy.namenode); + } + } + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java new file mode 100644 index 0000000000..780bad72e9 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java @@ -0,0 +1,339 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.security.PrivilegedAction; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB; +import org.apache.hadoop.hdfs.server.namenode.EditLogInputException; +import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream; +import org.apache.hadoop.hdfs.server.namenode.FSEditLog; +import org.apache.hadoop.hdfs.server.namenode.FSImage; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.security.SecurityUtil; + +import static org.apache.hadoop.hdfs.server.common.Util.now; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; + + +/** + * EditLogTailer represents a thread which periodically reads from edits + * journals and applies the transactions contained within to a given + * FSNamesystem. + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class EditLogTailer { + public static final Log LOG = LogFactory.getLog(EditLogTailer.class); + + private final EditLogTailerThread tailerThread; + + private final FSNamesystem namesystem; + private FSEditLog editLog; + + private volatile Runtime runtime = Runtime.getRuntime(); + + private InetSocketAddress activeAddr; + private NamenodeProtocol cachedActiveProxy = null; + + /** + * The last transaction ID at which an edit log roll was initiated. + */ + private long lastRollTriggerTxId = HdfsConstants.INVALID_TXID; + + /** + * The highest transaction ID loaded by the Standby. + */ + private long lastLoadedTxnId = HdfsConstants.INVALID_TXID; + + /** + * The last time we successfully loaded a non-zero number of edits from the + * shared directory. + */ + private long lastLoadTimestamp; + + /** + * How often the Standby should roll edit logs. Since the Standby only reads + * from finalized log segments, the Standby will only be as up-to-date as how + * often the logs are rolled. + */ + private long logRollPeriodMs; + + /** + * How often the Standby should check if there are new finalized segment(s) + * available to be read from. + */ + private long sleepTimeMs; + + public EditLogTailer(FSNamesystem namesystem) { + this.tailerThread = new EditLogTailerThread(); + this.namesystem = namesystem; + this.editLog = namesystem.getEditLog(); + + + Configuration conf = namesystem.getConf(); + lastLoadTimestamp = now(); + + logRollPeriodMs = conf.getInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, + DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_DEFAULT) * 1000; + if (logRollPeriodMs >= 0) { + this.activeAddr = getActiveNodeAddress(); + Preconditions.checkArgument(activeAddr.getPort() > 0, + "Active NameNode must have an IPC port configured. " + + "Got address '%s'", activeAddr); + LOG.info("Will roll logs on active node at " + activeAddr + " every " + + (logRollPeriodMs / 1000) + " seconds."); + } else { + LOG.info("Not going to trigger log rolls on active node because " + + DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY + " is negative."); + } + + sleepTimeMs = conf.getInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, + DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_DEFAULT) * 1000; + + LOG.debug("logRollPeriodMs=" + logRollPeriodMs + + " sleepTime=" + sleepTimeMs); + } + + private InetSocketAddress getActiveNodeAddress() { + Configuration conf = namesystem.getConf(); + Configuration activeConf = HAUtil.getConfForOtherNode(conf); + return NameNode.getServiceAddress(activeConf, true); + } + + private NamenodeProtocol getActiveNodeProxy() throws IOException { + if (cachedActiveProxy == null) { + Configuration conf = namesystem.getConf(); + NamenodeProtocolPB proxy = + RPC.waitForProxy(NamenodeProtocolPB.class, + RPC.getProtocolVersion(NamenodeProtocolPB.class), activeAddr, conf); + cachedActiveProxy = new NamenodeProtocolTranslatorPB(proxy); + } + assert cachedActiveProxy != null; + return cachedActiveProxy; + } + + public void start() { + tailerThread.start(); + } + + public void stop() throws IOException { + tailerThread.setShouldRun(false); + tailerThread.interrupt(); + try { + tailerThread.join(); + } catch (InterruptedException e) { + LOG.warn("Edit log tailer thread exited with an exception"); + throw new IOException(e); + } + } + + @VisibleForTesting + FSEditLog getEditLog() { + return editLog; + } + + @VisibleForTesting + void setEditLog(FSEditLog editLog) { + this.editLog = editLog; + } + + @VisibleForTesting + synchronized void setRuntime(Runtime runtime) { + this.runtime = runtime; + } + + public void catchupDuringFailover() throws IOException { + Preconditions.checkState(tailerThread == null || + !tailerThread.isAlive(), + "Tailer thread should not be running once failover starts"); + try { + doTailEdits(); + } catch (InterruptedException e) { + throw new IOException(e); + } + } + + private void doTailEdits() throws IOException, InterruptedException { + // Write lock needs to be interruptible here because the + // transitionToActive RPC takes the write lock before calling + // tailer.stop() -- so if we're not interruptible, it will + // deadlock. + namesystem.writeLockInterruptibly(); + try { + FSImage image = namesystem.getFSImage(); + + long lastTxnId = image.getLastAppliedTxId(); + + if (LOG.isDebugEnabled()) { + LOG.debug("lastTxnId: " + lastTxnId); + } + Collection streams; + try { + streams = editLog.selectInputStreams(lastTxnId + 1, 0, false); + } catch (IOException ioe) { + // This is acceptable. If we try to tail edits in the middle of an edits + // log roll, i.e. the last one has been finalized but the new inprogress + // edits file hasn't been started yet. + LOG.warn("Edits tailer failed to find any streams. Will try again " + + "later.", ioe); + return; + } + if (LOG.isDebugEnabled()) { + LOG.debug("edit streams to load from: " + streams.size()); + } + + // Once we have streams to load, errors encountered are legitimate cause + // for concern, so we don't catch them here. Simple errors reading from + // disk are ignored. + long editsLoaded = 0; + try { + editsLoaded = image.loadEdits(streams, namesystem); + } catch (EditLogInputException elie) { + editsLoaded = elie.getNumEditsLoaded(); + throw elie; + } finally { + if (editsLoaded > 0 || LOG.isDebugEnabled()) { + LOG.info(String.format("Loaded %d edits starting from txid %d ", + editsLoaded, lastTxnId)); + } + } + + if (editsLoaded > 0) { + lastLoadTimestamp = now(); + } + lastLoadedTxnId = image.getLastAppliedTxId(); + } finally { + namesystem.writeUnlock(); + } + } + + /** + * @return timestamp (in msec) of when we last loaded a non-zero number of edits. + */ + public long getLastLoadTimestamp() { + return lastLoadTimestamp; + } + + /** + * @return true if the configured log roll period has elapsed. + */ + private boolean tooLongSinceLastLoad() { + return logRollPeriodMs >= 0 && + (now() - lastLoadTimestamp) > logRollPeriodMs ; + } + + /** + * Trigger the active node to roll its logs. + */ + private void triggerActiveLogRoll() { + LOG.info("Triggering log roll on remote NameNode " + activeAddr); + try { + getActiveNodeProxy().rollEditLog(); + lastRollTriggerTxId = lastLoadedTxnId; + } catch (IOException ioe) { + LOG.warn("Unable to trigger a roll of the active NN", ioe); + } + } + + /** + * The thread which does the actual work of tailing edits journals and + * applying the transactions to the FSNS. + */ + private class EditLogTailerThread extends Thread { + private volatile boolean shouldRun = true; + + private EditLogTailerThread() { + super("Edit log tailer"); + } + + private void setShouldRun(boolean shouldRun) { + this.shouldRun = shouldRun; + } + + @Override + public void run() { + SecurityUtil.doAsLoginUserOrFatal( + new PrivilegedAction() { + @Override + public Object run() { + doWork(); + return null; + } + }); + } + + private void doWork() { + while (shouldRun) { + try { + // There's no point in triggering a log roll if the Standby hasn't + // read any more transactions since the last time a roll was + // triggered. + if (tooLongSinceLastLoad() && + lastRollTriggerTxId < lastLoadedTxnId) { + triggerActiveLogRoll(); + } + /** + * Check again in case someone calls {@link EditLogTailer#stop} while + * we're triggering an edit log roll, since ipc.Client catches and + * ignores {@link InterruptedException} in a few places. This fixes + * the bug described in HDFS-2823. + */ + if (!shouldRun) { + break; + } + doTailEdits(); + } catch (EditLogInputException elie) { + LOG.warn("Error while reading edits from disk. Will try again.", elie); + } catch (InterruptedException ie) { + // interrupter should have already set shouldRun to false + continue; + } catch (Throwable t) { + LOG.error("Unknown error encountered while tailing edits. " + + "Shutting down standby NN.", t); + runtime.exit(1); + } + + try { + Thread.sleep(sleepTimeMs); + } catch (InterruptedException e) { + LOG.warn("Edit log tailer interrupted", e); + } + } + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java new file mode 100644 index 0000000000..6b070b25f5 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java @@ -0,0 +1,61 @@ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; +import org.apache.hadoop.ipc.StandbyException; + +/** + * Context that is to be used by {@link HAState} for getting/setting the + * current state and performing required operations. + */ +@InterfaceAudience.Private +public interface HAContext { + /** Set the state of the context to given {@code state} */ + public void setState(HAState state); + + /** Get the state from the context */ + public HAState getState(); + + /** Start the services required in active state */ + public void startActiveServices() throws IOException; + + /** Stop the services when exiting active state */ + public void stopActiveServices() throws IOException; + + /** Start the services required in standby state */ + public void startStandbyServices() throws IOException; + + /** Prepare to exit the standby state */ + public void prepareToStopStandbyServices() throws ServiceFailedException; + + /** Stop the services when exiting standby state */ + public void stopStandbyServices() throws IOException; + + /** + * Take a write-lock on the underlying namesystem + * so that no concurrent state transitions or edits + * can be made. + */ + void writeLock(); + + /** + * Unlock the lock taken by {@link #writeLock()} + */ + void writeUnlock(); + + /** + * Verify that the given operation category is allowed in the + * current state. This is to allow NN implementations (eg BackupNode) + * to override it with node-specific handling. + */ + void checkOperation(OperationCategory op) throws StandbyException; + + /** + * @return true if the node should allow stale reads (ie reads + * while the namespace is not up to date) + */ + boolean allowStaleReads(); +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAState.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAState.java new file mode 100644 index 0000000000..20ea854b46 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAState.java @@ -0,0 +1,148 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; +import org.apache.hadoop.hdfs.server.namenode.UnsupportedActionException; +import org.apache.hadoop.ipc.StandbyException; + +/** + * Namenode base state to implement state machine pattern. + */ +@InterfaceAudience.Private +abstract public class HAState { + protected final HAServiceState state; + + /** + * Constructor + * @param name Name of the state. + */ + public HAState(HAServiceState state) { + this.state = state; + } + + /** + * @return the generic service state + */ + public HAServiceState getServiceState() { + return state; + } + + /** + * Internal method to transition the state of a given namenode to a new state. + * @param nn Namenode + * @param s new state + * @throws ServiceFailedException on failure to transition to new state. + */ + protected final void setStateInternal(final HAContext context, final HAState s) + throws ServiceFailedException { + prepareToExitState(context); + s.prepareToEnterState(context); + context.writeLock(); + try { + exitState(context); + context.setState(s); + s.enterState(context); + } finally { + context.writeUnlock(); + } + } + + /** + * Method to be overridden by subclasses to prepare to enter a state. + * This method is called without the context being locked, + * and after {@link #prepareToExitState(HAContext)} has been called + * for the previous state, but before {@link #exitState(HAContext)} + * has been called for the previous state. + * @param context HA context + * @throws ServiceFailedException on precondition failure + */ + public void prepareToEnterState(final HAContext context) + throws ServiceFailedException {} + + /** + * Method to be overridden by subclasses to perform steps necessary for + * entering a state. + * @param context HA context + * @throws ServiceFailedException on failure to enter the state. + */ + public abstract void enterState(final HAContext context) + throws ServiceFailedException; + + /** + * Method to be overridden by subclasses to prepare to exit a state. + * This method is called without the context being locked. + * This is used by the standby state to cancel any checkpoints + * that are going on. It can also be used to check any preconditions + * for the state transition. + * + * This method should not make any destructuve changes to the state + * (eg stopping threads) since {@link #prepareToEnterState(HAContext)} + * may subsequently cancel the state transition. + * @param context HA context + * @throws ServiceFailedException on precondition failure + */ + public void prepareToExitState(final HAContext context) + throws ServiceFailedException {} + + /** + * Method to be overridden by subclasses to perform steps necessary for + * exiting a state. + * @param context HA context + * @throws ServiceFailedException on failure to enter the state. + */ + public abstract void exitState(final HAContext context) + throws ServiceFailedException; + + /** + * Move from the existing state to a new state + * @param context HA context + * @param s new state + * @throws ServiceFailedException on failure to transition to new state. + */ + public void setState(HAContext context, HAState s) throws ServiceFailedException { + if (this == s) { // Aleady in the new state + return; + } + throw new ServiceFailedException("Transtion from state " + this + " to " + + s + " is not allowed."); + } + + /** + * Check if an operation is supported in a given state. + * @param context HA context + * @param op Type of the operation. + * @throws UnsupportedActionException if a given type of operation is not + * supported in this state. + */ + public abstract void checkOperation(final HAContext context, final OperationCategory op) + throws StandbyException; + + public abstract boolean shouldPopulateReplQueues(); + + /** + * @return String representation of the service state. + */ + @Override + public String toString() { + return state.toString(); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java new file mode 100644 index 0000000000..036dd431ad --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java @@ -0,0 +1,299 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.security.PrivilegedAction; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.server.namenode.CheckpointConf; +import org.apache.hadoop.hdfs.server.namenode.FSImage; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.SaveNamespaceCancelledException; +import org.apache.hadoop.hdfs.server.namenode.TransferFsImage; +import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.SecurityUtil; +import org.apache.hadoop.security.UserGroupInformation; +import static org.apache.hadoop.hdfs.server.common.Util.now; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; + +/** + * Thread which runs inside the NN when it's in Standby state, + * periodically waking up to take a checkpoint of the namespace. + * When it takes a checkpoint, it saves it to its local + * storage and then uploads it to the remote NameNode. + */ +@InterfaceAudience.Private +public class StandbyCheckpointer { + private static final Log LOG = LogFactory.getLog(StandbyCheckpointer.class); + private static final long PREVENT_AFTER_CANCEL_MS = 2*60*1000L; + private final CheckpointConf checkpointConf; + private final FSNamesystem namesystem; + private long lastCheckpointTime; + private final CheckpointerThread thread; + private String activeNNAddress; + private InetSocketAddress myNNAddress; + + // Keep track of how many checkpoints were canceled. + // This is for use in tests. + private static int canceledCount = 0; + + public StandbyCheckpointer(Configuration conf, FSNamesystem ns) { + this.namesystem = ns; + this.checkpointConf = new CheckpointConf(conf); + this.thread = new CheckpointerThread(); + + setNameNodeAddresses(conf); + } + + /** + * Determine the address of the NN we are checkpointing + * as well as our own HTTP address from the configuration. + */ + private void setNameNodeAddresses(Configuration conf) { + // Look up our own address. + String myAddrString = getHttpAddress(conf); + + // Look up the active node's address + Configuration confForActive = HAUtil.getConfForOtherNode(conf); + activeNNAddress = getHttpAddress(confForActive); + + // Sanity-check. + Preconditions.checkArgument(checkAddress(activeNNAddress), + "Bad address for active NN: %s", activeNNAddress); + Preconditions.checkArgument(checkAddress(myAddrString), + "Bad address for standby NN: %s", myAddrString); + myNNAddress = NetUtils.createSocketAddr(myAddrString); + } + + private String getHttpAddress(Configuration conf) { + String configuredAddr = DFSUtil.getInfoServer(null, conf, true); + + // Use the hostname from the RPC address as a default, in case + // the HTTP address is configured to 0.0.0.0. + String hostnameFromRpc = NameNode.getServiceAddress( + conf, true).getHostName(); + try { + return DFSUtil.substituteForWildcardAddress( + configuredAddr, hostnameFromRpc); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + /** + * Ensure that the given address is valid and has a port + * specified. + */ + private boolean checkAddress(String addrStr) { + InetSocketAddress addr = NetUtils.createSocketAddr(addrStr); + return addr.getPort() != 0 && !addr.getAddress().isAnyLocalAddress(); + } + + public void start() { + LOG.info("Starting standby checkpoint thread...\n" + + "Checkpointing active NN at " + activeNNAddress + "\n" + + "Serving checkpoints at " + myNNAddress); + thread.start(); + } + + public void stop() throws IOException { + thread.setShouldRun(false); + thread.interrupt(); + try { + thread.join(); + } catch (InterruptedException e) { + LOG.warn("Edit log tailer thread exited with an exception"); + throw new IOException(e); + } + } + + private void doCheckpoint() throws InterruptedException, IOException { + long txid; + + namesystem.writeLockInterruptibly(); + try { + assert namesystem.getEditLog().isOpenForRead() : + "Standby Checkpointer should only attempt a checkpoint when " + + "NN is in standby mode, but the edit logs are in an unexpected state"; + + FSImage img = namesystem.getFSImage(); + + long prevCheckpointTxId = img.getStorage().getMostRecentCheckpointTxId(); + long thisCheckpointTxId = img.getLastAppliedOrWrittenTxId(); + assert thisCheckpointTxId >= prevCheckpointTxId; + if (thisCheckpointTxId == prevCheckpointTxId) { + LOG.info("A checkpoint was triggered but the Standby Node has not " + + "received any transactions since the last checkpoint at txid " + + thisCheckpointTxId + ". Skipping..."); + return; + } + + img.saveNamespace(namesystem); + txid = img.getStorage().getMostRecentCheckpointTxId(); + assert txid == thisCheckpointTxId : "expected to save checkpoint at txid=" + + thisCheckpointTxId + " but instead saved at txid=" + txid; + } finally { + namesystem.writeUnlock(); + } + + // Upload the saved checkpoint back to the active + TransferFsImage.uploadImageFromStorage( + activeNNAddress, myNNAddress, + namesystem.getFSImage().getStorage(), txid); + } + + /** + * Cancel any checkpoint that's currently being made, + * and prevent any new checkpoints from starting for the next + * minute or so. + */ + public void cancelAndPreventCheckpoints() throws ServiceFailedException { + try { + thread.preventCheckpointsFor(PREVENT_AFTER_CANCEL_MS); + // TODO(HA): there is a really narrow race here if we are just + // about to start a checkpoint - this won't cancel it! + namesystem.getFSImage().cancelSaveNamespace( + "About to exit standby state"); + } catch (InterruptedException e) { + throw new ServiceFailedException( + "Interrupted while trying to cancel checkpoint"); + } + } + + @VisibleForTesting + static int getCanceledCount() { + return canceledCount; + } + + private long countUncheckpointedTxns() { + FSImage img = namesystem.getFSImage(); + return img.getLastAppliedOrWrittenTxId() - + img.getStorage().getMostRecentCheckpointTxId(); + } + + private class CheckpointerThread extends Thread { + private volatile boolean shouldRun = true; + private volatile long preventCheckpointsUntil = 0; + + private CheckpointerThread() { + super("Standby State Checkpointer"); + } + + private void setShouldRun(boolean shouldRun) { + this.shouldRun = shouldRun; + } + + @Override + public void run() { + // We have to make sure we're logged in as far as JAAS + // is concerned, in order to use kerberized SSL properly. + SecurityUtil.doAsLoginUserOrFatal( + new PrivilegedAction() { + @Override + public Object run() { + doWork(); + return null; + } + }); + } + + /** + * Prevent checkpoints from occurring for some time period + * in the future. This is used when preparing to enter active + * mode. We need to not only cancel any concurrent checkpoint, + * but also prevent any checkpoints from racing to start just + * after the cancel call. + * + * @param delayMs the number of MS for which checkpoints will be + * prevented + */ + private void preventCheckpointsFor(long delayMs) { + preventCheckpointsUntil = now() + delayMs; + } + + private void doWork() { + // Reset checkpoint time so that we don't always checkpoint + // on startup. + lastCheckpointTime = now(); + while (shouldRun) { + try { + Thread.sleep(1000 * checkpointConf.getCheckPeriod()); + } catch (InterruptedException ie) { + } + if (!shouldRun) { + break; + } + try { + // We may have lost our ticket since last checkpoint, log in again, just in case + if (UserGroupInformation.isSecurityEnabled()) { + UserGroupInformation.getCurrentUser().reloginFromKeytab(); + } + + long now = now(); + long uncheckpointed = countUncheckpointedTxns(); + long secsSinceLast = (now - lastCheckpointTime)/1000; + + boolean needCheckpoint = false; + if (uncheckpointed >= checkpointConf.getTxnCount()) { + LOG.info("Triggering checkpoint because there have been " + + uncheckpointed + " txns since the last checkpoint, which " + + "exceeds the configured threshold " + + checkpointConf.getTxnCount()); + needCheckpoint = true; + } else if (secsSinceLast >= checkpointConf.getPeriod()) { + LOG.info("Triggering checkpoint because it has been " + + secsSinceLast + " seconds since the last checkpoint, which " + + "exceeds the configured interval " + checkpointConf.getPeriod()); + needCheckpoint = true; + } + if (needCheckpoint && now < preventCheckpointsUntil) { + LOG.info("But skipping this checkpoint since we are about to failover!"); + canceledCount++; + } else if (needCheckpoint) { + doCheckpoint(); + lastCheckpointTime = now; + } + } catch (SaveNamespaceCancelledException ce) { + LOG.info("Checkpoint was cancelled: " + ce.getMessage()); + canceledCount++; + } catch (InterruptedException ie) { + // Probably requested shutdown. + continue; + } catch (Throwable t) { + LOG.error("Exception in doCheckpoint", t); + } + } + } + } + + @VisibleForTesting + String getActiveNNAddress() { + return activeNNAddress; + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyState.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyState.java new file mode 100644 index 0000000000..60e83713b8 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyState.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; +import org.apache.hadoop.ipc.StandbyException; + +/** + * Namenode standby state. In this state the namenode acts as warm standby and + * keeps the following updated: + *
        + *
      • Namespace by getting the edits.
      • + *
      • Block location information by receiving block reports and blocks + * received from the datanodes.
      • + *
      + * + * It does not handle read/write/checkpoint operations. + */ +@InterfaceAudience.Private +public class StandbyState extends HAState { + public StandbyState() { + super(HAServiceState.STANDBY); + } + + @Override + public void setState(HAContext context, HAState s) throws ServiceFailedException { + if (s == NameNode.ACTIVE_STATE) { + setStateInternal(context, s); + return; + } + super.setState(context, s); + } + + @Override + public void enterState(HAContext context) throws ServiceFailedException { + try { + context.startStandbyServices(); + } catch (IOException e) { + throw new ServiceFailedException("Failed to start standby services", e); + } + } + + @Override + public void prepareToExitState(HAContext context) throws ServiceFailedException { + context.prepareToStopStandbyServices(); + } + + @Override + public void exitState(HAContext context) throws ServiceFailedException { + try { + context.stopStandbyServices(); + } catch (IOException e) { + throw new ServiceFailedException("Failed to stop standby services", e); + } + } + + @Override + public void checkOperation(HAContext context, OperationCategory op) + throws StandbyException { + if (op == OperationCategory.UNCHECKED || + (op == OperationCategory.READ && context.allowStaleReads())) { + return; + } + String msg = "Operation category " + op + " is not supported in state " + + context.getState(); + throw new StandbyException(msg); + } + + @Override + public boolean shouldPopulateReplQueues() { + return false; + } +} + diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java index c9c7150def..5669497ed6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java @@ -22,8 +22,8 @@ import java.io.*; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hdfs.DFSConfigKeys; -import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.DatanodeID; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.security.KerberosInfo; @@ -93,7 +93,7 @@ public interface DatanodeProtocol { * sendHeartbeat() tells the NameNode that the DataNode is still * alive and well. Includes some status info, too. * It also gives the NameNode a chance to return - * an array of "DatanodeCommand" objects. + * an array of "DatanodeCommand" objects in HeartbeatResponse. * A DatanodeCommand tells the DataNode to invalidate local block(s), * or to copy them to other DataNodes, etc. * @param registration datanode registration information @@ -103,7 +103,7 @@ public interface DatanodeProtocol { * @param failedVolumes number of failed volumes * @throws IOException on error */ - public DatanodeCommand[] sendHeartbeat(DatanodeRegistration registration, + public HeartbeatResponse sendHeartbeat(DatanodeRegistration registration, StorageReport[] reports, int xmitsInProgress, int xceiverCount, @@ -118,7 +118,8 @@ public interface DatanodeProtocol { * @param registration * @param poolId - the block pool ID for the blocks * @param reports - report of blocks per storage - * Each block is represented as 2 longs. + * Each finalized block is represented as 3 longs. Each under- + * construction replica is represented as 4 longs. * This is done instead of Block[] to reduce memory used by block reports. * * @return - the next command for DN to process. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/HeartbeatResponse.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/HeartbeatResponse.java new file mode 100644 index 0000000000..96f74a0c79 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/HeartbeatResponse.java @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.protocol; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.io.ObjectWritable; +import org.apache.hadoop.io.Writable; + +@InterfaceAudience.Private +@InterfaceStability.Evolving +/** + * Response to {@link DatanodeProtocol#sendHeartbeat} + */ +public class HeartbeatResponse implements Writable { + /** Commands returned from the namenode to the datanode */ + private DatanodeCommand[] commands; + + /** Information about the current HA-related state of the NN */ + private NNHAStatusHeartbeat haStatus; + + public HeartbeatResponse() { + // Empty constructor required for Writable + } + + public HeartbeatResponse(DatanodeCommand[] cmds, + NNHAStatusHeartbeat haStatus) { + commands = cmds; + this.haStatus = haStatus; + } + + public DatanodeCommand[] getCommands() { + return commands; + } + + public NNHAStatusHeartbeat getNameNodeHaState() { + return haStatus; + } + + /////////////////////////////////////////// + // Writable + /////////////////////////////////////////// + @Override + public void write(DataOutput out) throws IOException { + int length = commands == null ? 0 : commands.length; + out.writeInt(length); + for (int i = 0; i < length; i++) { + ObjectWritable.writeObject(out, commands[i], commands[i].getClass(), + null, true); + } + haStatus.write(out); + } + + @Override + public void readFields(DataInput in) throws IOException { + int length = in.readInt(); + commands = new DatanodeCommand[length]; + ObjectWritable objectWritable = new ObjectWritable(); + for (int i = 0; i < length; i++) { + commands[i] = (DatanodeCommand) ObjectWritable.readObject(in, + objectWritable, null); + } + haStatus = new NNHAStatusHeartbeat(); + haStatus.readFields(in); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NNHAStatusHeartbeat.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NNHAStatusHeartbeat.java new file mode 100644 index 0000000000..633aa850df --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NNHAStatusHeartbeat.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.protocol; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; + +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class NNHAStatusHeartbeat implements Writable { + + private State state; + private long txid = HdfsConstants.INVALID_TXID; + + public NNHAStatusHeartbeat() { + } + + public NNHAStatusHeartbeat(State state, long txid) { + this.state = state; + this.txid = txid; + } + + public State getState() { + return state; + } + + public long getTxId() { + return txid; + } + + /////////////////////////////////////////// + // Writable + /////////////////////////////////////////// + @Override + public void write(DataOutput out) throws IOException { + WritableUtils.writeEnum(out, state); + out.writeLong(txid); + } + + @Override + public void readFields(DataInput in) throws IOException { + state = WritableUtils.readEnum(in, State.class); + txid = in.readLong(); + } + + @InterfaceAudience.Private + public enum State { + ACTIVE, + STANDBY; + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java index 59b279cd4d..a75308a3d8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java @@ -101,10 +101,7 @@ public interface NamenodeProtocol { * call fails if the file system is in SafeMode. * @throws IOException * @return a unique token to identify this transaction. - * @deprecated - * See {@link org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode} */ - @Deprecated public CheckpointSignature rollEditLog() throws IOException; /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java index 4de386f368..e05b8fef28 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.protocol; import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.ha.HAServiceProtocol; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol; import org.apache.hadoop.security.RefreshUserMappingsProtocol; @@ -32,5 +33,6 @@ public interface NamenodeProtocols NamenodeProtocol, RefreshAuthorizationPolicyProtocol, RefreshUserMappingsProtocol, - GetUserMappingsProtocol { + GetUserMappingsProtocol, + HAServiceProtocol { } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java index 45014add97..bde5a5e2d7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java @@ -25,22 +25,47 @@ import java.io.IOException; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; /** - * A data structure to store Block and delHints together, used to send - * received/deleted ACKs. + * A data structure to store the blocks in an incremental block report. */ public class ReceivedDeletedBlockInfo implements Writable { Block block; + BlockStatus status; String delHints; - public final static String TODELETE_HINT = "-"; + public static enum BlockStatus { + RECEIVING_BLOCK(1), + RECEIVED_BLOCK(2), + DELETED_BLOCK(3); + + private final int code; + BlockStatus(int code) { + this.code = code; + } + + public int getCode() { + return code; + } + + public static BlockStatus fromCode(int code) { + for (BlockStatus bs : BlockStatus.values()) { + if (bs.code == code) { + return bs; + } + } + return null; + } + } public ReceivedDeletedBlockInfo() { } - public ReceivedDeletedBlockInfo(Block blk, String delHints) { + public ReceivedDeletedBlockInfo( + Block blk, BlockStatus status, String delHints) { this.block = blk; + this.status = status; this.delHints = delHints; } @@ -60,13 +85,19 @@ public class ReceivedDeletedBlockInfo implements Writable { this.delHints = hints; } + public BlockStatus getStatus() { + return status; + } + public boolean equals(Object o) { if (!(o instanceof ReceivedDeletedBlockInfo)) { return false; } ReceivedDeletedBlockInfo other = (ReceivedDeletedBlockInfo) o; return this.block.equals(other.getBlock()) - && this.delHints.equals(other.delHints); + && this.status == other.status + && (this.delHints == other.delHints || + this.delHints != null && this.delHints.equals(other.delHints)); } public int hashCode() { @@ -79,23 +110,30 @@ public class ReceivedDeletedBlockInfo implements Writable { } public boolean isDeletedBlock() { - return delHints.equals(TODELETE_HINT); + return status == BlockStatus.DELETED_BLOCK; } @Override public void write(DataOutput out) throws IOException { this.block.write(out); - Text.writeString(out, this.delHints); + WritableUtils.writeVInt(out, this.status.code); + if (this.status == BlockStatus.DELETED_BLOCK) { + Text.writeString(out, this.delHints); + } } @Override public void readFields(DataInput in) throws IOException { this.block = new Block(); this.block.readFields(in); - this.delHints = Text.readString(in); + this.status = BlockStatus.fromCode(WritableUtils.readVInt(in)); + if (this.status == BlockStatus.DELETED_BLOCK) { + this.delHints = Text.readString(in); + } } public String toString() { - return block.toString() + ", delHint: " + delHints; + return block.toString() + ", status: " + status + + ", delHint: " + delHints; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java index 1025880c9d..edbbb2250b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java @@ -38,19 +38,20 @@ import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.UpgradeAction; -import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolClientSideTranslatorPB; -import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolClientSideTranslatorPB; import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.RefreshUserMappingsProtocol; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ToolRunner; @@ -791,9 +792,9 @@ public class DFSAdmin extends FsShell { conf.get(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, "")); // Create the client - RefreshAuthorizationPolicyProtocolClientSideTranslatorPB refreshProtocol = - new RefreshAuthorizationPolicyProtocolClientSideTranslatorPB( - NameNode.getAddress(conf), getUGI(), conf); + RefreshAuthorizationPolicyProtocol refreshProtocol = + NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf), + RefreshAuthorizationPolicyProtocol.class).getProxy(); // Refresh the authorization policy in-effect refreshProtocol.refreshServiceAcl(); @@ -817,9 +818,9 @@ public class DFSAdmin extends FsShell { conf.get(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, "")); // Create the client - RefreshUserMappingsProtocolClientSideTranslatorPB refreshProtocol = - new RefreshUserMappingsProtocolClientSideTranslatorPB( - NameNode.getAddress(conf), getUGI(), conf); + RefreshUserMappingsProtocol refreshProtocol = + NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf), + RefreshUserMappingsProtocol.class).getProxy(); // Refresh the user-to-groups mappings refreshProtocol.refreshUserToGroupsMappings(); @@ -844,9 +845,9 @@ public class DFSAdmin extends FsShell { conf.get(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, "")); // Create the client - RefreshUserMappingsProtocolClientSideTranslatorPB refreshProtocol = - new RefreshUserMappingsProtocolClientSideTranslatorPB( - NameNode.getAddress(conf), getUGI(), conf); + RefreshUserMappingsProtocol refreshProtocol = + NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf), + RefreshUserMappingsProtocol.class).getProxy(); // Refresh the user-to-groups mappings refreshProtocol.refreshSuperUserGroupsConfiguration(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java new file mode 100644 index 0000000000..13bde2ae53 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.tools; + +import java.io.PrintStream; +import java.util.Arrays; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.ha.HAAdmin; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.util.ToolRunner; + +/** + * Class to extend HAAdmin to do a little bit of HDFS-specific configuration. + */ +public class DFSHAAdmin extends HAAdmin { + + private static final Log LOG = LogFactory.getLog(DFSHAAdmin.class); + + private String nameserviceId; + + protected void setErrOut(PrintStream errOut) { + this.errOut = errOut; + } + + @Override + public void setConf(Configuration conf) { + if (conf != null) { + // Make a copy so we don't mutate it. Also use an HdfsConfiguration to + // force loading of hdfs-site.xml. + conf = new HdfsConfiguration(conf); + String nameNodePrincipal = conf.get( + DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, ""); + if (LOG.isDebugEnabled()) { + LOG.debug("Using NN principal: " + nameNodePrincipal); + } + + conf.set(CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_USER_NAME_KEY, + nameNodePrincipal); + } + super.setConf(conf); + } + + /** + * Try to map the given namenode ID to its service address. + */ + @Override + protected String getServiceAddr(String nnId) { + HdfsConfiguration conf = (HdfsConfiguration)getConf(); + String serviceAddr = + DFSUtil.getNamenodeServiceAddr(conf, nameserviceId, nnId); + if (serviceAddr == null) { + throw new IllegalArgumentException( + "Unable to determine service address for namenode '" + nnId + "'"); + } + return serviceAddr; + } + + @Override + protected String getUsageString() { + return "Usage: DFSHAAdmin [-ns ]"; + } + + @Override + protected int runCmd(String[] argv) throws Exception { + if (argv.length < 1) { + printUsage(errOut); + return -1; + } + + int i = 0; + String cmd = argv[i++]; + + if ("-ns".equals(cmd)) { + if (i == argv.length) { + errOut.println("Missing nameservice ID"); + printUsage(errOut); + return -1; + } + nameserviceId = argv[i++]; + if (i >= argv.length) { + errOut.println("Missing command"); + printUsage(errOut); + return -1; + } + argv = Arrays.copyOfRange(argv, i, argv.length); + } + + return super.runCmd(argv); + } + + public static void main(String[] argv) throws Exception { + int res = ToolRunner.run(new DFSHAAdmin(), argv); + System.exit(res); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java index bc98995af3..1a99fcb62a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java @@ -32,11 +32,13 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hdfs.server.namenode.NameNode; -import org.apache.hadoop.hdfs.server.namenode.NamenodeFsck; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.server.namenode.NamenodeFsck; +import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.security.Krb5AndCertsSslSocketConnector; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.UserGroupInformation; @@ -204,8 +206,9 @@ public class DFSck extends Configured implements Tool { * Derive the namenode http address from the current file system, * either default or as set by "-fs" in the generic options. * @return Returns http address or null if failure. + * @throws IOException if we can't determine the active NN address */ - private String getCurrentNamenodeAddress() { + private String getCurrentNamenodeAddress() throws IOException { //String nnAddress = null; Configuration conf = getConf(); @@ -222,16 +225,21 @@ public class DFSck extends Configured implements Tool { System.err.println("FileSystem is " + fs.getUri()); return null; } - DistributedFileSystem dfs = (DistributedFileSystem) fs; - - // Derive the nameservice ID from the filesystem URI. - // The URI may have been provided by a human, and the server name may be - // aliased, so compare InetSocketAddresses instead of URI strings, and - // test against both possible variants of RPC address. - InetSocketAddress namenode = - NameNode.getAddress(dfs.getUri().getAuthority()); - return DFSUtil.getInfoServer(namenode, conf, true); + // force client address resolution. + fs.exists(new Path("/")); + + // Derive the nameservice ID from the filesystem connection. The URI may + // have been provided by a human, the server name may be aliased, or there + // may be multiple possible actual addresses (e.g. in an HA setup) so + // compare InetSocketAddresses instead of URI strings, and test against both + // possible configurations of RPC address (DFS_NAMENODE_RPC_ADDRESS_KEY and + // DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY). + DistributedFileSystem dfs = (DistributedFileSystem) fs; + DFSClient dfsClient = dfs.getClient(); + InetSocketAddress addr = RPC.getServerAddress(dfsClient.getNamenode()); + + return DFSUtil.getInfoServer(addr, conf, true); } private int doWork(final String[] args) throws IOException { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java index 0c7517739a..e3a67edebc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java @@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; @@ -164,7 +165,7 @@ public class GetConf extends Configured implements Tool { static class NameNodesCommandHandler extends CommandHandler { @Override int doWorkInternal(GetConf tool) throws IOException { - tool.printList(DFSUtil.getNNServiceRpcAddresses(tool.getConf())); + tool.printMap(DFSUtil.getNNServiceRpcAddresses(tool.getConf())); return 0; } } @@ -175,7 +176,7 @@ public class GetConf extends Configured implements Tool { static class BackupNodesCommandHandler extends CommandHandler { @Override public int doWorkInternal(GetConf tool) throws IOException { - tool.printList(DFSUtil.getBackupNodeAddresses(tool.getConf())); + tool.printMap(DFSUtil.getBackupNodeAddresses(tool.getConf())); return 0; } } @@ -186,7 +187,7 @@ public class GetConf extends Configured implements Tool { static class SecondaryNameNodesCommandHandler extends CommandHandler { @Override public int doWorkInternal(GetConf tool) throws IOException { - tool.printList(DFSUtil.getSecondaryNameNodeAddresses(tool.getConf())); + tool.printMap(DFSUtil.getSecondaryNameNodeAddresses(tool.getConf())); return 0; } } @@ -200,9 +201,11 @@ public class GetConf extends Configured implements Tool { @Override public int doWorkInternal(GetConf tool) throws IOException { Configuration config = tool.getConf(); - List rpclist = DFSUtil.getNNServiceRpcAddresses(config); - if (rpclist != null) { - for (InetSocketAddress rpc : rpclist) { + List cnnlist = DFSUtil.flattenAddressMap( + DFSUtil.getNNServiceRpcAddresses(config)); + if (!cnnlist.isEmpty()) { + for (ConfiguredNNAddress cnn : cnnlist) { + InetSocketAddress rpc = cnn.getAddress(); tool.printOut(rpc.getHostName()+":"+rpc.getPort()); } return 0; @@ -232,10 +235,13 @@ public class GetConf extends Configured implements Tool { void printOut(String message) { out.println(message); } - - void printList(List list) { + + void printMap(Map> map) { StringBuilder buffer = new StringBuilder(); - for (InetSocketAddress address : list) { + + List cnns = DFSUtil.flattenAddressMap(map); + for (ConfiguredNNAddress cnn : cnns) { + InetSocketAddress address = cnn.getAddress(); if (buffer.length() > 0) { buffer.append(" "); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java index 5ad227d9e0..51612befff 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java @@ -21,8 +21,11 @@ import java.io.IOException; import java.io.PrintStream; import java.net.InetSocketAddress; +import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.protocolPB.GetUserMappingsProtocolClientSideTranslatorPB; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.security.UserGroupInformation; @@ -34,6 +37,7 @@ import org.apache.hadoop.util.ToolRunner; * HDFS implementation of a tool for getting the groups which a given user * belongs to. */ +@InterfaceAudience.Private public class GetGroups extends GetGroupsBase { static{ @@ -41,11 +45,11 @@ public class GetGroups extends GetGroupsBase { } - GetGroups(Configuration conf) { + public GetGroups(Configuration conf) { super(conf); } - GetGroups(Configuration conf, PrintStream out) { + public GetGroups(Configuration conf, PrintStream out) { super(conf, out); } @@ -57,9 +61,8 @@ public class GetGroups extends GetGroupsBase { @Override protected GetUserMappingsProtocol getUgmProtocol() throws IOException { - return new GetUserMappingsProtocolClientSideTranslatorPB( - NameNode.getAddress(getConf()), UserGroupInformation.getCurrentUser(), - getConf()); + return NameNodeProxies.createProxy(getConf(), FileSystem.getDefaultUri(getConf()), + GetUserMappingsProtocol.class).getProxy(); } public static void main(String[] argv) throws Exception { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java index a01083065d..3a460e021d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java @@ -48,6 +48,8 @@ public enum EditsElement { BLOCK_ID, BLOCK_NUM_BYTES, BLOCK_GENERATION_STAMP, + BLOCK_DELTA_NUM_BYTES, // delta-encoded relative to previous block + BLOCK_DELTA_GEN_STAMP, // delta-encoded relative to previous block PERMISSION_STATUS, FS_PERMISSIONS, CLIENT_NAME, diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java index f1da4c6175..d34bff92d7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java @@ -41,7 +41,7 @@ import static org.apache.hadoop.hdfs.tools.offlineEditsViewer.Tokenizer.VIntToke class EditsLoaderCurrent implements EditsLoader { private static int[] supportedVersions = { -18, -19, -20, -21, -22, -23, -24, - -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39}; + -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40}; private EditsVisitor v; private int editsVersion = 0; @@ -150,6 +150,25 @@ class EditsLoaderCurrent implements EditsLoader { } } + private void visit_OP_UPDATE_BLOCKS() throws IOException { + visitTxId(); + v.visitStringUTF8(EditsElement.PATH); + VIntToken numBlocksToken = v.visitVInt(EditsElement.NUMBLOCKS); + for (int i = 0; i < numBlocksToken.value; i++) { + v.visitEnclosingElement(EditsElement.BLOCK); + + v.visitLong(EditsElement.BLOCK_ID); + if (i == 0) { + v.visitVLong(EditsElement.BLOCK_NUM_BYTES); + v.visitVLong(EditsElement.BLOCK_GENERATION_STAMP); + } else { + v.visitVLong(EditsElement.BLOCK_DELTA_NUM_BYTES); + v.visitVLong(EditsElement.BLOCK_DELTA_GEN_STAMP); + } + v.leaveEnclosingElement(); + } + } + /** * Visit OP_RENAME_OLD */ @@ -521,6 +540,9 @@ class EditsLoaderCurrent implements EditsLoader { case OP_START_LOG_SEGMENT: // 24 visit_OP_BEGIN_LOG_SEGMENT(); break; + case OP_UPDATE_BLOCKS: // 25 + visit_OP_UPDATE_BLOCKS(); + break; default: { throw new IOException("Unknown op code " + editsOpCode); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java index 8960cbce31..fdc9892e1a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java @@ -122,7 +122,8 @@ class ImageLoaderCurrent implements ImageLoader { protected final DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm"); private static int[] versions = { -16, -17, -18, -19, -20, -21, -22, -23, - -24, -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39}; + -24, -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, + -40}; private int imageVersion = 0; /* (non-Javadoc) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java index cd88963e3d..8a0f992364 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java @@ -30,6 +30,9 @@ public interface RwLock { /** Acquire write lock. */ public void writeLock(); + + /** Acquire write lock, unless interrupted while waiting */ + void writeLockInterruptibly() throws InterruptedException; /** Release write lock. */ public void writeUnlock(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto index 81ca74b8ee..2a96544624 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto @@ -188,11 +188,26 @@ message StorageReportProto { optional uint64 blockPoolUsed = 6 [ default = 0 ]; } +/** + * state - State the NN is in when returning response to the DN + * txid - Highest transaction ID this NN has seen + */ +message NNHAStatusHeartbeatProto { + enum State { + ACTIVE = 0; + STANDBY = 1; + } + required State state = 1; + required uint64 txid = 2; +} + /** * cmds - Commands from namenode to datanode. + * haStatus - Status (from an HA perspective) of the NN sending this response */ message HeartbeatResponseProto { repeated DatanodeCommandProto cmds = 1; // Returned commands can be null + required NNHAStatusHeartbeatProto haStatus = 2; } /** @@ -226,12 +241,16 @@ message BlockReportResponseProto { /** * Data structure to send received or deleted block information * from datanode to namenode. - * - * deleteHint set to "-" indicates block deletion. - * other deleteHint indicates block addition. */ message ReceivedDeletedBlockInfoProto { + enum BlockStatus { + RECEIVING = 1; // block being created + RECEIVED = 2; // block creation complete + DELETED = 3; + } + required BlockProto block = 1; + required BlockStatus status = 3; optional string deleteHint = 2; } @@ -350,7 +369,9 @@ service DatanodeProtocolService { rpc blockReport(BlockReportRequestProto) returns(BlockReportResponseProto); /** - * Report from datanode about recently received or deleted block + * Incremental block report from the DN. This contains info about recently + * received and deleted blocks, as well as when blocks start being + * received. */ rpc blockReceivedAndDeleted(BlockReceivedAndDeletedRequestProto) returns(BlockReceivedAndDeletedResponseProto); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index 1ce090be94..9ec25d2833 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -33,9 +33,11 @@ dfs.namenode.logging.level info - The logging level for dfs namenode. Other values are "dir"(trac -e namespace mutations), "block"(trace block under/over replications and block -creations/deletions), or "all". + + The logging level for dfs namenode. Other values are "dir" (trace + namespace mutations), "block" (trace block under/over replications + and block creations/deletions), or "all". + @@ -226,6 +228,18 @@ creations/deletions), or "all". directories, for redundancy. Default value is same as dfs.namenode.name.dir + + + dfs.namenode.shared.edits.dir + + A directory on shared storage between the multiple namenodes + in an HA cluster. This directory will be written by the active and read + by the standby in order to keep the namespaces synchronized. This directory + does not need to be listed in dfs.namenode.edits.dir above. It should be + left empty in a non-HA cluster. + + + dfs.web.ugi webuser,webgroup @@ -623,6 +637,19 @@ creations/deletions), or "all". + + dfs.namenode.num.extra.edits.retained + 1000000 + The number of extra transactions which should be retained + beyond what is minimally necessary for a NN restart. This can be useful for + audit purposes or for an HA setup where a remote Standby Node may have + been offline for some time and need to have a longer backlog of retained + edits in order to start again. + Typically each edit is on the order of a few hundred bytes, so the default + of 1 million edits should be on the order of hundreds of MBs or low GBs. + + + dfs.namenode.delegation.key.update-interval 86400000 @@ -698,4 +725,118 @@ creations/deletions), or "all". + + dfs.client.failover.max.attempts + 15 + + Expert only. The number of client failover attempts that should be + made before the failover is considered failed. + + + + + dfs.client.failover.sleep.base.millis + 500 + + Expert only. The time to wait, in milliseconds, between failover + attempts increases exponentially as a function of the number of + attempts made so far, with a random factor of +/- 50%. This option + specifies the base value used in the failover calculation. The + first failover will retry immediately. The 2nd failover attempt + will delay at least dfs.client.failover.sleep.base.millis + milliseconds. And so on. + + + + + dfs.client.failover.sleep.max.millis + 15000 + + Expert only. The time to wait, in milliseconds, between failover + attempts increases exponentially as a function of the number of + attempts made so far, with a random factor of +/- 50%. This option + specifies the maximum value to wait between failovers. + Specifically, the time between two failover attempts will not + exceed +/- 50% of dfs.client.failover.sleep.max.millis + milliseconds. + + + + + dfs.client.failover.connection.retries + 0 + + Expert only. Indicates the number of retries a failover IPC client + will make to establish a server connection. + + + + + dfs.client.failover.connection.retries.on.timeouts + 0 + + Expert only. The number of retry attempts a failover IPC client + will make on socket timeout when establishing a server connection. + + + + + dfs.federation.nameservices + + + Comma-separated list of nameservices. + + + + + dfs.federation.nameservice.id + + + The ID of this nameservice. If the nameservice ID is not + configured or more than one nameservice is configured for + dfs.federation.nameservices it is determined automatically by + matching the local node's address with the configured address. + + + + + dfs.ha.namenodes.EXAMPLENAMESERVICE + + + The prefix for a given nameservice, contains a comma-separated + list of namenodes for a given nameservice (eg EXAMPLENAMESERVICE). + + + + + dfs.ha.namenode.id + + + The ID of this namenode. If the namenode ID is not configured it + is determined automatically by matching the local node's address + with the configured address. + + + + + dfs.ha.log-roll.period + 120 + + How often, in seconds, the StandbyNode should ask the active to + roll edit logs. Since the StandbyNode only reads from finalized + log segments, the StandbyNode will only be as up-to-date as how + often the logs are rolled. Note that failover triggers a log roll + so the StandbyNode will be up to date before it becomes active. + + + + + dfs.ha.tail-edits.period + 60 + + How often, in seconds, the StandbyNode should check for new + finalized log segments in the shared edits log. + + + diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp index a71f40f26e..a4906a5880 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp @@ -23,6 +23,7 @@ import="org.apache.hadoop.fs.FileStatus" import="org.apache.hadoop.fs.FileUtil" import="org.apache.hadoop.fs.Path" + import="org.apache.hadoop.ha.HAServiceProtocol.HAServiceState" import="java.util.Collection" import="java.util.Arrays" %> <%!//for java.io.Serializable @@ -30,6 +31,8 @@ <% NameNode nn = NameNodeHttpServer.getNameNodeFromContext(application); FSNamesystem fsn = nn.getNamesystem(); + HAServiceState nnHAState = nn.getServiceState(); + boolean isActive = (nnHAState == HAServiceState.ACTIVE); String namenodeRole = nn.getRole().toString(); String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameNodeAddress().getPort(); @@ -46,8 +49,10 @@

      <%=namenodeRole%> '<%=namenodeLabel%>'

      <%=NamenodeJspHelper.getVersionTable(fsn)%>
      -Browse the filesystem -
      +<% if (isActive) { %> + Browse the filesystem +
      +<% } %> <%=namenodeRole%> Logs
      Go back to DFS home diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp index ecce30ae88..81e595d718 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp @@ -20,6 +20,7 @@ <%@ page contentType="text/html; charset=UTF-8" import="org.apache.hadoop.util.ServletUtil" + import="org.apache.hadoop.ha.HAServiceProtocol.HAServiceState" %> <%! //for java.io.Serializable @@ -29,7 +30,10 @@ final NamenodeJspHelper.HealthJsp healthjsp = new NamenodeJspHelper.HealthJsp(); NameNode nn = NameNodeHttpServer.getNameNodeFromContext(application); FSNamesystem fsn = nn.getNamesystem(); + HAServiceState nnHAState = nn.getServiceState(); + boolean isActive = (nnHAState == HAServiceState.ACTIVE); String namenodeRole = nn.getRole().toString(); + String namenodeState = nnHAState.toString(); String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameNodeAddress().getPort(); %> @@ -40,10 +44,12 @@ Hadoop <%=namenodeRole%> <%=namenodeLabel%> -

      <%=namenodeRole%> '<%=namenodeLabel%>'

      +

      <%=namenodeRole%> '<%=namenodeLabel%>' (<%=namenodeState%>)

      <%= NamenodeJspHelper.getVersionTable(fsn) %>
      -Browse the filesystem
      +<% if (isActive) { %> + Browse the filesystem
      +<% } %> <%=namenodeRole%> Logs
      diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp index 886fbeaa35..35deb05f85 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp @@ -20,6 +20,7 @@ <%@ page contentType="text/html; charset=UTF-8" import="org.apache.hadoop.util.ServletUtil" + import="org.apache.hadoop.ha.HAServiceProtocol.HAServiceState" %> <%! //for java.io.Serializable @@ -30,6 +31,8 @@ final NamenodeJspHelper.NodeListJsp nodelistjsp = new NamenodeJspHelper.NodeList NameNode nn = NameNodeHttpServer.getNameNodeFromContext(application); String namenodeRole = nn.getRole().toString(); FSNamesystem fsn = nn.getNamesystem(); +HAServiceState nnHAState = nn.getServiceState(); +boolean isActive = (nnHAState == HAServiceState.ACTIVE); String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameNodeAddress().getPort(); %> @@ -43,7 +46,9 @@ String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameN

      <%=namenodeRole%> '<%=namenodeLabel%>'

      <%= NamenodeJspHelper.getVersionTable(fsn) %>
      -Browse the filesystem
      +<% if (isActive) { %> + Browse the filesystem
      +<% } %> <%=namenodeRole%> Logs
      Go back to DFS home
      diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java index 17608ac1f7..1d5def6b48 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Set; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; @@ -47,9 +48,11 @@ public class TestResolveHdfsSymlink { @BeforeClass public static void setUp() throws IOException { Configuration conf = new HdfsConfiguration(); + conf.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); cluster = new MiniDFSCluster.Builder(conf).build(); cluster.waitActive(); - NameNodeAdapter.getDtSecretManager(cluster.getNamesystem()).startThreads(); + } @AfterClass diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java index 83115edaad..7ad56c0e93 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java @@ -27,8 +27,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystemTestHelper; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.security.UserGroupInformation; import org.junit.After; import org.junit.AfterClass; @@ -51,12 +52,15 @@ public class TestViewFileSystemHdfs extends ViewFileSystemBaseTest { public static void clusterSetupAtBegining() throws IOException, LoginException, URISyntaxException { SupportsBlocks = true; + CONF.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); + cluster = - new MiniDFSCluster.Builder(CONF).numNameNodes(2).numDataNodes(2) + new MiniDFSCluster.Builder(CONF).nnTopology( + MiniDFSNNTopology.simpleFederatedTopology(2)) + .numDataNodes(2) .build(); cluster.waitClusterUp(); - NameNodeAdapter.getDtSecretManager(cluster.getNamesystem(0)).startThreads(); - NameNodeAdapter.getDtSecretManager(cluster.getNamesystem(1)).startThreads(); fHdfs = cluster.getFileSystem(0); fHdfs2 = cluster.getFileSystem(1); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java index 4a60556a43..0e94b4eb3d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java @@ -26,9 +26,9 @@ import javax.security.auth.login.LoginException; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; @@ -51,9 +51,11 @@ public class TestViewFsHdfs extends ViewFsBaseTest { public static void clusterSetupAtBegining() throws IOException, LoginException, URISyntaxException { SupportsBlocks = true; + CONF.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); + cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(2).build(); cluster.waitClusterUp(); - NameNodeAdapter.getDtSecretManager(cluster.getNamesystem()).startThreads(); fc = FileContext.getFileContext(cluster.getURI(0), CONF); defaultWorkingDirectory = fc.makeQualified( new Path("/user/" + UserGroupInformation.getCurrentUser().getShortUserName())); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java index 50a34a8a04..f28648189d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java @@ -109,12 +109,18 @@ public class AppendTestUtil { out.write(bytes); } - static void check(FileSystem fs, Path p, long length) throws IOException { + public static void check(FileSystem fs, Path p, long length) throws IOException { int i = -1; try { final FileStatus status = fs.getFileStatus(p); - TestCase.assertEquals(length, status.getLen()); - InputStream in = fs.open(p); + FSDataInputStream in = fs.open(p); + if (in.getWrappedStream() instanceof DFSInputStream) { + long len = ((DFSInputStream)in.getWrappedStream()).getFileLength(); + TestCase.assertEquals(length, len); + } else { + TestCase.assertEquals(length, status.getLen()); + } + for(i++; i < length; i++) { TestCase.assertEquals((byte)i, (byte)in.read()); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java index 409dd37525..7854f95f88 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hdfs; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY; import static org.junit.Assert.assertEquals; import java.io.BufferedOutputStream; @@ -38,9 +40,11 @@ import java.net.URLConnection; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; +import java.util.Set; import java.util.concurrent.TimeoutException; import org.apache.hadoop.conf.Configuration; @@ -52,6 +56,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem.Statistics; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSClient.DFSDataInputStream; +import org.apache.hadoop.hdfs.MiniDFSCluster.NameNodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.HdfsConstants; @@ -74,6 +79,8 @@ import org.apache.hadoop.security.ShellBasedUnixGroupsMapping; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; +import com.google.common.base.Joiner; + /** Utilities for HDFS tests */ public class DFSTestUtil { @@ -681,4 +688,21 @@ public class DFSTestUtil { return BlockOpResponseProto.parseDelimitedFrom(in); } + + public static void setFederatedConfiguration(MiniDFSCluster cluster, + Configuration conf) { + Set nameservices = new HashSet(); + for (NameNodeInfo info : cluster.getNameNodeInfos()) { + assert info.nameserviceId != null; + nameservices.add(info.nameserviceId); + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, + info.nameserviceId), DFSUtil.createUri(HdfsConstants.HDFS_URI_SCHEME, + info.nameNode.getNameNodeAddress()).toString()); + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, + info.nameserviceId), DFSUtil.createUri(HdfsConstants.HDFS_URI_SCHEME, + info.nameNode.getNameNodeAddress()).toString()); + } + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, Joiner.on(",") + .join(nameservices)); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java index c3cc6bbcab..8888bec1cf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java @@ -32,6 +32,7 @@ import java.nio.channels.FileChannel; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Collection; +import java.util.List; import java.util.Random; import org.apache.commons.logging.Log; @@ -41,15 +42,25 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; + import static org.apache.hadoop.hdfs.DFSConfigKeys.*; + +import org.apache.hadoop.ha.HAServiceProtocol; +import org.apache.hadoop.ha.HAServiceProtocolHelper; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.ha.protocolPB.HAServiceProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.MiniDFSNNTopology.NNConf; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; +import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.datanode.DataStorage; import org.apache.hadoop.hdfs.server.datanode.FSDatasetInterface; @@ -60,6 +71,7 @@ import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; import org.apache.hadoop.hdfs.tools.DFSAdmin; +import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.net.DNSToSwitchMapping; import org.apache.hadoop.net.NetUtils; @@ -69,6 +81,11 @@ import org.apache.hadoop.security.authorize.ProxyUsers; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ToolRunner; +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.io.Files; + /** * This class creates a single-process DFS cluster for junit testing. * The data directories for non-simulated DFS are under the testing directory. @@ -94,7 +111,6 @@ public class MiniDFSCluster { private int nameNodePort = 0; private int nameNodeHttpPort = 0; private final Configuration conf; - private int numNameNodes = 1; private int numDataNodes = 1; private boolean format = true; private boolean manageNameDfsDirs = true; @@ -106,21 +122,12 @@ public class MiniDFSCluster { private String clusterId = null; private boolean waitSafeMode = true; private boolean setupHostsFile = false; - private boolean federation = false; + private MiniDFSNNTopology nnTopology = null; public Builder(Configuration conf) { this.conf = conf; } - /** - * default false - non federated cluster - * @param val - * @return Builder object - */ - public Builder federation (boolean val){ - this.federation = val; - return this; - } /** * Default: 0 */ @@ -137,14 +144,6 @@ public class MiniDFSCluster { return this; } - /** - * Default: 1 - */ - public Builder numNameNodes(int val) { - this.numNameNodes = val; - return this; - } - /** * Default: 1 */ @@ -234,6 +233,16 @@ public class MiniDFSCluster { return this; } + /** + * Default: a single namenode. + * See {@link MiniDFSNNTopology#simpleFederatedTopology(int)} to set up + * federated nameservices + */ + public Builder nnTopology(MiniDFSNNTopology topology) { + this.nnTopology = topology; + return this; + } + /** * Construct the actual MiniDFSCluster */ @@ -246,15 +255,17 @@ public class MiniDFSCluster { * Used by builder to create and return an instance of MiniDFSCluster */ private MiniDFSCluster(Builder builder) throws IOException { - LOG.info("starting cluster with " + builder.numNameNodes + " namenodes."); - nameNodes = new NameNodeInfo[builder.numNameNodes]; - // try to determine if in federation mode - if(builder.numNameNodes > 1) - builder.federation = true; + if (builder.nnTopology == null) { + // If no topology is specified, build a single NN. + builder.nnTopology = MiniDFSNNTopology.simpleSingleNN( + builder.nameNodePort, builder.nameNodeHttpPort); + } + + LOG.info("starting cluster with " + + builder.nnTopology.countNameNodes() + " namenodes."); + nameNodes = new NameNodeInfo[builder.nnTopology.countNameNodes()]; - initMiniDFSCluster(builder.nameNodePort, - builder.nameNodeHttpPort, - builder.conf, + initMiniDFSCluster(builder.conf, builder.numDataNodes, builder.format, builder.manageNameDfsDirs, @@ -266,7 +277,7 @@ public class MiniDFSCluster { builder.clusterId, builder.waitSafeMode, builder.setupHostsFile, - builder.federation); + builder.nnTopology); } public class DataNodeProperties { @@ -288,8 +299,16 @@ public class MiniDFSCluster { new ArrayList(); private File base_dir; private File data_dir; - private boolean federation = false; private boolean waitSafeMode = true; + private boolean federation; + + /** + * A unique instance identifier for the cluster. This + * is used to disambiguate HA filesystems in the case where + * multiple MiniDFSClusters are used in the same test suite. + */ + private int instanceId; + private static int instanceCount = 0; /** * Stores the information related to a namenode in the cluster @@ -297,8 +316,13 @@ public class MiniDFSCluster { static class NameNodeInfo { final NameNode nameNode; final Configuration conf; - NameNodeInfo(NameNode nn, Configuration conf) { + final String nameserviceId; + final String nnId; + NameNodeInfo(NameNode nn, String nameserviceId, String nnId, + Configuration conf) { this.nameNode = nn; + this.nameserviceId = nameserviceId; + this.nnId = nnId; this.conf = conf; } } @@ -309,6 +333,9 @@ public class MiniDFSCluster { */ public MiniDFSCluster() { nameNodes = new NameNodeInfo[0]; // No namenode in the cluster + synchronized (MiniDFSCluster.class) { + instanceId = instanceCount++; + } } /** @@ -480,22 +507,27 @@ public class MiniDFSCluster { String[] racks, String hosts[], long[] simulatedCapacities) throws IOException { this.nameNodes = new NameNodeInfo[1]; // Single namenode in the cluster - initMiniDFSCluster(nameNodePort, 0, conf, numDataNodes, format, + initMiniDFSCluster(conf, numDataNodes, format, manageNameDfsDirs, manageDataDfsDirs, operation, racks, hosts, - simulatedCapacities, null, true, false, false); + simulatedCapacities, null, true, false, + MiniDFSNNTopology.simpleSingleNN(nameNodePort, 0)); } - private void initMiniDFSCluster(int nameNodePort, int nameNodeHttpPort, + private void initMiniDFSCluster( Configuration conf, int numDataNodes, boolean format, boolean manageNameDfsDirs, boolean manageDataDfsDirs, StartupOption operation, String[] racks, String[] hosts, long[] simulatedCapacities, String clusterId, - boolean waitSafeMode, boolean setupHostsFile, boolean federation) + boolean waitSafeMode, boolean setupHostsFile, + MiniDFSNNTopology nnTopology) throws IOException { + synchronized (MiniDFSCluster.class) { + instanceId = instanceCount++; + } + this.conf = conf; base_dir = new File(determineDfsBaseDir()); data_dir = new File(base_dir, "data"); - this.federation = federation; this.waitSafeMode = waitSafeMode; int replication = conf.getInt(DFS_REPLICATION_KEY, 3); @@ -505,28 +537,25 @@ public class MiniDFSCluster { conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, StaticMapping.class, DNSToSwitchMapping.class); - Collection nameserviceIds = DFSUtil.getNameServiceIds(conf); - if(nameserviceIds.size() > 1) - federation = true; - - if (!federation) { - conf.set(FS_DEFAULT_NAME_KEY, "127.0.0.1:" + nameNodePort); - conf.set(DFS_NAMENODE_HTTP_ADDRESS_KEY, "127.0.0.1:" - + nameNodeHttpPort); - NameNode nn = createNameNode(0, conf, numDataNodes, manageNameDfsDirs, - format, operation, clusterId); - nameNodes[0] = new NameNodeInfo(nn, conf); - FileSystem.setDefaultUri(conf, getURI(0)); - } else { - if (nameserviceIds.isEmpty()) { - for (int i = 0; i < nameNodes.length; i++) { - nameserviceIds.add(NAMESERVICE_ID_PREFIX + i); - } - } - initFederationConf(conf, nameserviceIds, numDataNodes, nameNodePort); - createFederationNamenodes(conf, nameserviceIds, manageNameDfsDirs, format, - operation, clusterId); + // In an HA cluster, in order for the StandbyNode to perform checkpoints, + // it needs to know the HTTP port of the Active. So, if ephemeral ports + // are chosen, disable checkpoints for the test. + if (!nnTopology.allHttpPortsSpecified() && + nnTopology.isHA()) { + LOG.info("MiniDFSCluster disabling checkpointing in the Standby node " + + "since no HTTP ports have been specified."); + conf.setBoolean(DFS_HA_STANDBY_CHECKPOINTS_KEY, false); } + if (!nnTopology.allIpcPortsSpecified() && + nnTopology.isHA()) { + LOG.info("MiniDFSCluster disabling log-roll triggering in the " + + "Standby node since no IPC ports have been specified."); + conf.setInt(DFS_HA_LOGROLL_PERIOD_KEY, -1); + } + + federation = nnTopology.isFederated(); + createNameNodesAndSetConf( + nnTopology, manageNameDfsDirs, format, operation, clusterId, conf); if (format) { if (data_dir.exists() && !FileUtil.fullyDelete(data_dir)) { @@ -542,51 +571,130 @@ public class MiniDFSCluster { ProxyUsers.refreshSuperUserGroupsConfiguration(conf); } - /** Initialize configuration for federated cluster */ - private static void initFederationConf(Configuration conf, - Collection nameserviceIds, int numDataNodes, int nnPort) { - String nameserviceIdList = ""; - for (String nameserviceId : nameserviceIds) { - // Create comma separated list of nameserviceIds - if (nameserviceIdList.length() > 0) { - nameserviceIdList += ","; + private void createNameNodesAndSetConf(MiniDFSNNTopology nnTopology, + boolean manageNameDfsDirs, boolean format, StartupOption operation, + String clusterId, + Configuration conf) throws IOException { + Preconditions.checkArgument(nnTopology.countNameNodes() > 0, + "empty NN topology: no namenodes specified!"); + + if (!federation && nnTopology.countNameNodes() == 1) { + NNConf onlyNN = nnTopology.getOnlyNameNode(); + // we only had one NN, set DEFAULT_NAME for it + conf.set(FS_DEFAULT_NAME_KEY, "127.0.0.1:" + onlyNN.getIpcPort()); + } + + List allNsIds = Lists.newArrayList(); + for (MiniDFSNNTopology.NSConf nameservice : nnTopology.getNameservices()) { + if (nameservice.getId() != null) { + allNsIds.add(nameservice.getId()); } - nameserviceIdList += nameserviceId; - initFederatedNamenodeAddress(conf, nameserviceId, nnPort); - nnPort = nnPort == 0 ? 0 : nnPort + 2; } - conf.set(DFS_FEDERATION_NAMESERVICES, nameserviceIdList); - } - - /* For federated namenode initialize the address:port */ - private static void initFederatedNamenodeAddress(Configuration conf, - String nameserviceId, int nnPort) { - // Set nameserviceId specific key - String key = DFSUtil.getNameServiceIdKey( - DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId); - conf.set(key, "127.0.0.1:0"); - - key = DFSUtil.getNameServiceIdKey( - DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId); - conf.set(key, "127.0.0.1:" + nnPort); - } - - private void createFederationNamenodes(Configuration conf, - Collection nameserviceIds, boolean manageNameDfsDirs, - boolean format, StartupOption operation, String clusterId) - throws IOException { - // Create namenodes in the cluster + if (!allNsIds.isEmpty()) { + conf.set(DFS_FEDERATION_NAMESERVICES, Joiner.on(",").join(allNsIds)); + } + int nnCounter = 0; - for (String nameserviceId : nameserviceIds) { - createFederatedNameNode(nnCounter++, conf, numDataNodes, manageNameDfsDirs, - format, operation, clusterId, nameserviceId); + for (MiniDFSNNTopology.NSConf nameservice : nnTopology.getNameservices()) { + String nsId = nameservice.getId(); + + Preconditions.checkArgument( + !federation || nsId != null, + "if there is more than one NS, they must have names"); + + // First set up the configuration which all of the NNs + // need to have - have to do this a priori before starting + // *any* of the NNs, so they know to come up in standby. + List nnIds = Lists.newArrayList(); + // Iterate over the NNs in this nameservice + for (NNConf nn : nameservice.getNNs()) { + nnIds.add(nn.getNnId()); + + initNameNodeAddress(conf, nameservice.getId(), nn); + } + + // If HA is enabled on this nameservice, enumerate all the namenodes + // in the configuration. Also need to set a shared edits dir + if (nnIds.size() > 1) { + conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, nameservice.getId()), + Joiner.on(",").join(nnIds)); + if (manageNameDfsDirs) { + URI sharedEditsUri = getSharedEditsDir(nnCounter, nnCounter+nnIds.size()-1); + conf.set(DFS_NAMENODE_SHARED_EDITS_DIR_KEY, sharedEditsUri.toString()); + } + } + + // Now format first NN and copy the storage directory from that node to the others. + int i = 0; + Collection prevNNDirs = null; + int nnCounterForFormat = nnCounter; + for (NNConf nn : nameservice.getNNs()) { + initNameNodeConf(conf, nsId, nn.getNnId(), manageNameDfsDirs, + nnCounterForFormat); + Collection namespaceDirs = FSNamesystem.getNamespaceDirs(conf); + if (format) { + for (URI nameDirUri : namespaceDirs) { + File nameDir = new File(nameDirUri); + if (nameDir.exists() && !FileUtil.fullyDelete(nameDir)) { + throw new IOException("Could not fully delete " + nameDir); + } + } + } + + boolean formatThisOne = format; + if (format && i++ > 0) { + // Don't format the second NN in an HA setup - that + // would result in it having a different clusterID, + // block pool ID, etc. Instead, copy the name dirs + // from the first one. + formatThisOne = false; + assert (null != prevNNDirs); + copyNameDirs(prevNNDirs, namespaceDirs, conf); + } + + nnCounterForFormat++; + if (formatThisOne) { + DFSTestUtil.formatNameNode(conf); + } + prevNNDirs = namespaceDirs; + } + + // Start all Namenodes + for (NNConf nn : nameservice.getNNs()) { + initNameNodeConf(conf, nsId, nn.getNnId(), manageNameDfsDirs, nnCounter); + createNameNode(nnCounter++, conf, numDataNodes, false, operation, + clusterId, nsId, nn.getNnId()); + } + } + } - private NameNode createNameNode(int nnIndex, Configuration conf, - int numDataNodes, boolean manageNameDfsDirs, boolean format, - StartupOption operation, String clusterId) + public URI getSharedEditsDir(int minNN, int maxNN) throws IOException { + return formatSharedEditsDir(base_dir, minNN, maxNN); + } + + public static URI formatSharedEditsDir(File baseDir, int minNN, int maxNN) throws IOException { + return fileAsURI(new File(baseDir, "shared-edits-" + + minNN + "-through-" + maxNN)); + } + + public NameNodeInfo[] getNameNodeInfos() { + return this.nameNodes; + } + + private void initNameNodeConf(Configuration conf, + String nameserviceId, String nnId, + boolean manageNameDfsDirs, int nnIndex) + throws IOException { + if (nameserviceId != null) { + conf.set(DFS_FEDERATION_NAMESERVICE_ID, nameserviceId); + } + if (nnId != null) { + conf.set(DFS_HA_NAMENODE_ID_KEY, nnId); + } + if (manageNameDfsDirs) { conf.set(DFS_NAMENODE_NAME_DIR_KEY, fileAsURI(new File(base_dir, "name" + (2*nnIndex + 1)))+","+ @@ -595,7 +703,50 @@ public class MiniDFSCluster { fileAsURI(new File(base_dir, "namesecondary" + (2*nnIndex + 1)))+","+ fileAsURI(new File(base_dir, "namesecondary" + (2*nnIndex + 2)))); } - + } + + private void copyNameDirs(Collection srcDirs, Collection dstDirs, + Configuration dstConf) throws IOException { + URI srcDir = Lists.newArrayList(srcDirs).get(0); + FileSystem dstFS = FileSystem.getLocal(dstConf).getRaw(); + for (URI dstDir : dstDirs) { + Preconditions.checkArgument(!dstDir.equals(srcDir)); + File dstDirF = new File(dstDir); + if (dstDirF.exists()) { + Files.deleteRecursively(dstDirF); + } + LOG.info("Copying namedir from primary node dir " + + srcDir + " to " + dstDir); + FileUtil.copy( + new File(srcDir), + dstFS, new Path(dstDir), false, dstConf); + } + } + + /** + * Initialize the address and port for this NameNode. In the + * non-federated case, the nameservice and namenode ID may be + * null. + */ + private static void initNameNodeAddress(Configuration conf, + String nameserviceId, NNConf nnConf) { + // Set NN-specific specific key + String key = DFSUtil.addKeySuffixes( + DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId, + nnConf.getNnId()); + conf.set(key, "127.0.0.1:" + nnConf.getHttpPort()); + + key = DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId, + nnConf.getNnId()); + conf.set(key, "127.0.0.1:" + nnConf.getIpcPort()); + } + + private void createNameNode(int nnIndex, Configuration conf, + int numDataNodes, boolean format, StartupOption operation, + String clusterId, String nameserviceId, + String nnId) + throws IOException { // Format and clean out DataNode directories if (format) { DFSTestUtil.formatNameNode(conf); @@ -609,25 +760,20 @@ public class MiniDFSCluster { operation == StartupOption.FORMAT || operation == StartupOption.REGULAR) ? new String[] {} : new String[] {operation.getName()}; - return NameNode.createNameNode(args, conf); - } - - private void createFederatedNameNode(int nnIndex, Configuration conf, - int numDataNodes, boolean manageNameDfsDirs, boolean format, - StartupOption operation, String clusterId, String nameserviceId) - throws IOException { - conf.set(DFS_FEDERATION_NAMESERVICE_ID, nameserviceId); - NameNode nn = createNameNode(nnIndex, conf, numDataNodes, manageNameDfsDirs, - format, operation, clusterId); - conf.set(DFSUtil.getNameServiceIdKey( - DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId), NetUtils + NameNode nn = NameNode.createNameNode(args, conf); + + // After the NN has started, set back the bound ports into + // the conf + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId, nnId), NetUtils .getHostPortString(nn.getNameNodeAddress())); - conf.set(DFSUtil.getNameServiceIdKey( - DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId), NetUtils + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId, nnId), NetUtils .getHostPortString(nn.getHttpAddress())); - DFSUtil.setGenericConf(conf, nameserviceId, + DFSUtil.setGenericConf(conf, nameserviceId, nnId, DFS_NAMENODE_HTTP_ADDRESS_KEY); - nameNodes[nnIndex] = new NameNodeInfo(nn, new Configuration(conf)); + nameNodes[nnIndex] = new NameNodeInfo(nn, nameserviceId, nnId, + new Configuration(conf)); } /** @@ -652,6 +798,10 @@ public class MiniDFSCluster { } return uri; } + + public int getInstanceId() { + return instanceId; + } /** * @return Configuration of for the given namenode @@ -1005,7 +1155,14 @@ public class MiniDFSCluster { */ public NamenodeProtocols getNameNodeRpc() { checkSingleNameNode(); - return getNameNode(0).getRpcServer(); + return getNameNodeRpc(0); + } + + /** + * Get an instance of the NameNode's RPC handler. + */ + public NamenodeProtocols getNameNodeRpc(int nnIndex) { + return getNameNode(nnIndex).getRpcServer(); } /** @@ -1075,6 +1232,7 @@ public class MiniDFSCluster { LOG.info("Shutting down the Mini HDFS Cluster"); shutdownDataNodes(); for (NameNodeInfo nnInfo : nameNodes) { + if (nnInfo == null) continue; NameNode nameNode = nnInfo.nameNode; if (nameNode != null) { nameNode.stop(); @@ -1116,7 +1274,16 @@ public class MiniDFSCluster { nn.stop(); nn.join(); Configuration conf = nameNodes[nnIndex].conf; - nameNodes[nnIndex] = new NameNodeInfo(null, conf); + nameNodes[nnIndex] = new NameNodeInfo(null, null, null, conf); + } + } + + /** + * Restart all namenodes. + */ + public synchronized void restartNameNodes() throws IOException { + for (int i = 0; i < nameNodes.length; i++) { + restartNameNode(i); } } @@ -1150,10 +1317,12 @@ public class MiniDFSCluster { */ public synchronized void restartNameNode(int nnIndex, boolean waitActive) throws IOException { + String nameserviceId = nameNodes[nnIndex].nameserviceId; + String nnId = nameNodes[nnIndex].nnId; Configuration conf = nameNodes[nnIndex].conf; shutdownNameNode(nnIndex); NameNode nn = NameNode.createNameNode(new String[] {}, conf); - nameNodes[nnIndex] = new NameNodeInfo(nn, conf); + nameNodes[nnIndex] = new NameNodeInfo(nn, nameserviceId, nnId, conf); if (waitActive) { waitClusterUp(); LOG.info("Restarted the namenode"); @@ -1345,17 +1514,11 @@ public class MiniDFSCluster { return false; } long[] sizes; - try { - sizes = nameNode.getRpcServer().getStats(); - } catch (IOException ioe) { - // This method above should never throw. - // It only throws IOE since it is exposed via RPC - throw (AssertionError)(new AssertionError("Unexpected IOE thrown: " - + StringUtils.stringifyException(ioe)).initCause(ioe)); - } + sizes = NameNodeAdapter.getStats(nameNode.getNamesystem()); boolean isUp = false; synchronized (this) { - isUp = ((!nameNode.isInSafeMode() || !waitSafeMode) && sizes[0] != 0); + isUp = ((!nameNode.isInSafeMode() || !waitSafeMode) && + sizes[ClientProtocol.GET_STATS_CAPACITY_IDX] != 0); } return isUp; } @@ -1459,9 +1622,48 @@ public class MiniDFSCluster { /** * Get the directories where the namenode stores its edits. */ - public Collection getNameEditsDirs(int nnIndex) { + public Collection getNameEditsDirs(int nnIndex) throws IOException { return FSNamesystem.getNamespaceEditsDirs(nameNodes[nnIndex].conf); } + + private HAServiceProtocol getHaServiceClient(int nnIndex) throws IOException { + InetSocketAddress addr = nameNodes[nnIndex].nameNode.getServiceRpcAddress(); + return new HAServiceProtocolClientSideTranslatorPB(addr, conf); + } + + public void transitionToActive(int nnIndex) throws IOException, + ServiceFailedException { + HAServiceProtocolHelper.transitionToActive(getHaServiceClient(nnIndex)); + } + + public void transitionToStandby(int nnIndex) throws IOException, + ServiceFailedException { + HAServiceProtocolHelper.transitionToStandby(getHaServiceClient(nnIndex)); + } + + + public void triggerBlockReports() + throws IOException { + for (DataNode dn : getDataNodes()) { + DataNodeAdapter.triggerBlockReport(dn); + } + } + + + public void triggerDeletionReports() + throws IOException { + for (DataNode dn : getDataNodes()) { + DataNodeAdapter.triggerDeletionReport(dn); + } + } + + public void triggerHeartbeats() + throws IOException { + for (DataNode dn : getDataNodes()) { + DataNodeAdapter.triggerHeartbeat(dn); + } + } + /** Wait until the given namenode gets registration from all the datanodes */ public void waitActive(int nnIndex) throws IOException { @@ -1469,6 +1671,7 @@ public class MiniDFSCluster { return; } InetSocketAddress addr = nameNodes[nnIndex].nameNode.getServiceRpcAddress(); + assert addr.getPort() != 0; DFSClient client = new DFSClient(addr, conf); // ensure all datanodes have registered and sent heartbeat to the namenode @@ -1512,9 +1715,9 @@ public class MiniDFSCluster { // If a datanode failed to start, then do not wait for (DataNodeProperties dn : dataNodes) { // the datanode thread communicating with the namenode should be alive - if (!dn.datanode.isBPServiceAlive(addr)) { - LOG.warn("BPOfferService failed to start in datanode " + dn.datanode - + " for namenode at " + addr); + if (!dn.datanode.isConnectedToNN(addr)) { + LOG.warn("BPOfferService in datanode " + dn.datanode + + " failed to connect to namenode at " + addr); return false; } } @@ -1653,6 +1856,10 @@ public class MiniDFSCluster { public void setLeasePeriod(long soft, long hard) { NameNodeAdapter.setLeasePeriod(getNamesystem(), soft, hard); } + + public void setWaitSafeMode(boolean wait) { + this.waitSafeMode = wait; + } /** * Returns the current set of datanodes @@ -1869,7 +2076,7 @@ public class MiniDFSCluster { throws IOException { if(!federation) throw new IOException("cannot add namenode to non-federated cluster"); - + int nnIndex = nameNodes.length; int numNameNodes = nameNodes.length + 1; NameNodeInfo[] newlist = new NameNodeInfo[numNameNodes]; @@ -1880,10 +2087,13 @@ public class MiniDFSCluster { String nameserviceIds = conf.get(DFS_FEDERATION_NAMESERVICES); nameserviceIds += "," + nameserviceId; conf.set(DFS_FEDERATION_NAMESERVICES, nameserviceIds); - - initFederatedNamenodeAddress(conf, nameserviceId, namenodePort); - createFederatedNameNode(nnIndex, conf, numDataNodes, true, true, null, - null, nameserviceId); + + String nnId = null; + initNameNodeAddress(conf, nameserviceId, + new NNConf(nnId).setIpcPort(namenodePort)); + initNameNodeConf(conf, nameserviceId, nnId, true, nnIndex); + createNameNode(nnIndex, conf, numDataNodes, true, null, null, + nameserviceId, nnId); // Refresh datanodes with the newly started namenode for (DataNodeProperties dn : dataNodes) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSNNTopology.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSNNTopology.java new file mode 100644 index 0000000000..4dfbfd81d9 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSNNTopology.java @@ -0,0 +1,227 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs; + +import java.util.List; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +/** + * This class is used to specify the setup of namenodes when instantiating + * a MiniDFSCluster. It consists of a set of nameservices, each of which + * may have one or more namenodes (in the case of HA) + */ +@InterfaceAudience.LimitedPrivate({"HBase", "HDFS", "Hive", "MapReduce", "Pig"}) +@InterfaceStability.Unstable +public class MiniDFSNNTopology { + private final List nameservices = Lists.newArrayList(); + private boolean federation; + + public MiniDFSNNTopology() { + } + + /** + * Set up a simple non-federated non-HA NN. + */ + public static MiniDFSNNTopology simpleSingleNN( + int nameNodePort, int nameNodeHttpPort) { + return new MiniDFSNNTopology() + .addNameservice(new MiniDFSNNTopology.NSConf(null) + .addNN(new MiniDFSNNTopology.NNConf(null) + .setHttpPort(nameNodeHttpPort) + .setIpcPort(nameNodePort))); + } + + + /** + * Set up an HA topology with a single HA nameservice. + */ + public static MiniDFSNNTopology simpleHATopology() { + return new MiniDFSNNTopology() + .addNameservice(new MiniDFSNNTopology.NSConf("minidfs-ns") + .addNN(new MiniDFSNNTopology.NNConf("nn1")) + .addNN(new MiniDFSNNTopology.NNConf("nn2"))); + } + + /** + * Set up federated cluster with the given number of nameservices, each + * of which has only a single NameNode. + */ + public static MiniDFSNNTopology simpleFederatedTopology( + int numNameservices) { + MiniDFSNNTopology topology = new MiniDFSNNTopology(); + for (int i = 1; i <= numNameservices; i++) { + topology.addNameservice(new MiniDFSNNTopology.NSConf("ns" + i) + .addNN(new MiniDFSNNTopology.NNConf(null))); + } + topology.setFederation(true); + return topology; + } + + /** + * Set up federated cluster with the given number of nameservices, each + * of which has two NameNodes. + */ + public static MiniDFSNNTopology simpleHAFederatedTopology( + int numNameservices) { + MiniDFSNNTopology topology = new MiniDFSNNTopology(); + for (int i = 0; i < numNameservices; i++) { + topology.addNameservice(new MiniDFSNNTopology.NSConf("ns" + i) + .addNN(new MiniDFSNNTopology.NNConf("nn0")) + .addNN(new MiniDFSNNTopology.NNConf("nn1"))); + } + topology.setFederation(true); + return topology; + } + + public MiniDFSNNTopology setFederation(boolean federation) { + this.federation = federation; + return this; + } + + public MiniDFSNNTopology addNameservice(NSConf nameservice) { + Preconditions.checkArgument(!nameservice.getNNs().isEmpty(), + "Must have at least one NN in a nameservice"); + this.nameservices.add(nameservice); + return this; + } + + public int countNameNodes() { + int count = 0; + for (NSConf ns : nameservices) { + count += ns.nns.size(); + } + return count; + } + + public NNConf getOnlyNameNode() { + Preconditions.checkState(countNameNodes() == 1, + "must have exactly one NN!"); + return nameservices.get(0).getNNs().get(0); + } + + public boolean isFederated() { + return nameservices.size() > 1 || federation; + } + + /** + * @return true if at least one of the nameservices + * in the topology has HA enabled. + */ + public boolean isHA() { + for (NSConf ns : nameservices) { + if (ns.getNNs().size() > 1) { + return true; + } + } + return false; + } + + /** + * @return true if all of the NNs in the cluster have their HTTP + * port specified to be non-ephemeral. + */ + public boolean allHttpPortsSpecified() { + for (NSConf ns : nameservices) { + for (NNConf nn : ns.getNNs()) { + if (nn.getHttpPort() == 0) { + return false; + } + } + } + return true; + } + + /** + * @return true if all of the NNs in the cluster have their IPC + * port specified to be non-ephemeral. + */ + public boolean allIpcPortsSpecified() { + for (NSConf ns : nameservices) { + for (NNConf nn : ns.getNNs()) { + if (nn.getIpcPort() == 0) { + return false; + } + } + } + return true; + } + + public List getNameservices() { + return nameservices; + } + + public static class NSConf { + private final String id; + private final List nns = Lists.newArrayList(); + + public NSConf(String id) { + this.id = id; + } + + public NSConf addNN(NNConf nn) { + this.nns.add(nn); + return this; + } + + public String getId() { + return id; + } + + public List getNNs() { + return nns; + } + } + + public static class NNConf { + private String nnId; + private int httpPort; + private int ipcPort; + + public NNConf(String nnId) { + this.nnId = nnId; + } + + String getNnId() { + return nnId; + } + + int getIpcPort() { + return ipcPort; + } + + int getHttpPort() { + return httpPort; + } + + public NNConf setHttpPort(int httpPort) { + this.httpPort = httpPort; + return this; + } + + public NNConf setIpcPort(int ipcPort) { + this.ipcPort = ipcPort; + return this; + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientFailover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientFailover.java new file mode 100644 index 0000000000..a88e8a74ed --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientFailover.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs; + +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider; +import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.util.StringUtils; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestDFSClientFailover { + + private static final Log LOG = LogFactory.getLog(TestDFSClientFailover.class); + + private static final Path TEST_FILE = new Path("/tmp/failover-test-file"); + private static final int FILE_LENGTH_TO_VERIFY = 100; + + private Configuration conf = new Configuration(); + private MiniDFSCluster cluster; + + @Before + public void setUpCluster() throws IOException { + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .build(); + cluster.transitionToActive(0); + cluster.waitActive(); + } + + @After + public void tearDownCluster() throws IOException { + cluster.shutdown(); + } + + /** + * Make sure that client failover works when an active NN dies and the standby + * takes over. + */ + @Test + public void testDfsClientFailover() throws IOException, URISyntaxException { + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + + DFSTestUtil.createFile(fs, TEST_FILE, + FILE_LENGTH_TO_VERIFY, (short)1, 1L); + + assertEquals(fs.getFileStatus(TEST_FILE).getLen(), FILE_LENGTH_TO_VERIFY); + cluster.shutdownNameNode(0); + cluster.transitionToActive(1); + assertEquals(fs.getFileStatus(TEST_FILE).getLen(), FILE_LENGTH_TO_VERIFY); + + // Check that it functions even if the URL becomes canonicalized + // to include a port number. + Path withPort = new Path("hdfs://" + + HATestUtil.getLogicalHostname(cluster) + ":" + + NameNode.DEFAULT_PORT + "/" + TEST_FILE.toUri().getPath()); + FileSystem fs2 = withPort.getFileSystem(fs.getConf()); + assertTrue(fs2.exists(withPort)); + + fs.close(); + } + + /** + * Regression test for HDFS-2683. + */ + @Test + public void testLogicalUriShouldNotHavePorts() { + Configuration conf = new HdfsConfiguration(); + conf.set(DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + ".foo", + ConfiguredFailoverProxyProvider.class.getName()); + Path p = new Path("hdfs://foo:12345/"); + try { + p.getFileSystem(conf).exists(p); + fail("Did not fail with fake FS"); + } catch (IOException ioe) { + GenericTestUtils.assertExceptionContains( + "does not use port information", ioe); + } + } + + /** + * Make sure that a helpful error message is shown if a proxy provider is + * configured for a given URI, but no actual addresses are configured for that + * URI. + */ + @Test + public void testFailureWithMisconfiguredHaNNs() throws Exception { + String logicalHost = "misconfigured-ha-uri"; + Configuration conf = new Configuration(); + conf.set(DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "." + logicalHost, + ConfiguredFailoverProxyProvider.class.getName()); + + URI uri = new URI("hdfs://" + logicalHost + "/test"); + try { + FileSystem.get(uri, conf).exists(new Path("/test")); + fail("Successfully got proxy provider for misconfigured FS"); + } catch (IOException ioe) { + LOG.info("got expected exception", ioe); + assertTrue("expected exception did not contain helpful message", + StringUtils.stringifyException(ioe).contains( + "Could not find any configured addresses for URI " + uri)); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java index a308c230cb..ad3e6d8c55 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hdfs.server.namenode.TestParallelImageWrite; import static org.apache.hadoop.hdfs.server.namenode.NNStorage.getInProgressEditsFileName; import static org.apache.hadoop.hdfs.server.namenode.NNStorage.getImageFileName; +import static org.apache.hadoop.test.GenericTestUtils.assertExists; import org.apache.hadoop.util.StringUtils; import org.junit.BeforeClass; import org.junit.Ignore; @@ -51,7 +52,7 @@ import static org.junit.Assert.*; */ public class TestDFSUpgrade { - private static final int EXPECTED_TXID = 17; + private static final int EXPECTED_TXID = 49; private static final Log LOG = LogFactory.getLog(TestDFSUpgrade.class.getName()); private Configuration conf; private int testCounter = 0; @@ -80,16 +81,16 @@ public class TestDFSUpgrade { Joiner.on(" \n").join(new File(baseDir, "current").list())); LOG.info("=================="); - assertTrue(new File(baseDir,"current").isDirectory()); - assertTrue(new File(baseDir,"current/VERSION").isFile()); - assertTrue(new File(baseDir,"current/" - + getInProgressEditsFileName(imageTxId + 1)).isFile()); - assertTrue(new File(baseDir,"current/" - + getImageFileName(imageTxId)).isFile()); - assertTrue(new File(baseDir,"current/seen_txid").isFile()); + assertExists(new File(baseDir,"current")); + assertExists(new File(baseDir,"current/VERSION")); + assertExists(new File(baseDir,"current/" + + getInProgressEditsFileName(imageTxId + 1))); + assertExists(new File(baseDir,"current/" + + getImageFileName(imageTxId))); + assertExists(new File(baseDir,"current/seen_txid")); File previous = new File(baseDir, "previous"); - assertTrue(previous.isDirectory()); + assertExists(previous); assertEquals(UpgradeUtilities.checksumContents(NAME_NODE, previous), UpgradeUtilities.checksumMasterNameNodeContents()); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java index 3b93aebbfa..ef8f850395 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java @@ -18,16 +18,20 @@ package org.apache.hadoop.hdfs; +import org.junit.Before; import org.junit.Test; import static org.junit.Assert.*; import java.io.IOException; import java.net.InetSocketAddress; +import java.net.URI; +import java.net.URISyntaxException; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; +import java.util.Map; import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; @@ -39,9 +43,20 @@ import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.CommonConfigurationKeys; + import static org.apache.hadoop.hdfs.DFSConfigKeys.*; public class TestDFSUtil { + + /** + * Reset to default UGI settings since some tests change them. + */ + @Before + public void resetUGI() { + UserGroupInformation.setConfiguration(new Configuration()); + } + /** * Test conversion of LocatedBlock to BlockLocation */ @@ -86,7 +101,7 @@ public class TestDFSUtil { private Configuration setupAddress(String key) { HdfsConfiguration conf = new HdfsConfiguration(); conf.set(DFS_FEDERATION_NAMESERVICES, "nn1"); - conf.set(DFSUtil.getNameServiceIdKey(key, "nn1"), "localhost:9000"); + conf.set(DFSUtil.addKeySuffixes(key, "nn1"), "localhost:9000"); return conf; } @@ -102,7 +117,7 @@ public class TestDFSUtil { } /** - * Test {@link DFSUtil#getNameNodeNameServiceId(Configuration)} to ensure + * Test {@link DFSUtil#getNamenodeNameServiceId(Configuration)} to ensure * nameserviceId for namenode is determined based on matching the address with * local node's address */ @@ -135,7 +150,7 @@ public class TestDFSUtil { } /** - * Test {@link DFSUtil#getNameServiceId(Configuration, String))} to ensure + * Test {@link DFSUtil#getNamenodeNameServiceId(Configuration)} to ensure * exception is thrown when multiple rpc addresses match the local node's * address */ @@ -143,9 +158,9 @@ public class TestDFSUtil { public void testGetNameServiceIdException() { HdfsConfiguration conf = new HdfsConfiguration(); conf.set(DFS_FEDERATION_NAMESERVICES, "nn1,nn2"); - conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"), + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"), "localhost:9000"); - conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"), + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"), "localhost:9001"); DFSUtil.getNamenodeNameServiceId(conf); fail("Expected exception is not thrown"); @@ -178,19 +193,24 @@ public class TestDFSUtil { final String NN1_ADDRESS = "localhost:9000"; final String NN2_ADDRESS = "localhost:9001"; final String NN3_ADDRESS = "localhost:9002"; - conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"), + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"), NN1_ADDRESS); - conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"), + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"), NN2_ADDRESS); - Collection nnAddresses = DFSUtil + Map> nnMap = DFSUtil .getNNServiceRpcAddresses(conf); - assertEquals(2, nnAddresses.size()); - Iterator iterator = nnAddresses.iterator(); - InetSocketAddress addr = iterator.next(); + assertEquals(2, nnMap.size()); + + Map nn1Map = nnMap.get("nn1"); + assertEquals(1, nn1Map.size()); + InetSocketAddress addr = nn1Map.get(null); assertEquals("localhost", addr.getHostName()); assertEquals(9000, addr.getPort()); - addr = iterator.next(); + + Map nn2Map = nnMap.get("nn2"); + assertEquals(1, nn2Map.size()); + addr = nn2Map.get(null); assertEquals("localhost", addr.getHostName()); assertEquals(9001, addr.getPort()); @@ -198,6 +218,10 @@ public class TestDFSUtil { checkNameServiceId(conf, NN1_ADDRESS, "nn1"); checkNameServiceId(conf, NN2_ADDRESS, "nn2"); checkNameServiceId(conf, NN3_ADDRESS, null); + + // HA is not enabled in a purely federated config + assertFalse(HAUtil.isHAEnabled(conf, "nn1")); + assertFalse(HAUtil.isHAEnabled(conf, "nn2")); } public void checkNameServiceId(Configuration conf, String addr, @@ -216,9 +240,14 @@ public class TestDFSUtil { conf.set(FS_DEFAULT_NAME_KEY, hdfs_default); // If DFS_FEDERATION_NAMESERVICES is not set, verify that // default namenode address is returned. - List addrList = DFSUtil.getNNServiceRpcAddresses(conf); - assertEquals(1, addrList.size()); - assertEquals(9999, addrList.get(0).getPort()); + Map> addrMap = + DFSUtil.getNNServiceRpcAddresses(conf); + assertEquals(1, addrMap.size()); + + Map defaultNsMap = addrMap.get(null); + assertEquals(1, defaultNsMap.size()); + + assertEquals(9999, defaultNsMap.get(null).getPort()); } /** @@ -226,20 +255,51 @@ public class TestDFSUtil { * copied to generic keys when the namenode starts. */ @Test - public void testConfModification() throws IOException { + public void testConfModificationFederationOnly() { final HdfsConfiguration conf = new HdfsConfiguration(); - conf.set(DFS_FEDERATION_NAMESERVICES, "nn1"); - conf.set(DFS_FEDERATION_NAMESERVICE_ID, "nn1"); - final String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf); + String nsId = "ns1"; + + conf.set(DFS_FEDERATION_NAMESERVICES, nsId); + conf.set(DFS_FEDERATION_NAMESERVICE_ID, nsId); // Set the nameservice specific keys with nameserviceId in the config key for (String key : NameNode.NAMESERVICE_SPECIFIC_KEYS) { // Note: value is same as the key - conf.set(DFSUtil.getNameServiceIdKey(key, nameserviceId), key); + conf.set(DFSUtil.addKeySuffixes(key, nsId), key); } // Initialize generic keys from specific keys - NameNode.initializeGenericKeys(conf, nameserviceId); + NameNode.initializeGenericKeys(conf, nsId, null); + + // Retrieve the keys without nameserviceId and Ensure generic keys are set + // to the correct value + for (String key : NameNode.NAMESERVICE_SPECIFIC_KEYS) { + assertEquals(key, conf.get(key)); + } + } + + /** + * Test to ensure nameservice specific keys in the configuration are + * copied to generic keys when the namenode starts. + */ + @Test + public void testConfModificationFederationAndHa() { + final HdfsConfiguration conf = new HdfsConfiguration(); + String nsId = "ns1"; + String nnId = "nn1"; + + conf.set(DFS_FEDERATION_NAMESERVICES, nsId); + conf.set(DFS_FEDERATION_NAMESERVICE_ID, nsId); + conf.set(DFS_HA_NAMENODES_KEY_PREFIX + "." + nsId, nnId); + + // Set the nameservice specific keys with nameserviceId in the config key + for (String key : NameNode.NAMESERVICE_SPECIFIC_KEYS) { + // Note: value is same as the key + conf.set(DFSUtil.addKeySuffixes(key, nsId, nnId), key); + } + + // Initialize generic keys from specific keys + NameNode.initializeGenericKeys(conf, nsId, nnId); // Retrieve the keys without nameserviceId and Ensure generic keys are set // to the correct value @@ -248,6 +308,39 @@ public class TestDFSUtil { } } + /** + * Regression test for HDFS-2934. + */ + @Test + public void testSomeConfsNNSpecificSomeNSSpecific() { + final HdfsConfiguration conf = new HdfsConfiguration(); + + String key = DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY; + conf.set(key, "global-default"); + conf.set(key + ".ns1", "ns1-override"); + conf.set(key + ".ns1.nn1", "nn1-override"); + + // A namenode in another nameservice should get the global default. + Configuration newConf = new Configuration(conf); + NameNode.initializeGenericKeys(newConf, "ns2", "nn1"); + assertEquals("global-default", newConf.get(key)); + + // A namenode in another non-HA nameservice should get global default. + newConf = new Configuration(conf); + NameNode.initializeGenericKeys(newConf, "ns2", null); + assertEquals("global-default", newConf.get(key)); + + // A namenode in the same nameservice should get the ns setting + newConf = new Configuration(conf); + NameNode.initializeGenericKeys(newConf, "ns1", "nn2"); + assertEquals("ns1-override", newConf.get(key)); + + // The nn with the nn-specific setting should get its own override + newConf = new Configuration(conf); + NameNode.initializeGenericKeys(newConf, "ns1", "nn1"); + assertEquals("nn1-override", newConf.get(key)); + } + /** * Tests for empty configuration, an exception is thrown from * {@link DFSUtil#getNNServiceRpcAddresses(Configuration)} @@ -258,21 +351,30 @@ public class TestDFSUtil { public void testEmptyConf() { HdfsConfiguration conf = new HdfsConfiguration(false); try { - DFSUtil.getNNServiceRpcAddresses(conf); - fail("Expected IOException is not thrown"); + Map> map = + DFSUtil.getNNServiceRpcAddresses(conf); + fail("Expected IOException is not thrown, result was: " + + DFSUtil.addressMapToString(map)); } catch (IOException expected) { + /** Expected */ } try { - DFSUtil.getBackupNodeAddresses(conf); - fail("Expected IOException is not thrown"); + Map> map = + DFSUtil.getBackupNodeAddresses(conf); + fail("Expected IOException is not thrown, result was: " + + DFSUtil.addressMapToString(map)); } catch (IOException expected) { + /** Expected */ } try { - DFSUtil.getSecondaryNameNodeAddresses(conf); - fail("Expected IOException is not thrown"); + Map> map = + DFSUtil.getSecondaryNameNodeAddresses(conf); + fail("Expected IOException is not thrown, result was: " + + DFSUtil.addressMapToString(map)); } catch (IOException expected) { + /** Expected */ } } @@ -286,5 +388,144 @@ public class TestDFSUtil { String httpport = DFSUtil.getInfoServer(null, conf, false); assertEquals("0.0.0.0:50070", httpport); } + + @Test + public void testHANameNodesWithFederation() throws URISyntaxException { + HdfsConfiguration conf = new HdfsConfiguration(); + + final String NS1_NN1_HOST = "ns1-nn1.example.com:8020"; + final String NS1_NN2_HOST = "ns1-nn2.example.com:8020"; + final String NS2_NN1_HOST = "ns2-nn1.example.com:8020"; + final String NS2_NN2_HOST = "ns2-nn2.example.com:8020"; + conf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, "hdfs://ns1"); + + // Two nameservices, each with two NNs. + conf.set(DFS_FEDERATION_NAMESERVICES, "ns1,ns2"); + conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns1"), + "ns1-nn1,ns1-nn2"); + conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns2"), + "ns2-nn1,ns2-nn2"); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "ns1-nn1"), + NS1_NN1_HOST); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "ns1-nn2"), + NS1_NN2_HOST); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns2", "ns2-nn1"), + NS2_NN1_HOST); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns2", "ns2-nn2"), + NS2_NN2_HOST); + + Map> map = + DFSUtil.getHaNnRpcAddresses(conf); -} \ No newline at end of file + assertTrue(HAUtil.isHAEnabled(conf, "ns1")); + assertTrue(HAUtil.isHAEnabled(conf, "ns2")); + assertFalse(HAUtil.isHAEnabled(conf, "ns3")); + + assertEquals(NS1_NN1_HOST, map.get("ns1").get("ns1-nn1").toString()); + assertEquals(NS1_NN2_HOST, map.get("ns1").get("ns1-nn2").toString()); + assertEquals(NS2_NN1_HOST, map.get("ns2").get("ns2-nn1").toString()); + assertEquals(NS2_NN2_HOST, map.get("ns2").get("ns2-nn2").toString()); + + assertEquals(NS1_NN1_HOST, + DFSUtil.getNamenodeServiceAddr(conf, "ns1", "ns1-nn1")); + assertEquals(NS1_NN2_HOST, + DFSUtil.getNamenodeServiceAddr(conf, "ns1", "ns1-nn2")); + assertEquals(NS2_NN1_HOST, + DFSUtil.getNamenodeServiceAddr(conf, "ns2", "ns2-nn1")); + + // No nameservice was given and we can't determine which service addr + // to use as two nameservices could share a namenode ID. + assertEquals(null, DFSUtil.getNamenodeServiceAddr(conf, null, "ns1-nn1")); + + // Ditto for nameservice IDs, if multiple are defined + assertEquals(null, DFSUtil.getNamenodeNameServiceId(conf)); + assertEquals(null, DFSUtil.getSecondaryNameServiceId(conf)); + + Collection uris = DFSUtil.getNameServiceUris(conf, DFS_NAMENODE_RPC_ADDRESS_KEY); + assertEquals(2, uris.size()); + assertTrue(uris.contains(new URI("hdfs://ns1"))); + assertTrue(uris.contains(new URI("hdfs://ns2"))); + } + + @Test + public void getNameNodeServiceAddr() throws IOException { + HdfsConfiguration conf = new HdfsConfiguration(); + + // One nameservice with two NNs + final String NS1_NN1_HOST = "ns1-nn1.example.com:8020"; + final String NS1_NN1_HOST_SVC = "ns1-nn2.example.com:8021"; + final String NS1_NN2_HOST = "ns1-nn1.example.com:8020"; + final String NS1_NN2_HOST_SVC = "ns1-nn2.example.com:8021"; + + conf.set(DFS_FEDERATION_NAMESERVICES, "ns1"); + conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns1"),"nn1,nn2"); + + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn1"), NS1_NN1_HOST); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn2"), NS1_NN2_HOST); + + // The rpc address is used if no service address is defined + assertEquals(NS1_NN1_HOST, DFSUtil.getNamenodeServiceAddr(conf, null, "nn1")); + assertEquals(NS1_NN2_HOST, DFSUtil.getNamenodeServiceAddr(conf, null, "nn2")); + + // A nameservice is specified explicitly + assertEquals(NS1_NN1_HOST, DFSUtil.getNamenodeServiceAddr(conf, "ns1", "nn1")); + assertEquals(null, DFSUtil.getNamenodeServiceAddr(conf, "invalid", "nn1")); + + // The service addrs are used when they are defined + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, "ns1", "nn1"), NS1_NN1_HOST_SVC); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, "ns1", "nn2"), NS1_NN2_HOST_SVC); + + assertEquals(NS1_NN1_HOST_SVC, DFSUtil.getNamenodeServiceAddr(conf, null, "nn1")); + assertEquals(NS1_NN2_HOST_SVC, DFSUtil.getNamenodeServiceAddr(conf, null, "nn2")); + + // We can determine the nameservice ID, there's only one listed + assertEquals("ns1", DFSUtil.getNamenodeNameServiceId(conf)); + assertEquals("ns1", DFSUtil.getSecondaryNameServiceId(conf)); + } + + @Test + public void testSubstituteForWildcardAddress() throws IOException { + assertEquals("foo:12345", + DFSUtil.substituteForWildcardAddress("0.0.0.0:12345", "foo")); + assertEquals("127.0.0.1:12345", + DFSUtil.substituteForWildcardAddress("127.0.0.1:12345", "foo")); + } + + @Test + public void testGetNNUris() throws Exception { + HdfsConfiguration conf = new HdfsConfiguration(); + + final String NS1_NN1_HOST = "ns1-nn1.example.com:8020"; + final String NS1_NN2_HOST = "ns1-nn1.example.com:8020"; + final String NS2_NN_HOST = "ns2-nn.example.com:8020"; + final String NN_HOST = "nn.example.com:8020"; + + conf.set(DFS_FEDERATION_NAMESERVICES, "ns1,ns2"); + conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns1"),"nn1,nn2"); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn1"), NS1_NN1_HOST); + conf.set(DFSUtil.addKeySuffixes( + DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn2"), NS1_NN2_HOST); + + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, "ns2"), + NS2_NN_HOST); + + conf.set(DFS_NAMENODE_RPC_ADDRESS_KEY, "hdfs://" + NN_HOST); + + Collection uris = DFSUtil.getNameServiceUris(conf, DFS_NAMENODE_RPC_ADDRESS_KEY, + DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY); + + assertEquals(3, uris.size()); + assertTrue(uris.contains(new URI("hdfs://ns1"))); + assertTrue(uris.contains(new URI("hdfs://" + NS2_NN_HOST))); + assertTrue(uris.contains(new URI("hdfs://" + NN_HOST))); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java index 089ab4d837..af0bf6a19d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java @@ -302,7 +302,7 @@ public class TestDataTransferProtocol extends TestCase { testWrite(firstBlock, BlockConstructionStage.PIPELINE_SETUP_CREATE, 0L, "Cannot create a RBW block", true); // test PIPELINE_SETUP_APPEND on an existing block - newGS = newBlock.getGenerationStamp() + 1; + newGS = firstBlock.getGenerationStamp() + 1; testWrite(firstBlock, BlockConstructionStage.PIPELINE_SETUP_APPEND, newGS, "Cannot append to a RBW replica", true); // test PIPELINE_SETUP_APPEND on an existing block diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index faf7efd536..6997ebc2e7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -279,7 +279,8 @@ public class TestDecommission { * @throws IOException */ private void startCluster(int numNameNodes, int numDatanodes, Configuration conf) throws IOException { - cluster = new MiniDFSCluster.Builder(conf).numNameNodes(numNameNodes) + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(numNameNodes)) .numDataNodes(numDatanodes).build(); cluster.waitActive(); for (int i = 0; i < numNameNodes; i++) { @@ -507,7 +508,8 @@ public class TestDecommission { InterruptedException { conf.set(DFSConfigKeys.DFS_HOSTS, hostsFile.toUri().getPath()); int numDatanodes = 1; - cluster = new MiniDFSCluster.Builder(conf).numNameNodes(numNameNodes) + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(numNameNodes)) .numDataNodes(numDatanodes).setupHostsFile(true).build(); cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java index 033478f3a8..e10eab8c57 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java @@ -90,7 +90,7 @@ public class TestFileAppendRestart { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build(); FileSystem fs = cluster.getFileSystem(); File editLog = - new File(FSImageTestUtil.getNameNodeCurrentDirs(cluster).get(0), + new File(FSImageTestUtil.getNameNodeCurrentDirs(cluster, 0).get(0), NNStorage.getInProgressEditsFileName(1)); EnumMap> counts; @@ -98,18 +98,31 @@ public class TestFileAppendRestart { writeAndAppend(fs, p1, BLOCK_SIZE, BLOCK_SIZE); counts = FSImageTestUtil.countEditLogOpTypes(editLog); + // OP_ADD to create file + // OP_UPDATE_BLOCKS for first block + // OP_CLOSE to close file + // OP_ADD to reopen file + // OP_UPDATE_BLOCKS for second block + // OP_CLOSE to close file assertEquals(2, (int)counts.get(FSEditLogOpCodes.OP_ADD).held); + assertEquals(2, (int)counts.get(FSEditLogOpCodes.OP_UPDATE_BLOCKS).held); assertEquals(2, (int)counts.get(FSEditLogOpCodes.OP_CLOSE).held); Path p2 = new Path("/not-block-boundaries"); writeAndAppend(fs, p2, BLOCK_SIZE/2, BLOCK_SIZE); counts = FSImageTestUtil.countEditLogOpTypes(editLog); - // We get *3* OP_ADDS from this test rather than two. The first - // OP_ADD comes from re-opening the file to establish the lease, - // the second comes from the updatePipeline call when the block - // itself has its generation stamp incremented - assertEquals(5, (int)counts.get(FSEditLogOpCodes.OP_ADD).held); - assertEquals(4, (int)counts.get(FSEditLogOpCodes.OP_CLOSE).held); + // OP_ADD to create file + // OP_UPDATE_BLOCKS for first block + // OP_CLOSE to close file + // OP_ADD to re-establish the lease + // OP_UPDATE_BLOCKS from the updatePipeline call (increments genstamp of last block) + // OP_UPDATE_BLOCKS at the start of the second block + // OP_CLOSE to close file + // Total: 2 OP_ADDs, 3 OP_UPDATE_BLOCKS, and 2 OP_CLOSEs in addition + // to the ones above + assertEquals(2+2, (int)counts.get(FSEditLogOpCodes.OP_ADD).held); + assertEquals(2+3, (int)counts.get(FSEditLogOpCodes.OP_UPDATE_BLOCKS).held); + assertEquals(2+2, (int)counts.get(FSEditLogOpCodes.OP_CLOSE).held); cluster.restartNameNode(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java index af9d05c061..d3df0c0c04 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java @@ -146,8 +146,14 @@ public class TestFileCorruption extends TestCase { // report corrupted block by the third datanode DatanodeRegistration dnR = DataNodeTestUtils.getDNRegistrationForBP(dataNode, blk.getBlockPoolId()); - cluster.getNamesystem().getBlockManager().findAndMarkBlockAsCorrupt( - blk, new DatanodeInfo(dnR), "TEST"); + FSNamesystem ns = cluster.getNamesystem(); + ns.writeLock(); + try { + cluster.getNamesystem().getBlockManager().findAndMarkBlockAsCorrupt( + blk, new DatanodeInfo(dnR), "TEST"); + } finally { + ns.writeUnlock(); + } // open the file fs.open(FILE_PATH); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java index 8693885ec6..b0878d1eb8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java @@ -25,7 +25,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.LocatedBlock; -import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB; import org.apache.hadoop.hdfs.server.common.GenerationStamp; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; @@ -34,8 +33,6 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.ipc.RemoteException; -import org.apache.hadoop.security.UserGroupInformation; - import junit.framework.TestCase; /** * This class tests if block replacement request to data nodes work correctly. @@ -97,8 +94,8 @@ public class TestGetBlocks extends TestCase { // get RPC client to namenode InetSocketAddress addr = new InetSocketAddress("localhost", cluster.getNameNodePort()); - NamenodeProtocol namenode = new NamenodeProtocolTranslatorPB(addr, CONF, - UserGroupInformation.getCurrentUser()); + NamenodeProtocol namenode = NameNodeProxies.createProxy(CONF, + NameNode.getUri(addr), NamenodeProtocol.class).getProxy(); // get blocks of size fileLen from dataNodes[0] BlockWithLocations[] locs; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java index 0d8174e55c..3e90665590 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java @@ -22,6 +22,7 @@ import java.net.InetSocketAddress; import junit.framework.Assert; +import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocolPB.ClientDatanodeProtocolTranslatorPB; import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB; import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; @@ -31,8 +32,13 @@ import org.apache.hadoop.hdfs.protocolPB.JournalProtocolTranslatorPB; import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB; import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolClientSideTranslatorPB; import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.server.protocol.JournalProtocol; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.RefreshUserMappingsProtocol; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol; +import org.apache.hadoop.tools.GetUserMappingsProtocol; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -49,7 +55,7 @@ public class TestIsMethodSupported { @BeforeClass public static void setUp() throws Exception { - cluster = (new MiniDFSCluster.Builder(conf)).numNameNodes(1) + cluster = (new MiniDFSCluster.Builder(conf)) .numDataNodes(1).build(); nnAddress = cluster.getNameNode().getNameNodeAddress(); dnAddress = new InetSocketAddress(cluster.getDataNodes().get(0) @@ -66,8 +72,9 @@ public class TestIsMethodSupported { @Test public void testNamenodeProtocol() throws IOException { NamenodeProtocolTranslatorPB translator = - new NamenodeProtocolTranslatorPB(nnAddress, conf, - UserGroupInformation.getCurrentUser()); + (NamenodeProtocolTranslatorPB) NameNodeProxies.createNonHAProxy(conf, + nnAddress, NamenodeProtocol.class, UserGroupInformation.getCurrentUser(), + true).getProxy(); boolean exists = translator.isMethodSupported("rollEditLog"); Assert.assertTrue(exists); exists = translator.isMethodSupported("bogusMethod"); @@ -99,15 +106,17 @@ public class TestIsMethodSupported { @Test public void testClientNamenodeProtocol() throws IOException { ClientNamenodeProtocolTranslatorPB translator = - new ClientNamenodeProtocolTranslatorPB(nnAddress, conf, - UserGroupInformation.getCurrentUser()); + (ClientNamenodeProtocolTranslatorPB) NameNodeProxies.createNonHAProxy( + conf, nnAddress, ClientProtocol.class, + UserGroupInformation.getCurrentUser(), true).getProxy(); Assert.assertTrue(translator.isMethodSupported("mkdirs")); } @Test public void tesJournalProtocol() throws IOException { - JournalProtocolTranslatorPB translator = - new JournalProtocolTranslatorPB(nnAddress, conf); + JournalProtocolTranslatorPB translator = (JournalProtocolTranslatorPB) + NameNodeProxies.createNonHAProxy(conf, nnAddress, JournalProtocol.class, + UserGroupInformation.getCurrentUser(), true).getProxy(); //Nameode doesn't implement JournalProtocol Assert.assertFalse(translator.isMethodSupported("startLogSegment")); } @@ -130,24 +139,30 @@ public class TestIsMethodSupported { @Test public void testGetUserMappingsProtocol() throws IOException { GetUserMappingsProtocolClientSideTranslatorPB translator = - new GetUserMappingsProtocolClientSideTranslatorPB( - nnAddress, UserGroupInformation.getCurrentUser(), conf); + (GetUserMappingsProtocolClientSideTranslatorPB) + NameNodeProxies.createNonHAProxy(conf, nnAddress, + GetUserMappingsProtocol.class, UserGroupInformation.getCurrentUser(), + true).getProxy(); Assert.assertTrue(translator.isMethodSupported("getGroupsForUser")); } @Test public void testRefreshAuthorizationPolicyProtocol() throws IOException { - RefreshAuthorizationPolicyProtocolClientSideTranslatorPB translator = - new RefreshAuthorizationPolicyProtocolClientSideTranslatorPB( - nnAddress, UserGroupInformation.getCurrentUser(), conf); + RefreshAuthorizationPolicyProtocolClientSideTranslatorPB translator = + (RefreshAuthorizationPolicyProtocolClientSideTranslatorPB) + NameNodeProxies.createNonHAProxy(conf, nnAddress, + RefreshAuthorizationPolicyProtocol.class, + UserGroupInformation.getCurrentUser(), true).getProxy(); Assert.assertTrue(translator.isMethodSupported("refreshServiceAcl")); } @Test public void testRefreshUserMappingsProtocol() throws IOException { RefreshUserMappingsProtocolClientSideTranslatorPB translator = - new RefreshUserMappingsProtocolClientSideTranslatorPB( - nnAddress, UserGroupInformation.getCurrentUser(), conf); + (RefreshUserMappingsProtocolClientSideTranslatorPB) + NameNodeProxies.createNonHAProxy(conf, nnAddress, + RefreshUserMappingsProtocol.class, + UserGroupInformation.getCurrentUser(), true).getProxy(); Assert.assertTrue( translator.isMethodSupported("refreshUserToGroupsMappings")); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java index 4e3152385c..0eec0d1877 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hdfs; import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.protocol.FSConstants; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -37,11 +38,13 @@ public class TestMiniDFSCluster { private static final String CLUSTER_1 = "cluster1"; private static final String CLUSTER_2 = "cluster2"; private static final String CLUSTER_3 = "cluster3"; + private static final String CLUSTER_4 = "cluster4"; protected String testDataPath; protected File testDataDir; @Before public void setUp() { - testDataPath = System.getProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA); + testDataPath = System.getProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA, + "build/test/data"); testDataDir = new File(new File(testDataPath).getParentFile(), "miniclusters"); @@ -103,5 +106,21 @@ public class TestMiniDFSCluster { } } - + @Test(timeout=100000) + public void testIsClusterUpAfterShutdown() throws Throwable { + Configuration conf = new HdfsConfiguration(); + File testDataCluster4 = new File(testDataPath, CLUSTER_4); + String c4Path = testDataCluster4.getAbsolutePath(); + conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, c4Path); + MiniDFSCluster cluster4 = new MiniDFSCluster.Builder(conf).build(); + try { + DistributedFileSystem dfs = (DistributedFileSystem) cluster4.getFileSystem(); + dfs.setSafeMode(FSConstants.SafeModeAction.SAFEMODE_ENTER); + cluster4.shutdown(); + } finally { + while(cluster4.isClusterUp()){ + Thread.sleep(1000); + } + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPersistBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPersistBlocks.java new file mode 100644 index 0000000000..cb989298fa --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPersistBlocks.java @@ -0,0 +1,353 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdfs; + +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.protocol.LocatedBlocks; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; +import org.apache.hadoop.hdfs.server.namenode.FSEditLog; +import org.apache.hadoop.hdfs.server.namenode.FSImage; +import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.INodeFileUnderConstruction; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.log4j.Level; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.util.Collection; +import java.util.List; +import java.util.Random; +import static org.junit.Assert.*; +import org.junit.Test; + +import com.google.common.collect.Lists; + +/** + * A JUnit test for checking if restarting DFS preserves the + * blocks that are part of an unclosed file. + */ +public class TestPersistBlocks { + static { + ((Log4JLogger)FSImage.LOG).getLogger().setLevel(Level.ALL); + ((Log4JLogger)FSNamesystem.LOG).getLogger().setLevel(Level.ALL); + } + + private static final int BLOCK_SIZE = 4096; + private static final int NUM_BLOCKS = 5; + + private static final String FILE_NAME = "/data"; + private static final Path FILE_PATH = new Path(FILE_NAME); + + static final byte[] DATA_BEFORE_RESTART = new byte[BLOCK_SIZE * NUM_BLOCKS]; + static final byte[] DATA_AFTER_RESTART = new byte[BLOCK_SIZE * NUM_BLOCKS]; + + private static final String HADOOP_1_0_MULTIBLOCK_TGZ = + "hadoop-1.0-multiblock-file.tgz"; + static { + Random rand = new Random(); + rand.nextBytes(DATA_BEFORE_RESTART); + rand.nextBytes(DATA_AFTER_RESTART); + } + + /** check if DFS remains in proper condition after a restart */ + @Test + public void testRestartDfs() throws Exception { + final Configuration conf = new HdfsConfiguration(); + // Turn off persistent IPC, so that the DFSClient can survive NN restart + conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY, + 0); + conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true); + MiniDFSCluster cluster = null; + + long len = 0; + FSDataOutputStream stream; + try { + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + FileSystem fs = cluster.getFileSystem(); + // Creating a file with 4096 blockSize to write multiple blocks + stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE); + stream.write(DATA_BEFORE_RESTART); + stream.hflush(); + + // Wait for at least a few blocks to get through + while (len <= BLOCK_SIZE) { + FileStatus status = fs.getFileStatus(FILE_PATH); + len = status.getLen(); + Thread.sleep(100); + } + + // explicitly do NOT close the file. + cluster.restartNameNode(); + + // Check that the file has no less bytes than before the restart + // This would mean that blocks were successfully persisted to the log + FileStatus status = fs.getFileStatus(FILE_PATH); + assertTrue("Length too short: " + status.getLen(), + status.getLen() >= len); + + // And keep writing (ensures that leases are also persisted correctly) + stream.write(DATA_AFTER_RESTART); + stream.close(); + + // Verify that the data showed up, both from before and after the restart. + FSDataInputStream readStream = fs.open(FILE_PATH); + try { + byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length]; + IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length); + assertArrayEquals(DATA_BEFORE_RESTART, verifyBuf); + + IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length); + assertArrayEquals(DATA_AFTER_RESTART, verifyBuf); + } finally { + IOUtils.closeStream(readStream); + } + } finally { + if (cluster != null) { cluster.shutdown(); } + } + } + + @Test + public void testRestartDfsWithAbandonedBlock() throws Exception { + final Configuration conf = new HdfsConfiguration(); + // Turn off persistent IPC, so that the DFSClient can survive NN restart + conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY, + 0); + conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true); + MiniDFSCluster cluster = null; + + long len = 0; + FSDataOutputStream stream; + try { + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + FileSystem fs = cluster.getFileSystem(); + // Creating a file with 4096 blockSize to write multiple blocks + stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE); + stream.write(DATA_BEFORE_RESTART); + stream.hflush(); + + // Wait for all of the blocks to get through + while (len < BLOCK_SIZE * (NUM_BLOCKS - 1)) { + FileStatus status = fs.getFileStatus(FILE_PATH); + len = status.getLen(); + Thread.sleep(100); + } + + // Abandon the last block + DFSClient dfsclient = DFSClientAdapter.getDFSClient((DistributedFileSystem)fs); + LocatedBlocks blocks = dfsclient.getNamenode().getBlockLocations( + FILE_NAME, 0, BLOCK_SIZE * NUM_BLOCKS); + assertEquals(NUM_BLOCKS, blocks.getLocatedBlocks().size()); + LocatedBlock b = blocks.getLastLocatedBlock(); + dfsclient.getNamenode().abandonBlock(b.getBlock(), FILE_NAME, + dfsclient.clientName); + + // explicitly do NOT close the file. + cluster.restartNameNode(); + + // Check that the file has no less bytes than before the restart + // This would mean that blocks were successfully persisted to the log + FileStatus status = fs.getFileStatus(FILE_PATH); + assertTrue("Length incorrect: " + status.getLen(), + status.getLen() != len - BLOCK_SIZE); + + // Verify the data showed up from before restart, sans abandoned block. + FSDataInputStream readStream = fs.open(FILE_PATH); + try { + byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length - BLOCK_SIZE]; + IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length); + byte[] expectedBuf = new byte[DATA_BEFORE_RESTART.length - BLOCK_SIZE]; + System.arraycopy(DATA_BEFORE_RESTART, 0, + expectedBuf, 0, expectedBuf.length); + assertArrayEquals(expectedBuf, verifyBuf); + } finally { + IOUtils.closeStream(readStream); + } + } finally { + if (cluster != null) { cluster.shutdown(); } + } + } + + @Test + public void testRestartWithPartialBlockHflushed() throws IOException { + final Configuration conf = new HdfsConfiguration(); + // Turn off persistent IPC, so that the DFSClient can survive NN restart + conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY, + 0); + conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true); + MiniDFSCluster cluster = null; + + FSDataOutputStream stream; + try { + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + FileSystem fs = cluster.getFileSystem(); + NameNode.getAddress(conf).getPort(); + // Creating a file with 4096 blockSize to write multiple blocks + stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE); + stream.write(DATA_BEFORE_RESTART); + stream.write((byte)1); + stream.hflush(); + + // explicitly do NOT close the file before restarting the NN. + cluster.restartNameNode(); + + // this will fail if the final block of the file is prematurely COMPLETEd + stream.write((byte)2); + stream.hflush(); + stream.close(); + + assertEquals(DATA_BEFORE_RESTART.length + 2, + fs.getFileStatus(FILE_PATH).getLen()); + + FSDataInputStream readStream = fs.open(FILE_PATH); + try { + byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length + 2]; + IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length); + byte[] expectedBuf = new byte[DATA_BEFORE_RESTART.length + 2]; + System.arraycopy(DATA_BEFORE_RESTART, 0, expectedBuf, 0, + DATA_BEFORE_RESTART.length); + System.arraycopy(new byte[]{1, 2}, 0, expectedBuf, + DATA_BEFORE_RESTART.length, 2); + assertArrayEquals(expectedBuf, verifyBuf); + } finally { + IOUtils.closeStream(readStream); + } + } finally { + if (cluster != null) { cluster.shutdown(); } + } + } + + @Test + public void testRestartWithAppend() throws IOException { + final Configuration conf = new HdfsConfiguration(); + // Turn off persistent IPC, so that the DFSClient can survive NN restart + conf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY, + 0); + conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true); + MiniDFSCluster cluster = null; + + FSDataOutputStream stream; + try { + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + FileSystem fs = cluster.getFileSystem(); + NameNode.getAddress(conf).getPort(); + // Creating a file with 4096 blockSize to write multiple blocks + stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE); + stream.write(DATA_BEFORE_RESTART, 0, DATA_BEFORE_RESTART.length / 2); + stream.close(); + stream = fs.append(FILE_PATH, BLOCK_SIZE); + stream.write(DATA_BEFORE_RESTART, DATA_BEFORE_RESTART.length / 2, + DATA_BEFORE_RESTART.length / 2); + stream.close(); + + assertEquals(DATA_BEFORE_RESTART.length, + fs.getFileStatus(FILE_PATH).getLen()); + + cluster.restartNameNode(); + + assertEquals(DATA_BEFORE_RESTART.length, + fs.getFileStatus(FILE_PATH).getLen()); + + FSDataInputStream readStream = fs.open(FILE_PATH); + try { + byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length]; + IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length); + assertArrayEquals(DATA_BEFORE_RESTART, verifyBuf); + } finally { + IOUtils.closeStream(readStream); + } + } finally { + if (cluster != null) { cluster.shutdown(); } + } + } + + /** + * Earlier versions of HDFS didn't persist block allocation to the edit log. + * This makes sure that we can still load an edit log when the OP_CLOSE + * is the opcode which adds all of the blocks. This is a regression + * test for HDFS-2773. + * This test uses a tarred pseudo-distributed cluster from Hadoop 1.0 + * which has a multi-block file. This is similar to the tests in + * {@link TestDFSUpgradeFromImage} but none of those images include + * a multi-block file. + */ + @Test + public void testEarlierVersionEditLog() throws Exception { + final Configuration conf = new HdfsConfiguration(); + + String tarFile = System.getProperty("test.cache.data", "build/test/cache") + + "/" + HADOOP_1_0_MULTIBLOCK_TGZ; + String testDir = System.getProperty("test.build.data", "build/test/data"); + File dfsDir = new File(testDir, "image-1.0"); + if (dfsDir.exists() && !FileUtil.fullyDelete(dfsDir)) { + throw new IOException("Could not delete dfs directory '" + dfsDir + "'"); + } + FileUtil.unTar(new File(tarFile), new File(testDir)); + + File nameDir = new File(dfsDir, "name"); + GenericTestUtils.assertExists(nameDir); + File dataDir = new File(dfsDir, "data"); + GenericTestUtils.assertExists(dataDir); + + conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY, nameDir.getAbsolutePath()); + conf.set(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY, dataDir.getAbsolutePath()); + + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0) + .format(false) + .manageDataDfsDirs(false) + .manageNameDfsDirs(false) + .numDataNodes(1) + .startupOption(StartupOption.UPGRADE) + .build(); + try { + FileSystem fs = cluster.getFileSystem(); + Path testPath = new Path("/user/todd/4blocks"); + // Read it without caring about the actual data within - we just need + // to make sure that the block states and locations are OK. + DFSTestUtil.readFile(fs, testPath); + + // Ensure that we can append to it - if the blocks were in some funny + // state we'd get some kind of issue here. + FSDataOutputStream stm = fs.append(testPath); + try { + stm.write(1); + } finally { + IOUtils.closeStream(stm); + } + } finally { + cluster.shutdown(); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java index a488b0a5cd..e211d20977 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java @@ -75,7 +75,8 @@ public class TestReplication extends TestCase { private void checkFile(FileSystem fileSys, Path name, int repl) throws IOException { Configuration conf = fileSys.getConf(); - ClientProtocol namenode = DFSUtil.createNamenode(conf); + ClientProtocol namenode = NameNodeProxies.createProxy(conf, fileSys.getUri(), + ClientProtocol.class).getProxy(); waitForBlockReplication(name.toString(), namenode, Math.min(numDatanodes, repl), -1); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java index 4d18e98d1d..c2aaf0615c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java @@ -20,6 +20,8 @@ package org.apache.hadoop.hdfs.security; +import static org.junit.Assert.*; + import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOException; @@ -32,12 +34,16 @@ import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; +import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods; import org.apache.hadoop.hdfs.web.WebHdfsFileSystem; @@ -64,6 +70,7 @@ public class TestDelegationToken { config.setBoolean(DFSConfigKeys.DFS_WEBHDFS_ENABLED_KEY, true); config.setLong(DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY, 10000); config.setLong(DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, 5000); + config.setBoolean(DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); config.set("hadoop.security.auth_to_local", "RULE:[2:$1@$0](JobTracker@.*FOO.COM)s/@.*//" + "DEFAULT"); FileSystem.setDefaultUri(config, "hdfs://localhost:" + "0"); @@ -71,7 +78,6 @@ public class TestDelegationToken { cluster.waitActive(); dtSecretManager = NameNodeAdapter.getDtSecretManager( cluster.getNamesystem()); - dtSecretManager.startThreads(); } @After @@ -269,5 +275,51 @@ public class TestDelegationToken { } }); } - + + /** + * Test that the delegation token secret manager only runs when the + * NN is out of safe mode. This is because the secret manager + * has to log to the edit log, which should not be written in + * safe mode. Regression test for HDFS-2579. + */ + @Test + public void testDTManagerInSafeMode() throws Exception { + cluster.startDataNodes(config, 1, true, StartupOption.REGULAR, null); + FileSystem fs = cluster.getFileSystem(); + for (int i = 0; i < 5; i++) { + DFSTestUtil.createFile(fs, new Path("/test-" + i), 100, (short)1, 1L); + } + cluster.getConfiguration(0).setInt( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY, 500); + cluster.getConfiguration(0).setInt( + DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000); + cluster.setWaitSafeMode(false); + cluster.restartNameNode(); + NameNode nn = cluster.getNameNode(); + assertTrue(nn.isInSafeMode()); + DelegationTokenSecretManager sm = + NameNodeAdapter.getDtSecretManager(nn.getNamesystem()); + assertFalse("Secret manager should not run in safe mode", sm.isRunning()); + + NameNodeAdapter.leaveSafeMode(nn, false); + assertTrue("Secret manager should start when safe mode is exited", + sm.isRunning()); + + LOG.info("========= entering safemode again"); + + NameNodeAdapter.enterSafeMode(nn, false); + assertFalse("Secret manager should stop again when safe mode " + + "is manually entered", sm.isRunning()); + + // Set the cluster to leave safemode quickly on its own. + cluster.getConfiguration(0).setInt( + DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0); + cluster.setWaitSafeMode(true); + cluster.restartNameNode(); + nn = cluster.getNameNode(); + sm = NameNodeAdapter.getDtSecretManager(nn.getNamesystem()); + + assertFalse(nn.isInSafeMode()); + assertTrue(sm.isRunning()); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java index cdad31cc9b..6837f65afc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java @@ -48,7 +48,6 @@ import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; -import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods; import org.apache.hadoop.hdfs.web.WebHdfsFileSystem; import org.apache.hadoop.hdfs.web.WebHdfsTestUtil; @@ -114,11 +113,12 @@ public class TestDelegationTokenForProxyUser { DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, 5000); config.setStrings(ProxyUsers.getProxySuperuserGroupConfKey(REAL_USER), "group1"); + config.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); configureSuperUserIPAddresses(config, REAL_USER); FileSystem.setDefaultUri(config, "hdfs://localhost:" + "0"); cluster = new MiniDFSCluster.Builder(config).build(); cluster.waitActive(); - NameNodeAdapter.getDtSecretManager(cluster.getNamesystem()).startThreads(); ProxyUsers.refreshSuperUserGroupsConfiguration(config); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java index 61953c85b0..01725b1bce 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java @@ -373,7 +373,7 @@ public class TestBlockToken { Configuration conf = new HdfsConfiguration(); conf.setBoolean(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, true); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 512); - MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(1) + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) .numDataNodes(1).build(); cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java index eb567469ab..81b03a568e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java @@ -18,9 +18,10 @@ package org.apache.hadoop.hdfs.server.balancer; import java.io.IOException; -import java.net.InetSocketAddress; +import java.net.URI; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Random; import java.util.concurrent.TimeoutException; @@ -37,28 +38,28 @@ import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.NameNodeProxies; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; -import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset; -import org.apache.hadoop.hdfs.server.namenode.NameNode; /** * This class tests if a balancer schedules tasks correctly. */ public class TestBalancer extends TestCase { private static final Log LOG = LogFactory.getLog( - "org.apache.hadoop.hdfs.TestReplication"); + "org.apache.hadoop.hdfs.TestBalancer"); - final private static long CAPACITY = 500L; - final private static String RACK0 = "/rack0"; - final private static String RACK1 = "/rack1"; - final private static String RACK2 = "/rack2"; - final static private String fileName = "/tmp.txt"; - final static private Path filePath = new Path(fileName); + final static long CAPACITY = 500L; + final static String RACK0 = "/rack0"; + final static String RACK1 = "/rack1"; + final static String RACK2 = "/rack2"; + final private static String fileName = "/tmp.txt"; + final static Path filePath = new Path(fileName); private MiniDFSCluster cluster; ClientProtocol client; @@ -82,9 +83,10 @@ public class TestBalancer extends TestCase { } /* create a file with a length of fileLen */ - private void createFile(long fileLen, short replicationFactor) + static void createFile(MiniDFSCluster cluster, Path filePath, long fileLen, + short replicationFactor, int nnIndex) throws IOException { - FileSystem fs = cluster.getFileSystem(); + FileSystem fs = cluster.getFileSystem(nnIndex); DFSTestUtil.createFile(fs, filePath, fileLen, replicationFactor, r.nextLong()); DFSTestUtil.waitReplication(fs, filePath, replicationFactor); @@ -99,11 +101,12 @@ public class TestBalancer extends TestCase { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numNodes).build(); try { cluster.waitActive(); - client = DFSUtil.createNamenode(conf); + client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(), + ClientProtocol.class).getProxy(); short replicationFactor = (short)(numNodes-1); long fileLen = size/replicationFactor; - createFile(fileLen, replicationFactor); + createFile(cluster , filePath, fileLen, replicationFactor, 0); List locatedBlocks = client. getBlockLocations(fileName, 0, fileLen).getLocatedBlocks(); @@ -193,7 +196,8 @@ public class TestBalancer extends TestCase { .simulatedCapacities(capacities) .build(); cluster.waitActive(); - client = DFSUtil.createNamenode(conf); + client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(), + ClientProtocol.class).getProxy(); for(int i = 0; i < blocksDN.length; i++) cluster.injectBlocks(i, Arrays.asList(blocksDN[i])); @@ -211,7 +215,8 @@ public class TestBalancer extends TestCase { * @throws IOException - if getStats() fails * @throws TimeoutException */ - private void waitForHeartBeat(long expectedUsedSpace, long expectedTotalSpace) + static void waitForHeartBeat(long expectedUsedSpace, + long expectedTotalSpace, ClientProtocol client, MiniDFSCluster cluster) throws IOException, TimeoutException { long timeout = TIMEOUT; long failtime = (timeout <= 0L) ? Long.MAX_VALUE @@ -248,7 +253,8 @@ public class TestBalancer extends TestCase { * @throws IOException * @throws TimeoutException */ - private void waitForBalancer(long totalUsedSpace, long totalCapacity) + static void waitForBalancer(long totalUsedSpace, long totalCapacity, + ClientProtocol client, MiniDFSCluster cluster) throws IOException, TimeoutException { long timeout = TIMEOUT; long failtime = (timeout <= 0L) ? Long.MAX_VALUE @@ -305,13 +311,15 @@ public class TestBalancer extends TestCase { .build(); try { cluster.waitActive(); - client = DFSUtil.createNamenode(conf); + client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(), + ClientProtocol.class).getProxy(); long totalCapacity = sum(capacities); // fill up the cluster to be 30% full long totalUsedSpace = totalCapacity*3/10; - createFile(totalUsedSpace/numOfDatanodes, (short)numOfDatanodes); + createFile(cluster, filePath, totalUsedSpace / numOfDatanodes, + (short) numOfDatanodes, 0); // start up an empty node with the same capacity and on the same rack cluster.startDataNodes(conf, 1, true, null, new String[]{newRack}, new long[]{newCapacity}); @@ -327,17 +335,16 @@ public class TestBalancer extends TestCase { private void runBalancer(Configuration conf, long totalUsedSpace, long totalCapacity) throws Exception { - waitForHeartBeat(totalUsedSpace, totalCapacity); + waitForHeartBeat(totalUsedSpace, totalCapacity, client, cluster); // start rebalancing - final List namenodes =new ArrayList(); - namenodes.add(NameNode.getServiceAddress(conf, true)); + Collection namenodes = DFSUtil.getNsServiceRpcUris(conf); final int r = Balancer.run(namenodes, Balancer.Parameters.DEFALUT, conf); assertEquals(Balancer.ReturnStatus.SUCCESS.code, r); - waitForHeartBeat(totalUsedSpace, totalCapacity); + waitForHeartBeat(totalUsedSpace, totalCapacity, client, cluster); LOG.info("Rebalancing with default ctor."); - waitForBalancer(totalUsedSpace, totalCapacity); + waitForBalancer(totalUsedSpace, totalCapacity, client, cluster); } /** one-node cluster test*/ @@ -396,13 +403,15 @@ public class TestBalancer extends TestCase { .build(); try { cluster.waitActive(); - client = DFSUtil.createNamenode(conf); + client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(), + ClientProtocol.class).getProxy(); long totalCapacity = sum(capacities); // fill up the cluster to be 30% full long totalUsedSpace = totalCapacity * 3 / 10; - createFile(totalUsedSpace / numOfDatanodes, (short) numOfDatanodes); + createFile(cluster, filePath, totalUsedSpace / numOfDatanodes, + (short) numOfDatanodes, 0); // start up an empty node with the same capacity and on the same rack cluster.startDataNodes(conf, 1, true, null, new String[] { newRack }, new long[] { newCapacity }); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java new file mode 100644 index 0000000000..9d13a2b619 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java @@ -0,0 +1,108 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.balancer; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.net.URI; +import java.util.Collection; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.NameNodeProxies; +import org.apache.hadoop.hdfs.MiniDFSNNTopology.NNConf; +import org.apache.hadoop.hdfs.protocol.ClientProtocol; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; +import org.junit.Test; + +/** + * Test balancer with HA NameNodes + */ +public class TestBalancerWithHANameNodes { + private MiniDFSCluster cluster; + ClientProtocol client; + + static { + Balancer.setBlockMoveWaitTime(1000L); + } + + /** + * Test a cluster with even distribution, then a new empty node is added to + * the cluster. Test start a cluster with specified number of nodes, and fills + * it to be 30% full (with a single file replicated identically to all + * datanodes); It then adds one new empty node and starts balancing. + */ + @Test(timeout = 60000) + public void testBalancerWithHANameNodes() throws Exception { + Configuration conf = new HdfsConfiguration(); + TestBalancer.initConf(conf); + long newNodeCapacity = TestBalancer.CAPACITY; // new node's capacity + String newNodeRack = TestBalancer.RACK2; // new node's rack + // array of racks for original nodes in cluster + String[] racks = new String[] { TestBalancer.RACK0, TestBalancer.RACK1 }; + // array of capacities of original nodes in cluster + long[] capacities = new long[] { TestBalancer.CAPACITY, + TestBalancer.CAPACITY }; + assertEquals(capacities.length, racks.length); + int numOfDatanodes = capacities.length; + NNConf nn1Conf = new MiniDFSNNTopology.NNConf("nn1"); + nn1Conf.setIpcPort(NameNode.DEFAULT_PORT); + Configuration copiedConf = new Configuration(conf); + cluster = new MiniDFSCluster.Builder(copiedConf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(capacities.length) + .racks(racks) + .simulatedCapacities(capacities) + .build(); + HATestUtil.setFailoverConfigurations(cluster, conf); + try { + cluster.waitActive(); + cluster.transitionToActive(1); + Thread.sleep(500); + client = NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf), + ClientProtocol.class).getProxy(); + long totalCapacity = TestBalancer.sum(capacities); + // fill up the cluster to be 30% full + long totalUsedSpace = totalCapacity * 3 / 10; + TestBalancer.createFile(cluster, TestBalancer.filePath, totalUsedSpace + / numOfDatanodes, (short) numOfDatanodes, 1); + + // start up an empty node with the same capacity and on the same rack + cluster.startDataNodes(conf, 1, true, null, new String[] { newNodeRack }, + new long[] { newNodeCapacity }); + totalCapacity += newNodeCapacity; + TestBalancer.waitForHeartBeat(totalUsedSpace, totalCapacity, client, + cluster); + Collection namenodes = DFSUtil.getNsServiceRpcUris(conf); + assertEquals(1, namenodes.size()); + assertTrue(namenodes.contains(HATestUtil.getLogicalUri(cluster))); + final int r = Balancer.run(namenodes, Balancer.Parameters.DEFALUT, conf); + assertEquals(Balancer.ReturnStatus.SUCCESS.code, r); + TestBalancer.waitForBalancer(totalUsedSpace, totalCapacity, client, + cluster); + } finally { + cluster.shutdown(); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java index 6d06da4968..b130e027b0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java @@ -18,8 +18,9 @@ package org.apache.hadoop.hdfs.server.balancer; import java.io.IOException; -import java.net.InetSocketAddress; +import java.net.URI; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Random; @@ -34,12 +35,13 @@ import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; -import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.LeaseManager; import org.apache.hadoop.hdfs.server.namenode.NameNode; @@ -155,7 +157,7 @@ public class TestBalancerWithMultipleNameNodes { LOG.info("BALANCER 1"); // start rebalancing - final List namenodes = DFSUtil.getNNServiceRpcAddresses(s.conf); + final Collection namenodes = DFSUtil.getNsServiceRpcUris(s.conf); final int r = Balancer.run(namenodes, Balancer.Parameters.DEFALUT, s.conf); Assert.assertEquals(Balancer.ReturnStatus.SUCCESS.code, r); @@ -249,8 +251,9 @@ public class TestBalancerWithMultipleNameNodes { final ExtendedBlock[][] blocks; { LOG.info("UNEVEN 1"); - final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) - .numNameNodes(nNameNodes) + final MiniDFSCluster cluster = new MiniDFSCluster + .Builder(new Configuration(conf)) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2)) .numDataNodes(nDataNodes) .racks(racks) .simulatedCapacities(capacities) @@ -258,6 +261,7 @@ public class TestBalancerWithMultipleNameNodes { LOG.info("UNEVEN 2"); try { cluster.waitActive(); + DFSTestUtil.setFederatedConfiguration(cluster, conf); LOG.info("UNEVEN 3"); final Suite s = new Suite(cluster, nNameNodes, nDataNodes, conf); blocks = generateBlocks(s, usedSpacePerNN); @@ -271,7 +275,7 @@ public class TestBalancerWithMultipleNameNodes { { LOG.info("UNEVEN 10"); final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) - .numNameNodes(nNameNodes) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(nNameNodes)) .numDataNodes(nDataNodes) .racks(racks) .simulatedCapacities(capacities) @@ -324,13 +328,15 @@ public class TestBalancerWithMultipleNameNodes { Assert.assertEquals(nDataNodes, racks.length); LOG.info("RUN_TEST -1"); - final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) - .numNameNodes(nNameNodes) + final MiniDFSCluster cluster = new MiniDFSCluster + .Builder(new Configuration(conf)) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(nNameNodes)) .numDataNodes(nDataNodes) .racks(racks) .simulatedCapacities(capacities) .build(); LOG.info("RUN_TEST 0"); + DFSTestUtil.setFederatedConfiguration(cluster, conf); try { cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java index 8c9b4b3820..1ec75112f8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java @@ -24,8 +24,11 @@ import java.util.Iterator; import java.util.Set; import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.util.Daemon; +import org.junit.Assert; import com.google.common.base.Preconditions; @@ -124,6 +127,58 @@ public class BlockManagerTestUtil { return blockManager.computeDatanodeWork(); } + public static int computeInvalidationWork(BlockManager bm) { + return bm.computeInvalidateWork(Integer.MAX_VALUE); + } + + /** + * Compute all the replication and invalidation work for the + * given BlockManager. + * + * This differs from the above functions in that it computes + * replication work for all DNs rather than a particular subset, + * regardless of invalidation/replication limit configurations. + * + * NB: you may want to set + * {@link DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY} to + * a high value to ensure that all work is calculated. + */ + public static int computeAllPendingWork(BlockManager bm) + throws IOException { + int work = computeInvalidationWork(bm); + work += bm.computeReplicationWork(Integer.MAX_VALUE); + return work; + } + + /** + * Ensure that the given NameNode marks the specified DataNode as + * entirely dead/expired. + * @param nn the NameNode to manipulate + * @param dnName the name of the DataNode + */ + public static void noticeDeadDatanode(NameNode nn, String dnName) { + FSNamesystem namesystem = nn.getNamesystem(); + namesystem.writeLock(); + try { + DatanodeManager dnm = namesystem.getBlockManager().getDatanodeManager(); + HeartbeatManager hbm = dnm.getHeartbeatManager(); + DatanodeDescriptor[] dnds = hbm.getDatanodes(); + DatanodeDescriptor theDND = null; + for (DatanodeDescriptor dnd : dnds) { + if (dnd.getName().equals(dnName)) { + theDND = dnd; + } + } + Assert.assertNotNull("Could not find DN with name: " + dnName, theDND); + + synchronized (hbm) { + theDND.setLastUpdate(0); + hbm.heartbeatCheck(); + } + } finally { + namesystem.writeUnlock(); + } + } /** * Change whether the block placement policy will prefer the writer's diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java index c18a5c04fe..2d7a122c46 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java @@ -41,7 +41,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; public class TestHeartbeatHandling extends TestCase { /** * Test if - * {@link FSNamesystem#handleHeartbeat(DatanodeRegistration, long, long, long, long, int, int)} + * {@link FSNamesystem#handleHeartbeat} * can pick up replication and/or invalidate requests and observes the max * limit */ @@ -75,7 +75,8 @@ public class TestHeartbeatHandling extends TestCase { dd.addBlockToBeReplicated( new Block(i, 0, GenerationStamp.FIRST_VALID_STAMP), ONE_TARGET); } - DatanodeCommand[]cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem); + DatanodeCommand[] cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, + namesystem).getCommands(); assertEquals(1, cmds.length); assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction()); assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand)cmds[0]).getBlocks().length); @@ -85,27 +86,31 @@ public class TestHeartbeatHandling extends TestCase { blockList.add(new Block(i, 0, GenerationStamp.FIRST_VALID_STAMP)); } dd.addBlocksToBeInvalidated(blockList); - cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem); + cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem) + .getCommands(); assertEquals(2, cmds.length); assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction()); assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand)cmds[0]).getBlocks().length); assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[1].getAction()); assertEquals(MAX_INVALIDATE_LIMIT, ((BlockCommand)cmds[1]).getBlocks().length); - cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem); + cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem) + .getCommands(); assertEquals(2, cmds.length); assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction()); assertEquals(REMAINING_BLOCKS, ((BlockCommand)cmds[0]).getBlocks().length); assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[1].getAction()); assertEquals(MAX_INVALIDATE_LIMIT, ((BlockCommand)cmds[1]).getBlocks().length); - cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem); + cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem) + .getCommands(); assertEquals(1, cmds.length); assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[0].getAction()); assertEquals(REMAINING_BLOCKS, ((BlockCommand)cmds[0]).getBlocks().length); - cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem); - assertEquals(null, cmds); + cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem) + .getCommands(); + assertEquals(0, cmds.length); } } finally { namesystem.writeUnlock(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java index 986ca13ed1..d47f110344 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java @@ -81,15 +81,8 @@ public class TestNodeCount extends TestCase { DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName()); // make sure that NN detects that the datanode is down - try { - namesystem.writeLock(); - synchronized (hm) { - datanode.setLastUpdate(0); // mark it dead - hm.heartbeatCheck(); - } - } finally { - namesystem.writeUnlock(); - } + BlockManagerTestUtil.noticeDeadDatanode( + cluster.getNameNode(), datanode.getName()); // the block will be replicated DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR); @@ -121,16 +114,8 @@ public class TestNodeCount extends TestCase { // bring down non excessive datanode dnprop = cluster.stopDataNode(nonExcessDN.getName()); // make sure that NN detects that the datanode is down - - try { - namesystem.writeLock(); - synchronized(hm) { - nonExcessDN.setLastUpdate(0); // mark it dead - hm.heartbeatCheck(); - } - } finally { - namesystem.writeUnlock(); - } + BlockManagerTestUtil.noticeDeadDatanode( + cluster.getNameNode(), nonExcessDN.getName()); // The block should be replicated initializeTimeout(TIMEOUT); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingDataNodeMessages.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingDataNodeMessages.java new file mode 100644 index 0000000000..16977bb820 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingDataNodeMessages.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.blockmanagement; + +import static org.junit.Assert.*; + +import java.util.Queue; + +import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.protocol.DatanodeID; +import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; +import org.junit.Test; + +import com.google.common.base.Joiner; + + +public class TestPendingDataNodeMessages { + PendingDataNodeMessages msgs = new PendingDataNodeMessages(); + + private final Block block1Gs1 = new Block(1, 0, 1); + private final Block block1Gs2 = new Block(1, 0, 2); + private final Block block1Gs2DifferentInstance = + new Block(1, 0, 2); + private final Block block2Gs1 = new Block(2, 0, 1); + + private final DatanodeDescriptor fakeDN = new DatanodeDescriptor( + new DatanodeID("fake")); + + @Test + public void testQueues() { + msgs.enqueueReportedBlock(fakeDN, block1Gs1, ReplicaState.FINALIZED); + msgs.enqueueReportedBlock(fakeDN, block1Gs2, ReplicaState.FINALIZED); + + assertEquals(2, msgs.count()); + + // Nothing queued yet for block 2 + assertNull(msgs.takeBlockQueue(block2Gs1)); + assertEquals(2, msgs.count()); + + Queue q = + msgs.takeBlockQueue(block1Gs2DifferentInstance); + assertEquals( + "ReportedBlockInfo [block=blk_1_1, dn=fake, reportedState=FINALIZED]," + + "ReportedBlockInfo [block=blk_1_2, dn=fake, reportedState=FINALIZED]", + Joiner.on(",").join(q)); + assertEquals(0, msgs.count()); + + // Should be null if we pull again + assertNull(msgs.takeBlockQueue(block1Gs1)); + assertEquals(0, msgs.count()); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java index fb015a2f73..6ab878c561 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java @@ -50,6 +50,29 @@ public class DataNodeAdapter { boolean heartbeatsDisabledForTests) { dn.setHeartbeatsDisabledForTests(heartbeatsDisabledForTests); } + + public static void triggerDeletionReport(DataNode dn) throws IOException { + for (BPOfferService bpos : dn.getAllBpOs()) { + bpos.triggerDeletionReportForTests(); + } + } + + public static void triggerHeartbeat(DataNode dn) throws IOException { + for (BPOfferService bpos : dn.getAllBpOs()) { + bpos.triggerHeartbeatForTests(); + } + } + + public static void triggerBlockReport(DataNode dn) throws IOException { + for (BPOfferService bpos : dn.getAllBpOs()) { + bpos.triggerBlockReportForTests(); + } + } + + public static long getPendingAsyncDeletions(DataNode dn) { + FSDataset fsd = (FSDataset)dn.getFSDataset(); + return fsd.asyncDiskService.countPendingDeletions(); + } /** * Insert a Mockito spy object between the given DataNode and @@ -69,10 +92,20 @@ public class DataNodeAdapter { } Preconditions.checkArgument(bpos != null, "No such bpid: %s", bpid); + + BPServiceActor bpsa = null; + for (BPServiceActor thisBpsa : bpos.getBPServiceActors()) { + if (thisBpsa.getNNSocketAddress().equals(nn.getServiceRpcAddress())) { + bpsa = thisBpsa; + break; + } + } + Preconditions.checkArgument(bpsa != null, + "No service actor to NN at %s", nn.getServiceRpcAddress()); - DatanodeProtocolClientSideTranslatorPB origNN = bpos.getBpNamenode(); + DatanodeProtocolClientSideTranslatorPB origNN = bpsa.getNameNodeProxy(); DatanodeProtocolClientSideTranslatorPB spy = Mockito.spy(origNN); - bpos.setBpNamenode(spy); + bpsa.setNameNode(spy); return spy; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java new file mode 100644 index 0000000000..41e7c8b3fd --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java @@ -0,0 +1,373 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics; +import org.apache.hadoop.hdfs.server.protocol.BlockCommand; +import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; +import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; +import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat; +import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat.State; +import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; +import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; +import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; +import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; +import org.apache.hadoop.hdfs.server.protocol.StorageReport; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.log4j.Level; +import org.junit.Before; +import org.junit.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; + +import com.google.common.base.Supplier; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +public class TestBPOfferService { + + private static final String FAKE_BPID = "fake bpid"; + private static final String FAKE_CLUSTERID = "fake cluster"; + protected static final Log LOG = LogFactory.getLog( + TestBPOfferService.class); + private static final ExtendedBlock FAKE_BLOCK = + new ExtendedBlock(FAKE_BPID, 12345L); + + static { + ((Log4JLogger)DataNode.LOG).getLogger().setLevel(Level.ALL); + } + + private DatanodeProtocolClientSideTranslatorPB mockNN1; + private DatanodeProtocolClientSideTranslatorPB mockNN2; + private NNHAStatusHeartbeat[] mockHaStatuses = new NNHAStatusHeartbeat[2]; + private int heartbeatCounts[] = new int[2]; + private DataNode mockDn; + private FSDatasetInterface mockFSDataset; + + @Before + public void setupMocks() throws Exception { + mockNN1 = setupNNMock(0); + mockNN2 = setupNNMock(1); + + // Set up a mock DN with the bare-bones configuration + // objects, etc. + mockDn = Mockito.mock(DataNode.class); + Mockito.doReturn(true).when(mockDn).shouldRun(); + Configuration conf = new Configuration(); + Mockito.doReturn(conf).when(mockDn).getConf(); + Mockito.doReturn(new DNConf(conf)).when(mockDn).getDnConf(); + Mockito.doReturn(DataNodeMetrics.create(conf, "fake dn")) + .when(mockDn).getMetrics(); + + // Set up a simulated dataset with our fake BP + mockFSDataset = Mockito.spy(new SimulatedFSDataset(null, null, conf)); + mockFSDataset.addBlockPool(FAKE_BPID, conf); + + // Wire the dataset to the DN. + Mockito.doReturn(mockFSDataset).when(mockDn).getFSDataset(); + } + + /** + * Set up a mock NN with the bare minimum for a DN to register to it. + */ + private DatanodeProtocolClientSideTranslatorPB setupNNMock(int nnIdx) + throws Exception { + DatanodeProtocolClientSideTranslatorPB mock = + Mockito.mock(DatanodeProtocolClientSideTranslatorPB.class); + Mockito.doReturn( + new NamespaceInfo(1, FAKE_CLUSTERID, FAKE_BPID, + 0, HdfsConstants.LAYOUT_VERSION)) + .when(mock).versionRequest(); + + Mockito.doReturn(new DatanodeRegistration("fake-node")) + .when(mock).registerDatanode(Mockito.any(DatanodeRegistration.class), + Mockito.any(DatanodeStorage[].class)); + + Mockito.doAnswer(new HeartbeatAnswer(nnIdx)) + .when(mock).sendHeartbeat( + Mockito.any(DatanodeRegistration.class), + Mockito.any(StorageReport[].class), + Mockito.anyInt(), + Mockito.anyInt(), + Mockito.anyInt()); + mockHaStatuses[nnIdx] = new NNHAStatusHeartbeat(State.STANDBY, 0); + return mock; + } + + /** + * Mock answer for heartbeats which returns an empty set of commands + * and the HA status for the chosen NN from the + * {@link TestBPOfferService#mockHaStatuses} array. + */ + private class HeartbeatAnswer implements Answer { + private final int nnIdx; + + public HeartbeatAnswer(int nnIdx) { + this.nnIdx = nnIdx; + } + + @Override + public HeartbeatResponse answer(InvocationOnMock invocation) throws Throwable { + heartbeatCounts[nnIdx]++; + return new HeartbeatResponse(new DatanodeCommand[0], + mockHaStatuses[nnIdx]); + } + } + + + /** + * Test that the BPOS can register to talk to two different NNs, + * sends block reports to both, etc. + */ + @Test + public void testBasicFunctionality() throws Exception { + BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2); + bpos.start(); + try { + waitForInitialization(bpos); + + // The DN should have register to both NNs. + Mockito.verify(mockNN1).registerDatanode(Mockito.any(DatanodeRegistration.class), + Mockito.any(DatanodeStorage[].class)); + Mockito.verify(mockNN2).registerDatanode(Mockito.any(DatanodeRegistration.class), + Mockito.any(DatanodeStorage[].class)); + + // Should get block reports from both NNs + waitForBlockReport(mockNN1); + waitForBlockReport(mockNN2); + + // When we receive a block, it should report it to both NNs + bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, ""); + + ReceivedDeletedBlockInfo[] ret = waitForBlockReceived(FAKE_BLOCK, mockNN1); + assertEquals(1, ret.length); + assertEquals(FAKE_BLOCK.getLocalBlock(), ret[0].getBlock()); + + ret = waitForBlockReceived(FAKE_BLOCK, mockNN2); + assertEquals(1, ret.length); + assertEquals(FAKE_BLOCK.getLocalBlock(), ret[0].getBlock()); + + } finally { + bpos.stop(); + } + } + + /** + * Test that DNA_INVALIDATE commands from the standby are ignored. + */ + @Test + public void testIgnoreDeletionsFromNonActive() throws Exception { + BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2); + + // Ask to invalidate FAKE_BLOCK when block report hits the + // standby + Mockito.doReturn(new BlockCommand(DatanodeProtocol.DNA_INVALIDATE, + FAKE_BPID, new Block[] { FAKE_BLOCK.getLocalBlock() })) + .when(mockNN2).blockReport( + Mockito.anyObject(), + Mockito.eq(FAKE_BPID), + Mockito.anyObject()); + + bpos.start(); + try { + waitForInitialization(bpos); + + // Should get block reports from both NNs + waitForBlockReport(mockNN1); + waitForBlockReport(mockNN2); + + } finally { + bpos.stop(); + } + + // Should ignore the delete command from the standby + Mockito.verify(mockFSDataset, Mockito.never()) + .invalidate(Mockito.eq(FAKE_BPID), + (Block[]) Mockito.anyObject()); + } + + /** + * Ensure that, if the two NNs configured for a block pool + * have different block pool IDs, they will refuse to both + * register. + */ + @Test + public void testNNsFromDifferentClusters() throws Exception { + Mockito.doReturn( + new NamespaceInfo(1, "fake foreign cluster", FAKE_BPID, + 0, HdfsConstants.LAYOUT_VERSION)) + .when(mockNN1).versionRequest(); + + BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2); + bpos.start(); + try { + waitForOneToFail(bpos); + } finally { + bpos.stop(); + } + } + + /** + * Test that the DataNode determines the active NameNode correctly + * based on the HA-related information in heartbeat responses. + * See HDFS-2627. + */ + @Test + public void testPickActiveNameNode() throws Exception { + BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2); + bpos.start(); + try { + waitForInitialization(bpos); + + // Should start with neither NN as active. + assertNull(bpos.getActiveNN()); + + // Have NN1 claim active at txid 1 + mockHaStatuses[0] = new NNHAStatusHeartbeat(State.ACTIVE, 1); + bpos.triggerHeartbeatForTests(); + assertSame(mockNN1, bpos.getActiveNN()); + + // NN2 claims active at a higher txid + mockHaStatuses[1] = new NNHAStatusHeartbeat(State.ACTIVE, 2); + bpos.triggerHeartbeatForTests(); + assertSame(mockNN2, bpos.getActiveNN()); + + // Even after another heartbeat from the first NN, it should + // think NN2 is active, since it claimed a higher txid + bpos.triggerHeartbeatForTests(); + assertSame(mockNN2, bpos.getActiveNN()); + + // Even if NN2 goes to standby, DN shouldn't reset to talking to NN1, + // because NN1's txid is lower than the last active txid. Instead, + // it should consider neither active. + mockHaStatuses[1] = new NNHAStatusHeartbeat(State.STANDBY, 2); + bpos.triggerHeartbeatForTests(); + assertNull(bpos.getActiveNN()); + + // Now if NN1 goes back to a higher txid, it should be considered active + mockHaStatuses[0] = new NNHAStatusHeartbeat(State.ACTIVE, 3); + bpos.triggerHeartbeatForTests(); + assertSame(mockNN1, bpos.getActiveNN()); + + } finally { + bpos.stop(); + } + } + + private void waitForOneToFail(final BPOfferService bpos) + throws Exception { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + return bpos.countNameNodes() == 1; + } + }, 100, 10000); + } + + /** + * Create a BPOfferService which registers with and heartbeats with the + * specified namenode proxy objects. + * @throws IOException + */ + private BPOfferService setupBPOSForNNs( + DatanodeProtocolClientSideTranslatorPB ... nns) throws IOException { + // Set up some fake InetAddresses, then override the connectToNN + // function to return the corresponding proxies. + + final Map nnMap = Maps.newLinkedHashMap(); + for (int port = 0; port < nns.length; port++) { + nnMap.put(new InetSocketAddress(port), nns[port]); + Mockito.doReturn(nns[port]).when(mockDn).connectToNN( + Mockito.eq(new InetSocketAddress(port))); + } + + return new BPOfferService(Lists.newArrayList(nnMap.keySet()), mockDn); + } + + private void waitForInitialization(final BPOfferService bpos) + throws Exception { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + return bpos.isAlive() && bpos.isInitialized(); + } + }, 100, 10000); + } + + private void waitForBlockReport(final DatanodeProtocolClientSideTranslatorPB mockNN) + throws Exception { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + try { + Mockito.verify(mockNN).blockReport( + Mockito.anyObject(), + Mockito.eq(FAKE_BPID), + Mockito.anyObject()); + return true; + } catch (Throwable t) { + LOG.info("waiting on block report: " + t.getMessage()); + return false; + } + } + }, 500, 10000); + } + + private ReceivedDeletedBlockInfo[] waitForBlockReceived( + ExtendedBlock fakeBlock, + DatanodeProtocolClientSideTranslatorPB mockNN) throws Exception { + final ArgumentCaptor captor = + ArgumentCaptor.forClass(StorageReceivedDeletedBlocks[].class); + GenericTestUtils.waitFor(new Supplier() { + + @Override + public Boolean get() { + try { + Mockito.verify(mockNN1).blockReceivedAndDeleted( + Mockito.anyObject(), + Mockito.eq(FAKE_BPID), + captor.capture()); + return true; + } catch (Throwable t) { + return false; + } + } + }, 100, 10000); + return captor.getValue()[0].getBlocks(); + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolManager.java new file mode 100644 index 0000000000..c0301ac814 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolManager.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; + + +public class TestBlockPoolManager { + private Log LOG = LogFactory.getLog(TestBlockPoolManager.class); + private DataNode mockDN = Mockito.mock(DataNode.class); + private BlockPoolManager bpm; + private StringBuilder log = new StringBuilder(); + private int mockIdx = 1; + + @Before + public void setupBPM() { + bpm = new BlockPoolManager(mockDN){ + + @Override + protected BPOfferService createBPOS(List nnAddrs) { + final int idx = mockIdx++; + doLog("create #" + idx); + final BPOfferService bpos = Mockito.mock(BPOfferService.class); + Mockito.doReturn("Mock BPOS #" + idx).when(bpos).toString(); + // Log refreshes + try { + Mockito.doAnswer( + new Answer() { + @Override + public Void answer(InvocationOnMock invocation) throws Throwable { + doLog("refresh #" + idx); + return null; + } + }).when(bpos).refreshNNList( + Mockito.>any()); + } catch (IOException e) { + throw new RuntimeException(e); + } + // Log stops + Mockito.doAnswer( + new Answer() { + @Override + public Void answer(InvocationOnMock invocation) throws Throwable { + doLog("stop #" + idx); + bpm.remove(bpos); + return null; + } + }).when(bpos).stop(); + return bpos; + } + }; + } + + private void doLog(String string) { + synchronized(log) { + LOG.info(string); + log.append(string).append("\n"); + } + } + + @Test + public void testSimpleSingleNS() throws Exception { + Configuration conf = new Configuration(); + conf.set(DFSConfigKeys.FS_DEFAULT_NAME_KEY, + "hdfs://mock1:8020"); + bpm.refreshNamenodes(conf); + assertEquals("create #1\n", log.toString()); + } + + @Test + public void testFederationRefresh() throws Exception { + Configuration conf = new Configuration(); + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, + "ns1,ns2"); + addNN(conf, "ns1", "mock1:8020"); + addNN(conf, "ns2", "mock1:8020"); + bpm.refreshNamenodes(conf); + assertEquals( + "create #1\n" + + "create #2\n", log.toString()); + log.setLength(0); + + // Remove the first NS + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, + "ns1"); + bpm.refreshNamenodes(conf); + assertEquals( + "stop #1\n" + + "refresh #2\n", log.toString()); + log.setLength(0); + + // Add back an NS -- this creates a new BPOS since the old + // one for ns2 should have been previously retired + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, + "ns1,ns2"); + bpm.refreshNamenodes(conf); + assertEquals( + "create #3\n" + + "refresh #2\n", log.toString()); + } + + private static void addNN(Configuration conf, String ns, String addr) { + String key = DFSUtil.addKeySuffixes( + DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, ns); + conf.set(key, addr); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java index cb4244132b..59a61cf2ea 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java @@ -42,10 +42,13 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock; +import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat.State; import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.Daemon; @@ -137,7 +140,9 @@ public class TestBlockRecovery { Mockito.anyInt(), Mockito.anyInt(), Mockito.anyInt())) - .thenReturn(new DatanodeCommand[0]); + .thenReturn(new HeartbeatResponse( + new DatanodeCommand[0], + new NNHAStatusHeartbeat(State.ACTIVE, 1))); dn = new DataNode(conf, dirs, null) { @Override @@ -147,14 +152,8 @@ public class TestBlockRecovery { return namenode; } }; - dn.runDatanodeDaemon(); - while (!dn.isDatanodeFullyStarted()) { - try { - Thread.sleep(50); - } catch (InterruptedException e) { - fail("Interrupted starting DN"); - } - } + // Trigger a heartbeat so that it acknowledges the NN as active. + dn.getAllBpOs()[0].triggerHeartbeatForTests(); } /** @@ -462,7 +461,7 @@ public class TestBlockRecovery { initReplicaRecovery(any(RecoveringBlock.class)); Daemon d = spyDN.recoverBlocks(initRecoveringBlocks()); d.join(); - DatanodeProtocol dnP = dn.getBPNamenode(POOL_ID); + DatanodeProtocol dnP = dn.getActiveNamenodeForBP(POOL_ID); verify(dnP).commitBlockSynchronization( block, RECOVERY_ID, 0, true, true, DatanodeID.EMPTY_ARRAY); } @@ -519,7 +518,7 @@ public class TestBlockRecovery { } catch (IOException e) { e.getMessage().startsWith("Cannot recover "); } - DatanodeProtocol namenode = dn.getBPNamenode(POOL_ID); + DatanodeProtocol namenode = dn.getActiveNamenodeForBP(POOL_ID); verify(namenode, never()).commitBlockSynchronization( any(ExtendedBlock.class), anyLong(), anyLong(), anyBoolean(), anyBoolean(), any(DatanodeID[].class)); @@ -548,7 +547,7 @@ public class TestBlockRecovery { } catch (IOException e) { e.getMessage().startsWith("Cannot recover "); } - DatanodeProtocol namenode = dn.getBPNamenode(POOL_ID); + DatanodeProtocol namenode = dn.getActiveNamenodeForBP(POOL_ID); verify(namenode, never()).commitBlockSynchronization( any(ExtendedBlock.class), anyLong(), anyLong(), anyBoolean(), anyBoolean(), any(DatanodeID[].class)); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java index b7a10177c1..0faa5b1d05 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java @@ -28,6 +28,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -36,7 +37,6 @@ import org.junit.Test; * Tests if DataNode process exits if all Block Pool services exit. */ public class TestDataNodeExit { - private static int BASEPORT = 9923; private static long WAIT_TIME_IN_MILLIS = 10; Configuration conf; MiniDFSCluster cluster = null; @@ -46,8 +46,9 @@ public class TestDataNodeExit { conf = new HdfsConfiguration(); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100); conf.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 100); - cluster = new MiniDFSCluster.Builder(conf).numNameNodes(3) - .nameNodePort(BASEPORT).build(); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(3)) + .build(); for (int i = 0; i < 3; i++) { cluster.waitActive(i); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java index 4a84ce87ab..20a16c3166 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java @@ -23,6 +23,8 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotSame; import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; @@ -30,6 +32,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil; import org.apache.hadoop.hdfs.server.namenode.NameNode; @@ -55,8 +58,9 @@ public class TestDataNodeMultipleRegistrations { */ @Test public void test2NNRegistration() throws IOException { - MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2) - .nameNodePort(9928).build(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2)) + .build(); try { cluster.waitActive(); NameNode nn1 = cluster.getNameNode(0); @@ -90,23 +94,22 @@ public class TestDataNodeMultipleRegistrations { assertEquals("number of volumes is wrong", 2, volInfos.size()); for (BPOfferService bpos : dn.getAllBpOs()) { - LOG.info("reg: bpid=" + "; name=" + bpos.bpRegistration.name + "; sid=" - + bpos.bpRegistration.storageID + "; nna=" + bpos.nnAddr); + LOG.info("BP: " + bpos); } BPOfferService bpos1 = dn.getAllBpOs()[0]; BPOfferService bpos2 = dn.getAllBpOs()[1]; // The order of bpos is not guaranteed, so fix the order - if (bpos1.nnAddr.equals(nn2.getNameNodeAddress())) { + if (getNNSocketAddress(bpos1).equals(nn2.getNameNodeAddress())) { BPOfferService tmp = bpos1; bpos1 = bpos2; bpos2 = tmp; } - assertEquals("wrong nn address", bpos1.nnAddr, + assertEquals("wrong nn address", getNNSocketAddress(bpos1), nn1.getNameNodeAddress()); - assertEquals("wrong nn address", bpos2.nnAddr, + assertEquals("wrong nn address", getNNSocketAddress(bpos2), nn2.getNameNodeAddress()); assertEquals("wrong bpid", bpos1.getBlockPoolId(), bpid1); assertEquals("wrong bpid", bpos2.getBlockPoolId(), bpid2); @@ -120,6 +123,12 @@ public class TestDataNodeMultipleRegistrations { cluster.shutdown(); } } + + private static InetSocketAddress getNNSocketAddress(BPOfferService bpos) { + List actors = bpos.getBPServiceActors(); + assertEquals(1, actors.size()); + return actors.get(0).getNNSocketAddress(); + } /** * starts single nn and single dn and verifies registration and handshake @@ -153,15 +162,16 @@ public class TestDataNodeMultipleRegistrations { for (BPOfferService bpos : dn.getAllBpOs()) { LOG.info("reg: bpid=" + "; name=" + bpos.bpRegistration.name + "; sid=" - + bpos.bpRegistration.storageID + "; nna=" + bpos.nnAddr); + + bpos.bpRegistration.storageID + "; nna=" + + getNNSocketAddress(bpos)); } // try block report BPOfferService bpos1 = dn.getAllBpOs()[0]; - bpos1.lastBlockReport = 0; - bpos1.blockReport(); + bpos1.triggerBlockReportForTests(); - assertEquals("wrong nn address", bpos1.nnAddr, + assertEquals("wrong nn address", + getNNSocketAddress(bpos1), nn1.getNameNodeAddress()); assertEquals("wrong bpid", bpos1.getBlockPoolId(), bpid1); assertEquals("wrong cid", dn.getClusterId(), cid1); @@ -179,8 +189,9 @@ public class TestDataNodeMultipleRegistrations { @Test public void testClusterIdMismatch() throws IOException { - MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2). - nameNodePort(9928).build(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2)) + .build(); try { cluster.waitActive(); @@ -215,25 +226,27 @@ public class TestDataNodeMultipleRegistrations { Configuration conf = new HdfsConfiguration(); // start Federated cluster and add a node. - MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2). - nameNodePort(9928).build(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2)) + .build(); Assert.assertNotNull(cluster); Assert.assertEquals("(1)Should be 2 namenodes", 2, cluster.getNumNameNodes()); // add a node - cluster.addNameNode(conf, 9929); + cluster.addNameNode(conf, 0); Assert.assertEquals("(1)Should be 3 namenodes", 3, cluster.getNumNameNodes()); cluster.shutdown(); // 2. start with Federation flag set conf = new HdfsConfiguration(); - cluster = new MiniDFSCluster.Builder(conf).federation(true). - nameNodePort(9928).build(); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(1)) + .build(); Assert.assertNotNull(cluster); Assert.assertEquals("(2)Should be 1 namenodes", 1, cluster.getNumNameNodes()); // add a node - cluster.addNameNode(conf, 9929); + cluster.addNameNode(conf, 0); Assert.assertEquals("(2)Should be 2 namenodes", 2, cluster.getNumNameNodes()); cluster.shutdown(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java index ca9b3dcfb3..dbbaedd6f8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java @@ -42,7 +42,10 @@ public class TestDatanodeRegister { DataNode mockDN = mock(DataNode.class); Mockito.doReturn(true).when(mockDN).shouldRun(); - BPOfferService bpos = new BPOfferService(INVALID_ADDR, mockDN); + BPOfferService mockBPOS = Mockito.mock(BPOfferService.class); + Mockito.doReturn(mockDN).when(mockBPOS).getDataNode(); + + BPServiceActor actor = new BPServiceActor(INVALID_ADDR, mockBPOS); NamespaceInfo fakeNSInfo = mock(NamespaceInfo.class); when(fakeNSInfo.getBuildVersion()).thenReturn("NSBuildVersion"); @@ -50,10 +53,9 @@ public class TestDatanodeRegister { mock(DatanodeProtocolClientSideTranslatorPB.class); when(fakeDNProt.versionRequest()).thenReturn(fakeNSInfo); - bpos.setNameNode( fakeDNProt ); - bpos.bpNSInfo = fakeNSInfo; + actor.setNameNode( fakeDNProt ); try { - bpos.retrieveNamespaceInfo(); + actor.retrieveNamespaceInfo(); fail("register() did not throw exception! " + "Expected: IncorrectVersionException"); } catch (IncorrectVersionException ie) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java index 0b0ca7bd74..2ff075c8ad 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.tools.DFSAdmin; import org.junit.Test; @@ -47,8 +48,9 @@ public class TestDeleteBlockPool { try { conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, "namesServerId1,namesServerId2"); - cluster = new MiniDFSCluster.Builder(conf).federation(true).numNameNodes( - 2).numDataNodes(2).build(); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2)) + .numDataNodes(2).build(); cluster.waitActive(); @@ -155,8 +157,9 @@ public class TestDeleteBlockPool { try { conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, "namesServerId1,namesServerId2"); - cluster = new MiniDFSCluster.Builder(conf).federation(true).numNameNodes( - 2).numDataNodes(1).build(); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2)) + .numDataNodes(1).build(); cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java index 6862628263..a21cab5756 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.junit.Test; @@ -41,12 +42,13 @@ public class TestMulitipleNNDataBlockScanner { String bpids[] = new String[3]; FileSystem fs[] = new FileSystem[3]; - public void setUp(int port) throws IOException { + public void setUp() throws IOException { conf = new HdfsConfiguration(); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100); conf.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 100); - cluster = new MiniDFSCluster.Builder(conf).numNameNodes(3).nameNodePort( - port).build(); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(3)) + .build(); for (int i = 0; i < 3; i++) { cluster.waitActive(i); } @@ -65,7 +67,7 @@ public class TestMulitipleNNDataBlockScanner { @Test public void testDataBlockScanner() throws IOException, InterruptedException { - setUp(9923); + setUp(); try { DataNode dn = cluster.getDataNodes().get(0); for (int i = 0; i < 3; i++) { @@ -89,9 +91,10 @@ public class TestMulitipleNNDataBlockScanner { @Test public void testBlockScannerAfterRefresh() throws IOException, InterruptedException { - setUp(9933); + setUp(); try { - Configuration conf = new HdfsConfiguration(cluster.getConfiguration(0)); + Configuration dnConf = cluster.getDataNodes().get(0).getConf(); + Configuration conf = new HdfsConfiguration(dnConf); StringBuilder namenodesBuilder = new StringBuilder(); String bpidToShutdown = cluster.getNamesystem(2).getBlockPoolId(); @@ -140,7 +143,7 @@ public class TestMulitipleNNDataBlockScanner { @Test public void testBlockScannerAfterRestart() throws IOException, InterruptedException { - setUp(9943); + setUp(); try { cluster.restartDataNode(0); cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java index 150f117840..2d6f210379 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java @@ -22,12 +22,18 @@ import static org.junit.Assert.*; import java.io.IOException; import java.net.InetSocketAddress; +import java.util.Set; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.MiniDFSNNTopology.NNConf; +import org.apache.hadoop.hdfs.MiniDFSNNTopology.NSConf; import org.junit.Test; +import com.google.common.base.Joiner; +import com.google.common.collect.Sets; + /** * Tests datanode refresh namenode list functionality. */ @@ -43,9 +49,13 @@ public class TestRefreshNamenodes { Configuration conf = new Configuration(); MiniDFSCluster cluster = null; try { - conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, "namesServerId1"); - cluster = new MiniDFSCluster.Builder(conf).federation(true). - numNameNodes(1).nameNodePort(nnPort1).build(); + MiniDFSNNTopology topology = new MiniDFSNNTopology() + .addNameservice(new NSConf("ns1").addNN( + new NNConf(null).setIpcPort(nnPort1))) + .setFederation(true); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(topology) + .build(); DataNode dn = cluster.getDataNodes().get(0); assertEquals(1, dn.getAllBpOs().length); @@ -58,21 +68,24 @@ public class TestRefreshNamenodes { cluster.addNameNode(conf, nnPort4); - BPOfferService[] bpoList = dn.getAllBpOs(); // Ensure a BPOfferService in the datanodes corresponds to // a namenode in the cluster + Set nnAddrsFromCluster = Sets.newHashSet(); for (int i = 0; i < 4; i++) { - InetSocketAddress addr = cluster.getNameNode(i).getNameNodeAddress(); - boolean found = false; - for (int j = 0; j < bpoList.length; j++) { - if (bpoList[j] != null && addr.equals(bpoList[j].nnAddr)) { - found = true; - bpoList[j] = null; // Erase the address that matched - break; - } - } - assertTrue("NameNode address " + addr + " is not found.", found); + assertTrue(nnAddrsFromCluster.add( + cluster.getNameNode(i).getNameNodeAddress())); } + + Set nnAddrsFromDN = Sets.newHashSet(); + for (BPOfferService bpos : dn.getAllBpOs()) { + for (BPServiceActor bpsa : bpos.getBPServiceActors()) { + assertTrue(nnAddrsFromDN.add(bpsa.getNNSocketAddress())); + } + } + + assertEquals("", + Joiner.on(",").join( + Sets.symmetricDifference(nnAddrsFromCluster, nnAddrsFromDN))); } finally { if (cluster != null) { cluster.shutdown(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java index 4d09815283..7962d4a9e2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java @@ -195,7 +195,7 @@ public class CreateEditsLog { FileNameGenerator nameGenerator = new FileNameGenerator(BASE_PATH, 100); FSEditLog editLog = FSImageTestUtil.createStandaloneEditLog(editsLogDir); - editLog.open(); + editLog.openForWrite(); addFiles(editLog, numFiles, replication, numBlocksPerFile, startingBlockId, nameGenerator); editLog.logSync(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java index b6c69c3193..6e9aa8c8a5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java @@ -34,8 +34,11 @@ import java.util.Map.Entry; import java.util.Properties; import java.util.Set; +import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.fs.permission.PermissionStatus; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; @@ -187,12 +190,35 @@ public abstract class FSImageTestUtil { Mockito.doReturn(sd).when(storage) .getStorageDirectory(Matchers.anyObject()); - return new FSEditLog(new Configuration(), + FSEditLog editLog = new FSEditLog(new Configuration(), storage, ImmutableList.of(logDir.toURI())); + editLog.initJournalsForWrite(); + return editLog; } + /** + * Create an aborted in-progress log in the given directory, containing + * only a specified number of "mkdirs" operations. + */ + public static void createAbortedLogWithMkdirs(File editsLogDir, int numDirs, + long firstTxId) throws IOException { + FSEditLog editLog = FSImageTestUtil.createStandaloneEditLog(editsLogDir); + editLog.setNextTxId(firstTxId); + editLog.openForWrite(); + + PermissionStatus perms = PermissionStatus.createImmutable("fakeuser", "fakegroup", + FsPermission.createImmutable((short)0755)); + for (int i = 1; i <= numDirs; i++) { + String dirName = "dir" + i; + INodeDirectory dir = new INodeDirectory(dirName, perms); + editLog.logMkDir("/" + dirName, dir); + } + editLog.logSync(); + editLog.abortCurrentLogSegment(); + } + /** * @param editLog a path of an edit log file * @return the count of each type of operation in the log file @@ -410,13 +436,20 @@ public abstract class FSImageTestUtil { * Assert that the NameNode has checkpoints at the expected * transaction IDs. */ - static void assertNNHasCheckpoints(MiniDFSCluster cluster, + public static void assertNNHasCheckpoints(MiniDFSCluster cluster, List txids) { + assertNNHasCheckpoints(cluster, 0, txids); + } + + public static void assertNNHasCheckpoints(MiniDFSCluster cluster, + int nnIdx, List txids) { - for (File nameDir : getNameNodeCurrentDirs(cluster)) { + for (File nameDir : getNameNodeCurrentDirs(cluster, nnIdx)) { LOG.info("examining name dir with files: " + Joiner.on(",").join(nameDir.listFiles())); // Should have fsimage_N for the three checkpoints + LOG.info("Examining storage dir " + nameDir + " with contents: " + + StringUtils.join(nameDir.listFiles(), ", ")); for (long checkpointTxId : txids) { File image = new File(nameDir, NNStorage.getImageFileName(checkpointTxId)); @@ -425,9 +458,9 @@ public abstract class FSImageTestUtil { } } - public static List getNameNodeCurrentDirs(MiniDFSCluster cluster) { + public static List getNameNodeCurrentDirs(MiniDFSCluster cluster, int nnIdx) { List nameDirs = Lists.newArrayList(); - for (URI u : cluster.getNameDirs(0)) { + for (URI u : cluster.getNameDirs(nnIdx)) { nameDirs.add(new File(u.getPath(), "current")); } return nameDirs; @@ -441,7 +474,7 @@ public abstract class FSImageTestUtil { throws IOException { File currentDir = sd.getCurrentDir(); List foundEditLogs - = Lists.newArrayList(FileJournalManager.matchEditLogs(currentDir.listFiles())); + = Lists.newArrayList(FileJournalManager.matchEditLogs(currentDir)); return Collections.max(foundEditLogs, EditLogFile.COMPARE_BY_START_TXID); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java index d128167e5b..7f18811e40 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java @@ -80,7 +80,7 @@ import org.apache.log4j.LogManager; *
    1. -logLevel L specifies the logging level when the benchmark runs. * The default logging level is {@link Level#ERROR}.
    2. *
    3. -UGCacheRefreshCount G will cause the benchmark to call - * {@link NameNode#refreshUserToGroupsMappings()} after + * {@link NameNodeRpcServer#refreshUserToGroupsMappings} after * every G operations, which purges the name-node's user group cache. * By default the refresh is never called.
    4. *
    5. -keepResults do not clean up the name-space after execution.
    6. @@ -813,7 +813,7 @@ public class NNThroughputBenchmark { StorageReport[] rep = { new StorageReport(dnRegistration.getStorageID(), false, DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, DF_USED) }; DatanodeCommand[] cmds = nameNodeProto.sendHeartbeat(dnRegistration, - rep, 0, 0, 0); + rep, 0, 0, 0).getCommands(); if(cmds != null) { for (DatanodeCommand cmd : cmds ) { if(LOG.isDebugEnabled()) { @@ -859,7 +859,7 @@ public class NNThroughputBenchmark { StorageReport[] rep = { new StorageReport(dnRegistration.getStorageID(), false, DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, DF_USED) }; DatanodeCommand[] cmds = nameNodeProto.sendHeartbeat(dnRegistration, - rep, 0, 0, 0); + rep, 0, 0, 0).getCommands(); if (cmds != null) { for (DatanodeCommand cmd : cmds) { if (cmd.getAction() == DatanodeProtocol.DNA_TRANSFER) { @@ -889,8 +889,10 @@ public class NNThroughputBenchmark { receivedDNReg.setStorageInfo( new DataStorage(nsInfo, dnInfo.getStorageID())); receivedDNReg.setInfoPort(dnInfo.getInfoPort()); - ReceivedDeletedBlockInfo[] rdBlocks = { new ReceivedDeletedBlockInfo( - blocks[i], DataNode.EMPTY_DEL_HINT) }; + ReceivedDeletedBlockInfo[] rdBlocks = { + new ReceivedDeletedBlockInfo( + blocks[i], ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK, + null) }; StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks( receivedDNReg.getStorageID(), rdBlocks) }; nameNodeProto.blockReceivedAndDeleted(receivedDNReg, nameNode @@ -1007,7 +1009,8 @@ public class NNThroughputBenchmark { int dnIdx = Arrays.binarySearch(datanodes, dnInfo.getName()); datanodes[dnIdx].addBlock(loc.getBlock().getLocalBlock()); ReceivedDeletedBlockInfo[] rdBlocks = { new ReceivedDeletedBlockInfo( - loc.getBlock().getLocalBlock(), "") }; + loc.getBlock().getLocalBlock(), + ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK, null) }; StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks( datanodes[dnIdx].dnRegistration.getStorageID(), rdBlocks) }; nameNodeProto.blockReceivedAndDeleted(datanodes[dnIdx].dnRegistration, loc diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java index fb1fc6b5de..fead3b6162 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java @@ -17,16 +17,27 @@ */ package org.apache.hadoop.hdfs.server.namenode; +import java.io.File; import java.io.IOException; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import org.apache.hadoop.fs.UnresolvedLinkException; +import org.apache.hadoop.fs.permission.PermissionStatus; import org.apache.hadoop.hdfs.protocol.DatanodeID; +import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; -import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; +import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; +import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MkdirOp; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.SafeModeInfo; +import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; import org.apache.hadoop.ipc.Server; +import org.apache.hadoop.ipc.StandbyException; +import org.apache.hadoop.security.AccessControlException; +import org.mockito.Mockito; /** * This is a utility class to expose NameNode functionality for unit tests. @@ -48,6 +59,38 @@ public class NameNodeAdapter { src, offset, length, false, true); } + public static HdfsFileStatus getFileInfo(NameNode namenode, String src, + boolean resolveLink) throws AccessControlException, UnresolvedLinkException, + StandbyException { + return namenode.getNamesystem().getFileInfo(src, resolveLink); + } + + public static boolean mkdirs(NameNode namenode, String src, + PermissionStatus permissions, boolean createParent) + throws UnresolvedLinkException, IOException { + return namenode.getNamesystem().mkdirs(src, permissions, createParent); + } + + public static void saveNamespace(NameNode namenode) + throws AccessControlException, IOException { + namenode.getNamesystem().saveNamespace(); + } + + public static void enterSafeMode(NameNode namenode, boolean resourcesLow) + throws IOException { + namenode.getNamesystem().enterSafeMode(resourcesLow); + } + + public static void leaveSafeMode(NameNode namenode, boolean checkForUpgrades) + throws SafeModeException { + namenode.getNamesystem().leaveSafeMode(checkForUpgrades); + } + + public static void abortEditLogs(NameNode nn) { + FSEditLog el = nn.getFSImage().getEditLog(); + el.abortCurrentLogSegment(); + } + /** * Get the internal RPC server instance. * @return rpc server @@ -61,7 +104,7 @@ public class NameNodeAdapter { return ns.getDelegationTokenSecretManager(); } - public static DatanodeCommand[] sendHeartBeat(DatanodeRegistration nodeReg, + public static HeartbeatResponse sendHeartBeat(DatanodeRegistration nodeReg, DatanodeDescriptor dd, FSNamesystem namesystem) throws IOException { return namesystem.handleHeartbeat(nodeReg, dd.getCapacity(), dd.getDfsUsed(), dd.getRemaining(), dd.getBlockPoolUsed(), 0, 0, 0); @@ -79,13 +122,26 @@ public class NameNodeAdapter { /** Set the softLimit and hardLimit of client lease periods. */ public static void setLeasePeriod(final FSNamesystem namesystem, long soft, long hard) { getLeaseManager(namesystem).setLeasePeriod(soft, hard); - namesystem.lmthread.interrupt(); + namesystem.leaseManager.triggerMonitorCheckNow(); } public static String getLeaseHolderForPath(NameNode namenode, String path) { return namenode.getNamesystem().leaseManager.getLeaseByPath(path).getHolder(); } + /** + * @return the timestamp of the last renewal of the given lease, + * or -1 in the case that the lease doesn't exist. + */ + public static long getLeaseRenewalTime(NameNode nn, String path) { + LeaseManager lm = nn.getNamesystem().leaseManager; + Lease l = lm.getLeaseByPath(path); + if (l == null) { + return -1; + } + return l.getLastUpdate(); + } + /** * Return the datanode descriptor for the given datanode. */ @@ -99,6 +155,33 @@ public class NameNodeAdapter { } } + /** + * Return the FSNamesystem stats + */ + public static long[] getStats(final FSNamesystem fsn) { + return fsn.getStats(); + } + + public static ReentrantReadWriteLock spyOnFsLock(FSNamesystem fsn) { + ReentrantReadWriteLock spy = Mockito.spy(fsn.getFsLockForTests()); + fsn.setFsLockForTests(spy); + return spy; + } + + public static FSImage spyOnFsImage(NameNode nn1) { + FSImage spy = Mockito.spy(nn1.getNamesystem().dir.fsImage); + nn1.getNamesystem().dir.fsImage = spy; + return spy; + } + + public static String getMkdirOpPath(FSEditLogOp op) { + if (op.opCode == FSEditLogOpCodes.OP_MKDIR) { + return ((MkdirOp) op).path; + } else { + return null; + } + } + /** * @return the number of blocks marked safe by safemode, or -1 * if safemode is not running. @@ -122,4 +205,8 @@ public class NameNodeAdapter { } return smi.initializedReplQueues; } + + public static File getInProgressEditsFile(StorageDirectory sd, long startTxId) { + return NNStorage.getInProgressEditsFile(sd, startTxId); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java index e22fa29927..392cc9dd91 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java @@ -108,10 +108,11 @@ public class OfflineEditsViewerHelper { // for security to work (fake JobTracker user) config.set("hadoop.security.auth_to_local", "RULE:[2:$1@$0](JobTracker@.*FOO.COM)s/@.*//" + "DEFAULT"); + config.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); cluster = new MiniDFSCluster.Builder(config).manageNameDfsDirs(false).build(); cluster.waitClusterUp(); - cluster.getNamesystem().getDelegationTokenSecretManager().startThreads(); } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java index 2d8a115a9a..5d93b8cf45 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java @@ -33,6 +33,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.HAUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; @@ -123,6 +124,7 @@ public class TestBackupNode { @Test public void testBackupNodeTailsEdits() throws Exception { Configuration conf = new HdfsConfiguration(); + HAUtil.setAllowStandbyReads(conf, true); MiniDFSCluster cluster = null; FileSystem fileSys = null; BackupNode backup = null; @@ -244,11 +246,12 @@ public class TestBackupNode { } void testCheckpoint(StartupOption op) throws Exception { - Path file1 = new Path("checkpoint.dat"); - Path file2 = new Path("checkpoint2.dat"); - Path file3 = new Path("backup.dat"); + Path file1 = new Path("/checkpoint.dat"); + Path file2 = new Path("/checkpoint2.dat"); + Path file3 = new Path("/backup.dat"); Configuration conf = new HdfsConfiguration(); + HAUtil.setAllowStandbyReads(conf, true); short replication = (short)conf.getInt("dfs.replication", 3); int numDatanodes = Math.max(3, replication); conf.set(DFSConfigKeys.DFS_BLOCKREPORT_INITIAL_DELAY_KEY, "0"); @@ -345,11 +348,13 @@ public class TestBackupNode { TestCheckpoint.checkFile(fileSys, file3, replication); // should also be on BN right away assertTrue("file3 does not exist on BackupNode", - op != StartupOption.BACKUP || bnFS.exists(file3)); + op != StartupOption.BACKUP || + backup.getNamesystem().getFileInfo( + file3.toUri().getPath(), false) != null); } catch(IOException e) { LOG.error("Error in TestBackupNode:", e); - assertTrue(e.getLocalizedMessage(), false); + throw new AssertionError(e); } finally { if(backup != null) backup.stop(); if(fileSys != null) fileSys.close(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java index fbbcfc72f8..20d4c720de 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java @@ -22,6 +22,7 @@ import junit.framework.Assert; import java.io.*; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; @@ -57,18 +58,19 @@ public class TestCheckPointForSecurityTokens { } /** - * Tests save namepsace. + * Tests save namespace. */ @Test public void testSaveNamespace() throws IOException { DistributedFileSystem fs = null; try { Configuration conf = new HdfsConfiguration(); + conf.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes).build(); cluster.waitActive(); fs = (DistributedFileSystem)(cluster.getFileSystem()); FSNamesystem namesystem = cluster.getNamesystem(); - namesystem.getDelegationTokenSecretManager().startThreads(); String renewer = UserGroupInformation.getLoginUser().getUserName(); Token token1 = namesystem .getDelegationToken(new Text(renewer)); @@ -122,7 +124,6 @@ public class TestCheckPointForSecurityTokens { } namesystem = cluster.getNamesystem(); - namesystem.getDelegationTokenSecretManager().startThreads(); Token token3 = namesystem .getDelegationToken(new Text(renewer)); Token token4 = namesystem @@ -136,7 +137,6 @@ public class TestCheckPointForSecurityTokens { cluster.waitActive(); namesystem = cluster.getNamesystem(); - namesystem.getDelegationTokenSecretManager().startThreads(); Token token5 = namesystem .getDelegationToken(new Text(renewer)); @@ -159,7 +159,6 @@ public class TestCheckPointForSecurityTokens { cluster.waitActive(); namesystem = cluster.getNamesystem(); - namesystem.getDelegationTokenSecretManager().startThreads(); try { renewToken(token1); cancelToken(token1); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java index 19f481212c..daed09bf1a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java @@ -46,6 +46,7 @@ import org.apache.hadoop.hdfs.DFSUtil.ErrorSimulator; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; @@ -655,6 +656,7 @@ public class TestCheckpoint extends TestCase { sdToLock.lock(); try { MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .format(false) .manageNameDfsDirs(false) .numDataNodes(0) .build(); @@ -861,7 +863,7 @@ public class TestCheckpoint extends TestCase { } /** - * Tests save namepsace. + * Tests save namespace. */ public void testSaveNamespace() throws IOException { MiniDFSCluster cluster = null; @@ -911,10 +913,12 @@ public class TestCheckpoint extends TestCase { throw new IOException(e); } + final int EXPECTED_TXNS_FIRST_SEG = 12; + // the following steps should have happened: - // edits_inprogress_1 -> edits_1-8 (finalized) - // fsimage_8 created - // edits_inprogress_9 created + // edits_inprogress_1 -> edits_1-12 (finalized) + // fsimage_12 created + // edits_inprogress_13 created // for(URI uri : editsDirs) { File ed = new File(uri.getPath()); @@ -926,19 +930,21 @@ public class TestCheckpoint extends TestCase { NNStorage.getInProgressEditsFileName(1)); assertFalse(originalEdits.exists()); File finalizedEdits = new File(curDir, - NNStorage.getFinalizedEditsFileName(1,8)); - assertTrue(finalizedEdits.exists()); + NNStorage.getFinalizedEditsFileName(1, EXPECTED_TXNS_FIRST_SEG)); + GenericTestUtils.assertExists(finalizedEdits); assertTrue(finalizedEdits.length() > Integer.SIZE/Byte.SIZE); - assertTrue(new File(ed, "current/" - + NNStorage.getInProgressEditsFileName(9)).exists()); + GenericTestUtils.assertExists(new File(ed, "current/" + + NNStorage.getInProgressEditsFileName( + EXPECTED_TXNS_FIRST_SEG + 1))); } Collection imageDirs = cluster.getNameDirs(0); for (URI uri : imageDirs) { File imageDir = new File(uri.getPath()); File savedImage = new File(imageDir, "current/" - + NNStorage.getImageFileName(8)); + + NNStorage.getImageFileName( + EXPECTED_TXNS_FIRST_SEG)); assertTrue("Should have saved image at " + savedImage, savedImage.exists()); } @@ -1059,8 +1065,9 @@ public class TestCheckpoint extends TestCase { String nameserviceId2 = "ns2"; conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, nameserviceId1 + "," + nameserviceId2); - MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2) - .nameNodePort(9928).build(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2)) + .build(); Configuration snConf1 = new HdfsConfiguration(cluster.getConfiguration(0)); Configuration snConf2 = new HdfsConfiguration(cluster.getConfiguration(1)); InetSocketAddress nn1RpcAddress = @@ -1076,9 +1083,9 @@ public class TestCheckpoint extends TestCase { snConf2.set(DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, ""); // Set the nameserviceIds - snConf1.set(DFSUtil.getNameServiceIdKey( + snConf1.set(DFSUtil.addKeySuffixes( DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, nameserviceId1), nn1); - snConf2.set(DFSUtil.getNameServiceIdKey( + snConf2.set(DFSUtil.addKeySuffixes( DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, nameserviceId2), nn2); SecondaryNameNode secondary1 = startSecondaryNameNode(snConf1); @@ -1317,17 +1324,11 @@ public class TestCheckpoint extends TestCase { // Let the first one finish delayer.proceed(); - // Letting the first node continue should catch an exception + // Letting the first node continue, it should try to upload the + // same image, and gracefully ignore it, while logging an + // error message. checkpointThread.join(); - try { - checkpointThread.propagateExceptions(); - fail("Didn't throw!"); - } catch (Exception ioe) { - assertTrue("Unexpected exception: " + - StringUtils.stringifyException(ioe), - ioe.toString().contains("Another checkpointer already uploaded")); - LOG.info("Caught expected exception", ioe); - } + checkpointThread.propagateExceptions(); // primary should still consider fsimage_4 the latest assertEquals(4, storage.getMostRecentCheckpointTxId()); @@ -1763,7 +1764,7 @@ public class TestCheckpoint extends TestCase { private void assertParallelFilesInvariant(MiniDFSCluster cluster, ImmutableList secondaries) throws Exception { List allCurrentDirs = Lists.newArrayList(); - allCurrentDirs.addAll(getNameNodeCurrentDirs(cluster)); + allCurrentDirs.addAll(getNameNodeCurrentDirs(cluster, 0)); for (SecondaryNameNode snn : secondaries) { allCurrentDirs.addAll(getCheckpointCurrentDirs(snn)); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java index 68dc9f5b54..98c17a7b4d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java @@ -26,6 +26,7 @@ import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; +import java.util.List; import java.util.Properties; import org.apache.commons.logging.Log; @@ -47,7 +48,7 @@ public class TestClusterId { private String getClusterId(Configuration config) throws IOException { // see if cluster id not empty. Collection dirsToFormat = FSNamesystem.getNamespaceDirs(config); - Collection editsToFormat = FSNamesystem.getNamespaceEditsDirs(config); + List editsToFormat = FSNamesystem.getNamespaceEditsDirs(config); FSImage fsImage = new FSImage(config, dirsToFormat, editsToFormat); Iterator sdit = diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java index 301c4d4f36..82730ea40f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java @@ -110,7 +110,9 @@ public class TestDeadDatanode { DatanodeProtocol dnp = cluster.getNameNodeRpc(); ReceivedDeletedBlockInfo[] blocks = { new ReceivedDeletedBlockInfo( - new Block(0), "") }; + new Block(0), + ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK, + null) }; StorageReceivedDeletedBlocks[] storageBlocks = { new StorageReceivedDeletedBlocks(reg.getStorageID(), blocks) }; @@ -136,7 +138,7 @@ public class TestDeadDatanode { // that asks datanode to register again StorageReport[] rep = { new StorageReport(reg.getStorageID(), false, 0, 0, 0, 0) }; - DatanodeCommand[] cmd = dnp.sendHeartbeat(reg, rep, 0, 0, 0); + DatanodeCommand[] cmd = dnp.sendHeartbeat(reg, rep, 0, 0, 0).getCommands(); Assert.assertEquals(1, cmd.length); Assert.assertEquals(cmd[0].getAction(), RegisterCommand.REGISTER .getAction()); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java index b3eeeab41d..bc41e7bf30 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java @@ -147,7 +147,7 @@ public class TestEditLog extends TestCase { public void testPreTxIdEditLogNoEdits() throws Exception { FSNamesystem namesys = Mockito.mock(FSNamesystem.class); namesys.dir = Mockito.mock(FSDirectory.class); - int numEdits = testLoad( + long numEdits = testLoad( StringUtils.hexStringToByte("ffffffed"), // just version number namesys); assertEquals(0, numEdits); @@ -166,7 +166,7 @@ public class TestEditLog extends TestCase { cluster.waitActive(); final FSNamesystem namesystem = cluster.getNamesystem(); - int numEdits = testLoad(HADOOP20_SOME_EDITS, namesystem); + long numEdits = testLoad(HADOOP20_SOME_EDITS, namesystem); assertEquals(3, numEdits); // Sanity check the edit HdfsFileStatus fileInfo = namesystem.getFileInfo("/myfile", false); @@ -177,7 +177,7 @@ public class TestEditLog extends TestCase { } } - private int testLoad(byte[] data, FSNamesystem namesys) throws IOException { + private long testLoad(byte[] data, FSNamesystem namesys) throws IOException { FSEditLogLoader loader = new FSEditLogLoader(namesys); return loader.loadFSEdits(new EditLogByteInputStream(data), 1); } @@ -321,7 +321,7 @@ public class TestEditLog extends TestCase { assertTrue("Expect " + editFile + " exists", editFile.exists()); System.out.println("Verifying file: " + editFile); - int numEdits = loader.loadFSEdits( + long numEdits = loader.loadFSEdits( new EditLogFileInputStream(editFile), 3); int numLeases = namesystem.leaseManager.countLease(); System.out.println("Number of outstanding leases " + numLeases); @@ -589,7 +589,6 @@ public class TestEditLog extends TestCase { currentDir.getAbsolutePath()); assertNotNull("No image found in " + nameDir, imageFile); assertEquals(NNStorage.getImageFileName(0), imageFile.getName()); - // Try to start a new cluster LOG.info("\n===========================================\n" + "Starting same cluster after simulated crash"); @@ -636,22 +635,26 @@ public class TestEditLog extends TestCase { } } + // should succeed - only one corrupt log dir public void testCrashRecoveryEmptyLogOneDir() throws Exception { - doTestCrashRecoveryEmptyLog(false, true); + doTestCrashRecoveryEmptyLog(false, true, true); } + // should fail - seen_txid updated to 3, but no log dir contains txid 3 public void testCrashRecoveryEmptyLogBothDirs() throws Exception { - doTestCrashRecoveryEmptyLog(true, true); + doTestCrashRecoveryEmptyLog(true, true, false); } + // should succeed - only one corrupt log dir public void testCrashRecoveryEmptyLogOneDirNoUpdateSeenTxId() throws Exception { - doTestCrashRecoveryEmptyLog(false, false); + doTestCrashRecoveryEmptyLog(false, false, true); } + // should succeed - both log dirs corrupt, but seen_txid never updated public void testCrashRecoveryEmptyLogBothDirsNoUpdateSeenTxId() throws Exception { - doTestCrashRecoveryEmptyLog(true, false); + doTestCrashRecoveryEmptyLog(true, false, true); } /** @@ -667,12 +670,13 @@ public class TestEditLog extends TestCase { * NN should fail to start up, because it's aware that txid 3 * was reached, but unable to find a non-corrupt log starting there. * @param updateTransactionIdFile if true update the seen_txid file. - * If false, the it will not be updated. This will simulate a case - * where the NN crashed between creating the new segment and updating - * seen_txid. + * If false, it will not be updated. This will simulate a case where + * the NN crashed between creating the new segment and updating the + * seen_txid file. + * @param shouldSucceed true if the test is expected to succeed. */ private void doTestCrashRecoveryEmptyLog(boolean inBothDirs, - boolean updateTransactionIdFile) + boolean updateTransactionIdFile, boolean shouldSucceed) throws Exception { // start a cluster Configuration conf = new HdfsConfiguration(); @@ -691,29 +695,40 @@ public class TestEditLog extends TestCase { // Make a truncated edits_3_inprogress File log = new File(currentDir, NNStorage.getInProgressEditsFileName(3)); - NNStorage storage = new NNStorage(conf, - Collections.emptyList(), - Lists.newArrayList(uri)); - if (updateTransactionIdFile) { - storage.writeTransactionIdFileToStorage(3); - } - storage.close(); new EditLogFileOutputStream(log, 1024).create(); if (!inBothDirs) { break; } + + NNStorage storage = new NNStorage(conf, + Collections.emptyList(), + Lists.newArrayList(uri)); + + if (updateTransactionIdFile) { + storage.writeTransactionIdFileToStorage(3); + } + storage.close(); } try { cluster = new MiniDFSCluster.Builder(conf) .numDataNodes(NUM_DATA_NODES).format(false).build(); - fail("Did not fail to start with all-corrupt logs"); + if (!shouldSucceed) { + fail("Should not have succeeded in startin cluster"); + } } catch (IOException ioe) { - GenericTestUtils.assertExceptionContains( - "No non-corrupt logs for txid 3", ioe); + if (shouldSucceed) { + LOG.info("Should have succeeded in starting cluster, but failed", ioe); + throw ioe; + } else { + GenericTestUtils.assertExceptionContains( + "No non-corrupt logs for txid 3", + ioe); + } + } finally { + cluster.shutdown(); } - cluster.shutdown(); } @@ -781,6 +796,11 @@ public class TestEditLog extends TestCase { public JournalType getType() { return JournalType.FILE; } + + @Override + public boolean isInProgress() { + return true; + } } public void testFailedOpen() throws Exception { @@ -789,11 +809,11 @@ public class TestEditLog extends TestCase { FSEditLog log = FSImageTestUtil.createStandaloneEditLog(logDir); try { logDir.setWritable(false); - log.open(); + log.openForWrite(); fail("Did no throw exception on only having a bad dir"); } catch (IOException ioe) { GenericTestUtils.assertExceptionContains( - "no journals successfully started", ioe); + "too few journals successfully started", ioe); } finally { logDir.setWritable(true); log.close(); @@ -813,7 +833,7 @@ public class TestEditLog extends TestCase { new byte[500]); try { - log.open(); + log.openForWrite(); NameNodeMetrics mockMetrics = Mockito.mock(NameNodeMetrics.class); log.setMetricsForTests(mockMetrics); @@ -848,6 +868,7 @@ public class TestEditLog extends TestCase { "[1,100]|[101,200]|[201,]", "[1,100]|[101,200]|[201,]"); log = new FSEditLog(storage); + log.initJournalsForWrite(); assertEquals("[[1,100], [101,200]]", log.getEditLogManifest(1).toString()); assertEquals("[[101,200]]", @@ -859,6 +880,7 @@ public class TestEditLog extends TestCase { "[1,100]|[101,200]", "[1,100]|[201,300]|[301,400]"); // nothing starting at 101 log = new FSEditLog(storage); + log.initJournalsForWrite(); assertEquals("[[1,100], [101,200], [201,300], [301,400]]", log.getEditLogManifest(1).toString()); @@ -868,6 +890,7 @@ public class TestEditLog extends TestCase { "[1,100]|[301,400]", // gap from 101 to 300 "[301,400]|[401,500]"); log = new FSEditLog(storage); + log.initJournalsForWrite(); assertEquals("[[301,400], [401,500]]", log.getEditLogManifest(1).toString()); @@ -877,6 +900,7 @@ public class TestEditLog extends TestCase { "[1,100]|[101,150]", // short log at 101 "[1,50]|[101,200]"); // short log at 1 log = new FSEditLog(storage); + log.initJournalsForWrite(); assertEquals("[[1,100], [101,200]]", log.getEditLogManifest(1).toString()); assertEquals("[[101,200]]", @@ -889,6 +913,7 @@ public class TestEditLog extends TestCase { "[1,100]|[101,]", "[1,100]|[101,200]"); log = new FSEditLog(storage); + log.initJournalsForWrite(); assertEquals("[[1,100], [101,200]]", log.getEditLogManifest(1).toString()); assertEquals("[[101,200]]", @@ -967,11 +992,11 @@ public class TestEditLog extends TestCase { * * @param editUris directories to create edit logs in * @param numrolls number of times to roll the edit log during setup + * @param closeOnFinish whether to close the edit log after setup * @param abortAtRolls Specifications for when to fail, see AbortSpec */ - public static NNStorage setupEdits(List editUris, int numrolls, - AbortSpec... abortAtRolls) - throws IOException { + public static NNStorage setupEdits(List editUris, int numrolls, + boolean closeOnFinish, AbortSpec... abortAtRolls) throws IOException { List aborts = new ArrayList(Arrays.asList(abortAtRolls)); NNStorage storage = new NNStorage(new Configuration(), Collections.emptyList(), @@ -981,7 +1006,8 @@ public class TestEditLog extends TestCase { // open the edit log and add two transactions // logGenerationStamp is used, simply because it doesn't // require complex arguments. - editlog.open(); + editlog.initJournalsForWrite(); + editlog.openForWrite(); for (int i = 2; i < TXNS_PER_ROLL; i++) { editlog.logGenerationStamp((long)0); } @@ -1009,16 +1035,34 @@ public class TestEditLog extends TestCase { } editlog.logSync(); } - editlog.close(); + + if (closeOnFinish) { + editlog.close(); + } FSImageTestUtil.logStorageContents(LOG, storage); return storage; } + + /** + * Set up directories for tests. + * + * Each rolled file is 10 txns long. + * A failed file is 2 txns long. + * + * @param editUris directories to create edit logs in + * @param numrolls number of times to roll the edit log during setup + * @param abortAtRolls Specifications for when to fail, see AbortSpec + */ + public static NNStorage setupEdits(List editUris, int numrolls, + AbortSpec... abortAtRolls) throws IOException { + return setupEdits(editUris, numrolls, true, abortAtRolls); + } /** * Test loading an editlog which has had both its storage fail * on alternating rolls. Two edit log directories are created. - * The first on fails on odd rolls, the second on even. Test + * The first one fails on odd rolls, the second on even. Test * that we are able to load the entire editlog regardless. */ @Test @@ -1041,6 +1085,7 @@ public class TestEditLog extends TestCase { new AbortSpec(10, 1)); long totaltxnread = 0; FSEditLog editlog = new FSEditLog(storage); + editlog.initJournalsForWrite(); long startTxId = 1; Iterable editStreams = editlog.selectInputStreams(startTxId, TXNS_PER_ROLL*11); @@ -1090,11 +1135,10 @@ public class TestEditLog extends TestCase { assertTrue(files[0].delete()); FSEditLog editlog = new FSEditLog(storage); + editlog.initJournalsForWrite(); long startTxId = 1; try { - Iterable editStreams - = editlog.selectInputStreams(startTxId, 4*TXNS_PER_ROLL); - + editlog.selectInputStreams(startTxId, 4*TXNS_PER_ROLL); fail("Should have thrown exception"); } catch (IOException ioe) { GenericTestUtils.assertExceptionContains( diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java index 77fd68637e..d14b2b24df 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream; import org.junit.After; import org.junit.Before; import org.junit.Test; +import org.mockito.Mockito; import org.mockito.verification.VerificationMode; public class TestEditLogJournalFailures { @@ -144,21 +145,35 @@ public class TestEditLogJournalFailures { DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY); shutDownMiniCluster(); Configuration conf = new HdfsConfiguration(); - conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY, editsDirs[1]); + conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY, editsDirs[0]); conf.setInt(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY, 0); conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_KEY, 0); setUpMiniCluster(conf, true); assertTrue(doAnEdit()); // Invalidated the one required edits journal. - invalidateEditsDirAtIndex(1, false, false); + invalidateEditsDirAtIndex(0, false, false); + JournalAndStream nonRequiredJas = getJournalAndStream(1); + EditLogFileOutputStream nonRequiredSpy = + spyOnStream(nonRequiredJas); + // Make sure runtime.exit(...) hasn't been called at all yet. assertExitInvocations(0); + // ..and that the other stream is active. + assertTrue(nonRequiredJas.isActive()); + // This will actually return true in the tests, since the NN will not in // fact call Runtime.exit(); doAnEdit(); + // Since the required directory failed setReadyToFlush, and that + // directory was listed prior to the non-required directory, + // we should not call setReadyToFlush on the non-required + // directory. Regression test for HDFS-2874. + Mockito.verify(nonRequiredSpy, Mockito.never()).setReadyToFlush(); + assertFalse(nonRequiredJas.isActive()); + // A single failure of a required journal should result in a call to // runtime.exit(...). assertExitInvocations(atLeast(1)); @@ -217,15 +232,10 @@ public class TestEditLogJournalFailures { * @param index the index of the journal to take offline. * @return the original EditLogOutputStream of the journal. */ - private EditLogOutputStream invalidateEditsDirAtIndex(int index, + private void invalidateEditsDirAtIndex(int index, boolean failOnFlush, boolean failOnWrite) throws IOException { - FSImage fsimage = cluster.getNamesystem().getFSImage(); - FSEditLog editLog = fsimage.getEditLog(); - - JournalAndStream jas = editLog.getJournals().get(index); - EditLogFileOutputStream elos = - (EditLogFileOutputStream) jas.getCurrentStream(); - EditLogFileOutputStream spyElos = spy(elos); + JournalAndStream jas = getJournalAndStream(index); + EditLogFileOutputStream spyElos = spyOnStream(jas); if (failOnWrite) { doThrow(new IOException("fail on write()")).when(spyElos).write( (FSEditLogOp) any()); @@ -237,25 +247,24 @@ public class TestEditLogJournalFailures { .setReadyToFlush(); } doNothing().when(spyElos).abort(); - + } + + private EditLogFileOutputStream spyOnStream(JournalAndStream jas) { + EditLogFileOutputStream elos = + (EditLogFileOutputStream) jas.getCurrentStream(); + EditLogFileOutputStream spyElos = spy(elos); jas.setCurrentStreamForTests(spyElos); - - return elos; + return spyElos; } /** - * Restore the journal at index index with the passed - * {@link EditLogOutputStream}. - * - * @param index index of the journal to restore. - * @param elos the {@link EditLogOutputStream} to put at that index. + * Pull out one of the JournalAndStream objects from the edit log. */ - private void restoreEditsDirAtIndex(int index, EditLogOutputStream elos) { + private JournalAndStream getJournalAndStream(int index) { FSImage fsimage = cluster.getNamesystem().getFSImage(); FSEditLog editLog = fsimage.getEditLog(); - JournalAndStream jas = editLog.getJournals().get(index); - jas.setCurrentStreamForTests(elos); + return editLog.getJournals().get(index); } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java index d3d64594ac..da66b45da2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java @@ -237,7 +237,7 @@ public class TestEditLogRace { System.out.println("Verifying file: " + editFile); FSEditLogLoader loader = new FSEditLogLoader(namesystem); - int numEditsThisLog = loader.loadFSEdits(new EditLogFileInputStream(editFile), + long numEditsThisLog = loader.loadFSEdits(new EditLogFileInputStream(editFile), startTxId); System.out.println("Number of edits: " + numEditsThisLog); @@ -375,6 +375,7 @@ public class TestEditLogRace { true); LOG.info("mkdirs complete"); } catch (Throwable ioe) { + LOG.fatal("Got exception", ioe); deferredException.set(ioe); waitToEnterFlush.countDown(); } @@ -469,6 +470,7 @@ public class TestEditLogRace { true); LOG.info("mkdirs complete"); } catch (Throwable ioe) { + LOG.fatal("Got exception", ioe); deferredException.set(ioe); waitToEnterSync.countDown(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java index dcbeea6c2f..fd1733a584 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java @@ -92,7 +92,7 @@ public class TestFSEditLogLoader { StringBuilder bld = new StringBuilder(); bld.append("^Error replaying edit log at offset \\d+"); - bld.append("On transaction ID \\d+\n"); + bld.append(" on transaction ID \\d+\n"); bld.append("Recent opcode offsets: (\\d+\\s*){4}$"); try { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATA_NODES) @@ -167,7 +167,7 @@ public class TestFSEditLogLoader { SortedMap offsetToTxId = Maps.newTreeMap(); try { fsel = FSImageTestUtil.createStandaloneEditLog(testDir); - fsel.open(); + fsel.openForWrite(); assertTrue("should exist: " + logFile, logFile.exists()); for (int i = 0; i < NUM_TXNS; i++) { @@ -245,7 +245,9 @@ public class TestFSEditLogLoader { Files.copy(logFileBak, logFile); corruptByteInFile(logFile, offset); EditLogValidation val = EditLogFileInputStream.validateEditLog(logFile); - assertTrue(val.getNumTransactions() >= prevNumValid); + assertTrue(String.format("%d should have been >= %d", + val.getNumTransactions(), prevNumValid), + val.getNumTransactions() >= prevNumValid); prevNumValid = val.getNumTransactions(); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java new file mode 100644 index 0000000000..de3a89c083 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdfs.server.namenode; + +import static org.apache.hadoop.hdfs.DFSConfigKeys.*; +import static org.junit.Assert.*; + +import java.io.IOException; +import java.net.URI; +import java.util.Collection; + +import org.apache.hadoop.conf.Configuration; +import org.junit.Test; + +public class TestFSNamesystem { + + /** + * Tests that the namenode edits dirs are gotten with duplicates removed + */ + @Test + public void testUniqueEditDirs() throws IOException { + Configuration config = new Configuration(); + + config.set(DFS_NAMENODE_EDITS_DIR_KEY, "file://edits/dir, " + + "file://edits/dir1,file://edits/dir1"); // overlapping internally + + // getNamespaceEditsDirs removes duplicates + Collection editsDirs = FSNamesystem.getNamespaceEditsDirs(config); + assertEquals(2, editsDirs.size()); + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java index e4ff4bb732..0ac194439d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java @@ -29,7 +29,9 @@ import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import org.junit.Test; +import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; +import org.apache.hadoop.hdfs.server.namenode.JournalManager.CorruptionException; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; import org.apache.hadoop.test.GenericTestUtils; import static org.apache.hadoop.hdfs.server.namenode.TestEditLog.setupEdits; @@ -58,8 +60,8 @@ public class TestFileJournalManager { long numJournals = 0; for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.EDITS)) { - FileJournalManager jm = new FileJournalManager(sd); - assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1)); + FileJournalManager jm = new FileJournalManager(sd, storage); + assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1, true)); numJournals++; } assertEquals(3, numJournals); @@ -78,9 +80,9 @@ public class TestFileJournalManager { 5, new AbortSpec(5, 0)); StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next(); - FileJournalManager jm = new FileJournalManager(sd); + FileJournalManager jm = new FileJournalManager(sd, storage); assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, - jm.getNumberOfTransactions(1)); + jm.getNumberOfTransactions(1, true)); } /** @@ -101,16 +103,17 @@ public class TestFileJournalManager { 5, new AbortSpec(5, 1)); Iterator dirs = storage.dirIterator(NameNodeDirType.EDITS); StorageDirectory sd = dirs.next(); - FileJournalManager jm = new FileJournalManager(sd); - assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1)); + FileJournalManager jm = new FileJournalManager(sd, storage); + assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1, true)); sd = dirs.next(); - jm = new FileJournalManager(sd); - assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1)); + jm = new FileJournalManager(sd, storage); + assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1, + true)); sd = dirs.next(); - jm = new FileJournalManager(sd); - assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1)); + jm = new FileJournalManager(sd, storage); + assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1, true)); } /** @@ -133,16 +136,19 @@ public class TestFileJournalManager { new AbortSpec(5, 2)); Iterator dirs = storage.dirIterator(NameNodeDirType.EDITS); StorageDirectory sd = dirs.next(); - FileJournalManager jm = new FileJournalManager(sd); - assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1)); + FileJournalManager jm = new FileJournalManager(sd, storage); + assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1, + true)); sd = dirs.next(); - jm = new FileJournalManager(sd); - assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1)); + jm = new FileJournalManager(sd, storage); + assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1, + true)); sd = dirs.next(); - jm = new FileJournalManager(sd); - assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1)); + jm = new FileJournalManager(sd, storage); + assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1, + true)); } /** @@ -156,6 +162,25 @@ public class TestFileJournalManager { } raf.close(); } + + @Test(expected=IllegalStateException.class) + public void testFinalizeErrorReportedToNNStorage() throws IOException, InterruptedException { + File f = new File(TestEditLog.TEST_DIR + "/filejournaltestError"); + // abort after 10th roll + NNStorage storage = setupEdits(Collections.singletonList(f.toURI()), + 10, new AbortSpec(10, 0)); + StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next(); + + FileJournalManager jm = new FileJournalManager(sd, storage); + String sdRootPath = sd.getRoot().getAbsolutePath(); + FileUtil.chmod(sdRootPath, "-w", true); + try { + jm.finalizeLogSegment(0, 1); + } finally { + assertTrue(storage.getRemovedStorageDirs().contains(sd)); + FileUtil.chmod(sdRootPath, "+w", true); + } + } /** * Test that we can read from a stream created by FileJournalManager. @@ -171,17 +196,17 @@ public class TestFileJournalManager { 10, new AbortSpec(10, 0)); StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next(); - FileJournalManager jm = new FileJournalManager(sd); + FileJournalManager jm = new FileJournalManager(sd, storage); long expectedTotalTxnCount = TXNS_PER_ROLL*10 + TXNS_PER_FAIL; - assertEquals(expectedTotalTxnCount, jm.getNumberOfTransactions(1)); + assertEquals(expectedTotalTxnCount, jm.getNumberOfTransactions(1, true)); long skippedTxns = (3*TXNS_PER_ROLL); // skip first 3 files long startingTxId = skippedTxns + 1; - long numTransactionsToLoad = jm.getNumberOfTransactions(startingTxId); + long numTransactionsToLoad = jm.getNumberOfTransactions(startingTxId, true); long numLoaded = 0; while (numLoaded < numTransactionsToLoad) { - EditLogInputStream editIn = jm.getInputStream(startingTxId); + EditLogInputStream editIn = jm.getInputStream(startingTxId, true); FSEditLogLoader.EditLogValidation val = FSEditLogLoader.validateEditLog(editIn); long count = val.getNumTransactions(); @@ -194,20 +219,26 @@ public class TestFileJournalManager { } /** - * Try to make a request with a start transaction id which doesn't - * match the start ID of some log segment. - * This should fail as edit logs must currently be treated as indevisable - * units. + * Make requests with starting transaction ids which don't match the beginning + * txid of some log segments. + * + * This should succeed. */ - @Test(expected=IOException.class) + @Test public void testAskForTransactionsMidfile() throws IOException { File f = new File(TestEditLog.TEST_DIR + "/filejournaltest2"); NNStorage storage = setupEdits(Collections.singletonList(f.toURI()), 10); StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next(); - FileJournalManager jm = new FileJournalManager(sd); - jm.getNumberOfTransactions(2); + FileJournalManager jm = new FileJournalManager(sd, storage); + + // 10 rolls, so 11 rolled files, 110 txids total. + final int TOTAL_TXIDS = 10 * 11; + for (int txid = 1; txid <= TOTAL_TXIDS; txid++) { + assertEquals((TOTAL_TXIDS - txid) + 1, jm.getNumberOfTransactions(txid, + true)); + } } /** @@ -237,19 +268,20 @@ public class TestFileJournalManager { assertEquals(1, files.length); assertTrue(files[0].delete()); - FileJournalManager jm = new FileJournalManager(sd); - assertEquals(startGapTxId-1, jm.getNumberOfTransactions(1)); + FileJournalManager jm = new FileJournalManager(sd, storage); + assertEquals(startGapTxId-1, jm.getNumberOfTransactions(1, true)); try { - jm.getNumberOfTransactions(startGapTxId); + jm.getNumberOfTransactions(startGapTxId, true); fail("Should have thrown an exception by now"); } catch (IOException ioe) { - assertTrue(true); + GenericTestUtils.assertExceptionContains( + "Gap in transactions, max txnid is 110, 0 txns from 31", ioe); } // rolled 10 times so there should be 11 files. assertEquals(11*TXNS_PER_ROLL - endGapTxId, - jm.getNumberOfTransactions(endGapTxId+1)); + jm.getNumberOfTransactions(endGapTxId + 1, true)); } /** @@ -274,9 +306,9 @@ public class TestFileJournalManager { corruptAfterStartSegment(files[0]); - FileJournalManager jm = new FileJournalManager(sd); + FileJournalManager jm = new FileJournalManager(sd, storage); assertEquals(10*TXNS_PER_ROLL+1, - jm.getNumberOfTransactions(1)); + jm.getNumberOfTransactions(1, true)); } @Test @@ -288,14 +320,15 @@ public class TestFileJournalManager { NNStorage.getInProgressEditsFileName(201), NNStorage.getFinalizedEditsFileName(1001, 1100)); - FileJournalManager fjm = new FileJournalManager(sd); + // passing null for NNStorage because this unit test will not use it + FileJournalManager fjm = new FileJournalManager(sd, null); assertEquals("[1,100],[101,200],[1001,1100]", getLogsAsString(fjm, 1)); assertEquals("[101,200],[1001,1100]", getLogsAsString(fjm, 101)); assertEquals("[1001,1100]", getLogsAsString(fjm, 201)); try { assertEquals("[]", getLogsAsString(fjm, 150)); fail("Did not throw when asking for a txn in the middle of a log"); - } catch (IOException ioe) { + } catch (IllegalStateException ioe) { GenericTestUtils.assertExceptionContains( "150 which is in the middle", ioe); } @@ -303,6 +336,60 @@ public class TestFileJournalManager { "", getLogsAsString(fjm, 9999)); } + /** + * tests that passing an invalid dir to matchEditLogs throws IOException + */ + @Test(expected = IOException.class) + public void testMatchEditLogInvalidDirThrowsIOException() throws IOException { + File badDir = new File("does not exist"); + FileJournalManager.matchEditLogs(badDir); + } + + /** + * Make sure that we starting reading the correct op when we request a stream + * with a txid in the middle of an edit log file. + */ + @Test + public void testReadFromMiddleOfEditLog() throws CorruptionException, + IOException { + File f = new File(TestEditLog.TEST_DIR + "/filejournaltest2"); + NNStorage storage = setupEdits(Collections.singletonList(f.toURI()), + 10); + StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next(); + + FileJournalManager jm = new FileJournalManager(sd, storage); + + EditLogInputStream elis = jm.getInputStream(5, true); + FSEditLogOp op = elis.readOp(); + assertEquals("read unexpected op", op.getTransactionId(), 5); + } + + /** + * Make sure that in-progress streams aren't counted if we don't ask for + * them. + */ + @Test + public void testExcludeInProgressStreams() throws CorruptionException, + IOException { + File f = new File(TestEditLog.TEST_DIR + "/filejournaltest2"); + + // Don't close the edit log once the files have been set up. + NNStorage storage = setupEdits(Collections.singletonList(f.toURI()), + 10, false); + StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next(); + + FileJournalManager jm = new FileJournalManager(sd, storage); + + // If we exclude the in-progess stream, we should only have 100 tx. + assertEquals(100, jm.getNumberOfTransactions(1, false)); + + EditLogInputStream elis = jm.getInputStream(90, false); + FSEditLogOp lastReadOp = null; + while ((lastReadOp = elis.readOp()) != null) { + assertTrue(lastReadOp.getTransactionId() <= 100); + } + } + private static String getLogsAsString( FileJournalManager fjm, long firstTxId) throws IOException { return Joiner.on(",").join(fjm.getRemoteEditLogs(firstTxId)); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java index 00fe43f404..51e49a9237 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java @@ -144,13 +144,13 @@ public class TestGenericJournalConf { } @Override - public EditLogInputStream getInputStream(long fromTxnId) + public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk) throws IOException { return null; } @Override - public long getNumberOfTransactions(long fromTxnId) + public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk) throws IOException { return 0; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java index aad8d7dc0a..e7a9cc1d49 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java @@ -61,6 +61,7 @@ public class TestNNStorageRetentionFunctional { throws IOException { MiniDFSCluster cluster = null; Configuration conf = new HdfsConfiguration(); + conf.setLong(DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY, 0); File sd0 = new File(TEST_ROOT_DIR, "nn0"); File sd1 = new File(TEST_ROOT_DIR, "nn1"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java index aadca5cc20..4c6334f53a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java @@ -23,6 +23,7 @@ import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile; import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFile; @@ -33,6 +34,7 @@ import static org.apache.hadoop.hdfs.server.namenode.NNStorage.getImageFileName; import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger; import org.junit.Assert; +import org.junit.Before; import org.junit.Test; import org.mockito.ArgumentCaptor; import org.mockito.Mockito; @@ -46,6 +48,17 @@ import com.google.common.collect.Sets; public class TestNNStorageRetentionManager { + Configuration conf = new Configuration(); + + /** + * For the purpose of this test, purge as many edits as we can + * with no extra "safety cushion" + */ + @Before + public void setNoExtraEditRetention() { + conf.setLong(DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY, 0); + } + /** * Test the "easy case" where we have more images in the * directory than we need to keep. Should purge the @@ -163,9 +176,27 @@ public class TestNNStorageRetentionManager { runTest(tc); } - private void runTest(TestCaseDescription tc) throws IOException { - Configuration conf = new Configuration(); + @Test + public void testRetainExtraLogs() throws IOException { + conf.setLong(DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY, + 50); + TestCaseDescription tc = new TestCaseDescription(); + tc.addRoot("/foo1", NameNodeDirType.IMAGE); + tc.addRoot("/foo2", NameNodeDirType.EDITS); + tc.addImage("/foo1/current/" + getImageFileName(100), true); + tc.addImage("/foo1/current/" + getImageFileName(200), true); + tc.addImage("/foo1/current/" + getImageFileName(300), false); + tc.addImage("/foo1/current/" + getImageFileName(400), false); + tc.addLog("/foo2/current/" + getFinalizedEditsFileName(101, 200), true); + // Since we need 50 extra edits, *do* retain the 201-300 segment + tc.addLog("/foo2/current/" + getFinalizedEditsFileName(201, 300), false); + tc.addLog("/foo2/current/" + getFinalizedEditsFileName(301, 400), false); + tc.addLog("/foo2/current/" + getInProgressEditsFileName(401), false); + runTest(tc); + } + + private void runTest(TestCaseDescription tc) throws IOException { StoragePurger mockPurger = Mockito.mock(NNStorageRetentionManager.StoragePurger.class); ArgumentCaptor imagesPurgedCaptor = @@ -261,8 +292,9 @@ public class TestNNStorageRetentionManager { for (FakeRoot root : dirRoots.values()) { if (!root.type.isOfType(NameNodeDirType.EDITS)) continue; + // passing null NNStorage for unit test because it does not use it FileJournalManager fjm = new FileJournalManager( - root.mockStorageDir()); + root.mockStorageDir(), null); fjm.purger = purger; jms.add(fjm); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java index 559d165726..49a96e9b66 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java @@ -50,13 +50,7 @@ public class TestNameNodeResourcePolicy { assertFalse(testResourceScenario(4, 0, 3, 0, 2)); assertTrue(testResourceScenario(4, 0, 3, 0, 1)); assertFalse(testResourceScenario(4, 0, 4, 0, 1)); - try { - testResourceScenario(1, 0, 0, 0, 2); - fail("Should fail if there are more minimum redundant resources than " + - "total redundant resources"); - } catch (RuntimeException rte) { - assertTrue(rte.getMessage().startsWith("Need a minimum")); - } + assertFalse(testResourceScenario(1, 0, 0, 0, 2)); } @Test diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java index d4fd72d3b0..596df8d76b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java @@ -24,6 +24,7 @@ import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; @@ -91,6 +92,9 @@ public class TestSecurityTokenEditLog extends TestCase { FileSystem fileSys = null; try { + conf.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATA_NODES).build(); cluster.waitActive(); fileSys = cluster.getFileSystem(); @@ -106,7 +110,6 @@ public class TestSecurityTokenEditLog extends TestCase { // set small size of flush buffer editLog.setOutputBufferCapacity(2048); - namesystem.getDelegationTokenSecretManager().startThreads(); // Create threads and make them run transactions concurrently. Thread threadId[] = new Thread[NUM_THREADS]; @@ -141,7 +144,7 @@ public class TestSecurityTokenEditLog extends TestCase { System.out.println("Verifying file: " + editFile); FSEditLogLoader loader = new FSEditLogLoader(namesystem); - int numEdits = loader.loadFSEdits( + long numEdits = loader.loadFSEdits( new EditLogFileInputStream(editFile), 1); assertEquals("Verification for " + editFile, expectedTransactions, numEdits); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java index 9233009628..74c3cf8f79 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java @@ -512,11 +512,10 @@ public class TestStartup extends TestCase { InetAddress inetAddress = InetAddress.getByAddress(b); list.add(inetAddress.getHostName()); writeConfigFile(localFileSys, hostsFile, list); - int numNameNodes = 1; int numDatanodes = 1; try { - cluster = new MiniDFSCluster.Builder(conf).numNameNodes(numNameNodes) + cluster = new MiniDFSCluster.Builder(conf) .numDataNodes(numDatanodes).setupHostsFile(true).build(); cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java index 397ad725cc..53f4f966de 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java @@ -19,10 +19,12 @@ package org.apache.hadoop.hdfs.server.namenode; import static org.junit.Assert.*; import org.junit.Test; +import java.io.File; import java.io.IOException; import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.DFSConfigKeys; @@ -71,4 +73,25 @@ public class TestValidateConfigurationSettings { DFSTestUtil.formatNameNode(conf); NameNode nameNode = new NameNode(conf); // should be OK! } + + /** + * HDFS-3013: NameNode format command doesn't pick up + * dfs.namenode.name.dir.NameServiceId configuration. + */ + @Test + public void testGenericKeysForNameNodeFormat() + throws IOException { + Configuration conf = new HdfsConfiguration(); + FileSystem.setDefaultUri(conf, "hdfs://localhost:8070"); + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, "ns1"); + String nameDir = System.getProperty("java.io.tmpdir") + "/test.dfs.name"; + File dir = new File(nameDir); + if (dir.exists()) { + FileUtil.fullyDelete(dir); + } + conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY + ".ns1", nameDir); + DFSTestUtil.formatNameNode(conf); + NameNode nameNode = new NameNode(conf); + FileUtil.fullyDelete(dir); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HAStressTestHarness.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HAStressTestHarness.java new file mode 100644 index 0000000000..39667eddf1 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HAStressTestHarness.java @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; +import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread; +import org.apache.hadoop.test.MultithreadedTestUtil.TestContext; + +/** + * Utility class to start an HA cluster, and then start threads + * to periodically fail back and forth, accelerate block deletion + * processing, etc. + */ +public class HAStressTestHarness { + Configuration conf; + private MiniDFSCluster cluster; + static final int BLOCK_SIZE = 1024; + TestContext testCtx = new TestContext(); + + public HAStressTestHarness() { + conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); + conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + // Increase max streams so that we re-replicate quickly. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 1000); + } + + /** + * Start and return the MiniDFSCluster. + */ + public MiniDFSCluster startCluster() throws IOException { + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(3) + .build(); + return cluster; + } + + /** + * Return a filesystem with client-failover configured for the + * cluster. + */ + public FileSystem getFailoverFs() throws IOException, URISyntaxException { + return HATestUtil.configureFailoverFs(cluster, conf); + } + + /** + * Add a thread which periodically triggers deletion reports, + * heartbeats, and NN-side block work. + * @param interval millisecond period on which to run + */ + public void addReplicationTriggerThread(final int interval) { + + testCtx.addThread(new RepeatingTestThread(testCtx) { + + @Override + public void doAnAction() throws Exception { + for (DataNode dn : cluster.getDataNodes()) { + DataNodeAdapter.triggerDeletionReport(dn); + DataNodeAdapter.triggerHeartbeat(dn); + } + for (int i = 0; i < 2; i++) { + NameNode nn = cluster.getNameNode(i); + BlockManagerTestUtil.computeAllPendingWork( + nn.getNamesystem().getBlockManager()); + } + Thread.sleep(interval); + } + }); + } + + /** + * Add a thread which periodically triggers failover back and forth between + * the two namenodes. + */ + public void addFailoverThread(final int msBetweenFailovers) { + testCtx.addThread(new RepeatingTestThread(testCtx) { + + @Override + public void doAnAction() throws Exception { + System.err.println("==============================\n" + + "Failing over from 0->1\n" + + "=================================="); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + Thread.sleep(msBetweenFailovers); + System.err.println("==============================\n" + + "Failing over from 1->0\n" + + "=================================="); + + cluster.transitionToStandby(1); + cluster.transitionToActive(0); + Thread.sleep(msBetweenFailovers); + } + }); + } + + /** + * Start all of the threads which have been added. + */ + public void startThreads() { + this.testCtx.startThreads(); + } + + /** + * Stop threads, propagating any exceptions that might have been thrown. + */ + public void stopThreads() throws Exception { + this.testCtx.stop(); + } + + /** + * Shutdown the minicluster, as well as any of the running threads. + */ + public void shutdown() throws Exception { + this.testCtx.stop(); + if (cluster != null) { + this.cluster.shutdown(); + cluster = null; + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HATestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HATestUtil.java new file mode 100644 index 0000000000..bf919cea7f --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HATestUtil.java @@ -0,0 +1,214 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.List; +import java.util.concurrent.TimeoutException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter; +import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.test.GenericTestUtils; + +import com.google.common.base.Supplier; + +/** + * Static utility functions useful for testing HA. + */ +public abstract class HATestUtil { + private static Log LOG = LogFactory.getLog(HATestUtil.class); + + private static final String LOGICAL_HOSTNAME = "ha-nn-uri-%d"; + + /** + * Trigger an edits log roll on the active and then wait for the standby to + * catch up to all the edits done by the active. This method will check + * repeatedly for up to NN_LAG_TIMEOUT milliseconds, and then fail throwing + * {@link CouldNotCatchUpException} + * + * @param active active NN + * @param standby standby NN which should catch up to active + * @throws IOException if an error occurs rolling the edit log + * @throws CouldNotCatchUpException if the standby doesn't catch up to the + * active in NN_LAG_TIMEOUT milliseconds + */ + static void waitForStandbyToCatchUp(NameNode active, + NameNode standby) throws InterruptedException, IOException, CouldNotCatchUpException { + + long activeTxId = active.getNamesystem().getFSImage().getEditLog() + .getLastWrittenTxId(); + + active.getRpcServer().rollEditLog(); + + long start = System.currentTimeMillis(); + while (System.currentTimeMillis() - start < TestEditLogTailer.NN_LAG_TIMEOUT) { + long nn2HighestTxId = standby.getNamesystem().getFSImage() + .getLastAppliedTxId(); + if (nn2HighestTxId >= activeTxId) { + return; + } + Thread.sleep(TestEditLogTailer.SLEEP_TIME); + } + throw new CouldNotCatchUpException("Standby did not catch up to txid " + + activeTxId + " (currently at " + + standby.getNamesystem().getFSImage().getLastAppliedTxId() + ")"); + } + + /** + * Wait for the datanodes in the cluster to process any block + * deletions that have already been asynchronously queued. + */ + static void waitForDNDeletions(final MiniDFSCluster cluster) + throws TimeoutException, InterruptedException { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + for (DataNode dn : cluster.getDataNodes()) { + if (DataNodeAdapter.getPendingAsyncDeletions(dn) > 0) { + return false; + } + } + return true; + } + }, 1000, 10000); + + } + + /** + * Wait for the NameNode to issue any deletions that are already + * pending (i.e. for the pendingDeletionBlocksCount to go to 0) + */ + static void waitForNNToIssueDeletions(final NameNode nn) + throws Exception { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + LOG.info("Waiting for NN to issue block deletions to DNs"); + return nn.getNamesystem().getBlockManager().getPendingDeletionBlocksCount() == 0; + } + }, 250, 10000); + } + + public static class CouldNotCatchUpException extends IOException { + private static final long serialVersionUID = 1L; + + public CouldNotCatchUpException(String message) { + super(message); + } + } + + /** Gets the filesystem instance by setting the failover configurations */ + public static FileSystem configureFailoverFs(MiniDFSCluster cluster, Configuration conf) + throws IOException, URISyntaxException { + return configureFailoverFs(cluster, conf, 0); + } + + /** + * Gets the filesystem instance by setting the failover configurations + * @param cluster the single process DFS cluster + * @param conf cluster configuration + * @param nsIndex namespace index starting with zero + * @throws IOException if an error occurs rolling the edit log + */ + public static FileSystem configureFailoverFs(MiniDFSCluster cluster, Configuration conf, + int nsIndex) throws IOException, URISyntaxException { + conf = new Configuration(conf); + String logicalName = getLogicalHostname(cluster); + setFailoverConfigurations(cluster, conf, logicalName, nsIndex); + FileSystem fs = FileSystem.get(new URI("hdfs://" + logicalName), conf); + return fs; + } + + public static void setFailoverConfigurations(MiniDFSCluster cluster, + Configuration conf) { + setFailoverConfigurations(cluster, conf, getLogicalHostname(cluster)); + } + + /** Sets the required configurations for performing failover of default namespace. */ + public static void setFailoverConfigurations(MiniDFSCluster cluster, + Configuration conf, String logicalName) { + setFailoverConfigurations(cluster, conf, logicalName, 0); + } + + /** Sets the required configurations for performing failover. */ + public static void setFailoverConfigurations(MiniDFSCluster cluster, + Configuration conf, String logicalName, int nsIndex) { + InetSocketAddress nnAddr1 = cluster.getNameNode(2 * nsIndex).getNameNodeAddress(); + InetSocketAddress nnAddr2 = cluster.getNameNode(2 * nsIndex + 1).getNameNodeAddress(); + String nameNodeId1 = "nn1"; + String nameNodeId2 = "nn2"; + String address1 = "hdfs://" + nnAddr1.getHostName() + ":" + nnAddr1.getPort(); + String address2 = "hdfs://" + nnAddr2.getHostName() + ":" + nnAddr2.getPort(); + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, + logicalName, nameNodeId1), address1); + conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, + logicalName, nameNodeId2), address2); + + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, logicalName); + conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, logicalName), + nameNodeId1 + "," + nameNodeId2); + conf.set(DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "." + logicalName, + ConfiguredFailoverProxyProvider.class.getName()); + conf.set("fs.defaultFS", "hdfs://" + logicalName); + } + + + public static String getLogicalHostname(MiniDFSCluster cluster) { + return String.format(LOGICAL_HOSTNAME, cluster.getInstanceId()); + } + + public static URI getLogicalUri(MiniDFSCluster cluster) + throws URISyntaxException { + return new URI(HdfsConstants.HDFS_URI_SCHEME + "://" + + getLogicalHostname(cluster)); + } + + public static void waitForCheckpoint(MiniDFSCluster cluster, int nnIdx, + List txids) throws InterruptedException { + long start = System.currentTimeMillis(); + while (true) { + try { + FSImageTestUtil.assertNNHasCheckpoints(cluster, nnIdx, txids); + return; + } catch (AssertionError err) { + if (System.currentTimeMillis() - start > 10000) { + throw err; + } else { + Thread.sleep(300); + } + } + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSUpgradeWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSUpgradeWithHA.java new file mode 100644 index 0000000000..ccc46a204b --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSUpgradeWithHA.java @@ -0,0 +1,107 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Test; + +import com.google.common.collect.Lists; + +/** + * Tests for upgrading with HA enabled. + */ +public class TestDFSUpgradeWithHA { + + private static final Log LOG = LogFactory.getLog(TestDFSUpgradeWithHA.class); + + /** + * Make sure that an HA NN refuses to start if given an upgrade-related + * startup option. + */ + @Test + public void testStartingWithUpgradeOptionsFails() throws IOException { + for (StartupOption startOpt : Lists.newArrayList(new StartupOption[] { + StartupOption.UPGRADE, StartupOption.FINALIZE, + StartupOption.ROLLBACK })) { + MiniDFSCluster cluster = null; + try { + cluster = new MiniDFSCluster.Builder(new Configuration()) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .startupOption(startOpt) + .numDataNodes(0) + .build(); + fail("Should not have been able to start an HA NN in upgrade mode"); + } catch (IllegalArgumentException iae) { + GenericTestUtils.assertExceptionContains( + "Cannot perform DFS upgrade with HA enabled.", iae); + LOG.info("Got expected exception", iae); + } finally { + if (cluster != null) { + cluster.shutdown(); + } + } + } + } + + /** + * Make sure that an HA NN won't start if a previous upgrade was in progress. + */ + @Test + public void testStartingWithUpgradeInProgressFails() throws Exception { + MiniDFSCluster cluster = null; + try { + cluster = new MiniDFSCluster.Builder(new Configuration()) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + + // Simulate an upgrade having started. + for (int i = 0; i < 2; i++) { + for (URI uri : cluster.getNameDirs(i)) { + File prevTmp = new File(new File(uri), Storage.STORAGE_TMP_PREVIOUS); + LOG.info("creating previous tmp dir: " + prevTmp); + assertTrue(prevTmp.mkdirs()); + } + } + + cluster.restartNameNodes(); + fail("Should not have been able to start an HA NN with an in-progress upgrade"); + } catch (IOException ioe) { + GenericTestUtils.assertExceptionContains( + "Cannot start an HA namenode with name dirs that need recovery.", + ioe); + LOG.info("Got expected exception", ioe); + } finally { + if (cluster != null) { + cluster.shutdown(); + } + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java new file mode 100644 index 0000000000..ea769c057e --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java @@ -0,0 +1,605 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.CountDownLatch; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.AppendTestUtil; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault; +import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; +import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter; +import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; +import org.apache.hadoop.hdfs.server.namenode.FSInodeInfo; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.GenericTestUtils.DelayAnswer; +import org.apache.log4j.Level; +import org.junit.After; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; + +import com.google.common.base.Supplier; +import com.google.common.collect.Lists; + + +public class TestDNFencing { + + protected static final Log LOG = LogFactory.getLog( + TestDNFencing.class); + private static final String TEST_FILE_DATA = "hello highly available world"; + private static final String TEST_FILE = "/testStandbyIsHot"; + private static final Path TEST_FILE_PATH = new Path(TEST_FILE); + private static final int SMALL_BLOCK = 1024; + + private Configuration conf; + private MiniDFSCluster cluster; + private NameNode nn1, nn2; + private FileSystem fs; + + static { + ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL); + } + + @Before + public void setupCluster() throws Exception { + conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, SMALL_BLOCK); + // Bump up replication interval so that we only run replication + // checks explicitly. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 600); + // Increase max streams so that we re-replicate quickly. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 1000); + // See RandomDeleterPolicy javadoc. + conf.setClass("dfs.block.replicator.classname", RandomDeleterPolicy.class, + BlockPlacementPolicy.class); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(3) + .build(); + nn1 = cluster.getNameNode(0); + nn2 = cluster.getNameNode(1); + + cluster.waitActive(); + cluster.transitionToActive(0); + // Trigger block reports so that the first NN trusts all + // of the DNs, and will issue deletions + cluster.triggerBlockReports(); + fs = HATestUtil.configureFailoverFs(cluster, conf); + } + + @After + public void shutdownCluster() throws Exception { + if (cluster != null) { + banner("Shutting down cluster. NN1 metadata:"); + doMetasave(nn1); + banner("Shutting down cluster. NN2 metadata:"); + doMetasave(nn2); + cluster.shutdown(); + } + } + + + @Test + public void testDnFencing() throws Exception { + // Create a file with replication level 3. + DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L); + ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_FILE_PATH); + + // Drop its replication count to 1, so it becomes over-replicated. + // Then compute the invalidation of the extra blocks and trigger + // heartbeats so the invalidations are flushed to the DNs. + nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); + BlockManagerTestUtil.computeInvalidationWork( + nn1.getNamesystem().getBlockManager()); + cluster.triggerHeartbeats(); + + // Transition nn2 to active even though nn1 still thinks it's active. + banner("Failing to NN2 but let NN1 continue to think it's active"); + NameNodeAdapter.abortEditLogs(nn1); + NameNodeAdapter.enterSafeMode(nn1, false); + cluster.transitionToActive(1); + + // Check that the standby picked up the replication change. + assertEquals(1, + nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); + + // Dump some info for debugging purposes. + banner("NN2 Metadata immediately after failover"); + doMetasave(nn2); + + // Even though NN2 considers the blocks over-replicated, it should + // post-pone the block invalidation because the DNs are still "stale". + assertEquals(30, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); + + banner("Triggering heartbeats and block reports so that fencing is completed"); + cluster.triggerHeartbeats(); + cluster.triggerBlockReports(); + + banner("Metadata after nodes have all block-reported"); + doMetasave(nn2); + + // The blocks should no longer be postponed. + assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); + + // Wait for NN2 to enact its deletions (replication monitor has to run, etc) + BlockManagerTestUtil.computeInvalidationWork( + nn2.getNamesystem().getBlockManager()); + cluster.triggerHeartbeats(); + HATestUtil.waitForDNDeletions(cluster); + cluster.triggerDeletionReports(); + assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); + assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); + + banner("Making sure the file is still readable"); + FileSystem fs2 = cluster.getFileSystem(1); + DFSTestUtil.readFile(fs2, TEST_FILE_PATH); + + banner("Waiting for the actual block files to get deleted from DNs."); + waitForTrueReplication(cluster, block, 1); + } + + /** + * Test case which restarts the standby node in such a way that, + * when it exits safemode, it will want to invalidate a bunch + * of over-replicated block replicas. Ensures that if we failover + * at this point it won't lose data. + */ + @Test + public void testNNClearsCommandsOnFailoverAfterStartup() + throws Exception { + // Make lots of blocks to increase chances of triggering a bug. + DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L); + + banner("Shutting down NN2"); + cluster.shutdownNameNode(1); + + banner("Setting replication to 1, rolling edit log."); + nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); + nn1.getRpcServer().rollEditLog(); + + // Start NN2 again. When it starts up, it will see all of the + // blocks as over-replicated, since it has the metadata for + // replication=1, but the DNs haven't yet processed the deletions. + banner("Starting NN2 again."); + cluster.restartNameNode(1); + nn2 = cluster.getNameNode(1); + + banner("triggering BRs"); + cluster.triggerBlockReports(); + + // We expect that both NN1 and NN2 will have some number of + // deletions queued up for the DNs. + banner("computing invalidation on nn1"); + BlockManagerTestUtil.computeInvalidationWork( + nn1.getNamesystem().getBlockManager()); + + banner("computing invalidation on nn2"); + BlockManagerTestUtil.computeInvalidationWork( + nn2.getNamesystem().getBlockManager()); + + // Dump some info for debugging purposes. + banner("Metadata immediately before failover"); + doMetasave(nn2); + + + // Transition nn2 to active even though nn1 still thinks it's active + banner("Failing to NN2 but let NN1 continue to think it's active"); + NameNodeAdapter.abortEditLogs(nn1); + NameNodeAdapter.enterSafeMode(nn1, false); + + cluster.transitionToActive(1); + + // Check that the standby picked up the replication change. + assertEquals(1, + nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); + + // Dump some info for debugging purposes. + banner("Metadata immediately after failover"); + doMetasave(nn2); + + banner("Triggering heartbeats and block reports so that fencing is completed"); + cluster.triggerHeartbeats(); + cluster.triggerBlockReports(); + + banner("Metadata after nodes have all block-reported"); + doMetasave(nn2); + + // The block should no longer be postponed. + assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); + + // Wait for NN2 to enact its deletions (replication monitor has to run, etc) + BlockManagerTestUtil.computeInvalidationWork( + nn2.getNamesystem().getBlockManager()); + + HATestUtil.waitForNNToIssueDeletions(nn2); + cluster.triggerHeartbeats(); + HATestUtil.waitForDNDeletions(cluster); + cluster.triggerDeletionReports(); + assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); + assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); + + banner("Making sure the file is still readable"); + FileSystem fs2 = cluster.getFileSystem(1); + DFSTestUtil.readFile(fs2, TEST_FILE_PATH); + } + + /** + * Test case that reduces replication of a file with a lot of blocks + * and then fails over right after those blocks enter the DN invalidation + * queues on the active. Ensures that fencing is correct and no replicas + * are lost. + */ + @Test + public void testNNClearsCommandsOnFailoverWithReplChanges() + throws Exception { + // Make lots of blocks to increase chances of triggering a bug. + DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)1, 1L); + + banner("rolling NN1's edit log, forcing catch-up"); + HATestUtil.waitForStandbyToCatchUp(nn1, nn2); + + // Get some new replicas reported so that NN2 now considers + // them over-replicated and schedules some more deletions + nn1.getRpcServer().setReplication(TEST_FILE, (short) 2); + while (BlockManagerTestUtil.getComputedDatanodeWork( + nn1.getNamesystem().getBlockManager()) > 0) { + LOG.info("Getting more replication work computed"); + } + BlockManager bm1 = nn1.getNamesystem().getBlockManager(); + while (bm1.getPendingReplicationBlocksCount() > 0) { + BlockManagerTestUtil.updateState(bm1); + cluster.triggerHeartbeats(); + Thread.sleep(1000); + } + + banner("triggering BRs"); + cluster.triggerBlockReports(); + + nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); + + + banner("computing invalidation on nn1"); + + BlockManagerTestUtil.computeInvalidationWork( + nn1.getNamesystem().getBlockManager()); + doMetasave(nn1); + + banner("computing invalidation on nn2"); + BlockManagerTestUtil.computeInvalidationWork( + nn2.getNamesystem().getBlockManager()); + doMetasave(nn2); + + // Dump some info for debugging purposes. + banner("Metadata immediately before failover"); + doMetasave(nn2); + + + // Transition nn2 to active even though nn1 still thinks it's active + banner("Failing to NN2 but let NN1 continue to think it's active"); + NameNodeAdapter.abortEditLogs(nn1); + NameNodeAdapter.enterSafeMode(nn1, false); + + + BlockManagerTestUtil.computeInvalidationWork( + nn2.getNamesystem().getBlockManager()); + cluster.transitionToActive(1); + + // Check that the standby picked up the replication change. + assertEquals(1, + nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); + + // Dump some info for debugging purposes. + banner("Metadata immediately after failover"); + doMetasave(nn2); + + banner("Triggering heartbeats and block reports so that fencing is completed"); + cluster.triggerHeartbeats(); + cluster.triggerBlockReports(); + + banner("Metadata after nodes have all block-reported"); + doMetasave(nn2); + + // The block should no longer be postponed. + assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); + + // Wait for NN2 to enact its deletions (replication monitor has to run, etc) + BlockManagerTestUtil.computeInvalidationWork( + nn2.getNamesystem().getBlockManager()); + + HATestUtil.waitForNNToIssueDeletions(nn2); + cluster.triggerHeartbeats(); + HATestUtil.waitForDNDeletions(cluster); + cluster.triggerDeletionReports(); + assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); + assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); + + banner("Making sure the file is still readable"); + FileSystem fs2 = cluster.getFileSystem(1); + DFSTestUtil.readFile(fs2, TEST_FILE_PATH); + } + + /** + * Regression test for HDFS-2742. The issue in this bug was: + * - DN does a block report while file is open. This BR contains + * the block in RBW state. + * - Standby queues the RBW state in PendingDatanodeMessages + * - Standby processes edit logs during failover. Before fixing + * this bug, it was mistakenly applying the RBW reported state + * after the block had been completed, causing the block to get + * marked corrupt. Instead, we should now be applying the RBW + * message on OP_ADD, and then the FINALIZED message on OP_CLOSE. + */ + @Test + public void testBlockReportsWhileFileBeingWritten() throws Exception { + FSDataOutputStream out = fs.create(TEST_FILE_PATH); + try { + AppendTestUtil.write(out, 0, 10); + out.hflush(); + + // Block report will include the RBW replica, but will be + // queued on the StandbyNode. + cluster.triggerBlockReports(); + + } finally { + IOUtils.closeStream(out); + } + + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + // Verify that no replicas are marked corrupt, and that the + // file is readable from the failed-over standby. + BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); + BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager()); + assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks()); + assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks()); + + DFSTestUtil.readFile(fs, TEST_FILE_PATH); + } + + /** + * Test that, when a block is re-opened for append, the related + * datanode messages are correctly queued by the SBN because + * they have future states and genstamps. + */ + @Test + public void testQueueingWithAppend() throws Exception { + int numQueued = 0; + int numDN = cluster.getDataNodes().size(); + + FSDataOutputStream out = fs.create(TEST_FILE_PATH); + try { + AppendTestUtil.write(out, 0, 10); + out.hflush(); + + // Opening the file will report RBW replicas, but will be + // queued on the StandbyNode. + numQueued += numDN; // RBW messages + } finally { + IOUtils.closeStream(out); + numQueued += numDN; // blockReceived messages + } + + cluster.triggerBlockReports(); + numQueued += numDN; + + try { + out = fs.append(TEST_FILE_PATH); + AppendTestUtil.write(out, 10, 10); + // RBW replicas once it's opened for append + numQueued += numDN; + + } finally { + IOUtils.closeStream(out); + numQueued += numDN; // blockReceived + } + + cluster.triggerBlockReports(); + numQueued += numDN; + + assertEquals(numQueued, cluster.getNameNode(1).getNamesystem(). + getPendingDataNodeMessageCount()); + + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + // Verify that no replicas are marked corrupt, and that the + // file is readable from the failed-over standby. + BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); + BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager()); + assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks()); + assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks()); + + AppendTestUtil.check(fs, TEST_FILE_PATH, 20); + } + + /** + * Another regression test for HDFS-2742. This tests the following sequence: + * - DN does a block report while file is open. This BR contains + * the block in RBW state. + * - The block report is delayed in reaching the standby. + * - The file is closed. + * - The standby processes the OP_ADD and OP_CLOSE operations before + * the RBW block report arrives. + * - The standby should not mark the block as corrupt. + */ + @Test + public void testRBWReportArrivesAfterEdits() throws Exception { + final CountDownLatch brFinished = new CountDownLatch(1); + DelayAnswer delayer = new GenericTestUtils.DelayAnswer(LOG) { + @Override + protected Object passThrough(InvocationOnMock invocation) + throws Throwable { + try { + return super.passThrough(invocation); + } finally { + // inform the test that our block report went through. + brFinished.countDown(); + } + } + }; + + FSDataOutputStream out = fs.create(TEST_FILE_PATH); + try { + AppendTestUtil.write(out, 0, 10); + out.hflush(); + + DataNode dn = cluster.getDataNodes().get(0); + DatanodeProtocolClientSideTranslatorPB spy = + DataNodeAdapter.spyOnBposToNN(dn, nn2); + + Mockito.doAnswer(delayer) + .when(spy).blockReport( + Mockito.anyObject(), + Mockito.anyString(), + Mockito.anyObject()); + dn.scheduleAllBlockReport(0); + delayer.waitForCall(); + + } finally { + IOUtils.closeStream(out); + } + + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + delayer.proceed(); + brFinished.await(); + + // Verify that no replicas are marked corrupt, and that the + // file is readable from the failed-over standby. + BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); + BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager()); + assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks()); + assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks()); + + DFSTestUtil.readFile(fs, TEST_FILE_PATH); + } + + /** + * Print a big banner in the test log to make debug easier. + */ + private void banner(String string) { + LOG.info("\n\n\n\n================================================\n" + + string + "\n" + + "==================================================\n\n"); + } + + private void doMetasave(NameNode nn2) { + nn2.getNamesystem().writeLock(); + try { + PrintWriter pw = new PrintWriter(System.err); + nn2.getNamesystem().getBlockManager().metaSave(pw); + pw.flush(); + } finally { + nn2.getNamesystem().writeUnlock(); + } + } + + private void waitForTrueReplication(final MiniDFSCluster cluster, + final ExtendedBlock block, final int waitFor) throws Exception { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + try { + return getTrueReplication(cluster, block) == waitFor; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }, 500, 10000); + } + + private int getTrueReplication(MiniDFSCluster cluster, ExtendedBlock block) + throws IOException { + int count = 0; + for (DataNode dn : cluster.getDataNodes()) { + if (DataNodeTestUtils.getFSDataset(dn).getStoredBlock( + block.getBlockPoolId(), block.getBlockId()) != null) { + count++; + } + } + return count; + } + + /** + * A BlockPlacementPolicy which, rather than using space available, makes + * random decisions about which excess replica to delete. This is because, + * in the test cases, the two NNs will usually (but not quite always) + * make the same decision of which replica to delete. The fencing issues + * are exacerbated when the two NNs make different decisions, which can + * happen in "real life" when they have slightly out-of-sync heartbeat + * information regarding disk usage. + */ + public static class RandomDeleterPolicy extends BlockPlacementPolicyDefault { + + public RandomDeleterPolicy() { + super(); + } + + @Override + public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode, + Block block, short replicationFactor, + Collection first, + Collection second) { + + Collection chooseFrom = + !first.isEmpty() ? first : second; + + List l = Lists.newArrayList(chooseFrom); + return l.get(DFSUtil.getRandom().nextInt(l.size())); + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java new file mode 100644 index 0000000000..95d5eb941e --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java @@ -0,0 +1,148 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; +import java.util.concurrent.TimeoutException; + +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.ipc.Server; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread; +import org.apache.hadoop.test.MultithreadedTestUtil.TestContext; +import org.apache.log4j.Level; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.base.Supplier; + + +/** + * Stress-test for potential bugs when replication is changing + * on blocks during a failover. + */ +public class TestDNFencingWithReplication { + static { + ((Log4JLogger)FSNamesystem.auditLog).getLogger().setLevel(Level.WARN); + ((Log4JLogger)Server.LOG).getLogger().setLevel(Level.FATAL); + ((Log4JLogger)LogFactory.getLog( + "org.apache.hadoop.io.retry.RetryInvocationHandler")) + .getLogger().setLevel(Level.FATAL); + } + + private static final int NUM_THREADS = 20; + // How long should the test try to run for. In practice + // it runs for ~20-30s longer than this constant due to startup/ + // shutdown time. + private static final long RUNTIME = 35000; + private static final int BLOCK_SIZE = 1024; + + private static class ReplicationToggler extends RepeatingTestThread { + private final FileSystem fs; + private final Path path; + + public ReplicationToggler(TestContext ctx, FileSystem fs, Path p) { + super(ctx); + this.fs = fs; + this.path = p; + } + + @Override + public void doAnAction() throws Exception { + fs.setReplication(path, (short)1); + waitForReplicas(1); + fs.setReplication(path, (short)2); + waitForReplicas(2); + } + + private void waitForReplicas(final int replicas) throws Exception { + try { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + try { + BlockLocation[] blocks = fs.getFileBlockLocations(path, 0, 10); + Assert.assertEquals(1, blocks.length); + return blocks[0].getHosts().length == replicas; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }, 100, 60000); + } catch (TimeoutException te) { + throw new IOException("Timed out waiting for " + replicas + " replicas " + + "on path " + path); + } + } + + public String toString() { + return "Toggler for " + path; + } + } + + @Test + public void testFencingStress() throws Exception { + HAStressTestHarness harness = new HAStressTestHarness(); + harness.conf.setInt( + DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000); + + final MiniDFSCluster cluster = harness.startCluster(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + + FileSystem fs = harness.getFailoverFs(); + TestContext togglers = new TestContext(); + for (int i = 0; i < NUM_THREADS; i++) { + Path p = new Path("/test-" + i); + DFSTestUtil.createFile(fs, p, BLOCK_SIZE*10, (short)3, (long)i); + togglers.addThread(new ReplicationToggler(togglers, fs, p)); + } + + // Start a separate thread which will make sure that replication + // happens quickly by triggering deletion reports and replication + // work calculation frequently. + harness.addReplicationTriggerThread(500); + harness.addFailoverThread(5000); + harness.startThreads(); + togglers.startThreads(); + + togglers.waitFor(RUNTIME); + togglers.stop(); + harness.stopThreads(); + + // CHeck that the files can be read without throwing + for (int i = 0; i < NUM_THREADS; i++) { + Path p = new Path("/test-" + i); + DFSTestUtil.readFile(fs, p); + } + } finally { + System.err.println("===========================\n\n\n\n"); + harness.shutdown(); + } + + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDelegationTokensWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDelegationTokensWithHA.java new file mode 100644 index 0000000000..561e4d6103 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDelegationTokensWithHA.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.URI; +import java.security.PrivilegedExceptionAction; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; +import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager; +import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSelector; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import com.google.common.base.Joiner; + +/** + * Test case for client support of delegation tokens in an HA cluster. + * See HDFS-2904 for more info. + **/ +public class TestDelegationTokensWithHA { + private static Configuration conf = new Configuration(); + private static final Log LOG = + LogFactory.getLog(TestDelegationTokensWithHA.class); + private static MiniDFSCluster cluster; + private static NameNode nn0; + private static NameNode nn1; + private static FileSystem fs; + private static DelegationTokenSecretManager dtSecretManager; + private static DistributedFileSystem dfs; + + @BeforeClass + public static void setupCluster() throws Exception { + conf.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); + conf.set("hadoop.security.auth_to_local", + "RULE:[2:$1@$0](JobTracker@.*FOO.COM)s/@.*//" + "DEFAULT"); + + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + cluster.waitActive(); + + nn0 = cluster.getNameNode(0); + nn1 = cluster.getNameNode(1); + fs = HATestUtil.configureFailoverFs(cluster, conf); + dfs = (DistributedFileSystem)fs; + + cluster.transitionToActive(0); + dtSecretManager = NameNodeAdapter.getDtSecretManager( + nn0.getNamesystem()); + } + + @AfterClass + public static void shutdownCluster() throws IOException { + if (cluster != null) { + cluster.shutdown(); + } + } + + + @Test + public void testDelegationTokenDFSApi() throws Exception { + Token token = dfs.getDelegationToken("JobTracker"); + DelegationTokenIdentifier identifier = new DelegationTokenIdentifier(); + byte[] tokenId = token.getIdentifier(); + identifier.readFields(new DataInputStream( + new ByteArrayInputStream(tokenId))); + + // Ensure that it's present in the NN's secret manager and can + // be renewed directly from there. + LOG.info("A valid token should have non-null password, " + + "and should be renewed successfully"); + assertTrue(null != dtSecretManager.retrievePassword(identifier)); + dtSecretManager.renewToken(token, "JobTracker"); + + // Use the client conf with the failover info present to check + // renewal. + Configuration clientConf = dfs.getConf(); + doRenewOrCancel(token, clientConf, TokenTestAction.RENEW); + + // Using a configuration that doesn't have the logical nameservice + // configured should result in a reasonable error message. + Configuration emptyConf = new Configuration(); + try { + doRenewOrCancel(token, emptyConf, TokenTestAction.RENEW); + fail("Did not throw trying to renew with an empty conf!"); + } catch (IOException ioe) { + GenericTestUtils.assertExceptionContains( + "Unable to map logical nameservice URI", ioe); + } + + + // Ensure that the token can be renewed again after a failover. + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + doRenewOrCancel(token, clientConf, TokenTestAction.RENEW); + + doRenewOrCancel(token, clientConf, TokenTestAction.CANCEL); + } + + @SuppressWarnings("deprecation") + @Test + public void testDelegationTokenWithDoAs() throws Exception { + final Token token = + dfs.getDelegationToken("JobTracker"); + final UserGroupInformation longUgi = UserGroupInformation + .createRemoteUser("JobTracker/foo.com@FOO.COM"); + final UserGroupInformation shortUgi = UserGroupInformation + .createRemoteUser("JobTracker"); + longUgi.doAs(new PrivilegedExceptionAction() { + public Void run() throws Exception { + DistributedFileSystem dfs = (DistributedFileSystem) + HATestUtil.configureFailoverFs(cluster, conf); + // try renew with long name + dfs.renewDelegationToken(token); + return null; + } + }); + shortUgi.doAs(new PrivilegedExceptionAction() { + public Void run() throws Exception { + DistributedFileSystem dfs = (DistributedFileSystem) + HATestUtil.configureFailoverFs(cluster, conf); + dfs.renewDelegationToken(token); + return null; + } + }); + longUgi.doAs(new PrivilegedExceptionAction() { + public Void run() throws Exception { + DistributedFileSystem dfs = (DistributedFileSystem) + HATestUtil.configureFailoverFs(cluster, conf); + // try cancel with long name + dfs.cancelDelegationToken(token); + return null; + } + }); + } + + @Test + public void testHAUtilClonesDelegationTokens() throws Exception { + final Token token = + dfs.getDelegationToken("test"); + + UserGroupInformation ugi = UserGroupInformation.createRemoteUser("test"); + + URI haUri = new URI("hdfs://my-ha-uri/"); + token.setService(HAUtil.buildTokenServiceForLogicalUri(haUri)); + ugi.addToken(token); + HAUtil.cloneDelegationTokenForLogicalUri(ugi, haUri, nn0.getNameNodeAddress()); + HAUtil.cloneDelegationTokenForLogicalUri(ugi, haUri, nn1.getNameNodeAddress()); + + Collection> tokens = ugi.getTokens(); + assertEquals(3, tokens.size()); + + LOG.info("Tokens:\n" + Joiner.on("\n").join(tokens)); + + // check that the token selected for one of the physical IPC addresses + // matches the one we received + InetSocketAddress addr = nn0.getNameNodeAddress(); + Text ipcDtService = new Text( + addr.getAddress().getHostAddress() + ":" + addr.getPort()); + Token token2 = + DelegationTokenSelector.selectHdfsDelegationToken(ipcDtService, ugi); + assertNotNull(token2); + assertArrayEquals(token.getIdentifier(), token2.getIdentifier()); + assertArrayEquals(token.getPassword(), token2.getPassword()); + } + + enum TokenTestAction { + RENEW, CANCEL; + } + + private static void doRenewOrCancel( + final Token token, final Configuration conf, + final TokenTestAction action) + throws IOException, InterruptedException { + UserGroupInformation.createRemoteUser("JobTracker").doAs( + new PrivilegedExceptionAction() { + @Override + public Void run() throws Exception { + switch (action) { + case RENEW: + token.renew(conf); + break; + case CANCEL: + token.cancel(conf); + break; + default: + fail("bad action:" + action); + } + return null; + } + }); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogTailer.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogTailer.java new file mode 100644 index 0000000000..bc5c487a76 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogTailer.java @@ -0,0 +1,162 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.util.List; +import java.util.concurrent.TimeoutException; + +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.fs.permission.PermissionStatus; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.FSImage; +import org.apache.hadoop.hdfs.server.namenode.NNStorage; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.log4j.Level; +import org.junit.Test; + +import com.google.common.base.Supplier; + +public class TestEditLogTailer { + + private static final String DIR_PREFIX = "/dir"; + private static final int DIRS_TO_MAKE = 20; + static final long SLEEP_TIME = 1000; + static final long NN_LAG_TIMEOUT = 10 * 1000; + + static { + ((Log4JLogger)FSImage.LOG).getLogger().setLevel(Level.ALL); + ((Log4JLogger)EditLogTailer.LOG).getLogger().setLevel(Level.ALL); + } + + @Test + public void testTailer() throws IOException, InterruptedException, + ServiceFailedException { + Configuration conf = new HdfsConfiguration(); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + + HAUtil.setAllowStandbyReads(conf, true); + + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + cluster.waitActive(); + + cluster.transitionToActive(0); + + NameNode nn1 = cluster.getNameNode(0); + NameNode nn2 = cluster.getNameNode(1); + try { + for (int i = 0; i < DIRS_TO_MAKE / 2; i++) { + NameNodeAdapter.mkdirs(nn1, getDirPath(i), + new PermissionStatus("test","test", new FsPermission((short)00755)), + true); + } + + HATestUtil.waitForStandbyToCatchUp(nn1, nn2); + + for (int i = 0; i < DIRS_TO_MAKE / 2; i++) { + assertTrue(NameNodeAdapter.getFileInfo(nn2, + getDirPath(i), false).isDir()); + } + + for (int i = DIRS_TO_MAKE / 2; i < DIRS_TO_MAKE; i++) { + NameNodeAdapter.mkdirs(nn1, getDirPath(i), + new PermissionStatus("test","test", new FsPermission((short)00755)), + true); + } + + HATestUtil.waitForStandbyToCatchUp(nn1, nn2); + + for (int i = DIRS_TO_MAKE / 2; i < DIRS_TO_MAKE; i++) { + assertTrue(NameNodeAdapter.getFileInfo(nn2, + getDirPath(i), false).isDir()); + } + } finally { + cluster.shutdown(); + } + } + + @Test + public void testNN0TriggersLogRolls() throws Exception { + testStandbyTriggersLogRolls(0); + } + + @Test + public void testNN1TriggersLogRolls() throws Exception { + testStandbyTriggersLogRolls(1); + } + + private static void testStandbyTriggersLogRolls(int activeIndex) + throws Exception { + Configuration conf = new Configuration(); + // Roll every 1s + conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + + // Have to specify IPC ports so the NNs can talk to each other. + MiniDFSNNTopology topology = new MiniDFSNNTopology() + .addNameservice(new MiniDFSNNTopology.NSConf("ns1") + .addNN(new MiniDFSNNTopology.NNConf("nn1").setIpcPort(10001)) + .addNN(new MiniDFSNNTopology.NNConf("nn2").setIpcPort(10002))); + + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(topology) + .numDataNodes(0) + .build(); + try { + cluster.transitionToActive(activeIndex); + waitForLogRollInSharedDir(cluster, 3); + } finally { + cluster.shutdown(); + } + } + + private static String getDirPath(int suffix) { + return DIR_PREFIX + suffix; + } + + private static void waitForLogRollInSharedDir(MiniDFSCluster cluster, + long startTxId) throws Exception { + URI sharedUri = cluster.getSharedEditsDir(0, 1); + File sharedDir = new File(sharedUri.getPath(), "current"); + final File expectedLog = new File(sharedDir, + NNStorage.getInProgressEditsFileName(startTxId)); + + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + return expectedLog.exists(); + } + }, 100, 10000); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogsDuringFailover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogsDuringFailover.java new file mode 100644 index 0000000000..a245301dd9 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogsDuringFailover.java @@ -0,0 +1,182 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil; +import org.apache.hadoop.hdfs.server.namenode.NNStorage; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Test; + +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; + +/** + * Test cases for the handling of edit logs during failover + * and startup of the standby node. + */ +public class TestEditLogsDuringFailover { + private static final Log LOG = + LogFactory.getLog(TestEditLogsDuringFailover.class); + private static final int NUM_DIRS_IN_LOG = 5; + + @Test + public void testStartup() throws Exception { + Configuration conf = new Configuration(); + HAUtil.setAllowStandbyReads(conf, true); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + try { + // During HA startup, both nodes should be in + // standby and we shouldn't have any edits files + // in any edits directory! + List allDirs = Lists.newArrayList(); + allDirs.addAll(cluster.getNameDirs(0)); + allDirs.addAll(cluster.getNameDirs(1)); + allDirs.add(cluster.getSharedEditsDir(0, 1)); + assertNoEditFiles(allDirs); + + // Set the first NN to active, make sure it creates edits + // in its own dirs and the shared dir. The standby + // should still have no edits! + cluster.getNameNode(0).getRpcServer().transitionToActive(); + + assertEditFiles(cluster.getNameDirs(0), + NNStorage.getInProgressEditsFileName(1)); + assertEditFiles( + Collections.singletonList(cluster.getSharedEditsDir(0, 1)), + NNStorage.getInProgressEditsFileName(1)); + assertNoEditFiles(cluster.getNameDirs(1)); + + cluster.getNameNode(0).getRpcServer().mkdirs("/test", + FsPermission.createImmutable((short)0755), true); + + // Restarting the standby should not finalize any edits files + // in the shared directory when it starts up! + cluster.restartNameNode(1); + + assertEditFiles(cluster.getNameDirs(0), + NNStorage.getInProgressEditsFileName(1)); + assertEditFiles( + Collections.singletonList(cluster.getSharedEditsDir(0, 1)), + NNStorage.getInProgressEditsFileName(1)); + assertNoEditFiles(cluster.getNameDirs(1)); + + // Additionally it should not have applied any in-progress logs + // at start-up -- otherwise, it would have read half-way into + // the current log segment, and on the next roll, it would have to + // either replay starting in the middle of the segment (not allowed) + // or double-replay the edits (incorrect). + assertNull(NameNodeAdapter.getFileInfo(cluster.getNameNode(1), "/test", true)); + + cluster.getNameNode(0).getRpcServer().mkdirs("/test2", + FsPermission.createImmutable((short)0755), true); + + // If we restart NN0, it'll come back as standby, and we can + // transition NN1 to active and make sure it reads edits correctly at this point. + cluster.restartNameNode(0); + cluster.getNameNode(1).getRpcServer().transitionToActive(); + + // NN1 should have both the edits that came before its restart, and the edits that + // came after its restart. + assertNotNull(NameNodeAdapter.getFileInfo(cluster.getNameNode(1), "/test", true)); + assertNotNull(NameNodeAdapter.getFileInfo(cluster.getNameNode(1), "/test2", true)); + } finally { + cluster.shutdown(); + } + } + + @Test + public void testFailoverFinalizesAndReadsInProgress() throws Exception { + Configuration conf = new Configuration(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + try { + // Create a fake in-progress edit-log in the shared directory + URI sharedUri = cluster.getSharedEditsDir(0, 1); + File sharedDir = new File(sharedUri.getPath(), "current"); + FSImageTestUtil.createAbortedLogWithMkdirs(sharedDir, NUM_DIRS_IN_LOG, 1); + assertEditFiles(Collections.singletonList(sharedUri), + NNStorage.getInProgressEditsFileName(1)); + + // Transition one of the NNs to active + cluster.getNameNode(0).getRpcServer().transitionToActive(); + + // In the transition to active, it should have read the log -- and + // hence see one of the dirs we made in the fake log. + String testPath = "/dir" + NUM_DIRS_IN_LOG; + assertNotNull(cluster.getNameNode(0).getRpcServer().getFileInfo(testPath)); + + // It also should have finalized that log in the shared directory and started + // writing to a new one at the next txid. + assertEditFiles(Collections.singletonList(sharedUri), + NNStorage.getFinalizedEditsFileName(1, NUM_DIRS_IN_LOG + 1), + NNStorage.getInProgressEditsFileName(NUM_DIRS_IN_LOG + 2)); + } finally { + cluster.shutdown(); + } + + } + + /** + * Check that no edits files are present in the given storage dirs. + */ + private void assertNoEditFiles(Iterable dirs) throws IOException { + assertEditFiles(dirs, new String[]{}); + } + + /** + * Check that the given list of edits files are present in the given storage + * dirs. + */ + private void assertEditFiles(Iterable dirs, String ... files) + throws IOException { + for (URI u : dirs) { + File editDirRoot = new File(u.getPath()); + File editDir = new File(editDirRoot, "current"); + GenericTestUtils.assertExists(editDir); + if (files.length == 0) { + LOG.info("Checking no edit files exist in " + editDir); + } else { + LOG.info("Checking for following edit files in " + editDir + + ": " + Joiner.on(",").join(files)); + } + + GenericTestUtils.assertGlobEquals(editDir, "edits_.*", files); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java new file mode 100644 index 0000000000..cc9552aec2 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Collection; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NNStorage; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Test; +import org.mockito.Mockito; + +import com.google.common.base.Joiner; + +public class TestFailureOfSharedDir { + + private static final Log LOG = LogFactory.getLog(TestFailureOfSharedDir.class); + + /** + * Test that the shared edits dir is automatically added to the list of edits + * dirs that are marked required. + */ + @Test + public void testSharedDirIsAutomaticallyMarkedRequired() + throws URISyntaxException { + URI foo = new URI("file:/foo"); + URI bar = new URI("file:/bar"); + Configuration conf = new Configuration(); + conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY, Joiner.on(",").join(foo, bar)); + conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY, foo.toString()); + assertFalse(FSNamesystem.getRequiredNamespaceEditsDirs(conf).contains( + bar)); + conf.set(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY, bar.toString()); + Collection requiredEditsDirs = FSNamesystem + .getRequiredNamespaceEditsDirs(conf); + assertTrue(Joiner.on(",").join(requiredEditsDirs) + " does not contain " + bar, + requiredEditsDirs.contains(bar)); + } + + /** + * Multiple shared edits directories is an invalid configuration. + */ + @Test + public void testMultipleSharedDirsFails() throws Exception { + Configuration conf = new Configuration(); + URI sharedA = new URI("file:///shared-A"); + URI sharedB = new URI("file:///shared-B"); + URI localA = new URI("file:///local-A"); + + conf.set(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY, + Joiner.on(",").join(sharedA,sharedB)); + conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY, + localA.toString()); + + try { + FSNamesystem.getNamespaceEditsDirs(conf); + fail("Allowed multiple shared edits directories"); + } catch (IOException ioe) { + assertEquals("Multiple shared edits directories are not yet supported", + ioe.getMessage()); + } + } + + /** + * Make sure that the shared edits dirs are listed before non-shared dirs + * when the configuration is parsed. This ensures that the shared journals + * are synced before the local ones. + */ + @Test + public void testSharedDirsComeFirstInEditsList() throws Exception { + Configuration conf = new Configuration(); + URI sharedA = new URI("file:///shared-A"); + URI localA = new URI("file:///local-A"); + URI localB = new URI("file:///local-B"); + URI localC = new URI("file:///local-C"); + + conf.set(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY, + sharedA.toString()); + // List them in reverse order, to make sure they show up in + // the order listed, regardless of lexical sort order. + conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY, + Joiner.on(",").join(localC, localB, localA)); + List dirs = FSNamesystem.getNamespaceEditsDirs(conf); + assertEquals( + "Shared dirs should come first, then local dirs, in the order " + + "they were listed in the configuration.", + Joiner.on(",").join(sharedA, localC, localB, localA), + Joiner.on(",").join(dirs)); + } + + /** + * Test that marking the shared edits dir as being "required" causes the NN to + * fail if that dir can't be accessed. + */ + @Test + public void testFailureOfSharedDir() throws Exception { + Configuration conf = new Configuration(); + + // The shared edits dir will automatically be marked required. + MiniDFSCluster cluster = null; + File sharedEditsDir = null; + try { + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + + cluster.waitActive(); + cluster.transitionToActive(0); + + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + + assertTrue(fs.mkdirs(new Path("/test1"))); + + // Blow away the shared edits dir. + Runtime mockRuntime = Mockito.mock(Runtime.class); + URI sharedEditsUri = cluster.getSharedEditsDir(0, 1); + sharedEditsDir = new File(sharedEditsUri); + assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w", + true)); + + NameNode nn0 = cluster.getNameNode(0); + nn0.getNamesystem().getFSImage().getEditLog().getJournalSet() + .setRuntimeForTesting(mockRuntime); + try { + // Make sure that subsequent operations on the NN fail. + nn0.getRpcServer().rollEditLog(); + fail("Succeeded in rolling edit log despite shared dir being deleted"); + } catch (IOException ioe) { + GenericTestUtils.assertExceptionContains( + "Unable to start log segment 4: too few journals successfully started", + ioe); + // By current policy the NN should exit upon this error. + // exit() should be called once, but since it is mocked, exit gets + // called once during FSEditsLog.endCurrentLogSegment() and then after + // that during FSEditsLog.startLogSegment(). So the check is atLeast(1) + Mockito.verify(mockRuntime, Mockito.atLeastOnce()).exit( + Mockito.anyInt()); + LOG.info("Got expected exception", ioe); + } + + // Check that none of the edits dirs rolled, since the shared edits + // dir didn't roll. Regression test for HDFS-2874. + for (URI editsUri : cluster.getNameEditsDirs(0)) { + if (editsUri.equals(sharedEditsUri)) { + continue; + } + File editsDir = new File(editsUri.getPath()); + File curDir = new File(editsDir, "current"); + GenericTestUtils.assertGlobEquals(curDir, + "edits_.*", + NNStorage.getInProgressEditsFileName(1)); + } + } finally { + if (sharedEditsDir != null) { + // without this test cleanup will fail + FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true); + } + if (cluster != null) { + cluster.shutdown(); + } + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureToReadEdits.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureToReadEdits.java new file mode 100644 index 0000000000..7bc2d8e164 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureToReadEdits.java @@ -0,0 +1,326 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Matchers.anyBoolean; +import static org.mockito.Matchers.anyInt; +import static org.mockito.Matchers.anyLong; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.IOException; +import java.util.Collection; +import java.util.LinkedList; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.EditLogInputException; +import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream; +import org.apache.hadoop.hdfs.server.namenode.FSEditLog; +import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; + +import com.google.common.collect.ImmutableList; + +public class TestFailureToReadEdits { + + private static final Log LOG = LogFactory.getLog(TestFailureToReadEdits.class); + + private static final String TEST_DIR1 = "/test1"; + private static final String TEST_DIR2 = "/test2"; + private static final String TEST_DIR3 = "/test3"; + + private Configuration conf; + private Runtime mockRuntime = mock(Runtime.class); + private MiniDFSCluster cluster; + private NameNode nn0; + private NameNode nn1; + private FileSystem fs; + + @Before + public void setUpCluster() throws Exception { + conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY, 10); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + HAUtil.setAllowStandbyReads(conf, true); + + MiniDFSNNTopology topology = new MiniDFSNNTopology() + .addNameservice(new MiniDFSNNTopology.NSConf("ns1") + .addNN(new MiniDFSNNTopology.NNConf("nn1").setHttpPort(10001)) + .addNN(new MiniDFSNNTopology.NNConf("nn2").setHttpPort(10002))); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(topology) + .numDataNodes(0) + .build(); + + cluster.waitActive(); + + nn0 = cluster.getNameNode(0); + nn1 = cluster.getNameNode(1); + nn1.getNamesystem().getEditLogTailer().setRuntime(mockRuntime); + + cluster.transitionToActive(0); + fs = HATestUtil.configureFailoverFs(cluster, conf); + } + + @After + public void tearDownCluster() throws Exception { + if (fs != null) { + fs.close(); + } + + if (cluster != null) { + cluster.shutdown(); + } + } + + /** + * Test that the standby NN won't double-replay earlier edits if it encounters + * a failure to read a later edit. + */ + @Test + public void testFailuretoReadEdits() throws Exception { + assertTrue(fs.mkdirs(new Path(TEST_DIR1))); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // If these two ops are applied twice, the first op will throw an + // exception the second time its replayed. + fs.setOwner(new Path(TEST_DIR1), "foo", "bar"); + assertTrue(fs.delete(new Path(TEST_DIR1), true)); + + // This op should get applied just fine. + assertTrue(fs.mkdirs(new Path(TEST_DIR2))); + + // This is the op the mocking will cause to fail to be read. + assertTrue(fs.mkdirs(new Path(TEST_DIR3))); + + LimitedEditLogAnswer answer = causeFailureOnEditLogRead(); + + try { + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + fail("Standby fully caught up, but should not have been able to"); + } catch (HATestUtil.CouldNotCatchUpException e) { + verify(mockRuntime, times(0)).exit(anyInt()); + } + + // Null because it was deleted. + assertNull(NameNodeAdapter.getFileInfo(nn1, + TEST_DIR1, false)); + // Should have been successfully created. + assertTrue(NameNodeAdapter.getFileInfo(nn1, + TEST_DIR2, false).isDir()); + // Null because it hasn't been created yet. + assertNull(NameNodeAdapter.getFileInfo(nn1, + TEST_DIR3, false)); + + // Now let the standby read ALL the edits. + answer.setThrowExceptionOnRead(false); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // Null because it was deleted. + assertNull(NameNodeAdapter.getFileInfo(nn1, + TEST_DIR1, false)); + // Should have been successfully created. + assertTrue(NameNodeAdapter.getFileInfo(nn1, + TEST_DIR2, false).isDir()); + // Should now have been successfully created. + assertTrue(NameNodeAdapter.getFileInfo(nn1, + TEST_DIR3, false).isDir()); + } + + /** + * Test the following case: + * 1. SBN is reading a finalized edits file when NFS disappears halfway + * through (or some intermittent error happens) + * 2. SBN performs a checkpoint and uploads it to the NN + * 3. NN receives a checkpoint that doesn't correspond to the end of any log + * segment + * 4. Both NN and SBN should be able to restart at this point. + * + * This is a regression test for HDFS-2766. + */ + @Test + public void testCheckpointStartingMidEditsFile() throws Exception { + assertTrue(fs.mkdirs(new Path(TEST_DIR1))); + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // Once the standby catches up, it should notice that it needs to + // do a checkpoint and save one to its local directories. + HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 3)); + + // It should also upload it back to the active. + HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3)); + + causeFailureOnEditLogRead(); + + assertTrue(fs.mkdirs(new Path(TEST_DIR2))); + assertTrue(fs.mkdirs(new Path(TEST_DIR3))); + + try { + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + fail("Standby fully caught up, but should not have been able to"); + } catch (HATestUtil.CouldNotCatchUpException e) { + verify(mockRuntime, times(0)).exit(anyInt()); + } + + // 5 because we should get OP_START_LOG_SEGMENT and one successful OP_MKDIR + HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 3, 5)); + + // It should also upload it back to the active. + HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3, 5)); + + // Restart the active NN + cluster.restartNameNode(0); + + HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3, 5)); + + FileSystem fs0 = null; + try { + // Make sure that when the active restarts, it loads all the edits. + fs0 = FileSystem.get(NameNode.getUri(nn0.getNameNodeAddress()), + conf); + + assertTrue(fs0.exists(new Path(TEST_DIR1))); + assertTrue(fs0.exists(new Path(TEST_DIR2))); + assertTrue(fs0.exists(new Path(TEST_DIR3))); + } finally { + if (fs0 != null) + fs0.close(); + } + } + + /** + * Ensure that the standby fails to become active if it cannot read all + * available edits in the shared edits dir when it is transitioning to active + * state. + */ + @Test + public void testFailureToReadEditsOnTransitionToActive() throws Exception { + assertTrue(fs.mkdirs(new Path(TEST_DIR1))); + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // It should also upload it back to the active. + HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3)); + + causeFailureOnEditLogRead(); + + assertTrue(fs.mkdirs(new Path(TEST_DIR2))); + assertTrue(fs.mkdirs(new Path(TEST_DIR3))); + + try { + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + fail("Standby fully caught up, but should not have been able to"); + } catch (HATestUtil.CouldNotCatchUpException e) { + verify(mockRuntime, times(0)).exit(anyInt()); + } + + // Shutdown the active NN. + cluster.shutdownNameNode(0); + + try { + // Transition the standby to active. + cluster.transitionToActive(1); + fail("Standby transitioned to active, but should not have been able to"); + } catch (ServiceFailedException sfe) { + LOG.info("got expected exception: " + sfe.toString(), sfe); + assertTrue("Standby failed to catch up for some reason other than " + + "failure to read logs", sfe.toString().contains( + EditLogInputException.class.getName())); + } + } + + private LimitedEditLogAnswer causeFailureOnEditLogRead() throws IOException { + FSEditLog spyEditLog = spy(nn1.getNamesystem().getEditLogTailer() + .getEditLog()); + LimitedEditLogAnswer answer = new LimitedEditLogAnswer(); + doAnswer(answer).when(spyEditLog).selectInputStreams( + anyLong(), anyLong(), anyBoolean()); + nn1.getNamesystem().getEditLogTailer().setEditLog(spyEditLog); + + return answer; + } + + private static class LimitedEditLogAnswer + implements Answer> { + + private boolean throwExceptionOnRead = true; + + @SuppressWarnings("unchecked") + @Override + public Collection answer(InvocationOnMock invocation) + throws Throwable { + Collection streams = (Collection) + invocation.callRealMethod(); + + if (!throwExceptionOnRead) { + return streams; + } else { + Collection ret = new LinkedList(); + for (EditLogInputStream stream : streams) { + EditLogInputStream spyStream = spy(stream); + doAnswer(new Answer() { + + @Override + public FSEditLogOp answer(InvocationOnMock invocation) + throws Throwable { + FSEditLogOp op = (FSEditLogOp) invocation.callRealMethod(); + if (throwExceptionOnRead && + TEST_DIR3.equals(NameNodeAdapter.getMkdirOpPath(op))) { + throw new IOException("failed to read op creating " + TEST_DIR3); + } else { + return op; + } + } + + }).when(spyStream).readOp(); + ret.add(spyStream); + } + return ret; + } + } + + public void setThrowExceptionOnRead(boolean throwExceptionOnRead) { + this.throwExceptionOnRead = throwExceptionOnRead; + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestGetGroupsWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestGetGroupsWithHA.java new file mode 100644 index 0000000000..e548817b6a --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestGetGroupsWithHA.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import java.io.IOException; +import java.io.PrintStream; + +import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.tools.GetGroups; +import org.apache.hadoop.tools.GetGroupsTestBase; +import org.apache.hadoop.util.Tool; +import org.junit.After; +import org.junit.Before; + +public class TestGetGroupsWithHA extends GetGroupsTestBase { + + private MiniDFSCluster cluster; + + @Before + public void setUpNameNode() throws IOException { + conf = new HdfsConfiguration(); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0).build(); + HATestUtil.setFailoverConfigurations(cluster, conf); + } + + @After + public void tearDownNameNode() { + if (cluster != null) { + cluster.shutdown(); + } + } + + @Override + protected Tool getTool(PrintStream o) { + return new GetGroups(conf, o); + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAConfiguration.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAConfiguration.java new file mode 100644 index 0000000000..9cd6ab7089 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAConfiguration.java @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.apache.hadoop.hdfs.DFSConfigKeys.*; +import static org.junit.Assert.*; + +import java.io.IOException; +import java.net.URI; +import java.util.Collection; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Test cases that the HA configuration is reasonably validated and + * interpreted in various places. These should be proper unit tests + * which don't start daemons. + */ +public class TestHAConfiguration { + + private FSNamesystem fsn = Mockito.mock(FSNamesystem.class); + + @Test + public void testCheckpointerValidityChecks() throws Exception { + try { + Configuration conf = new Configuration(); + new StandbyCheckpointer(conf, fsn); + fail("Bad config did not throw an error"); + } catch (IllegalArgumentException iae) { + GenericTestUtils.assertExceptionContains( + "Invalid URI for NameNode address", iae); + } + } + + private Configuration getHAConf(String nsId, String host1, String host2) { + Configuration conf = new Configuration(); + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, nsId); + conf.set(DFSUtil.addKeySuffixes( + DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX, nsId), + "nn1,nn2"); + conf.set(DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY, "nn1"); + conf.set(DFSUtil.addKeySuffixes( + DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, nsId, "nn1"), + host1 + ":12345"); + conf.set(DFSUtil.addKeySuffixes( + DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, nsId, "nn2"), + host2 + ":12345"); + return conf; + } + + @Test + public void testGetOtherNNHttpAddress() { + // Use non-local addresses to avoid host address matching + Configuration conf = getHAConf("ns1", "1.2.3.1", "1.2.3.2"); + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID, "ns1"); + + // This is done by the NN before the StandbyCheckpointer is created + NameNode.initializeGenericKeys(conf, "ns1", "nn1"); + + // Since we didn't configure the HTTP address, and the default is + // 0.0.0.0, it should substitute the address from the RPC configuration + // above. + StandbyCheckpointer checkpointer = new StandbyCheckpointer(conf, fsn); + assertEquals("1.2.3.2:" + DFSConfigKeys.DFS_NAMENODE_HTTP_PORT_DEFAULT, + checkpointer.getActiveNNAddress()); + } + + /** + * Tests that the namenode edits dirs and shared edits dirs are gotten with + * duplicates removed + */ + @Test + public void testHAUniqueEditDirs() throws IOException { + Configuration conf = new Configuration(); + + conf.set(DFS_NAMENODE_EDITS_DIR_KEY, "file://edits/dir, " + + "file://edits/shared/dir"); // overlapping + conf.set(DFS_NAMENODE_SHARED_EDITS_DIR_KEY, "file://edits/shared/dir"); + + // getNamespaceEditsDirs removes duplicates across edits and shared.edits + Collection editsDirs = FSNamesystem.getNamespaceEditsDirs(conf); + assertEquals(2, editsDirs.size()); + } + + /** + * Test that the 2NN does not start if given a config with HA NNs. + */ + @Test + public void testSecondaryNameNodeDoesNotStart() throws IOException { + // Note we're not explicitly setting the nameservice Id in the + // config as it is not required to be set and we want to test + // that we can determine if HA is enabled when the nameservice Id + // is not explicitly defined. + Configuration conf = getHAConf("ns1", "1.2.3.1", "1.2.3.2"); + try { + new SecondaryNameNode(conf); + fail("Created a 2NN with an HA config"); + } catch (IOException ioe) { + GenericTestUtils.assertExceptionContains( + "Cannot use SecondaryNameNode in an HA cluster", ioe); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAFsck.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAFsck.java new file mode 100644 index 0000000000..10218f218e --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAFsck.java @@ -0,0 +1,103 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; + +import junit.framework.Assert; + +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.tools.DFSck; +import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Level; +import org.junit.Test; + +public class TestHAFsck { + + static { + ((Log4JLogger)LogFactory.getLog(DFSUtil.class)).getLogger().setLevel(Level.ALL); + } + + /** + * Test that fsck still works with HA enabled. + */ + @Test + public void testHaFsck() throws Exception { + Configuration conf = new Configuration(); + + // need some HTTP ports + MiniDFSNNTopology topology = new MiniDFSNNTopology() + .addNameservice(new MiniDFSNNTopology.NSConf("ha-nn-uri-0") + .addNN(new MiniDFSNNTopology.NNConf("nn1").setHttpPort(10001)) + .addNN(new MiniDFSNNTopology.NNConf("nn2").setHttpPort(10002))); + + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(topology) + .numDataNodes(0) + .build(); + FileSystem fs = null; + try { + cluster.waitActive(); + + cluster.transitionToActive(0); + + // Make sure conf has the relevant HA configs. + HATestUtil.setFailoverConfigurations(cluster, conf, "ha-nn-uri-0", 0); + + fs = HATestUtil.configureFailoverFs(cluster, conf); + fs.mkdirs(new Path("/test1")); + fs.mkdirs(new Path("/test2")); + + runFsck(conf); + + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + runFsck(conf); + } finally { + if (fs != null) { + fs.close(); + } + if (cluster != null) { + cluster.shutdown(); + } + } + } + + static void runFsck(Configuration conf) throws Exception { + ByteArrayOutputStream bStream = new ByteArrayOutputStream(); + PrintStream out = new PrintStream(bStream, true); + int errCode = ToolRunner.run(new DFSck(conf, out), + new String[]{"/", "-files"}); + String result = bStream.toString(); + System.out.println("output from fsck:\n" + result); + Assert.assertEquals(0, errCode); + assertTrue(result.contains("/test1")); + assertTrue(result.contains("/test2")); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAMetrics.java new file mode 100644 index 0000000000..cc85c83b3d --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAMetrics.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.io.IOUtils; +import org.junit.Test; + +/** + * Make sure HA-related metrics are updated and reported appropriately. + */ +public class TestHAMetrics { + + private static final Log LOG = LogFactory.getLog(TestHAMetrics.class); + + @Test + public void testHAMetrics() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, Integer.MAX_VALUE); + + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()).numDataNodes(1) + .build(); + FileSystem fs = null; + try { + cluster.waitActive(); + + FSNamesystem nn0 = cluster.getNamesystem(0); + FSNamesystem nn1 = cluster.getNamesystem(1); + + assertEquals(nn0.getHAState(), "standby"); + assertTrue(0 < nn0.getMillisSinceLastLoadedEdits()); + assertEquals(nn1.getHAState(), "standby"); + assertTrue(0 < nn1.getMillisSinceLastLoadedEdits()); + + cluster.transitionToActive(0); + + assertEquals("active", nn0.getHAState()); + assertEquals(0, nn0.getMillisSinceLastLoadedEdits()); + assertEquals("standby", nn1.getHAState()); + assertTrue(0 < nn1.getMillisSinceLastLoadedEdits()); + + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + assertEquals("standby", nn0.getHAState()); + assertTrue(0 < nn0.getMillisSinceLastLoadedEdits()); + assertEquals("active", nn1.getHAState()); + assertEquals(0, nn1.getMillisSinceLastLoadedEdits()); + + Thread.sleep(2000); // make sure standby gets a little out-of-date + assertTrue(2000 <= nn0.getMillisSinceLastLoadedEdits()); + + assertEquals(0, nn0.getPendingDataNodeMessageCount()); + assertEquals(0, nn1.getPendingDataNodeMessageCount()); + + fs = HATestUtil.configureFailoverFs(cluster, conf); + DFSTestUtil.createFile(fs, new Path("/foo"), + 10, (short)1, 1L); + + assertTrue(0 < nn0.getPendingDataNodeMessageCount()); + assertEquals(0, nn1.getPendingDataNodeMessageCount()); + long millisSinceLastLoadedEdits = nn0.getMillisSinceLastLoadedEdits(); + + HATestUtil.waitForStandbyToCatchUp(cluster.getNameNode(1), + cluster.getNameNode(0)); + + assertEquals(0, nn0.getPendingDataNodeMessageCount()); + assertEquals(0, nn1.getPendingDataNodeMessageCount()); + long newMillisSinceLastLoadedEdits = nn0.getMillisSinceLastLoadedEdits(); + // Since we just waited for the standby to catch up, the time since we + // last loaded edits should be very low. + assertTrue("expected " + millisSinceLastLoadedEdits + " > " + + newMillisSinceLastLoadedEdits, + millisSinceLastLoadedEdits > newMillisSinceLastLoadedEdits); + } finally { + IOUtils.cleanup(LOG, fs); + cluster.shutdown(); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java new file mode 100644 index 0000000000..8790d0f331 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java @@ -0,0 +1,648 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; +import static org.junit.Assert.assertTrue; +import static org.mockito.Matchers.anyInt; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; +import org.apache.hadoop.hdfs.server.namenode.FSImage; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.log4j.Level; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import com.google.common.base.Supplier; +import com.google.common.collect.Lists; + +/** + * Tests that exercise safemode in an HA cluster. + */ +public class TestHASafeMode { + private static final Log LOG = LogFactory.getLog(TestHASafeMode.class); + private static final int BLOCK_SIZE = 1024; + private NameNode nn0; + private NameNode nn1; + private FileSystem fs; + private MiniDFSCluster cluster; + private Runtime mockRuntime = mock(Runtime.class); + + static { + ((Log4JLogger)LogFactory.getLog(FSImage.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL); + } + + @Before + public void setupCluster() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); + conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(3) + .waitSafeMode(false) + .build(); + cluster.waitActive(); + + nn0 = cluster.getNameNode(0); + nn1 = cluster.getNameNode(1); + fs = HATestUtil.configureFailoverFs(cluster, conf); + + nn0.getNamesystem().getEditLogTailer().setRuntime(mockRuntime); + + cluster.transitionToActive(0); + } + + @After + public void shutdownCluster() throws IOException { + if (cluster != null) { + verify(mockRuntime, times(0)).exit(anyInt()); + cluster.shutdown(); + } + } + + private void restartStandby() throws IOException { + cluster.shutdownNameNode(1); + // Set the safemode extension to be lengthy, so that the tests + // can check the safemode message after the safemode conditions + // have been achieved, without being racy. + cluster.getConfiguration(1).setInt( + DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000); + cluster.getConfiguration(1).setInt( + DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + + cluster.restartNameNode(1); + nn1 = cluster.getNameNode(1); + assertEquals(nn1.getNamesystem().getTransactionsSinceLastLogRoll(), 0L); + } + + /** + * Test case for enter safemode in active namenode, when it is already in startup safemode. + * It is a regression test for HDFS-2747. + */ + @Test + public void testEnterSafeModeInANNShouldNotThrowNPE() throws Exception { + banner("Restarting active"); + DFSTestUtil + .createFile(fs, new Path("/test"), 3 * BLOCK_SIZE, (short) 3, 1L); + restartActive(); + nn0.getRpcServer().transitionToActive(); + + FSNamesystem namesystem = nn0.getNamesystem(); + String status = namesystem.getSafemode(); + assertTrue("Bad safemode status: '" + status + "'", status + .startsWith("Safe mode is ON.")); + NameNodeAdapter.enterSafeMode(nn0, false); + assertTrue("Failed to enter into safemode in active", namesystem + .isInSafeMode()); + NameNodeAdapter.enterSafeMode(nn0, false); + assertTrue("Failed to enter into safemode in active", namesystem + .isInSafeMode()); + } + + /** + * Test case for enter safemode in standby namenode, when it is already in startup safemode. + * It is a regression test for HDFS-2747. + */ + @Test + public void testEnterSafeModeInSBNShouldNotThrowNPE() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + DFSTestUtil + .createFile(fs, new Path("/test"), 3 * BLOCK_SIZE, (short) 3, 1L); + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup and enter safemode. + nn0.getRpcServer().rollEditLog(); + banner("Creating some blocks that won't be in the edit log"); + DFSTestUtil.createFile(fs, new Path("/test2"), 5 * BLOCK_SIZE, (short) 3, + 1L); + banner("Deleting the original blocks"); + fs.delete(new Path("/test"), true); + banner("Restarting standby"); + restartStandby(); + FSNamesystem namesystem = nn1.getNamesystem(); + String status = namesystem.getSafemode(); + assertTrue("Bad safemode status: '" + status + "'", status + .startsWith("Safe mode is ON.")); + NameNodeAdapter.enterSafeMode(nn1, false); + assertTrue("Failed to enter into safemode in standby", namesystem + .isInSafeMode()); + NameNodeAdapter.enterSafeMode(nn1, false); + assertTrue("Failed to enter into safemode in standby", namesystem + .isInSafeMode()); + } + + private void restartActive() throws IOException { + cluster.shutdownNameNode(0); + // Set the safemode extension to be lengthy, so that the tests + // can check the safemode message after the safemode conditions + // have been achieved, without being racy. + cluster.getConfiguration(0).setInt( + DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000); + cluster.restartNameNode(0); + nn0 = cluster.getNameNode(0); + } + + /** + * Tests the case where, while a standby is down, more blocks are + * added to the namespace, but not rolled. So, when it starts up, + * it receives notification about the new blocks during + * the safemode extension period. + */ + @Test + public void testBlocksAddedBeforeStandbyRestart() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup. + nn0.getRpcServer().rollEditLog(); + + banner("Creating some blocks that won't be in the edit log"); + DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L); + + banner("Restarting standby"); + restartStandby(); + + // We expect it not to be stuck in safemode, since those blocks + // that are already visible to the SBN should be processed + // in the initial block reports. + assertSafeMode(nn1, 3, 3); + + banner("Waiting for standby to catch up to active namespace"); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + assertSafeMode(nn1, 8, 8); + } + + /** + * Similar to {@link #testBlocksAddedBeforeStandbyRestart()} except that + * the new blocks are allocated after the SBN has restarted. So, the + * blocks were not present in the original block reports at startup + * but are reported separately by blockReceived calls. + */ + @Test + public void testBlocksAddedWhileInSafeMode() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup. + nn0.getRpcServer().rollEditLog(); + + banner("Restarting standby"); + restartStandby(); + + assertSafeMode(nn1, 3, 3); + + // Create a few blocks which will send blockReceived calls to the + // SBN. + banner("Creating some blocks while SBN is in safe mode"); + DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L); + + + banner("Waiting for standby to catch up to active namespace"); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + assertSafeMode(nn1, 8, 8); + } + + /** + * Test for the following case proposed by ATM: + * 1. Both NNs are up, one is active. There are 100 blocks. Both are + * out of safemode. + * 2. 10 block deletions get processed by NN1. NN2 enqueues these DN messages + * until it next reads from a checkpointed edits file. + * 3. NN2 gets restarted. Its queues are lost. + * 4. NN2 comes up, reads from all the finalized edits files. Concludes there + * should still be 100 blocks. + * 5. NN2 receives a block report from all the DNs, which only accounts for + * 90 blocks. It doesn't leave safemode. + * 6. NN1 dies or is transitioned to standby. + * 7. NN2 is transitioned to active. It reads all the edits from NN1. It now + * knows there should only be 90 blocks, but it's still in safemode. + * 8. NN2 doesn't ever recheck whether it should leave safemode. + * + * This is essentially the inverse of {@link #testBlocksAddedBeforeStandbyRestart()} + */ + @Test + public void testBlocksRemovedBeforeStandbyRestart() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + DFSTestUtil.createFile(fs, new Path("/test"), 5*BLOCK_SIZE, (short) 3, 1L); + + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup. + nn0.getRpcServer().rollEditLog(); + + // Delete those blocks again, so they won't get reported to the SBN + // once it starts up + banner("Removing the blocks without rolling the edit log"); + fs.delete(new Path("/test"), true); + BlockManagerTestUtil.computeAllPendingWork( + nn0.getNamesystem().getBlockManager()); + cluster.triggerHeartbeats(); + + banner("Restarting standby"); + restartStandby(); + assertSafeMode(nn1, 0, 5); + + banner("Waiting for standby to catch up to active namespace"); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + assertSafeMode(nn1, 0, 0); + } + + /** + * Similar to {@link #testBlocksRemovedBeforeStandbyRestart()} except that + * the blocks are removed after the SBN has restarted. So, the + * blocks were present in the original block reports at startup + * but are deleted separately later by deletion reports. + */ + @Test + public void testBlocksRemovedWhileInSafeMode() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + DFSTestUtil.createFile(fs, new Path("/test"), 10*BLOCK_SIZE, (short) 3, 1L); + + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup. + nn0.getRpcServer().rollEditLog(); + + banner("Restarting standby"); + restartStandby(); + + // It will initially have all of the blocks necessary. + assertSafeMode(nn1, 10, 10); + + // Delete those blocks while the SBN is in safe mode. + // This doesn't affect the SBN, since deletions are not + // ACKed when due to block removals. + banner("Removing the blocks without rolling the edit log"); + fs.delete(new Path("/test"), true); + BlockManagerTestUtil.computeAllPendingWork( + nn0.getNamesystem().getBlockManager()); + + banner("Triggering deletions on DNs and Deletion Reports"); + cluster.triggerHeartbeats(); + HATestUtil.waitForDNDeletions(cluster); + cluster.triggerDeletionReports(); + + assertSafeMode(nn1, 10, 10); + + // When we catch up to active namespace, it will restore back + // to 0 blocks. + banner("Waiting for standby to catch up to active namespace"); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + assertSafeMode(nn1, 0, 0); + } + + /** + * Tests that the standby node properly tracks the number of total + * and safe blocks while it is in safe mode. Since safe-mode only + * counts completed blocks, append needs to decrement the total + * number of blocks and then re-increment when the file is closed + * again. + */ + @Test + public void testAppendWhileInSafeMode() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + // Make 4.5 blocks so that append() will re-open an existing block + // instead of just adding a new one + DFSTestUtil.createFile(fs, new Path("/test"), + 4*BLOCK_SIZE + BLOCK_SIZE/2, (short) 3, 1L); + + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup. + nn0.getRpcServer().rollEditLog(); + + banner("Restarting standby"); + restartStandby(); + + // It will initially have all of the blocks necessary. + assertSafeMode(nn1, 5, 5); + + // Append to a block while SBN is in safe mode. This should + // not affect safemode initially, since the DN message + // will get queued. + FSDataOutputStream stm = fs.append(new Path("/test")); + try { + assertSafeMode(nn1, 5, 5); + + // if we roll edits now, the SBN should see that it's under construction + // and change its total count and safe count down by one, since UC + // blocks are not counted by safe mode. + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + assertSafeMode(nn1, 4, 4); + } finally { + IOUtils.closeStream(stm); + } + + // Delete those blocks while the SBN is in safe mode. + // This will not ACK the deletions to the SBN, so it won't + // notice until we roll the edit log. + banner("Removing the blocks without rolling the edit log"); + fs.delete(new Path("/test"), true); + BlockManagerTestUtil.computeAllPendingWork( + nn0.getNamesystem().getBlockManager()); + + banner("Triggering deletions on DNs and Deletion Reports"); + cluster.triggerHeartbeats(); + HATestUtil.waitForDNDeletions(cluster); + cluster.triggerDeletionReports(); + + assertSafeMode(nn1, 4, 4); + + // When we roll the edit log, the deletions will go through. + banner("Waiting for standby to catch up to active namespace"); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + assertSafeMode(nn1, 0, 0); + } + + /** + * Regression test for a bug experienced while developing + * HDFS-2742. The scenario here is: + * - image contains some blocks + * - edits log contains at least one block addition, followed + * by deletion of more blocks than were added. + * - When node starts up, some incorrect accounting of block + * totals caused an assertion failure. + */ + @Test + public void testBlocksDeletedInEditLog() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + // Make 4 blocks persisted in the image. + DFSTestUtil.createFile(fs, new Path("/test"), + 4*BLOCK_SIZE, (short) 3, 1L); + NameNodeAdapter.enterSafeMode(nn0, false); + NameNodeAdapter.saveNamespace(nn0); + NameNodeAdapter.leaveSafeMode(nn0, false); + + // OP_ADD for 2 blocks + DFSTestUtil.createFile(fs, new Path("/test2"), + 2*BLOCK_SIZE, (short) 3, 1L); + + // OP_DELETE for 4 blocks + fs.delete(new Path("/test"), true); + + restartActive(); + } + + private void assertSafeMode(NameNode nn, int safe, int total) { + String status = nn1.getNamesystem().getSafemode(); + if (safe == total) { + assertTrue("Bad safemode status: '" + status + "'", + status.startsWith( + "Safe mode is ON." + + "The reported blocks " + safe + " has reached the threshold " + + "0.9990 of total blocks " + total + ". Safe mode will be " + + "turned off automatically")); + } else { + int additional = total - safe; + assertTrue("Bad safemode status: '" + status + "'", + status.startsWith( + "Safe mode is ON." + + "The reported blocks " + safe + " needs additional " + + additional + " blocks")); + } + } + + /** + * Set up a namesystem with several edits, both deletions and + * additions, and failover to a new NN while that NN is in + * safemode. Ensure that it will exit safemode. + */ + @Test + public void testComplexFailoverIntoSafemode() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup and enter safemode. + nn0.getRpcServer().rollEditLog(); + + banner("Creating some blocks that won't be in the edit log"); + DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L); + + banner("Deleting the original blocks"); + fs.delete(new Path("/test"), true); + + banner("Restarting standby"); + restartStandby(); + + // We expect it to be on its way out of safemode, since all of the blocks + // from the edit log have been reported. + assertSafeMode(nn1, 3, 3); + + // Initiate a failover into it while it's in safemode + banner("Initiating a failover into NN1 in safemode"); + NameNodeAdapter.abortEditLogs(nn0); + cluster.transitionToActive(1); + + assertSafeMode(nn1, 5, 5); + } + + /** + * Similar to {@link #testBlocksRemovedWhileInSafeMode()} except that + * the OP_DELETE edits arrive at the SBN before the block deletion reports. + * The tracking of safe blocks needs to properly account for the removal + * of the blocks as well as the safe count. This is a regression test for + * HDFS-2742. + */ + @Test + public void testBlocksRemovedWhileInSafeModeEditsArriveFirst() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some blocks"); + DFSTestUtil.createFile(fs, new Path("/test"), 10*BLOCK_SIZE, (short) 3, 1L); + + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup. + nn0.getRpcServer().rollEditLog(); + + banner("Restarting standby"); + restartStandby(); + + // It will initially have all of the blocks necessary. + String status = nn1.getNamesystem().getSafemode(); + assertTrue("Bad safemode status: '" + status + "'", + status.startsWith( + "Safe mode is ON." + + "The reported blocks 10 has reached the threshold 0.9990 of " + + "total blocks 10. Safe mode will be turned off automatically")); + + // Delete those blocks while the SBN is in safe mode. + // Immediately roll the edit log before the actual deletions are sent + // to the DNs. + banner("Removing the blocks without rolling the edit log"); + fs.delete(new Path("/test"), true); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // Should see removal of the blocks as well as their contribution to safe block count. + assertSafeMode(nn1, 0, 0); + + + banner("Triggering sending deletions to DNs and Deletion Reports"); + BlockManagerTestUtil.computeAllPendingWork( + nn0.getNamesystem().getBlockManager()); + cluster.triggerHeartbeats(); + HATestUtil.waitForDNDeletions(cluster); + cluster.triggerDeletionReports(); + + // No change in assertion status here, but some of the consistency checks + // in safemode will fire here if we accidentally decrement safe block count + // below 0. + assertSafeMode(nn1, 0, 0); + } + + + /** + * Test that the number of safe blocks is accounted correctly even when + * blocks move between under-construction state and completed state. + * If a FINALIZED report arrives at the SBN before the block is marked + * COMPLETE, then when we get the OP_CLOSE we need to count it as "safe" + * at that point. This is a regression test for HDFS-2742. + */ + @Test + public void testSafeBlockTracking() throws Exception { + banner("Starting with NN0 active and NN1 standby, creating some " + + "UC blocks plus some other blocks to force safemode"); + DFSTestUtil.createFile(fs, new Path("/other-blocks"), 10*BLOCK_SIZE, (short) 3, 1L); + + List stms = Lists.newArrayList(); + try { + for (int i = 0; i < 5; i++) { + FSDataOutputStream stm = fs.create(new Path("/test-uc-" + i)); + stms.add(stm); + stm.write(1); + stm.hflush(); + } + // Roll edit log so that, when the SBN restarts, it will load + // the namespace during startup and enter safemode. + nn0.getRpcServer().rollEditLog(); + } finally { + for (FSDataOutputStream stm : stms) { + IOUtils.closeStream(stm); + } + } + + banner("Restarting SBN"); + restartStandby(); + assertSafeMode(nn1, 10, 10); + + banner("Allowing SBN to catch up"); + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + assertSafeMode(nn1, 15, 15); + } + + /** + * Regression test for HDFS-2753. In this bug, the following sequence was + * observed: + * - Some blocks are written to DNs while the SBN was down. This causes + * the blockReceived messages to get queued in the BPServiceActor on the + * DN. + * - When the SBN returns, the DN re-registers with the SBN, and then + * flushes its blockReceived queue to the SBN before it sends its + * first block report. This caused the first block report to be + * incorrect ignored. + * - The SBN would become stuck in safemode. + */ + @Test + public void testBlocksAddedWhileStandbyIsDown() throws Exception { + DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); + + banner("Stopping standby"); + cluster.shutdownNameNode(1); + + DFSTestUtil.createFile(fs, new Path("/test2"), 3*BLOCK_SIZE, (short) 3, 1L); + + banner("Rolling edit log so standby gets all edits on restart"); + nn0.getRpcServer().rollEditLog(); + + restartStandby(); + assertSafeMode(nn1, 6, 6); + } + + /** + * Regression test for HDFS-2804: standby should not populate replication + * queues when exiting safe mode. + */ + @Test + public void testNoPopulatingReplQueuesWhenExitingSafemode() throws Exception { + DFSTestUtil.createFile(fs, new Path("/test"), 15*BLOCK_SIZE, (short)3, 1L); + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // get some blocks in the SBN's image + nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_ENTER); + NameNodeAdapter.saveNamespace(nn1); + nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_LEAVE); + + // and some blocks in the edit logs + DFSTestUtil.createFile(fs, new Path("/test2"), 15*BLOCK_SIZE, (short)3, 1L); + nn0.getRpcServer().rollEditLog(); + + cluster.stopDataNode(1); + cluster.shutdownNameNode(1); + + //Configuration sbConf = cluster.getConfiguration(1); + //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1); + cluster.restartNameNode(1, false); + nn1 = cluster.getNameNode(1); + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + return !nn1.isInSafeMode(); + } + }, 100, 10000); + + BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); + assertEquals(0L, nn1.getNamesystem().getUnderReplicatedBlocks()); + assertEquals(0L, nn1.getNamesystem().getPendingReplicationBlocks()); + } + + /** + * Print a big banner in the test log to make debug easier. + */ + static void banner(String string) { + LOG.info("\n\n\n\n================================================\n" + + string + "\n" + + "==================================================\n\n"); + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAStateTransitions.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAStateTransitions.java new file mode 100644 index 0000000000..092bb5af4a --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAStateTransitions.java @@ -0,0 +1,545 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; +import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; +import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread; +import org.apache.hadoop.test.MultithreadedTestUtil.TestContext; +import org.apache.log4j.Level; +import org.junit.Assert; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Tests state transition from active->standby, and manual failover + * and failback between two namenodes. + */ +public class TestHAStateTransitions { + protected static final Log LOG = LogFactory.getLog( + TestStandbyIsHot.class); + private static final Path TEST_DIR = new Path("/test"); + private static final Path TEST_FILE_PATH = new Path(TEST_DIR, "foo"); + private static final String TEST_FILE_STR = TEST_FILE_PATH.toUri().getPath(); + private static final String TEST_FILE_DATA = + "Hello state transitioning world"; + + static { + ((Log4JLogger)EditLogTailer.LOG).getLogger().setLevel(Level.ALL); + } + + /** + * Test which takes a single node and flip flops between + * active and standby mode, making sure it doesn't + * double-play any edits. + */ + @Test + public void testTransitionActiveToStandby() throws Exception { + Configuration conf = new Configuration(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .build(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + FileSystem fs = cluster.getFileSystem(0); + + fs.mkdirs(TEST_DIR); + cluster.transitionToStandby(0); + try { + fs.mkdirs(new Path("/x")); + fail("Didn't throw trying to mutate FS in standby state"); + } catch (Throwable t) { + GenericTestUtils.assertExceptionContains( + "Operation category WRITE is not supported", t); + } + cluster.transitionToActive(0); + + // Create a file, then delete the whole directory recursively. + DFSTestUtil.createFile(fs, new Path(TEST_DIR, "foo"), + 10, (short)1, 1L); + fs.delete(TEST_DIR, true); + + // Now if the standby tries to replay the last segment that it just + // wrote as active, it would fail since it's trying to create a file + // in a non-existent directory. + cluster.transitionToStandby(0); + cluster.transitionToActive(0); + + assertFalse(fs.exists(TEST_DIR)); + + } finally { + cluster.shutdown(); + } + } + + /** + * Test that transitioning a service to the state that it is already + * in is a nop, specifically, an exception is not thrown. + */ + @Test + public void testTransitionToCurrentStateIsANop() throws Exception { + Configuration conf = new Configuration(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .build(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + cluster.transitionToActive(0); + cluster.transitionToStandby(0); + cluster.transitionToStandby(0); + } finally { + cluster.shutdown(); + } + } + + /** + * Test manual failover failback for one namespace + * @param cluster single process test cluster + * @param conf cluster configuration + * @param nsIndex namespace index starting from zero + * @throws Exception + */ + private void testManualFailoverFailback(MiniDFSCluster cluster, + Configuration conf, int nsIndex) throws Exception { + int nn0 = 2 * nsIndex, nn1 = 2 * nsIndex + 1; + + cluster.transitionToActive(nn0); + + LOG.info("Starting with NN 0 active in namespace " + nsIndex); + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + fs.mkdirs(TEST_DIR); + + LOG.info("Failing over to NN 1 in namespace " + nsIndex); + cluster.transitionToStandby(nn0); + cluster.transitionToActive(nn1); + assertTrue(fs.exists(TEST_DIR)); + DFSTestUtil.writeFile(fs, TEST_FILE_PATH, TEST_FILE_DATA); + + LOG.info("Failing over to NN 0 in namespace " + nsIndex); + cluster.transitionToStandby(nn1); + cluster.transitionToActive(nn0); + assertTrue(fs.exists(TEST_DIR)); + assertEquals(TEST_FILE_DATA, + DFSTestUtil.readFile(fs, TEST_FILE_PATH)); + + LOG.info("Removing test file"); + fs.delete(TEST_DIR, true); + assertFalse(fs.exists(TEST_DIR)); + + LOG.info("Failing over to NN 1 in namespace " + nsIndex); + cluster.transitionToStandby(nn0); + cluster.transitionToActive(nn1); + assertFalse(fs.exists(TEST_DIR)); + } + + /** + * Tests manual failover back and forth between two NameNodes. + */ + @Test + public void testManualFailoverAndFailback() throws Exception { + Configuration conf = new Configuration(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .build(); + try { + cluster.waitActive(); + // test the only namespace + testManualFailoverFailback(cluster, conf, 0); + } finally { + cluster.shutdown(); + } + } + + /** + * Regression test for HDFS-2693: when doing state transitions, we need to + * lock the FSNamesystem so that we don't end up doing any writes while it's + * "in between" states. + * This test case starts up several client threads which do mutation operations + * while flipping a NN back and forth from active to standby. + */ + @Test(timeout=120000) + public void testTransitionSynchronization() throws Exception { + Configuration conf = new Configuration(); + final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + try { + cluster.waitActive(); + ReentrantReadWriteLock spyLock = NameNodeAdapter.spyOnFsLock( + cluster.getNameNode(0).getNamesystem()); + Mockito.doAnswer(new GenericTestUtils.SleepAnswer(50)) + .when(spyLock).writeLock(); + + final FileSystem fs = HATestUtil.configureFailoverFs( + cluster, conf); + + TestContext ctx = new TestContext(); + for (int i = 0; i < 50; i++) { + final int finalI = i; + ctx.addThread(new RepeatingTestThread(ctx) { + @Override + public void doAnAction() throws Exception { + Path p = new Path("/test-" + finalI); + fs.mkdirs(p); + fs.delete(p, true); + } + }); + } + + ctx.addThread(new RepeatingTestThread(ctx) { + @Override + public void doAnAction() throws Exception { + cluster.transitionToStandby(0); + Thread.sleep(50); + cluster.transitionToActive(0); + } + }); + ctx.startThreads(); + ctx.waitFor(20000); + ctx.stop(); + } finally { + cluster.shutdown(); + } + } + + /** + * Test for HDFS-2812. Since lease renewals go from the client + * only to the active NN, the SBN will have out-of-date lease + * info when it becomes active. We need to make sure we don't + * accidentally mark the leases as expired when the failover + * proceeds. + */ + @Test(timeout=120000) + public void testLeasesRenewedOnTransition() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .build(); + FSDataOutputStream stm = null; + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + NameNode nn0 = cluster.getNameNode(0); + NameNode nn1 = cluster.getNameNode(1); + + try { + cluster.waitActive(); + cluster.transitionToActive(0); + + LOG.info("Starting with NN 0 active"); + + stm = fs.create(TEST_FILE_PATH); + long nn0t0 = NameNodeAdapter.getLeaseRenewalTime(nn0, TEST_FILE_STR); + assertTrue(nn0t0 > 0); + long nn1t0 = NameNodeAdapter.getLeaseRenewalTime(nn1, TEST_FILE_STR); + assertEquals("Lease should not yet exist on nn1", + -1, nn1t0); + + Thread.sleep(5); // make sure time advances! + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + long nn1t1 = NameNodeAdapter.getLeaseRenewalTime(nn1, TEST_FILE_STR); + assertTrue("Lease should have been created on standby. Time was: " + + nn1t1, nn1t1 > nn0t0); + + Thread.sleep(5); // make sure time advances! + + LOG.info("Failing over to NN 1"); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + long nn1t2 = NameNodeAdapter.getLeaseRenewalTime(nn1, TEST_FILE_STR); + assertTrue("Lease should have been renewed by failover process", + nn1t2 > nn1t1); + } finally { + IOUtils.closeStream(stm); + cluster.shutdown(); + } + } + + /** + * Test that delegation tokens continue to work after the failover. + */ + @Test + public void testDelegationTokensAfterFailover() throws IOException, + URISyntaxException { + Configuration conf = new Configuration(); + conf.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); + + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + NameNode nn1 = cluster.getNameNode(0); + NameNode nn2 = cluster.getNameNode(1); + + String renewer = UserGroupInformation.getLoginUser().getUserName(); + Token token = nn1.getRpcServer() + .getDelegationToken(new Text(renewer)); + + LOG.info("Failing over to NN 1"); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + nn2.getRpcServer().renewDelegationToken(token); + nn2.getRpcServer().cancelDelegationToken(token); + token = nn2.getRpcServer().getDelegationToken(new Text(renewer)); + Assert.assertTrue(token != null); + } finally { + cluster.shutdown(); + } + } + + /** + * Tests manual failover back and forth between two NameNodes + * for federation cluster with two namespaces. + */ + @Test + public void testManualFailoverFailbackFederationHA() throws Exception { + Configuration conf = new Configuration(); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHAFederatedTopology(2)) + .numDataNodes(1) + .build(); + try { + cluster.waitActive(); + + // test for namespace 0 + testManualFailoverFailback(cluster, conf, 0); + + // test for namespace 1 + testManualFailoverFailback(cluster, conf, 1); + } finally { + cluster.shutdown(); + } + } + + @Test + public void testFailoverWithEmptyInProgressEditLog() throws Exception { + testFailoverAfterCrashDuringLogRoll(false); + } + + @Test + public void testFailoverWithEmptyInProgressEditLogWithHeader() + throws Exception { + testFailoverAfterCrashDuringLogRoll(true); + } + + private static void testFailoverAfterCrashDuringLogRoll(boolean writeHeader) + throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, Integer.MAX_VALUE); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + try { + cluster.transitionToActive(0); + NameNode nn0 = cluster.getNameNode(0); + nn0.getRpcServer().rollEditLog(); + cluster.shutdownNameNode(0); + createEmptyInProgressEditLog(cluster, nn0, writeHeader); + cluster.transitionToActive(1); + } finally { + IOUtils.cleanup(LOG, fs); + cluster.shutdown(); + } + } + + private static void createEmptyInProgressEditLog(MiniDFSCluster cluster, + NameNode nn, boolean writeHeader) throws IOException { + long txid = nn.getNamesystem().getEditLog().getLastWrittenTxId(); + URI sharedEditsUri = cluster.getSharedEditsDir(0, 1); + File sharedEditsDir = new File(sharedEditsUri.getPath()); + StorageDirectory storageDir = new StorageDirectory(sharedEditsDir); + File inProgressFile = NameNodeAdapter.getInProgressEditsFile(storageDir, + txid + 1); + assertTrue("Failed to create in-progress edits file", + inProgressFile.createNewFile()); + + if (writeHeader) { + DataOutputStream out = new DataOutputStream(new FileOutputStream( + inProgressFile)); + EditLogFileOutputStream.writeHeader(out); + out.close(); + } + } + + + /** + * The secret manager needs to start/stop - the invariant should be that + * the secret manager runs if and only if the NN is active and not in + * safe mode. As a state diagram, we need to test all of the following + * transitions to make sure the secret manager is started when we transition + * into state 4, but none of the others. + *
      +   *         SafeMode     Not SafeMode 
      +   * Standby   1 <------> 2
      +   *           ^          ^
      +   *           |          |
      +   *           v          v
      +   * Active    3 <------> 4
      +   * 
      + */ + @Test(timeout=60000) + public void testSecretManagerState() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true); + conf.setInt( + DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY, 50); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .waitSafeMode(false) + .build(); + try { + cluster.transitionToActive(0); + DFSTestUtil.createFile(cluster.getFileSystem(0), + TEST_FILE_PATH, 6000, (short)1, 1L); + + cluster.getConfiguration(0).setInt( + DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 60000); + + cluster.restartNameNode(0); + NameNode nn = cluster.getNameNode(0); + + banner("Started in state 1."); + assertTrue(nn.isStandbyState()); + assertTrue(nn.isInSafeMode()); + assertFalse(isDTRunning(nn)); + + banner("Transition 1->2. Should not start secret manager"); + NameNodeAdapter.leaveSafeMode(nn, false); + assertTrue(nn.isStandbyState()); + assertFalse(nn.isInSafeMode()); + assertFalse(isDTRunning(nn)); + + banner("Transition 2->1. Should not start secret manager."); + NameNodeAdapter.enterSafeMode(nn, false); + assertTrue(nn.isStandbyState()); + assertTrue(nn.isInSafeMode()); + assertFalse(isDTRunning(nn)); + + banner("Transition 1->3. Should not start secret manager."); + nn.getRpcServer().transitionToActive(); + assertFalse(nn.isStandbyState()); + assertTrue(nn.isInSafeMode()); + assertFalse(isDTRunning(nn)); + + banner("Transition 3->1. Should not start secret manager."); + nn.getRpcServer().transitionToStandby(); + assertTrue(nn.isStandbyState()); + assertTrue(nn.isInSafeMode()); + assertFalse(isDTRunning(nn)); + + banner("Transition 1->3->4. Should start secret manager."); + nn.getRpcServer().transitionToActive(); + NameNodeAdapter.leaveSafeMode(nn, false); + assertFalse(nn.isStandbyState()); + assertFalse(nn.isInSafeMode()); + assertTrue(isDTRunning(nn)); + + banner("Transition 4->3. Should stop secret manager"); + NameNodeAdapter.enterSafeMode(nn, false); + assertFalse(nn.isStandbyState()); + assertTrue(nn.isInSafeMode()); + assertFalse(isDTRunning(nn)); + + banner("Transition 3->4. Should start secret manager"); + NameNodeAdapter.leaveSafeMode(nn, false); + assertFalse(nn.isStandbyState()); + assertFalse(nn.isInSafeMode()); + assertTrue(isDTRunning(nn)); + + for (int i = 0; i < 20; i++) { + // Loop the last check to suss out races. + banner("Transition 4->2. Should stop secret manager."); + nn.getRpcServer().transitionToStandby(); + assertTrue(nn.isStandbyState()); + assertFalse(nn.isInSafeMode()); + assertFalse(isDTRunning(nn)); + + banner("Transition 2->4. Should start secret manager"); + nn.getRpcServer().transitionToActive(); + assertFalse(nn.isStandbyState()); + assertFalse(nn.isInSafeMode()); + assertTrue(isDTRunning(nn)); + } + } finally { + cluster.shutdown(); + } + } + + private boolean isDTRunning(NameNode nn) { + return NameNodeAdapter.getDtSecretManager(nn.getNamesystem()).isRunning(); + } + + /** + * Print a big banner in the test log to make debug easier. + */ + static void banner(String string) { + LOG.info("\n\n\n\n================================================\n" + + string + "\n" + + "==================================================\n\n"); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAWebUI.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAWebUI.java new file mode 100644 index 0000000000..be01430117 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAWebUI.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.junit.Test; + +public class TestHAWebUI { + + /** + * Tests that the web UI of the name node provides a link to browse the file + * system and summary of under-replicated blocks only in active state + * + */ + @Test + public void testLinkAndClusterSummary() throws Exception { + Configuration conf = new Configuration(); + + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()).numDataNodes(0) + .build(); + try { + cluster.waitActive(); + + cluster.transitionToActive(0); + String pageContents = DFSTestUtil.urlGet(new URL("http://localhost:" + + NameNode.getHttpAddress(cluster.getConfiguration(0)).getPort() + + "/dfshealth.jsp")); + assertTrue(pageContents.contains("Browse the filesystem")); + assertTrue(pageContents.contains("Number of Under-Replicated Blocks")); + + cluster.transitionToStandby(0); + pageContents = DFSTestUtil.urlGet(new URL("http://localhost:" + + NameNode.getHttpAddress(cluster.getConfiguration(0)).getPort() + + "/dfshealth.jsp")); + assertFalse(pageContents.contains("Browse the filesystem")); + assertFalse(pageContents.contains("Number of Under-Replicated Blocks")); + + cluster.transitionToActive(0); + pageContents = DFSTestUtil.urlGet(new URL("http://localhost:" + + NameNode.getHttpAddress(cluster.getConfiguration(0)).getPort() + + "/dfshealth.jsp")); + assertTrue(pageContents.contains("Browse the filesystem")); + assertTrue(pageContents.contains("Number of Under-Replicated Blocks")); + + } finally { + cluster.shutdown(); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java new file mode 100644 index 0000000000..ab2a8dd061 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.fail; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.ha.HealthCheckFailedException; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.NameNodeResourceChecker; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Test; +import org.mockito.Mockito; + +public class TestNNHealthCheck { + + @Test + public void testNNHealthCheck() throws IOException { + MiniDFSCluster cluster = null; + try { + Configuration conf = new Configuration(); + cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(0) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .build(); + + NameNodeResourceChecker mockResourceChecker = Mockito.mock( + NameNodeResourceChecker.class); + Mockito.doReturn(true).when(mockResourceChecker).hasAvailableDiskSpace(); + cluster.getNameNode(0).getNamesystem() + .setNNResourceChecker(mockResourceChecker); + + NamenodeProtocols rpc = cluster.getNameNodeRpc(0); + + // Should not throw error, which indicates healthy. + rpc.monitorHealth(); + + Mockito.doReturn(false).when(mockResourceChecker).hasAvailableDiskSpace(); + + try { + // Should throw error - NN is unhealthy. + rpc.monitorHealth(); + fail("Should not have succeeded in calling monitorHealth"); + } catch (HealthCheckFailedException hcfe) { + GenericTestUtils.assertExceptionContains( + "The NameNode has no resources available", hcfe); + } + } finally { + if (cluster != null) { + cluster.shutdown(); + } + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java new file mode 100644 index 0000000000..547ba72e49 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java @@ -0,0 +1,506 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.security.PrivilegedExceptionAction; +import java.util.concurrent.TimeoutException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.AppendTestUtil; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.protocol.DatanodeID; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; +import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; +import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.GenericTestUtils.DelayAnswer; +import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread; +import org.apache.hadoop.test.MultithreadedTestUtil.TestContext; + +import org.apache.log4j.Level; + +import org.junit.Test; +import org.mockito.Mockito; + +import com.google.common.base.Supplier; + +/** + * Test cases regarding pipeline recovery during NN failover. + */ +public class TestPipelinesFailover { + static { + ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)LogFactory.getLog( + "org.apache.hadoop.io.retry.RetryInvocationHandler")).getLogger().setLevel(Level.ALL); + + ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL); + } + + protected static final Log LOG = LogFactory.getLog( + TestPipelinesFailover.class); + private static final Path TEST_PATH = + new Path("/test-file"); + private static final int BLOCK_SIZE = 4096; + private static final int BLOCK_AND_A_HALF = BLOCK_SIZE * 3 / 2; + + private static final int STRESS_NUM_THREADS = 25; + private static final int STRESS_RUNTIME = 40000; + + /** + * Tests continuing a write pipeline over a failover. + */ + @Test(timeout=30000) + public void testWriteOverFailover() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); + // Don't check replication periodically. + conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1000); + + FSDataOutputStream stm = null; + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(3) + .build(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + Thread.sleep(500); + + LOG.info("Starting with NN 0 active"); + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + stm = fs.create(TEST_PATH); + + // write a block and a half + AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF); + + // Make sure all of the blocks are written out before failover. + stm.hflush(); + + LOG.info("Failing over to NN 1"); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + assertTrue(fs.exists(TEST_PATH)); + FSNamesystem ns1 = cluster.getNameNode(1).getNamesystem(); + BlockManagerTestUtil.updateState(ns1.getBlockManager()); + assertEquals(0, ns1.getPendingReplicationBlocks()); + assertEquals(0, ns1.getCorruptReplicaBlocks()); + assertEquals(0, ns1.getMissingBlocksCount()); + + // write another block and a half + AppendTestUtil.write(stm, BLOCK_AND_A_HALF, BLOCK_AND_A_HALF); + + stm.close(); + stm = null; + + AppendTestUtil.check(fs, TEST_PATH, BLOCK_SIZE * 3); + } finally { + IOUtils.closeStream(stm); + cluster.shutdown(); + } + } + + /** + * Tests continuing a write pipeline over a failover when a DN fails + * after the failover - ensures that updating the pipeline succeeds + * even when the pipeline was constructed on a different NN. + */ + @Test(timeout=30000) + public void testWriteOverFailoverWithDnFail() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); + + FSDataOutputStream stm = null; + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(5) + .build(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + Thread.sleep(500); + + LOG.info("Starting with NN 0 active"); + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + stm = fs.create(TEST_PATH); + + // write a block and a half + AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF); + + // Make sure all the blocks are written before failover + stm.hflush(); + + LOG.info("Failing over to NN 1"); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + assertTrue(fs.exists(TEST_PATH)); + + cluster.stopDataNode(0); + + // write another block and a half + AppendTestUtil.write(stm, BLOCK_AND_A_HALF, BLOCK_AND_A_HALF); + stm.hflush(); + + LOG.info("Failing back to NN 0"); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + cluster.stopDataNode(1); + + AppendTestUtil.write(stm, BLOCK_AND_A_HALF*2, BLOCK_AND_A_HALF); + stm.hflush(); + + + stm.close(); + stm = null; + + AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF * 3); + } finally { + IOUtils.closeStream(stm); + cluster.shutdown(); + } + } + + /** + * Tests lease recovery if a client crashes. This approximates the + * use case of HBase WALs being recovered after a NN failover. + */ + @Test(timeout=30000) + public void testLeaseRecoveryAfterFailover() throws Exception { + final Configuration conf = new Configuration(); + // Disable permissions so that another user can recover the lease. + conf.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); + + FSDataOutputStream stm = null; + final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(3) + .build(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + Thread.sleep(500); + + LOG.info("Starting with NN 0 active"); + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + stm = fs.create(TEST_PATH); + + // write a block and a half + AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF); + stm.hflush(); + + LOG.info("Failing over to NN 1"); + + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + assertTrue(fs.exists(TEST_PATH)); + + FileSystem fsOtherUser = createFsAsOtherUser(cluster, conf); + loopRecoverLease(fsOtherUser, TEST_PATH); + + AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF); + + // Fail back to ensure that the block locations weren't lost on the + // original node. + cluster.transitionToStandby(1); + cluster.transitionToActive(0); + AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF); + } finally { + IOUtils.closeStream(stm); + cluster.shutdown(); + } + } + + /** + * Test the scenario where the NN fails over after issuing a block + * synchronization request, but before it is committed. The + * DN running the recovery should then fail to commit the synchronization + * and a later retry will succeed. + */ + @Test(timeout=30000) + public void testFailoverRightBeforeCommitSynchronization() throws Exception { + final Configuration conf = new Configuration(); + // Disable permissions so that another user can recover the lease. + conf.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); + + FSDataOutputStream stm = null; + final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(3) + .build(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + Thread.sleep(500); + + LOG.info("Starting with NN 0 active"); + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + stm = fs.create(TEST_PATH); + + // write a half block + AppendTestUtil.write(stm, 0, BLOCK_SIZE / 2); + stm.hflush(); + + // Look into the block manager on the active node for the block + // under construction. + + NameNode nn0 = cluster.getNameNode(0); + ExtendedBlock blk = DFSTestUtil.getFirstBlock(fs, TEST_PATH); + DatanodeDescriptor expectedPrimary = getExpectedPrimaryNode(nn0, blk); + LOG.info("Expecting block recovery to be triggered on DN " + + expectedPrimary); + + // Find the corresponding DN daemon, and spy on its connection to the + // active. + DataNode primaryDN = cluster.getDataNode(expectedPrimary.getIpcPort()); + DatanodeProtocolClientSideTranslatorPB nnSpy = + DataNodeAdapter.spyOnBposToNN(primaryDN, nn0); + + // Delay the commitBlockSynchronization call + DelayAnswer delayer = new DelayAnswer(LOG); + Mockito.doAnswer(delayer).when(nnSpy).commitBlockSynchronization( + Mockito.eq(blk), + Mockito.anyInt(), // new genstamp + Mockito.anyLong(), // new length + Mockito.eq(true), // close file + Mockito.eq(false), // delete block + (DatanodeID[]) Mockito.anyObject()); // new targets + + DistributedFileSystem fsOtherUser = createFsAsOtherUser(cluster, conf); + assertFalse(fsOtherUser.recoverLease(TEST_PATH)); + + LOG.info("Waiting for commitBlockSynchronization call from primary"); + delayer.waitForCall(); + + LOG.info("Failing over to NN 1"); + + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + + // Let the commitBlockSynchronization call go through, and check that + // it failed with the correct exception. + delayer.proceed(); + delayer.waitForResult(); + Throwable t = delayer.getThrown(); + if (t == null) { + fail("commitBlockSynchronization call did not fail on standby"); + } + GenericTestUtils.assertExceptionContains( + "Operation category WRITE is not supported", + t); + + // Now, if we try again to recover the block, it should succeed on the new + // active. + loopRecoverLease(fsOtherUser, TEST_PATH); + + AppendTestUtil.check(fs, TEST_PATH, BLOCK_SIZE/2); + } finally { + IOUtils.closeStream(stm); + cluster.shutdown(); + } + } + + /** + * Stress test for pipeline/lease recovery. Starts a number of + * threads, each of which creates a file and has another client + * break the lease. While these threads run, failover proceeds + * back and forth between two namenodes. + */ + @Test(timeout=STRESS_RUNTIME*3) + public void testPipelineRecoveryStress() throws Exception { + HAStressTestHarness harness = new HAStressTestHarness(); + // Disable permissions so that another user can recover the lease. + harness.conf.setBoolean( + DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false); + + final MiniDFSCluster cluster = harness.startCluster(); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + + FileSystem fs = harness.getFailoverFs(); + DistributedFileSystem fsAsOtherUser = createFsAsOtherUser( + cluster, harness.conf); + + TestContext testers = new TestContext(); + for (int i = 0; i < STRESS_NUM_THREADS; i++) { + Path p = new Path("/test-" + i); + testers.addThread(new PipelineTestThread( + testers, fs, fsAsOtherUser, p)); + } + + // Start a separate thread which will make sure that replication + // happens quickly by triggering deletion reports and replication + // work calculation frequently. + harness.addReplicationTriggerThread(500); + harness.addFailoverThread(5000); + harness.startThreads(); + testers.startThreads(); + + testers.waitFor(STRESS_RUNTIME); + testers.stop(); + harness.stopThreads(); + } finally { + System.err.println("===========================\n\n\n\n"); + harness.shutdown(); + } + } + + /** + * Test thread which creates a file, has another fake user recover + * the lease on the file, and then ensures that the file's contents + * are properly readable. If any of these steps fails, propagates + * an exception back to the test context, causing the test case + * to fail. + */ + private static class PipelineTestThread extends RepeatingTestThread { + private final FileSystem fs; + private final FileSystem fsOtherUser; + private final Path path; + + + public PipelineTestThread(TestContext ctx, + FileSystem fs, FileSystem fsOtherUser, Path p) { + super(ctx); + this.fs = fs; + this.fsOtherUser = fsOtherUser; + this.path = p; + } + + @Override + public void doAnAction() throws Exception { + FSDataOutputStream stm = fs.create(path, true); + try { + AppendTestUtil.write(stm, 0, 100); + stm.hflush(); + loopRecoverLease(fsOtherUser, path); + AppendTestUtil.check(fs, path, 100); + } finally { + try { + stm.close(); + } catch (IOException e) { + // should expect this since we lost the lease + } + } + } + + @Override + public String toString() { + return "Pipeline test thread for " + path; + } + } + + + + /** + * @return the node which is expected to run the recovery of the + * given block, which is known to be under construction inside the + * given NameNOde. + */ + private DatanodeDescriptor getExpectedPrimaryNode(NameNode nn, + ExtendedBlock blk) { + BlockManager bm0 = nn.getNamesystem().getBlockManager(); + BlockInfo storedBlock = bm0.getStoredBlock(blk.getLocalBlock()); + assertTrue("Block " + blk + " should be under construction, " + + "got: " + storedBlock, + storedBlock instanceof BlockInfoUnderConstruction); + BlockInfoUnderConstruction ucBlock = + (BlockInfoUnderConstruction)storedBlock; + // We expect that the first indexed replica will be the one + // to be in charge of the synchronization / recovery protocol. + DatanodeDescriptor expectedPrimary = ucBlock.getExpectedLocations()[0]; + return expectedPrimary; + } + + private DistributedFileSystem createFsAsOtherUser( + final MiniDFSCluster cluster, final Configuration conf) + throws IOException, InterruptedException { + return (DistributedFileSystem) UserGroupInformation.createUserForTesting( + "otheruser", new String[] { "othergroup"}) + .doAs(new PrivilegedExceptionAction() { + @Override + public FileSystem run() throws Exception { + return HATestUtil.configureFailoverFs( + cluster, conf); + } + }); + } + + /** + * Try to cover the lease on the given file for up to 30 + * seconds. + * @param fsOtherUser the filesystem to use for the recoverLease call + * @param testPath the path on which to run lease recovery + * @throws TimeoutException if lease recover does not succeed within 30 + * seconds + * @throws InterruptedException if the thread is interrupted + */ + private static void loopRecoverLease( + final FileSystem fsOtherUser, final Path testPath) + throws TimeoutException, InterruptedException { + try { + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + boolean success; + try { + success = ((DistributedFileSystem)fsOtherUser) + .recoverLease(testPath); + } catch (IOException e) { + throw new RuntimeException(e); + } + if (!success) { + LOG.info("Waiting to recover lease successfully"); + } + return success; + } + }, 1000, 30000); + } catch (TimeoutException e) { + throw new TimeoutException("Timed out recovering lease for " + + testPath); + } + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestQuotasWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestQuotasWithHA.java new file mode 100644 index 0000000000..5800d3a351 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestQuotasWithHA.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.io.IOUtils; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestQuotasWithHA { + private static final Path TEST_DIR = new Path("/test"); + private static final Path TEST_FILE = new Path(TEST_DIR, "file"); + private static final String TEST_DIR_STR = TEST_DIR.toUri().getPath(); + + private static final long NS_QUOTA = 10000; + private static final long DS_QUOTA = 10000; + private static final long BLOCK_SIZE = 1024; // 1KB blocks + + private MiniDFSCluster cluster; + private NameNode nn0; + private NameNode nn1; + private FileSystem fs; + + @Before + public void setupCluster() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); + HAUtil.setAllowStandbyReads(conf, true); + + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .waitSafeMode(false) + .build(); + cluster.waitActive(); + + nn0 = cluster.getNameNode(0); + nn1 = cluster.getNameNode(1); + fs = HATestUtil.configureFailoverFs(cluster, conf); + + cluster.transitionToActive(0); + } + + @After + public void shutdownCluster() throws IOException { + if (cluster != null) { + cluster.shutdown(); + } + } + + /** + * Test that quotas are properly tracked by the standby through + * create, append, delete. + */ + @Test(timeout=60000) + public void testQuotasTrackedOnStandby() throws Exception { + fs.mkdirs(TEST_DIR); + DistributedFileSystem dfs = (DistributedFileSystem)fs; + dfs.setQuota(TEST_DIR, NS_QUOTA, DS_QUOTA); + long expectedSize = 3 * BLOCK_SIZE + BLOCK_SIZE/2; + DFSTestUtil.createFile(fs, TEST_FILE, expectedSize, (short)1, 1L); + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + ContentSummary cs = nn1.getRpcServer().getContentSummary(TEST_DIR_STR); + assertEquals(NS_QUOTA, cs.getQuota()); + assertEquals(DS_QUOTA, cs.getSpaceQuota()); + assertEquals(expectedSize, cs.getSpaceConsumed()); + assertEquals(1, cs.getDirectoryCount()); + assertEquals(1, cs.getFileCount()); + + // Append to the file and make sure quota is updated correctly. + FSDataOutputStream stm = fs.append(TEST_FILE); + try { + byte[] data = new byte[(int) (BLOCK_SIZE * 3 / 2)]; + stm.write(data); + expectedSize += data.length; + } finally { + IOUtils.closeStream(stm); + } + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + cs = nn1.getRpcServer().getContentSummary(TEST_DIR_STR); + assertEquals(NS_QUOTA, cs.getQuota()); + assertEquals(DS_QUOTA, cs.getSpaceQuota()); + assertEquals(expectedSize, cs.getSpaceConsumed()); + assertEquals(1, cs.getDirectoryCount()); + assertEquals(1, cs.getFileCount()); + + + fs.delete(TEST_FILE, true); + expectedSize = 0; + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + cs = nn1.getRpcServer().getContentSummary(TEST_DIR_STR); + assertEquals(NS_QUOTA, cs.getQuota()); + assertEquals(DS_QUOTA, cs.getSpaceQuota()); + assertEquals(expectedSize, cs.getSpaceConsumed()); + assertEquals(1, cs.getDirectoryCount()); + assertEquals(0, cs.getFileCount()); + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java new file mode 100644 index 0000000000..5440c38cc2 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.FSImage; +import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NNStorage; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; + + +public class TestStandbyCheckpoints { + private static final int NUM_DIRS_IN_LOG = 200000; + private MiniDFSCluster cluster; + private NameNode nn0, nn1; + private FileSystem fs; + + @Before + public void setupCluster() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 5); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + + MiniDFSNNTopology topology = new MiniDFSNNTopology() + .addNameservice(new MiniDFSNNTopology.NSConf("ns1") + .addNN(new MiniDFSNNTopology.NNConf("nn1").setHttpPort(10001)) + .addNN(new MiniDFSNNTopology.NNConf("nn2").setHttpPort(10002))); + + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(topology) + .numDataNodes(0) + .build(); + cluster.waitActive(); + + nn0 = cluster.getNameNode(0); + nn1 = cluster.getNameNode(1); + fs = HATestUtil.configureFailoverFs(cluster, conf); + + cluster.transitionToActive(0); + } + + @After + public void shutdownCluster() throws IOException { + if (cluster != null) { + cluster.shutdown(); + } + } + + @Test + public void testSBNCheckpoints() throws Exception { + doEdits(0, 10); + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + // Once the standby catches up, it should notice that it needs to + // do a checkpoint and save one to its local directories. + HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 12)); + + // It should also upload it back to the active. + HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 12)); + } + + /** + * Test for the case when both of the NNs in the cluster are + * in the standby state, and thus are both creating checkpoints + * and uploading them to each other. + * In this circumstance, they should receive the error from the + * other node indicating that the other node already has a + * checkpoint for the given txid, but this should not cause + * an abort, etc. + */ + @Test + public void testBothNodesInStandbyState() throws Exception { + doEdits(0, 10); + + cluster.transitionToStandby(0); + + // Transitioning to standby closed the edit log on the active, + // so the standby will catch up. Then, both will be in standby mode + // with enough uncheckpointed txns to cause a checkpoint, and they + // will each try to take a checkpoint and upload to each other. + HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 12)); + HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 12)); + + assertEquals(12, nn0.getNamesystem().getFSImage() + .getMostRecentCheckpointTxId()); + assertEquals(12, nn1.getNamesystem().getFSImage() + .getMostRecentCheckpointTxId()); + + List dirs = Lists.newArrayList(); + dirs.addAll(FSImageTestUtil.getNameNodeCurrentDirs(cluster, 0)); + dirs.addAll(FSImageTestUtil.getNameNodeCurrentDirs(cluster, 1)); + FSImageTestUtil.assertParallelFilesAreIdentical(dirs, ImmutableSet.of()); + } + + /** + * Test for the case when the SBN is configured to checkpoint based + * on a time period, but no transactions are happening on the + * active. Thus, it would want to save a second checkpoint at the + * same txid, which is a no-op. This test makes sure this doesn't + * cause any problem. + */ + @Test + public void testCheckpointWhenNoNewTransactionsHappened() + throws Exception { + // Checkpoint as fast as we can, in a tight loop. + cluster.getConfiguration(1).setInt( + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0); + cluster.restartNameNode(1); + nn1 = cluster.getNameNode(1); + + FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); + + // We shouldn't save any checkpoints at txid=0 + Thread.sleep(1000); + Mockito.verify(spyImage1, Mockito.never()) + .saveNamespace((FSNamesystem) Mockito.anyObject()); + + // Roll the primary and wait for the standby to catch up + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + Thread.sleep(2000); + + // We should make exactly one checkpoint at this new txid. + Mockito.verify(spyImage1, Mockito.times(1)) + .saveNamespace((FSNamesystem) Mockito.anyObject()); + } + + /** + * Test cancellation of ongoing checkpoints when failover happens + * mid-checkpoint. + */ + @Test + public void testCheckpointCancellation() throws Exception { + cluster.transitionToStandby(0); + + // Create an edit log in the shared edits dir with a lot + // of mkdirs operations. This is solely so that the image is + // large enough to take a non-trivial amount of time to load. + // (only ~15MB) + URI sharedUri = cluster.getSharedEditsDir(0, 1); + File sharedDir = new File(sharedUri.getPath(), "current"); + File tmpDir = new File(MiniDFSCluster.getBaseDirectory(), + "testCheckpointCancellation-tmp"); + FSImageTestUtil.createAbortedLogWithMkdirs(tmpDir, NUM_DIRS_IN_LOG, + 3); + String fname = NNStorage.getInProgressEditsFileName(3); + new File(tmpDir, fname).renameTo(new File(sharedDir, fname)); + + // Checkpoint as fast as we can, in a tight loop. + cluster.getConfiguration(1).setInt( + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0); + cluster.restartNameNode(1); + nn1 = cluster.getNameNode(1); + + cluster.transitionToActive(0); + + for (int i = 0; i < 10; i++) { + + doEdits(i*10, i*10 + 10); + cluster.transitionToStandby(0); + cluster.transitionToActive(1); + cluster.transitionToStandby(1); + cluster.transitionToActive(0); + } + + assertTrue(StandbyCheckpointer.getCanceledCount() > 0); + } + + private void doEdits(int start, int stop) throws IOException { + for (int i = start; i < stop; i++) { + Path p = new Path("/test" + i); + fs.mkdirs(p); + } + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java new file mode 100644 index 0000000000..ce5814b0dd --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java @@ -0,0 +1,240 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.junit.Assert.assertEquals; +import static org.mockito.Matchers.anyInt; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.impl.Log4JLogger; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.LocatedBlocks; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; +import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.log4j.Level; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.base.Supplier; + +/** + * The hotornot.com of unit tests: makes sure that the standby not only + * has namespace information, but also has the correct block reports, etc. + */ +public class TestStandbyIsHot { + protected static final Log LOG = LogFactory.getLog( + TestStandbyIsHot.class); + private static final String TEST_FILE_DATA = "hello highly available world"; + private static final String TEST_FILE = "/testStandbyIsHot"; + private static final Path TEST_FILE_PATH = new Path(TEST_FILE); + + static { + ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL); + ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL); + } + + @Test + public void testStandbyIsHot() throws Exception { + Configuration conf = new Configuration(); + // We read from the standby to watch block locations + HAUtil.setAllowStandbyReads(conf, true); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(3) + .build(); + Runtime mockRuntime = mock(Runtime.class); + try { + cluster.waitActive(); + cluster.transitionToActive(0); + + NameNode nn1 = cluster.getNameNode(0); + NameNode nn2 = cluster.getNameNode(1); + + nn2.getNamesystem().getEditLogTailer().setRuntime(mockRuntime); + + FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf); + + Thread.sleep(1000); + System.err.println("=================================="); + DFSTestUtil.writeFile(fs, TEST_FILE_PATH, TEST_FILE_DATA); + // Have to force an edit log roll so that the standby catches up + nn1.getRpcServer().rollEditLog(); + System.err.println("=================================="); + + // Block locations should show up on standby. + LOG.info("Waiting for block locations to appear on standby node"); + waitForBlockLocations(cluster, nn2, TEST_FILE, 3); + + // Trigger immediate heartbeats and block reports so + // that the active "trusts" all of the DNs + cluster.triggerHeartbeats(); + cluster.triggerBlockReports(); + + // Change replication + LOG.info("Changing replication to 1"); + fs.setReplication(TEST_FILE_PATH, (short)1); + waitForBlockLocations(cluster, nn1, TEST_FILE, 1); + + nn1.getRpcServer().rollEditLog(); + + LOG.info("Waiting for lowered replication to show up on standby"); + waitForBlockLocations(cluster, nn2, TEST_FILE, 1); + + // Change back to 3 + LOG.info("Changing replication to 3"); + fs.setReplication(TEST_FILE_PATH, (short)3); + nn1.getRpcServer().rollEditLog(); + + LOG.info("Waiting for higher replication to show up on standby"); + waitForBlockLocations(cluster, nn2, TEST_FILE, 3); + + } finally { + verify(mockRuntime, times(0)).exit(anyInt()); + cluster.shutdown(); + } + } + + /** + * Regression test for HDFS-2795: + * - Start an HA cluster with a DN. + * - Write several blocks to the FS with replication 1. + * - Shutdown the DN + * - Wait for the NNs to declare the DN dead. All blocks will be under-replicated. + * - Restart the DN. + * In the bug, the standby node would only very slowly notice the blocks returning + * to the cluster. + */ + @Test + public void testDatanodeRestarts() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024); + // We read from the standby to watch block locations + HAUtil.setAllowStandbyReads(conf, true); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .build(); + try { + NameNode nn0 = cluster.getNameNode(0); + NameNode nn1 = cluster.getNameNode(1); + + cluster.transitionToActive(0); + + // Create 5 blocks. + DFSTestUtil.createFile(cluster.getFileSystem(0), + TEST_FILE_PATH, 5*1024, (short)1, 1L); + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // Stop the DN. + DataNode dn = cluster.getDataNodes().get(0); + String dnName = dn.getDatanodeId().getName(); + DataNodeProperties dnProps = cluster.stopDataNode(0); + + // Make sure both NNs register it as dead. + BlockManagerTestUtil.noticeDeadDatanode(nn0, dnName); + BlockManagerTestUtil.noticeDeadDatanode(nn1, dnName); + + BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager()); + BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); + assertEquals(5, nn0.getNamesystem().getUnderReplicatedBlocks()); + + // The SBN will not have any blocks in its neededReplication queue + // since the SBN doesn't process replication. + assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks()); + + LocatedBlocks locs = nn1.getRpcServer().getBlockLocations( + TEST_FILE, 0, 1); + assertEquals("Standby should have registered that the block has no replicas", + 0, locs.get(0).getLocations().length); + + cluster.restartDataNode(dnProps); + // Wait for both NNs to re-register the DN. + cluster.waitActive(0); + cluster.waitActive(1); + + BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager()); + BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); + assertEquals(0, nn0.getNamesystem().getUnderReplicatedBlocks()); + assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks()); + + locs = nn1.getRpcServer().getBlockLocations( + TEST_FILE, 0, 1); + assertEquals("Standby should have registered that the block has replicas again", + 1, locs.get(0).getLocations().length); + } finally { + cluster.shutdown(); + } + } + + static void waitForBlockLocations(final MiniDFSCluster cluster, + final NameNode nn, + final String path, final int expectedReplicas) + throws Exception { + GenericTestUtils.waitFor(new Supplier() { + + @Override + public Boolean get() { + try { + LocatedBlocks locs = NameNodeAdapter.getBlockLocations(nn, path, 0, 1000); + DatanodeInfo[] dnis = locs.getLastLocatedBlock().getLocations(); + for (DatanodeInfo dni : dnis) { + Assert.assertNotNull(dni); + } + int numReplicas = dnis.length; + + LOG.info("Got " + numReplicas + " locs: " + locs); + if (numReplicas > expectedReplicas) { + for (DataNode dn : cluster.getDataNodes()) { + DataNodeAdapter.triggerDeletionReport(dn); + } + } + return numReplicas == expectedReplicas; + } catch (IOException e) { + LOG.warn("No block locations yet: " + e.getMessage()); + return false; + } + } + }, 500, 10000); + + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index c993f6c9ae..79c7047f5c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -163,8 +163,13 @@ public class TestNameNodeMetrics { // Corrupt first replica of the block LocatedBlock block = NameNodeAdapter.getBlockLocations( cluster.getNameNode(), file.toString(), 0, 1).get(0); - bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], - "TEST"); + cluster.getNamesystem().writeLock(); + try { + bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], + "TEST"); + } finally { + cluster.getNamesystem().writeUnlock(); + } Thread.sleep(1000); // Wait for block to be marked corrupt MetricsRecordBuilder rb = getMetrics(NS_METRICS); assertGauge("CorruptBlocks", 1L, rb); @@ -202,8 +207,13 @@ public class TestNameNodeMetrics { // Corrupt the only replica of the block to result in a missing block LocatedBlock block = NameNodeAdapter.getBlockLocations( cluster.getNameNode(), file.toString(), 0, 1).get(0); - bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], - "TEST"); + cluster.getNamesystem().writeLock(); + try { + bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], + "TEST"); + } finally { + cluster.getNamesystem().writeUnlock(); + } Thread.sleep(1000); // Wait for block to be marked corrupt MetricsRecordBuilder rb = getMetrics(NS_METRICS); assertGauge("UnderReplicatedBlocks", 1L, rb); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdmin.java new file mode 100644 index 0000000000..355009a765 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdmin.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdfs.tools; + +import static org.junit.Assert.*; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; + +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.Log; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.ha.HAServiceProtocol; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import org.apache.hadoop.ha.HealthCheckFailedException; +import org.apache.hadoop.ha.NodeFencer; + +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; +import static org.mockito.Mockito.when; + +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; + +public class TestDFSHAAdmin { + private static final Log LOG = LogFactory.getLog(TestDFSHAAdmin.class); + + private DFSHAAdmin tool; + private ByteArrayOutputStream errOutBytes = new ByteArrayOutputStream(); + private String errOutput; + private HAServiceProtocol mockProtocol; + + private static final String NSID = "ns1"; + private static String HOST_A = "1.2.3.1"; + private static String HOST_B = "1.2.3.2"; + + private HdfsConfiguration getHAConf() { + HdfsConfiguration conf = new HdfsConfiguration(); + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, NSID); + conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID, NSID); + conf.set(DFSUtil.addKeySuffixes( + DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX, NSID), "nn1,nn2"); + conf.set(DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY, "nn1"); + conf.set(DFSUtil.addKeySuffixes( + DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, NSID, "nn1"), + HOST_A + ":12345"); + conf.set(DFSUtil.addKeySuffixes( + DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, NSID, "nn2"), + HOST_B + ":12345"); + return conf; + } + + @Before + public void setup() throws IOException { + mockProtocol = Mockito.mock(HAServiceProtocol.class); + when(mockProtocol.readyToBecomeActive()).thenReturn(true); + tool = new DFSHAAdmin() { + @Override + protected HAServiceProtocol getProtocol(String serviceId) throws IOException { + getServiceAddr(serviceId); + return mockProtocol; + } + }; + tool.setConf(getHAConf()); + tool.setErrOut(new PrintStream(errOutBytes)); + } + + private void assertOutputContains(String string) { + if (!errOutput.contains(string)) { + fail("Expected output to contain '" + string + "' but was:\n" + + errOutput); + } + } + + @Test + public void testNameserviceOption() throws Exception { + assertEquals(-1, runTool("-ns")); + assertOutputContains("Missing nameservice ID"); + assertEquals(-1, runTool("-ns", "ns1")); + assertOutputContains("Missing command"); + // "ns1" isn't defined but we check this lazily and help doesn't use the ns + assertEquals(0, runTool("-ns", "ns1", "-help", "transitionToActive")); + assertOutputContains("Transitions the service into Active"); + } + + @Test + public void testNamenodeResolution() throws Exception { + assertEquals(0, runTool("-getServiceState", "nn1")); + Mockito.verify(mockProtocol).getServiceState(); + assertEquals(-1, runTool("-getServiceState", "undefined")); + assertOutputContains( + "Unable to determine service address for namenode 'undefined'"); + } + + @Test + public void testHelp() throws Exception { + assertEquals(-1, runTool("-help")); + assertEquals(0, runTool("-help", "transitionToActive")); + assertOutputContains("Transitions the service into Active"); + } + + @Test + public void testTransitionToActive() throws Exception { + assertEquals(0, runTool("-transitionToActive", "nn1")); + Mockito.verify(mockProtocol).transitionToActive(); + } + + @Test + public void testTransitionToStandby() throws Exception { + assertEquals(0, runTool("-transitionToStandby", "nn1")); + Mockito.verify(mockProtocol).transitionToStandby(); + } + + @Test + public void testFailoverWithNoFencerConfigured() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + assertEquals(-1, runTool("-failover", "nn1", "nn2")); + } + + @Test + public void testFailoverWithFencerConfigured() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + HdfsConfiguration conf = getHAConf(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-failover", "nn1", "nn2")); + } + + @Test + public void testFailoverWithFencerAndNameservice() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + HdfsConfiguration conf = getHAConf(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-ns", "ns1", "-failover", "nn1", "nn2")); + } + + @Test + public void testFailoverWithFencerConfiguredAndForce() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + HdfsConfiguration conf = getHAConf(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-failover", "nn1", "nn2", "--forcefence")); + } + + @Test + public void testFailoverWithForceActive() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + HdfsConfiguration conf = getHAConf(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-failover", "nn1", "nn2", "--forceactive")); + } + + @Test + public void testFailoverWithInvalidFenceArg() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + HdfsConfiguration conf = getHAConf(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(-1, runTool("-failover", "nn1", "nn2", "notforcefence")); + } + + @Test + public void testFailoverWithFenceButNoFencer() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence")); + } + + @Test + public void testFailoverWithFenceAndBadFencer() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + HdfsConfiguration conf = getHAConf(); + conf.set(NodeFencer.CONF_METHODS_KEY, "foobar!"); + tool.setConf(conf); + assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence")); + } + + @Test + public void testForceFenceOptionListedBeforeArgs() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + HdfsConfiguration conf = getHAConf(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-failover", "--forcefence", "nn1", "nn2")); + } + + @Test + public void testGetServiceState() throws Exception { + assertEquals(0, runTool("-getServiceState", "nn1")); + Mockito.verify(mockProtocol).getServiceState(); + } + + @Test + public void testCheckHealth() throws Exception { + assertEquals(0, runTool("-checkHealth", "nn1")); + Mockito.verify(mockProtocol).monitorHealth(); + + Mockito.doThrow(new HealthCheckFailedException("fake health check failure")) + .when(mockProtocol).monitorHealth(); + assertEquals(-1, runTool("-checkHealth", "nn1")); + assertOutputContains("Health check failed: fake health check failure"); + } + + private Object runTool(String ... args) throws Exception { + errOutBytes.reset(); + LOG.info("Running: DFSHAAdmin " + Joiner.on(" ").join(args)); + int ret = tool.run(args); + errOutput = new String(errOutBytes.toByteArray(), Charsets.UTF_8); + LOG.info("Output:\n" + errOutput); + return ret; + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java new file mode 100644 index 0000000000..0302c8e903 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.tools; + +import static org.junit.Assert.*; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.apache.commons.logging.LogFactory; +import org.apache.commons.logging.Log; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.ha.NodeFencer; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; + +/** + * Tests for HAAdmin command with {@link MiniDFSCluster} set up in HA mode. + */ +public class TestDFSHAAdminMiniCluster { + private static final Log LOG = LogFactory.getLog(TestDFSHAAdminMiniCluster.class); + + private MiniDFSCluster cluster; + private Configuration conf; + private DFSHAAdmin tool; + + @Before + public void setup() throws IOException { + conf = new Configuration(); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()).numDataNodes(0) + .build(); + tool = new DFSHAAdmin(); + tool.setConf(conf); + cluster.waitActive(); + } + + @After + public void shutdown() throws Exception { + cluster.shutdown(); + } + + @Test + public void testGetServiceState() throws Exception { + assertEquals(0, runTool("-getServiceState", "nn1")); + assertEquals(0, runTool("-getServiceState", "nn2")); + } + + @Test + public void testStateTransition() throws Exception { + NameNode nnode1 = cluster.getNameNode(0); + assertTrue(nnode1.isStandbyState()); + assertEquals(0, runTool("-transitionToActive", "nn1")); + assertFalse(nnode1.isStandbyState()); + assertEquals(0, runTool("-transitionToStandby", "nn1")); + assertTrue(nnode1.isStandbyState()); + + NameNode nnode2 = cluster.getNameNode(1); + assertTrue(nnode2.isStandbyState()); + assertEquals(0, runTool("-transitionToActive", "nn2")); + assertFalse(nnode2.isStandbyState()); + assertEquals(0, runTool("-transitionToStandby", "nn2")); + assertTrue(nnode2.isStandbyState()); + } + + /** + * Test failover with various options + */ + @Test + public void testFencer() throws Exception { + // Test failover with no fencer + assertEquals(-1, runTool("-failover", "nn1", "nn2")); + + // Test failover with fencer + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-transitionToActive", "nn1")); + assertEquals(0, runTool("-failover", "nn1", "nn2")); + + // Test failover with fencer and nameservice + assertEquals(0, runTool("-ns", "minidfs-ns", "-failover", "nn2", "nn1")); + + // Test failover with fencer and forcefence option + assertEquals(0, runTool("-failover", "nn1", "nn2", "--forcefence")); + + // Test failover with forceactive option + assertEquals(0, runTool("-failover", "nn2", "nn1", "--forceactive")); + + // Test failover with not fencer and forcefence option + conf.unset(NodeFencer.CONF_METHODS_KEY); + tool.setConf(conf); + assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence")); + + // Test failover with bad fencer and forcefence option + conf.set(NodeFencer.CONF_METHODS_KEY, "foobar!"); + tool.setConf(conf); + assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence")); + + // Test failover with force fence listed before the other arguments + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-failover", "--forcefence", "nn1", "nn2")); + } + + @Test + public void testCheckHealth() throws Exception { + assertEquals(0, runTool("-checkHealth", "nn1")); + assertEquals(0, runTool("-checkHealth", "nn2")); + } + + private int runTool(String ... args) throws Exception { + ByteArrayOutputStream errOutBytes = new ByteArrayOutputStream(); + errOutBytes.reset(); + LOG.info("Running: DFSHAAdmin " + Joiner.on(" ").join(args)); + int ret = tool.run(args); + String errOutput = new String(errOutBytes.toByteArray(), Charsets.UTF_8); + LOG.info("Output:\n" + errOutput); + return ret; + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java index 7152e128d4..97be2b843d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java @@ -24,6 +24,7 @@ import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.StringTokenizer; import static org.junit.Assert.*; @@ -32,6 +33,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.tools.GetConf; import org.apache.hadoop.hdfs.tools.GetConf.Command; @@ -72,7 +74,7 @@ public class TestGetConf { String[] values = new String[nameServiceIdCount]; for (int i = 0; i < nameServiceIdCount; i++, portOffset++) { String nsID = getNameServiceId(i); - String specificKey = DFSUtil.getNameServiceIdKey(key, nsID); + String specificKey = DFSUtil.addKeySuffixes(key, nsID); values[i] = "nn" + i + ":" + portOffset; conf.set(specificKey, values[i]); } @@ -80,13 +82,13 @@ public class TestGetConf { } /* - * Convert list of InetSocketAddress to string array with each address - * represented as "host:port" + * Convert the map returned from DFSUtil functions to an array of + * addresses represented as "host:port" */ - private String[] toStringArray(List list) { + private String[] toStringArray(List list) { String[] ret = new String[list.size()]; for (int i = 0; i < list.size(); i++) { - ret[i] = NetUtils.getHostPortString(list.get(i)); + ret[i] = NetUtils.getHostPortString(list.get(i).getAddress()); } return ret; } @@ -94,8 +96,8 @@ public class TestGetConf { /** * Using DFSUtil methods get the list of given {@code type} of address */ - private List getAddressListFromConf(TestType type, - HdfsConfiguration conf) throws IOException { + private Map> getAddressListFromConf( + TestType type, HdfsConfiguration conf) throws IOException { switch (type) { case NAMENODE: return DFSUtil.getNNServiceRpcAddresses(conf); @@ -161,7 +163,7 @@ public class TestGetConf { * @param expected, expected addresses */ private void getAddressListFromTool(TestType type, HdfsConfiguration conf, - boolean checkPort, List expected) throws Exception { + boolean checkPort, List expected) throws Exception { String out = getAddressListFromTool(type, conf, expected.size() != 0); List values = new ArrayList(); @@ -176,7 +178,8 @@ public class TestGetConf { // Convert expected list to String[] of hosts int i = 0; String[] expectedHosts = new String[expected.size()]; - for (InetSocketAddress addr : expected) { + for (ConfiguredNNAddress cnn : expected) { + InetSocketAddress addr = cnn.getAddress(); if (!checkPort) { expectedHosts[i++] = addr.getHostName(); }else { @@ -191,7 +194,9 @@ public class TestGetConf { private void verifyAddresses(HdfsConfiguration conf, TestType type, boolean checkPort, String... expected) throws Exception { // Ensure DFSUtil returned the right set of addresses - List list = getAddressListFromConf(type, conf); + Map> map = + getAddressListFromConf(type, conf); + List list = DFSUtil.flattenAddressMap(map); String[] actual = toStringArray(list); Arrays.sort(actual); Arrays.sort(expected); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java index 966e52f93c..5d3272af7a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.LeaseManager; @@ -79,7 +80,7 @@ public class TestWebHdfsWithMultipleNameNodes { conf.setBoolean(DFSConfigKeys.DFS_WEBHDFS_ENABLED_KEY, true); cluster = new MiniDFSCluster.Builder(conf) - .numNameNodes(nNameNodes) + .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(nNameNodes)) .numDataNodes(nDataNodes) .build(); cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java index 13e9683084..23d1bb13a5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java @@ -20,6 +20,7 @@ package org.apache.hadoop.test; import java.io.File; import java.io.IOException; import java.util.Arrays; +import java.util.Random; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeoutException; @@ -79,8 +80,8 @@ public abstract class GenericTestUtils { public static void assertExceptionContains(String string, Throwable t) { String msg = t.getMessage(); Assert.assertTrue( - "Unexpected exception:" + StringUtils.stringifyException(t), - msg.contains(string)); + "Expected to find '" + string + "' but got unexpected exception:" + + StringUtils.stringifyException(t), msg.contains(string)); } public static void waitFor(Supplier check, @@ -109,7 +110,11 @@ public abstract class GenericTestUtils { private final CountDownLatch fireLatch = new CountDownLatch(1); private final CountDownLatch waitLatch = new CountDownLatch(1); - + private final CountDownLatch resultLatch = new CountDownLatch(1); + + // Result fields set after proceed() is called. + private volatile Throwable thrown; + private volatile Object returnValue; public DelayAnswer(Log log) { this.LOG = log; @@ -144,7 +149,40 @@ public abstract class GenericTestUtils { } protected Object passThrough(InvocationOnMock invocation) throws Throwable { - return invocation.callRealMethod(); + try { + Object ret = invocation.callRealMethod(); + returnValue = ret; + return ret; + } catch (Throwable t) { + thrown = t; + throw t; + } finally { + resultLatch.countDown(); + } + } + + /** + * After calling proceed(), this will wait until the call has + * completed and a result has been returned to the caller. + */ + public void waitForResult() throws InterruptedException { + resultLatch.await(); + } + + /** + * After the call has gone through, return any exception that + * was thrown, or null if no exception was thrown. + */ + public Throwable getThrown() { + return thrown; + } + + /** + * After the call has gone through, return the call's return value, + * or null in case it was void or an exception was thrown. + */ + public Object getReturnValue() { + return returnValue; } } @@ -176,4 +214,35 @@ public abstract class GenericTestUtils { } } + /** + * An Answer implementation which sleeps for a random number of milliseconds + * between 0 and a configurable value before delegating to the real + * implementation of the method. This can be useful for drawing out race + * conditions. + */ + public static class SleepAnswer implements Answer { + private final int maxSleepTime; + private static Random r = new Random(); + + public SleepAnswer(int maxSleepTime) { + this.maxSleepTime = maxSleepTime; + } + + @Override + public Object answer(InvocationOnMock invocation) throws Throwable { + boolean interrupted = false; + try { + Thread.sleep(r.nextInt(maxSleepTime)); + } catch (InterruptedException ie) { + interrupted = true; + } + try { + return invocation.callRealMethod(); + } finally { + if (interrupted) { + Thread.currentThread().interrupt(); + } + } + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored index 0101672b9d..5099ce21b3 100644 Binary files a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored and b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored differ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml index 65fe23a022..acc34bb273 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml @@ -1,34 +1,34 @@ - -38 + -40 24 1 - 1504643968 + -2045328303 21 2 1 - 1304751257518 + 1331096884634 3 - 2FhO + o0v1 - -174778556 + -1521490291 21 3 2 - 1304751257521 + 1331096884637 3 - 77-r + 3WMF - 1565957291 + 65546244 10 @@ -42,11 +42,10 @@ 0 5 - 5 /file_create 1 - 1304060057562 - 1304060057562 + 1330405685834 + 1330405685834 512 0 @@ -54,20 +53,19 @@ supergroup 420 - DFSClient_NONMAPREDUCE_-66857152_1 + DFSClient_NONMAPREDUCE_-2143415023_1 127.0.0.1 - -1854451489 + 179250704 9 6 - 5 /file_create 1 - 1304060057572 - 1304060057562 + 1330405685848 + 1330405685834 512 0 @@ -76,44 +74,41 @@ 420 - 617592855 + -584136658 1 7 - 3 /file_create /file_moved - 1304060057575 + 1330405685852 - 367100554 + -1983534581 2 8 - 2 /file_moved - 1304060057577 + 1330405685857 - 1048346698 + -97648053 3 9 - 3 /directory_mkdir - 1304060057581 - 0 + 1330405685861 + 1330405685861 todd supergroup 493 - 1207240248 + -146811985 10 @@ -127,11 +122,10 @@ 0 11 - 5 /file_create 1 - 1304060057584 - 1304060057584 + 1330405685866 + 1330405685866 512 0 @@ -139,20 +133,19 @@ supergroup 420 - DFSClient_NONMAPREDUCE_-66857152_1 + DFSClient_NONMAPREDUCE_-2143415023_1 127.0.0.1 - 1796314473 + 806955943 9 12 - 5 /file_create 1 - 1304060057588 - 1304060057584 + 1330405685868 + 1330405685866 512 0 @@ -161,7 +154,7 @@ 420 - 1017626905 + 641893387 4 @@ -170,7 +163,7 @@ /file_create 1 - 1842610087 + 24198146 7 @@ -195,12 +188,11 @@ 13 16 - 3 /file_create 1285195527000 1285195527000 - 1428793678 + 1853168961 14 @@ -216,13 +208,12 @@ 15 18 - 3 /file_create /file_moved - 1304060057605 + 1330405685882 AA - -1155144192 + -1235158297 10 @@ -236,11 +227,10 @@ 0 20 - 5 /file_concat_target 1 - 1304060057613 - 1304060057613 + 1330405685889 + 1330405685889 512 0 @@ -248,125 +238,141 @@ supergroup 420 - DFSClient_NONMAPREDUCE_-66857152_1 + DFSClient_NONMAPREDUCE_-2143415023_1 127.0.0.1 - -428545606 - - - 9 - - 21 - 5 - /file_concat_target - 1 - 1304060057694 - 1304060057613 - 512 - 3 - - 3459038074990663911 - 512 - 1003 - - - -5555244278278879146 - 512 - 1003 - - - -6344128791846831740 - 512 - 1003 - - - todd - supergroup - 420 - - - 707995174 + -981119572 10 - 22 + 21 1004 - -1500977009 + -1627007926 - 0 + 25 + + 22 + /file_concat_target + 1 + + -7144805496741076283 + 0 + 1004 + + + -1131701615 + + + 10 23 - 5 - /file_concat_0 - 1 - 1304060057701 - 1304060057701 - 512 - 0 - - todd - supergroup - 420 - - DFSClient_NONMAPREDUCE_-66857152_1 - 127.0.0.1 + 1005 - -119850856 + -957035430 - 9 + 25 24 - 5 - /file_concat_0 - 1 - 1304060057737 - 1304060057701 - 512 - 3 + /file_concat_target + 2 - 4671949296381030428 + -7144805496741076283 512 1004 - -844362243522407159 - 512 - 1004 + -4125931756867080767 + -512 + 1 - - 3476886462779656950 - 512 - 1004 - - - todd - supergroup - 420 - - -766805874 + -932985519 10 25 - 1005 + 1006 - 238426056 + -1757460878 + + + 25 + + 26 + /file_concat_target + 3 + + -7144805496741076283 + 512 + 1004 + + + -4125931756867080767 + 0 + 1 + + + 1562413691487277050 + -512 + 1 + + + -154090859 + + + 9 + + 27 + /file_concat_target + 1 + 1330405685978 + 1330405685889 + 512 + 3 + + -7144805496741076283 + 512 + 1004 + + + -4125931756867080767 + 512 + 1005 + + + 1562413691487277050 + 512 + 1006 + + + todd + supergroup + 420 + + + -292633850 + + + 10 + + 28 + 1007 + + -1431358549 0 - 26 - 5 - /file_concat_1 + 29 + /file_concat_0 1 - 1304060057742 - 1304060057742 + 1330405685983 + 1330405685983 512 0 @@ -374,36 +380,116 @@ supergroup 420 - DFSClient_NONMAPREDUCE_-66857152_1 + DFSClient_NONMAPREDUCE_-2143415023_1 127.0.0.1 - 1156254705 + -318194869 + + + 10 + + 30 + 1008 + + 156309208 + + + 25 + + 31 + /file_concat_0 + 1 + + 6084289468290363112 + 0 + 1008 + + + -596016492 + + + 10 + + 32 + 1009 + + -1734001394 + + + 25 + + 33 + /file_concat_0 + 2 + + 6084289468290363112 + 512 + 1008 + + + -4219431127125026105 + -512 + 1 + + + 1352178323 + + + 10 + + 34 + 1010 + + 794444850 + + + 25 + + 35 + /file_concat_0 + 3 + + 6084289468290363112 + 512 + 1008 + + + -4219431127125026105 + 0 + 1 + + + -1765119074945211374 + -512 + 1 + + + -1530696539 9 - 27 - 5 - /file_concat_1 + 36 + /file_concat_0 1 - 1304060057764 - 1304060057742 + 1330405686013 + 1330405685983 512 3 - -754893470864399741 + 6084289468290363112 512 - 1005 + 1008 - 1820875380010181049 + -4219431127125026105 512 - 1005 + 1009 - 8266387560744259971 + -1765119074945211374 512 - 1005 + 1010 todd @@ -411,121 +497,336 @@ 420 - -654780301 + -2043978220 + + + 10 + + 37 + 1011 + + 1010571629 + + + 0 + + 38 + /file_concat_1 + 1 + 1330405686017 + 1330405686017 + 512 + 0 + + todd + supergroup + 420 + + DFSClient_NONMAPREDUCE_-2143415023_1 + 127.0.0.1 + + -501297097 + + + 10 + + 39 + 1012 + + -1934711736 + + + 25 + + 40 + /file_concat_1 + 1 + + -7448471719302683860 + 0 + 1012 + + + -1853122907 + + + 10 + + 41 + 1013 + + 862670668 + + + 25 + + 42 + /file_concat_1 + 2 + + -7448471719302683860 + 512 + 1012 + + + -8051065559769974521 + -512 + 1 + + + -1169706939 + + + 10 + + 43 + 1014 + + -2070661520 + + + 25 + + 44 + /file_concat_1 + 3 + + -7448471719302683860 + 512 + 1012 + + + -8051065559769974521 + 0 + 1 + + + 3808670437711973616 + -512 + 1 + + + -1568093815 + + + 9 + + 45 + /file_concat_1 + 1 + 1330405686042 + 1330405686017 + 512 + 3 + + -7448471719302683860 + 512 + 1012 + + + -8051065559769974521 + 512 + 1013 + + + 3808670437711973616 + 512 + 1014 + + + todd + supergroup + 420 + + + -1640101896 16 - 28 - 4 + 46 /file_concat_target + 2 /file_concat_0 /file_concat_1 - 1304060057767 + 1330405686046 - 1273279541 + 2122891157 17 - 29 - 4 + 47 /file_symlink /file_concat_target - 1304060057770 - 1304060057770 + 1330405686051 + 1330405686051 todd supergroup 511 - 1385678569 + -585385283 18 - 30 + 48 0 todd JobTracker - 1304060057773 - 1304664857773 + 1330405686056 + 1331010486056 1 2 - 1304146457773 + 1330492086056 - 913145699 + 791321007 19 - 31 + 49 0 todd JobTracker - 1304060057773 - 1304664857773 + 1330405686056 + 1331010486056 1 2 - 1304146457785 + 1330492086075 - -1772039941 + 649714969 20 - 32 + 50 0 todd JobTracker - 1304060057773 - 1304664857773 + 1330405686056 + 1331010486056 1 2 - 1382094146 + 1190872628 + + + 10 + + 51 + 1015 + + -460593521 0 - 33 - 5 - /reassign-lease-test + 52 + /hard-lease-recovery-test 1 - 1286491964741 - 1286491964741 + 1330405686084 + 1330405686084 512 0 - atm + todd supergroup 420 - DFSClient_871171074 + DFSClient_NONMAPREDUCE_-2143415023_1 127.0.0.1 - 1975140107 + 2093219037 + + + 10 + + 53 + 1016 + + 120488596 + + + 25 + + 54 + /hard-lease-recovery-test + 1 + + -357061736603024522 + 0 + 1016 + + + 2098840974 + + + 25 + + 55 + /hard-lease-recovery-test + 1 + + -357061736603024522 + 0 + 1016 + + + -1794222801 + + + 10 + + 56 + 1017 + + -2123999915 22 - 34 - DFSClient_871171074 - /reassign-lease-test + 57 + DFSClient_NONMAPREDUCE_-2143415023_1 + /hard-lease-recovery-test HDFS_NameNode - 1975140107 + -1841690515 + + + 9 + + 58 + /hard-lease-recovery-test + 1 + 1330405688726 + 1330405686084 + 512 + 1 + + -357061736603024522 + 11 + 1017 + + + todd + supergroup + 420 + + + -218102037 23 - 35 + 59 - 1975140107 + -1616653774 -1 diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-1.0-multiblock-file.tgz b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-1.0-multiblock-file.tgz new file mode 100644 index 0000000000..8e327c2f3c Binary files /dev/null and b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-1.0-multiblock-file.tgz differ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml index 0f5310c76f..eb3f4bd744 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml @@ -109,5 +109,12 @@ group list is separated by a blank. For e.g. "alice,bob users,wheel". A special value of "*" means all users are allowed. - + + + security.ha.service.protocol.acl + * + ACL for HAService protocol used by HAAdmin to manage the + active and stand-by states of namenode. + + diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh index 11d7022940..e644bbff50 100755 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh @@ -34,7 +34,7 @@ if git rev-parse HEAD 2>/dev/null > /dev/null ; then url="git://${hostname}${cwd}" elif [ -d .svn ]; then revision=`svn info ../ | sed -n -e 's/Last Changed Rev: \(.*\)/\1/p'` - url=`svn info ../ | sed -n -e 's/URL: \(.*\)/\1/p'` + url=`svn info ../ | sed -n -e 's/^URL: \(.*\)/\1/p'` # Get canonical branch (branches/X, tags/X, or trunk) branch=`echo $url | sed -n -e 's,.*\(branches/.*\)$,\1,p' \ -e 's,.*\(tags/.*\)$,\1,p' \ diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HDFSHighAvailability.apt.vm b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HDFSHighAvailability.apt.vm new file mode 100644 index 0000000000..c66506734e --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HDFSHighAvailability.apt.vm @@ -0,0 +1,434 @@ +~~ Licensed under the Apache License, Version 2.0 (the "License"); +~~ you may not use this file except in compliance with the License. +~~ You may obtain a copy of the License at +~~ +~~ http://www.apache.org/licenses/LICENSE-2.0 +~~ +~~ Unless required by applicable law or agreed to in writing, software +~~ distributed under the License is distributed on an "AS IS" BASIS, +~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +~~ See the License for the specific language governing permissions and +~~ limitations under the License. See accompanying LICENSE file. + + --- + Hadoop Distributed File System-${project.version} - High Availability + --- + --- + ${maven.build.timestamp} + +HDFS High Availability + + \[ {{{./index.html}Go Back}} \] + +%{toc|section=1|fromDepth=0} + +* {Purpose} + + This guide provides an overview of the HDFS High Availability (HA) feature and + how to configure and manage an HA HDFS cluster. + + This document assumes that the reader has a general understanding of + general components and node types in an HDFS cluster. Please refer to the + HDFS Architecture guide for details. + +* {Background} + + Prior to Hadoop 0.23.2, the NameNode was a single point of failure (SPOF) in + an HDFS cluster. Each cluster had a single NameNode, and if that machine or + process became unavailable, the cluster as a whole would be unavailable + until the NameNode was either restarted or brought up on a separate machine. + + This impacted the total availability of the HDFS cluster in two major ways: + + * In the case of an unplanned event such as a machine crash, the cluster would + be unavailable until an operator restarted the NameNode. + + * Planned maintenance events such as software or hardware upgrades on the + NameNode machine would result in windows of cluster downtime. + + The HDFS High Availability feature addresses the above problems by providing + the option of running two redundant NameNodes in the same cluster in an + Active/Passive configuration with a hot standby. This allows a fast failover to + a new NameNode in the case that a machine crashes, or a graceful + administrator-initiated failover for the purpose of planned maintenance. + +* {Architecture} + + In a typical HA cluster, two separate machines are configured as NameNodes. + At any point in time, exactly one of the NameNodes is in an state, + and the other is in a state. The Active NameNode is responsible + for all client operations in the cluster, while the Standby is simply acting + as a slave, maintaining enough state to provide a fast failover if + necessary. + + In order for the Standby node to keep its state synchronized with the Active + node, the current implementation requires that the two nodes both have access + to a directory on a shared storage device (eg an NFS mount from a NAS). This + restriction will likely be relaxed in future versions. + + When any namespace modification is performed by the Active node, it durably + logs a record of the modification to an edit log file stored in the shared + directory. The Standby node is constantly watching this directory for edits, + and as it sees the edits, it applies them to its own namespace. In the event of + a failover, the Standby will ensure that it has read all of the edits from the + shared storage before promoting itself to the Active state. This ensures that + the namespace state is fully synchronized before a failover occurs. + + In order to provide a fast failover, it is also necessary that the Standby node + have up-to-date information regarding the location of blocks in the cluster. + In order to achieve this, the DataNodes are configured with the location of + both NameNodes, and send block location information and heartbeats to both. + + It is vital for the correct operation of an HA cluster that only one of the + NameNodes be Active at a time. Otherwise, the namespace state would quickly + diverge between the two, risking data loss or other incorrect results. In + order to ensure this property and prevent the so-called "split-brain scenario," + the administrator must configure at least one for the shared + storage. During a failover, if it cannot be verified that the previous Active + node has relinquished its Active state, the fencing process is responsible for + cutting off the previous Active's access to the shared edits storage. This + prevents it from making any further edits to the namespace, allowing the new + Active to safely proceed with failover. + + <> Currently, only manual failover is supported. This means the HA + NameNodes are incapable of automatically detecting a failure of the Active + NameNode, and instead rely on the operator to manually initiate a failover. + Automatic failure detection and initiation of a failover will be implemented in + future versions. + +* {Hardware resources} + + In order to deploy an HA cluster, you should prepare the following: + + * <> - the machines on which you run the Active and + Standby NameNodes should have equivalent hardware to each other, and + equivalent hardware to what would be used in a non-HA cluster. + + * <> - you will need to have a shared directory which both + NameNode machines can have read/write access to. Typically this is a remote + filer which supports NFS and is mounted on each of the NameNode machines. + Currently only a single shared edits directory is supported. Thus, the + availability of the system is limited by the availability of this shared edits + directory, and therefore in order to remove all single points of failure there + needs to be redundancy for the shared edits directory. Specifically, multiple + network paths to the storage, and redundancy in the storage itself (disk, + network, and power). Beacuse of this, it is recommended that the shared storage + server be a high-quality dedicated NAS appliance rather than a simple Linux + server. + + Note that, in an HA cluster, the Standby NameNode also performs checkpoints of + the namespace state, and thus it is not necessary to run a Secondary NameNode, + CheckpointNode, or BackupNode in an HA cluster. In fact, to do so would be an + error. This also allows one who is reconfiguring a non-HA-enabled HDFS cluster + to be HA-enabled to reuse the hardware which they had previously dedicated to + the Secondary NameNode. + +* {Deployment} + +** Configuration overview + + Similar to Federation configuration, HA configuration is backward compatible + and allows existing single NameNode configurations to work without change. + The new configuration is designed such that all the nodes in the cluster may + have the same configuration without the need for deploying different + configuration files to different machines based on the type of the node. + + Like HDFS Federation, HA clusters reuse the <<>> to identify a + single HDFS instance that may in fact consist of multiple HA NameNodes. In + addition, a new abstraction called <<>> is added with HA. Each + distinct NameNode in the cluster has a different NameNode ID to distinguish it. + To support a single configuration file for all of the NameNodes, the relevant + configuration parameters are suffixed with the <> as well as + the <>. + +** Configuration details + + To configure HA NameNodes, you must add several configuration options to your + <> configuration file. + + The order in which you set these configurations is unimportant, but the values + you choose for <> and + <> will determine the keys of those that + follow. Thus, you should decide on these values before setting the rest of the + configuration options. + + * <> - the logical name for this new nameservice + + Choose a logical name for this nameservice, for example "mycluster", and use + this logical name for the value of this config option. The name you choose is + arbitrary. It will be used both for configuration and as the authority + component of absolute HDFS paths in the cluster. + + <> If you are also using HDFS Federation, this configuration setting + should also include the list of other nameservices, HA or otherwise, as a + comma-separated list. + +---- + + dfs.federation.nameservices + mycluster + +---- + + * <> - unique identifiers for each NameNode in the nameservice + + Configure with a list of comma-separated NameNode IDs. This will be used by + DataNodes to determine all the NameNodes in the cluster. For example, if you + used "mycluster" as the nameservice ID previously, and you wanted to use "nn1" + and "nn2" as the individual IDs of the NameNodes, you would configure this as + such: + +---- + + dfs.ha.namenodes.mycluster + nn1,nn2 + +---- + + <> Currently, only a maximum of two NameNodes may be configured per + nameservice. + + * <> - the fully-qualified RPC address for each NameNode to listen on + + For both of the previously-configured NameNode IDs, set the full address and + IPC port of the NameNode processs. Note that this results in two separate + configuration options. For example: + +---- + + dfs.namenode.rpc-address.mycluster.nn1 + machine1.example.com:8020 + + + dfs.namenode.rpc-address.mycluster.nn2 + machine2.example.com:8020 + +---- + + <> You may similarly configure the "<>" setting if + you so desire. + + * <> - the fully-qualified HTTP address for each NameNode to listen on + + Similarly to above, set the addresses for both NameNodes' HTTP + servers to listen on. For example: + +---- + + dfs.namenode.http-address.mycluster.nn1 + machine1.example.com:50070 + + + dfs.namenode.http-address.mycluster.nn2 + machine2.example.com:50070 + +---- + + <> If you have Hadoop's security features enabled, you should also set + the similarly for each NameNode. + + * <> - the location of the shared storage directory + + This is where one configures the path to the remote shared edits directory + which the Standby NameNode uses to stay up-to-date with all the file system + changes the Active NameNode makes. <> This directory should be mounted r/w on both NameNode machines. + The value of this setting should be the absolute path to this directory on the + NameNode machines. For example: + +---- + + dfs.namenode.shared.edits.dir + file:///mnt/filer1/dfs/ha-name-dir-shared + +---- + + * <> - the Java class that HDFS clients use to contact the Active NameNode + + Configure the name of the Java class which will be used by the DFS Client to + determine which NameNode is the current Active, and therefore which NameNode is + currently serving client requests. The only implementation which currently + ships with Hadoop is the <>, so use this + unless you are using a custom one. For example: + +---- + + dfs.client.failover.proxy.provider.mycluster + org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider + +---- + + * <> - a list of scripts or Java classes which will be used to fence the Active NameNode during a failover + + It is critical for correctness of the system that only one NameNode be in the + Active state at any given time. Thus, during a failover, we first ensure that + the Active NameNode is either in the Standby state, or the process has + terminated, before transitioning the other NameNode to the Active state. In + order to do this, you must configure at least one <> These are + configured as a carriage-return-separated list, which will be attempted in order + until one indicates that fencing has succeeded. There are two methods which + ship with Hadoop: and . For information on implementing + your own custom fencing method, see the class. + + * <> - SSH to the Active NameNode and kill the process + + The option SSHes to the target node and uses to kill the + process listening on the service's TCP port. In order for this fencing option + to work, it must be able to SSH to the target node without providing a + passphrase. Thus, one must also configure the + <> option, which is a + comma-separated list of SSH private key files. For example: + +--- + + dfs.ha.fencing.methods + sshfence + + + + dfs.ha.fencing.ssh.private-key-files + /home/exampleuser/.ssh/id_rsa + +--- + + Optionally, one may configure a non-standard username or port to perform the + SSH. One may also configure a timeout, in milliseconds, for the SSH, after + which this fencing method will be considered to have failed. It may be + configured like so: + +--- + + dfs.ha.fencing.methods + sshfence([[username][:port]]) + + + dfs.ha.fencing.ssh.connect-timeout + + +--- + + * <> - run an arbitrary shell command to fence the Active NameNode + + The fencing method runs an arbitrary shell command. It may be + configured like so: + +--- + + dfs.ha.fencing.methods + shell(/path/to/my/script.sh arg1 arg2 ...) + +--- + + The string between '(' and ')' is passed directly to a bash shell and may not + include any closing parentheses. + + When executed, the first argument to the configured script will be the address + of the NameNode to be fenced, followed by all arguments specified in the + configuration. + + The shell command will be run with an environment set up to contain all of the + current Hadoop configuration variables, with the '_' character replacing any + '.' characters in the configuration keys. If the shell command returns an exit + code of 0, the fencing is determined to be successful. If it returns any other + exit code, the fencing was not successful and the next fencing method in the + list will be attempted. + + <> This fencing method does not implement any timeout. If timeouts are + necessary, they should be implemented in the shell script itself (eg by forking + a subshell to kill its parent in some number of seconds). + + * <> - the default path prefix used by the Hadoop FS client when none is given + + Optionally, you may now configure the default path for Hadoop clients to use + the new HA-enabled logical URI. If you used "mycluster" as the nameservice ID + earlier, this will be the value of the authority portion of all of your HDFS + paths. This may be configured like so, in your <> file: + +--- + + fs.defaultFS + hdfs://mycluster + +--- + +** Deployment details + + After all of the necessary configuration options have been set, one must + initially synchronize the two HA NameNodes' on-disk metadata. If you are + setting up a fresh HDFS cluster, you should first run the format command () on one of NameNodes. If you have already formatted the + NameNode, or are converting a non-HA-enabled cluster to be HA-enabled, you + should now copy over the contents of your NameNode metadata directories to + the other, unformatted NameNode using or a similar utility. The location + of the directories containing the NameNode metadata are configured via the + configuration options <> and/or + <>. At this time, you should also ensure that the + shared edits dir (as configured by <>) includes + all recent edits files which are in your NameNode metadata directories. + + At this point you may start both of your HA NameNodes as you normally would + start a NameNode. + + You can visit each of the NameNodes' web pages separately by browsing to their + configured HTTP addresses. You should notice that next to the configured + address will be the HA state of the NameNode (either "standby" or "active".) + Whenever an HA NameNode starts, it is initially in the Standby state. + +** Administrative commands + + Now that your HA NameNodes are configured and started, you will have access + to some additional commands to administer your HA HDFS cluster. Specifically, + you should familiarize yourself with all of the subcommands of the "" command. Running this command without any additional arguments will + display the following usage information: + +--- +Usage: DFSHAAdmin [-ns ] + [-transitionToActive ] + [-transitionToStandby ] + [-failover [--forcefence] [--forceactive] ] + [-getServiceState ] + [-checkHealth ] + [-help ] +--- + + This guide describes high-level uses of each of these subcommands. For + specific usage information of each subcommand, you should run ">". + + * <> and <> - transition the state of the given NameNode to Active or Standby + + These subcommands cause a given NameNode to transition to the Active or Standby + state, respectively. <> Instead, one should almost always prefer to + use the "" subcommand. + + * <> - initiate a failover between two NameNodes + + This subcommand causes a failover from the first provided NameNode to the + second. If the first NameNode is in the Standby state, this command simply + transitions the second to the Active state without error. If the first NameNode + is in the Active state, an attempt will be made to gracefully transition it to + the Standby state. If this fails, the fencing methods (as configured by + <>) will be attempted in order until one + succeeds. Only after this process will the second NameNode be transitioned to + the Active state. If no fencing method succeeds, the second NameNode will not + be transitioned to the Active state, and an error will be returned. + + * <> - determine whether the given NameNode is Active or Standby + + Connect to the provided NameNode to determine its current state, printing + either "standby" or "active" to STDOUT appropriately. This subcommand might be + used by cron jobs or monitoring scripts which need to behave differently based + on whether the NameNode is currently Active or Standby. + + * <> - check the health of the given NameNode + + Connect to the provided NameNode to check its health. The NameNode is capable + of performing some diagnostics on itself, including checking if internal + services are running as expected. This command will return 0 if the NameNode is + healthy, non-zero otherwise. One might use this command for monitoring + purposes. + + <> This is not yet implemented, and at present will always return + success, unless the given NameNode is completely down. diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index 93e9742483..1cf5a7262d 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -592,6 +592,11 @@ commons-daemon ${commons-daemon.version} + + com.jcraft + jsch + 0.1.42 + org.jdom jdom diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml index 27f9b7b983..f992a17b63 100644 --- a/hadoop-project/src/site/site.xml +++ b/hadoop-project/src/site/site.xml @@ -53,6 +53,7 @@ + diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java index 1d248f082a..563372e009 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java @@ -22,7 +22,9 @@ package org.apache.hadoop.tools.util; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.retry.RetryPolicy; +import org.apache.hadoop.io.retry.RetryPolicy.RetryAction; import org.apache.hadoop.io.retry.RetryPolicies; +import org.apache.hadoop.util.ThreadUtil; import java.io.IOException; import java.util.concurrent.TimeUnit; @@ -80,7 +82,7 @@ public abstract class RetriableCommand { public Object execute(Object... arguments) throws Exception { Exception latestException; int counter = 0; - do { + while (true) { try { return doExecute(arguments); } catch(Exception exception) { @@ -88,7 +90,13 @@ public abstract class RetriableCommand { latestException = exception; } counter++; - } while (retryPolicy.shouldRetry(latestException, counter, 0, true).equals(RetryPolicy.RetryAction.RETRY)); + RetryAction action = retryPolicy.shouldRetry(latestException, counter, 0, true); + if (action.action == RetryPolicy.RetryAction.RetryDecision.RETRY) { + ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis); + } else { + break; + } + } throw new IOException("Couldn't run retriable-command: " + description, latestException); diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java index e5ab0595c3..5ba5eb8867 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java @@ -545,7 +545,12 @@ public class TestCopyMapper { Assert.fail("Didn't expect the file to be copied"); } catch (AccessControlException ignore) { } catch (Exception e) { - if (e.getCause() == null || !(e.getCause() instanceof AccessControlException)) { + // We want to make sure the underlying cause of the exception is + // due to permissions error. The exception we're interested in is + // wrapped twice - once in RetriableCommand and again in CopyMapper + // itself. + if (e.getCause() == null || e.getCause().getCause() == null || + !(e.getCause().getCause() instanceof AccessControlException)) { throw new RuntimeException(e); } }