HDFS-17055 Export HAState as a metric from Namenode for monitoring (#5764)

This commit is contained in:
Xing Lin 2023-06-26 15:52:59 -07:00 committed by GitHub
parent a85272c33d
commit 03902f5ef0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 88 additions and 4 deletions

View File

@ -48,6 +48,7 @@
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
@ -68,6 +69,7 @@
* </ol> * </ol>
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
@Metrics(context="dfs")
public class BackupNode extends NameNode { public class BackupNode extends NameNode {
private static final String BN_ADDRESS_NAME_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_KEY; private static final String BN_ADDRESS_NAME_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_KEY;
private static final String BN_ADDRESS_DEFAULT = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_DEFAULT; private static final String BN_ADDRESS_DEFAULT = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_DEFAULT;

View File

@ -78,6 +78,8 @@
import org.apache.hadoop.ipc.RetriableException; import org.apache.hadoop.ipc.RetriableException;
import org.apache.hadoop.ipc.Server; import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.util.MBeans; import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetUtils;
@ -252,6 +254,7 @@
* NameNode state, for example partial blocksMap etc. * NameNode state, for example partial blocksMap etc.
**********************************************************/ **********************************************************/
@InterfaceAudience.Private @InterfaceAudience.Private
@Metrics(context="dfs")
public class NameNode extends ReconfigurableBase implements public class NameNode extends ReconfigurableBase implements
NameNodeStatusMXBean, TokenVerifier<DelegationTokenIdentifier> { NameNodeStatusMXBean, TokenVerifier<DelegationTokenIdentifier> {
static{ static{
@ -1146,6 +1149,7 @@ protected NameNode(Configuration conf, NamenodeRole role)
DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE,
DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT); DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT);
this.started.set(true); this.started.set(true);
DefaultMetricsSystem.instance().register(this);
} }
private void stopAtException(Exception e){ private void stopAtException(Exception e){
@ -1216,6 +1220,7 @@ public void stop() {
levelDBAliasMapServer.close(); levelDBAliasMapServer.close();
} }
} }
started.set(false);
tracer.close(); tracer.close();
} }
@ -2051,6 +2056,26 @@ synchronized HAServiceState getServiceState() {
return state.getServiceState(); return state.getServiceState();
} }
/**
* Emit Namenode HA service state as an integer so that one can monitor NN HA
* state based on this metric.
*
* @return 0 when not fully started
* 1 for active or standalone (non-HA) NN
* 2 for standby
* 3 for observer
*
* These are the same integer values for the HAServiceState enum.
*/
@Metric({"NameNodeState", "Namenode HA service state"})
public int getNameNodeState() {
if (!isStarted() || state == null) {
return HAServiceState.INITIALIZING.ordinal();
}
return state.getServiceState().ordinal();
}
/** /**
* Register NameNodeStatusMXBean * Register NameNodeStatusMXBean
*/ */

View File

@ -153,7 +153,7 @@ public void testFinalize() throws Exception {
UpgradeUtilities.createEmptyDirs(dataNodeDirs); UpgradeUtilities.createEmptyDirs(dataNodeDirs);
log("Finalize NN & BP with existing previous dir", numDirs); log("Finalize NN & BP with existing previous dir", numDirs);
String bpid = UpgradeUtilities.getCurrentBlockPoolID(cluster); String bpid = UpgradeUtilities.getCurrentBlockPoolID(null);
UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "current"); UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "current");
UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "previous"); UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "previous");
UpgradeUtilities.createDataNodeStorageDirs(dataNodeDirs, "current"); UpgradeUtilities.createDataNodeStorageDirs(dataNodeDirs, "current");

View File

@ -328,7 +328,7 @@ public void testRollback() throws Exception {
UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE); UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE);
UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs, UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs,
storageInfo, UpgradeUtilities.getCurrentBlockPoolID(cluster)); storageInfo, UpgradeUtilities.getCurrentBlockPoolID(null));
startNameNodeShouldFail("Cannot rollback to storage version 1 using this version"); startNameNodeShouldFail("Cannot rollback to storage version 1 using this version");
UpgradeUtilities.createEmptyDirs(nameNodeDirs); UpgradeUtilities.createEmptyDirs(nameNodeDirs);
} // end numDir loop } // end numDir loop

View File

@ -349,7 +349,7 @@ public void testUpgrade() throws Exception {
UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE); UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE);
UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs, storageInfo, UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs, storageInfo,
UpgradeUtilities.getCurrentBlockPoolID(cluster)); UpgradeUtilities.getCurrentBlockPoolID(null));
startNameNodeShouldFail(StartupOption.UPGRADE); startNameNodeShouldFail(StartupOption.UPGRADE);
UpgradeUtilities.createEmptyDirs(nameNodeDirs); UpgradeUtilities.createEmptyDirs(nameNodeDirs);
@ -362,7 +362,7 @@ public void testUpgrade() throws Exception {
UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE); UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE);
UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs, storageInfo, UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs, storageInfo,
UpgradeUtilities.getCurrentBlockPoolID(cluster)); UpgradeUtilities.getCurrentBlockPoolID(null));
startNameNodeShouldFail(StartupOption.UPGRADE); startNameNodeShouldFail(StartupOption.UPGRADE);
UpgradeUtilities.createEmptyDirs(nameNodeDirs); UpgradeUtilities.createEmptyDirs(nameNodeDirs);

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.hdfs.server.namenode; package org.apache.hadoop.hdfs.server.namenode;
import java.util.function.Supplier; import java.util.function.Supplier;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -116,6 +117,7 @@ private NameNode makeNameNode(boolean enableMetricsLogging)
/** /**
* A NameNode that stubs out the NameSystem for testing. * A NameNode that stubs out the NameSystem for testing.
*/ */
@Metrics(context="dfs")
private static class TestNameNode extends NameNode { private static class TestNameNode extends NameNode {
@Override @Override
protected void loadNamesystem(Configuration conf) throws IOException { protected void loadNamesystem(Configuration conf) throws IOException {

View File

@ -17,6 +17,8 @@
*/ */
package org.apache.hadoop.hdfs.server.namenode.ha; package org.apache.hadoop.hdfs.server.namenode.ha;
import java.io.IOException;
import org.apache.hadoop.ha.HAServiceProtocol;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -29,6 +31,7 @@
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.junit.Test; import org.junit.Test;
@ -176,4 +179,56 @@ public void testHAInodeCount() throws Exception {
} }
} }
/**
* Test the getNameNodeState() API added to NameNode.java.
*
* @throws IOException
*/
@Test
public void testGetNameNodeState() throws IOException {
Configuration conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, Integer.MAX_VALUE);
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).nnTopology(
MiniDFSNNTopology.simpleHATopology(3)).numDataNodes(1).build();
cluster.waitActive();
NameNode nn0 = cluster.getNameNode(0);
NameNode nn1 = cluster.getNameNode(1);
NameNode nn2 = cluster.getNameNode(2);
// All namenodes are in standby by default
assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
nn0.getNameNodeState());
assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
nn1.getNameNodeState());
assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
nn2.getNameNodeState());
// Transition nn0 to be active
cluster.transitionToActive(0);
assertEquals(HAServiceProtocol.HAServiceState.ACTIVE.ordinal(),
nn0.getNameNodeState());
// Transition nn1 to be active
cluster.transitionToStandby(0);
cluster.transitionToActive(1);
assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
nn0.getNameNodeState());
assertEquals(HAServiceProtocol.HAServiceState.ACTIVE.ordinal(),
nn1.getNameNodeState());
// Transition nn2 to observer
cluster.transitionToObserver(2);
assertEquals(HAServiceProtocol.HAServiceState.OBSERVER.ordinal(),
nn2.getNameNodeState());
// Shutdown nn2. Now getNameNodeState should return the INITIALIZING state.
cluster.shutdownNameNode(2);
assertEquals(HAServiceProtocol.HAServiceState.INITIALIZING.ordinal(),
nn2.getNameNodeState());
}
} }