diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java index e457b13e22..4464ed1704 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java @@ -102,7 +102,6 @@ public class SCMNodeManager implements NodeManager { public SCMNodeManager(OzoneConfiguration conf, String clusterID, StorageContainerManager scmManager, EventPublisher eventPublisher) throws IOException { - this.metrics = SCMNodeMetrics.create(); this.nodeStateManager = new NodeStateManager(conf, eventPublisher); this.clusterID = clusterID; this.version = VersionInfo.getLatestVersion(); @@ -110,6 +109,7 @@ public SCMNodeManager(OzoneConfiguration conf, String clusterID, this.scmManager = scmManager; LOG.info("Entering startup chill mode."); registerMXBean(); + this.metrics = SCMNodeMetrics.create(this); } private void registerMXBean() { @@ -118,7 +118,7 @@ private void registerMXBean() { } private void unregisterMXBean() { - if(this.nmInfoBean != null) { + if (this.nmInfoBean != null) { MBeans.unregister(this.nmInfoBean); this.nmInfoBean = null; } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java index 30b10795de..1596523bbc 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java @@ -18,11 +18,24 @@ package org.apache.hadoop.hdds.scm.node; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONING; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.STALE; + +import java.util.Map; + import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.metrics2.MetricsCollector; +import org.apache.hadoop.metrics2.MetricsInfo; +import org.apache.hadoop.metrics2.MetricsSource; import org.apache.hadoop.metrics2.MetricsSystem; import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.Interns; +import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.metrics2.lib.MutableCounterLong; /** @@ -30,7 +43,7 @@ */ @InterfaceAudience.Private @Metrics(about = "SCM NodeManager Metrics", context = "ozone") -public final class SCMNodeMetrics { +public final class SCMNodeMetrics implements MetricsSource { private static final String SOURCE_NAME = SCMNodeMetrics.class.getSimpleName(); @@ -40,18 +53,26 @@ public final class SCMNodeMetrics { private @Metric MutableCounterLong numNodeReportProcessed; private @Metric MutableCounterLong numNodeReportProcessingFailed; + private final MetricsRegistry registry; + private final NodeManagerMXBean managerMXBean; + private final MetricsInfo recordInfo = Interns.info("SCMNodeManager", + "SCM NodeManager metrics"); + /** Private constructor. */ - private SCMNodeMetrics() { } + private SCMNodeMetrics(NodeManagerMXBean managerMXBean) { + this.managerMXBean = managerMXBean; + this.registry = new MetricsRegistry(recordInfo); + } /** * Create and returns SCMNodeMetrics instance. * * @return SCMNodeMetrics */ - public static SCMNodeMetrics create() { + public static SCMNodeMetrics create(NodeManagerMXBean managerMXBean) { MetricsSystem ms = DefaultMetricsSystem.instance(); return ms.register(SOURCE_NAME, "SCM NodeManager Metrics", - new SCMNodeMetrics()); + new SCMNodeMetrics(managerMXBean)); } /** @@ -90,4 +111,51 @@ void incNumNodeReportProcessingFailed() { numNodeReportProcessingFailed.incr(); } + /** + * Get aggregated counter and gauage metrics. + */ + @Override + @SuppressWarnings("SuspiciousMethodCalls") + public void getMetrics(MetricsCollector collector, boolean all) { + Map nodeCount = managerMXBean.getNodeCount(); + Map nodeInfo = managerMXBean.getNodeInfo(); + + registry.snapshot( + collector.addRecord(registry.info()) // Add annotated ones first + .addGauge(Interns.info( + "HealthyNodes", + "Number of healthy datanodes"), + nodeCount.get(HEALTHY.toString())) + .addGauge(Interns.info("StaleNodes", + "Number of stale datanodes"), + nodeCount.get(STALE.toString())) + .addGauge(Interns.info("DeadNodes", + "Number of dead datanodes"), + nodeCount.get(DEAD.toString())) + .addGauge(Interns.info("DecommissioningNodes", + "Number of decommissioning datanodes"), + nodeCount.get(DECOMMISSIONING.toString())) + .addGauge(Interns.info("DecommissionedNodes", + "Number of decommissioned datanodes"), + nodeCount.get(DECOMMISSIONED.toString())) + .addGauge(Interns.info("DiskCapacity", + "Total disk capacity"), + nodeInfo.get("DISKCapacity")) + .addGauge(Interns.info("DiskUsed", + "Total disk capacity used"), + nodeInfo.get("DISKUsed")) + .addGauge(Interns.info("DiskRemaining", + "Total disk capacity remaining"), + nodeInfo.get("DISKRemaining")) + .addGauge(Interns.info("SSDCapacity", + "Total ssd capacity"), + nodeInfo.get("SSDCapacity")) + .addGauge(Interns.info("SSDUsed", + "Total ssd capacity used"), + nodeInfo.get("SSDUsed")) + .addGauge(Interns.info("SSDRemaining", + "Total disk capacity remaining"), + nodeInfo.get("SSDRemaining")), + all); + } } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java index c18ae5fe3c..d19be936b0 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java @@ -35,6 +35,7 @@ import org.junit.Test; import static org.apache.hadoop.test.MetricsAsserts.assertCounter; +import static org.apache.hadoop.test.MetricsAsserts.assertGauge; import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; @@ -128,6 +129,45 @@ public void testNodeReportProcessingFailure() { getMetrics(SCMNodeMetrics.class.getSimpleName())); } + /** + * Verify that datanode aggregated state and capacity metrics are reported. + */ + @Test + public void testNodeCountAndInfoMetricsReported() throws Exception { + HddsDatanodeService datanode = cluster.getHddsDatanodes().get(0); + StorageReportProto storageReport = TestUtils.createStorageReport( + datanode.getDatanodeDetails().getUuid(), "/tmp", 100, 10, 90, null); + NodeReportProto nodeReport = NodeReportProto.newBuilder() + .addStorageReport(storageReport).build(); + datanode.getDatanodeStateMachine().getContext().addReport(nodeReport); + datanode.getDatanodeStateMachine().triggerHeartbeat(); + // Give some time so that SCM receives and processes the heartbeat. + Thread.sleep(300L); + + assertGauge("HealthyNodes", 1, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("StaleNodes", 0, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("DeadNodes", 0, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("DecommissioningNodes", 0, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("DecommissionedNodes", 0, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("DiskCapacity", 100L, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("DiskUsed", 10L, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("DiskRemaining", 90L, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("SSDCapacity", 0L, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("SSDUsed", 0L, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + assertGauge("SSDRemaining", 0L, + getMetrics(SCMNodeMetrics.class.getSimpleName())); + } + @After public void teardown() { cluster.shutdown();