HDDS-917. Expose NodeManagerMXBean as a MetricsSource. Contributed by Siddharth Wagle.

This commit is contained in:
Bharat Viswanadham 2019-03-14 12:30:06 -07:00
parent 688b177fc6
commit 091a664977
No known key found for this signature in database
GPG Key ID: 6A6F3FB121D2F77C
3 changed files with 114 additions and 6 deletions

View File

@ -102,7 +102,6 @@ public class SCMNodeManager implements NodeManager {
public SCMNodeManager(OzoneConfiguration conf, String clusterID, public SCMNodeManager(OzoneConfiguration conf, String clusterID,
StorageContainerManager scmManager, EventPublisher eventPublisher) StorageContainerManager scmManager, EventPublisher eventPublisher)
throws IOException { throws IOException {
this.metrics = SCMNodeMetrics.create();
this.nodeStateManager = new NodeStateManager(conf, eventPublisher); this.nodeStateManager = new NodeStateManager(conf, eventPublisher);
this.clusterID = clusterID; this.clusterID = clusterID;
this.version = VersionInfo.getLatestVersion(); this.version = VersionInfo.getLatestVersion();
@ -110,6 +109,7 @@ public SCMNodeManager(OzoneConfiguration conf, String clusterID,
this.scmManager = scmManager; this.scmManager = scmManager;
LOG.info("Entering startup chill mode."); LOG.info("Entering startup chill mode.");
registerMXBean(); registerMXBean();
this.metrics = SCMNodeMetrics.create(this);
} }
private void registerMXBean() { private void registerMXBean() {

View File

@ -18,11 +18,24 @@
package org.apache.hadoop.hdds.scm.node; package org.apache.hadoop.hdds.scm.node;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONED;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONING;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.STALE;
import java.util.Map;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsInfo;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.MetricsSystem; import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.Interns;
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong; import org.apache.hadoop.metrics2.lib.MutableCounterLong;
/** /**
@ -30,7 +43,7 @@
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
@Metrics(about = "SCM NodeManager Metrics", context = "ozone") @Metrics(about = "SCM NodeManager Metrics", context = "ozone")
public final class SCMNodeMetrics { public final class SCMNodeMetrics implements MetricsSource {
private static final String SOURCE_NAME = private static final String SOURCE_NAME =
SCMNodeMetrics.class.getSimpleName(); SCMNodeMetrics.class.getSimpleName();
@ -40,18 +53,26 @@ public final class SCMNodeMetrics {
private @Metric MutableCounterLong numNodeReportProcessed; private @Metric MutableCounterLong numNodeReportProcessed;
private @Metric MutableCounterLong numNodeReportProcessingFailed; private @Metric MutableCounterLong numNodeReportProcessingFailed;
private final MetricsRegistry registry;
private final NodeManagerMXBean managerMXBean;
private final MetricsInfo recordInfo = Interns.info("SCMNodeManager",
"SCM NodeManager metrics");
/** Private constructor. */ /** Private constructor. */
private SCMNodeMetrics() { } private SCMNodeMetrics(NodeManagerMXBean managerMXBean) {
this.managerMXBean = managerMXBean;
this.registry = new MetricsRegistry(recordInfo);
}
/** /**
* Create and returns SCMNodeMetrics instance. * Create and returns SCMNodeMetrics instance.
* *
* @return SCMNodeMetrics * @return SCMNodeMetrics
*/ */
public static SCMNodeMetrics create() { public static SCMNodeMetrics create(NodeManagerMXBean managerMXBean) {
MetricsSystem ms = DefaultMetricsSystem.instance(); MetricsSystem ms = DefaultMetricsSystem.instance();
return ms.register(SOURCE_NAME, "SCM NodeManager Metrics", return ms.register(SOURCE_NAME, "SCM NodeManager Metrics",
new SCMNodeMetrics()); new SCMNodeMetrics(managerMXBean));
} }
/** /**
@ -90,4 +111,51 @@ void incNumNodeReportProcessingFailed() {
numNodeReportProcessingFailed.incr(); numNodeReportProcessingFailed.incr();
} }
/**
* Get aggregated counter and gauage metrics.
*/
@Override
@SuppressWarnings("SuspiciousMethodCalls")
public void getMetrics(MetricsCollector collector, boolean all) {
Map<String, Integer> nodeCount = managerMXBean.getNodeCount();
Map<String, Long> nodeInfo = managerMXBean.getNodeInfo();
registry.snapshot(
collector.addRecord(registry.info()) // Add annotated ones first
.addGauge(Interns.info(
"HealthyNodes",
"Number of healthy datanodes"),
nodeCount.get(HEALTHY.toString()))
.addGauge(Interns.info("StaleNodes",
"Number of stale datanodes"),
nodeCount.get(STALE.toString()))
.addGauge(Interns.info("DeadNodes",
"Number of dead datanodes"),
nodeCount.get(DEAD.toString()))
.addGauge(Interns.info("DecommissioningNodes",
"Number of decommissioning datanodes"),
nodeCount.get(DECOMMISSIONING.toString()))
.addGauge(Interns.info("DecommissionedNodes",
"Number of decommissioned datanodes"),
nodeCount.get(DECOMMISSIONED.toString()))
.addGauge(Interns.info("DiskCapacity",
"Total disk capacity"),
nodeInfo.get("DISKCapacity"))
.addGauge(Interns.info("DiskUsed",
"Total disk capacity used"),
nodeInfo.get("DISKUsed"))
.addGauge(Interns.info("DiskRemaining",
"Total disk capacity remaining"),
nodeInfo.get("DISKRemaining"))
.addGauge(Interns.info("SSDCapacity",
"Total ssd capacity"),
nodeInfo.get("SSDCapacity"))
.addGauge(Interns.info("SSDUsed",
"Total ssd capacity used"),
nodeInfo.get("SSDUsed"))
.addGauge(Interns.info("SSDRemaining",
"Total disk capacity remaining"),
nodeInfo.get("SSDRemaining")),
all);
}
} }

View File

@ -35,6 +35,7 @@
import org.junit.Test; import org.junit.Test;
import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
@ -128,6 +129,45 @@ public void testNodeReportProcessingFailure() {
getMetrics(SCMNodeMetrics.class.getSimpleName())); getMetrics(SCMNodeMetrics.class.getSimpleName()));
} }
/**
* Verify that datanode aggregated state and capacity metrics are reported.
*/
@Test
public void testNodeCountAndInfoMetricsReported() throws Exception {
HddsDatanodeService datanode = cluster.getHddsDatanodes().get(0);
StorageReportProto storageReport = TestUtils.createStorageReport(
datanode.getDatanodeDetails().getUuid(), "/tmp", 100, 10, 90, null);
NodeReportProto nodeReport = NodeReportProto.newBuilder()
.addStorageReport(storageReport).build();
datanode.getDatanodeStateMachine().getContext().addReport(nodeReport);
datanode.getDatanodeStateMachine().triggerHeartbeat();
// Give some time so that SCM receives and processes the heartbeat.
Thread.sleep(300L);
assertGauge("HealthyNodes", 1,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("StaleNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DeadNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DecommissioningNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DecommissionedNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DiskCapacity", 100L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DiskUsed", 10L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DiskRemaining", 90L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("SSDCapacity", 0L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("SSDUsed", 0L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("SSDRemaining", 0L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
}
@After @After
public void teardown() { public void teardown() {
cluster.shutdown(); cluster.shutdown();