HDFS-16678. RBF should supports disable getNodeUsage() in RBFMetrics (#4606)

This commit is contained in:
xuzq 2022-08-13 03:01:58 +08:00 committed by GitHub
parent 521e65acfe
commit e0c8c6eed4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 69 additions and 16 deletions

View File

@ -50,6 +50,7 @@
import javax.management.ObjectName; import javax.management.ObjectName;
import javax.management.StandardMBean; import javax.management.StandardMBean;
import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
@ -113,6 +114,8 @@ public class RBFMetrics implements RouterMBean, FederationMBean {
/** Prevent holding the page from load too long. */ /** Prevent holding the page from load too long. */
private final long timeOut; private final long timeOut;
/** Enable/Disable getNodeUsage. **/
private boolean enableGetDNUsage;
/** Router interface. */ /** Router interface. */
private final Router router; private final Router router;
@ -175,6 +178,8 @@ public RBFMetrics(Router router) throws IOException {
Configuration conf = router.getConfig(); Configuration conf = router.getConfig();
this.timeOut = conf.getTimeDuration(RBFConfigKeys.DN_REPORT_TIME_OUT, this.timeOut = conf.getTimeDuration(RBFConfigKeys.DN_REPORT_TIME_OUT,
RBFConfigKeys.DN_REPORT_TIME_OUT_MS_DEFAULT, TimeUnit.MILLISECONDS); RBFConfigKeys.DN_REPORT_TIME_OUT_MS_DEFAULT, TimeUnit.MILLISECONDS);
this.enableGetDNUsage = conf.getBoolean(RBFConfigKeys.DFS_ROUTER_ENABLE_GET_DN_USAGE_KEY,
RBFConfigKeys.DFS_ROUTER_ENABLE_GET_DN_USAGE_DEFAULT);
this.topTokenRealOwners = conf.getInt( this.topTokenRealOwners = conf.getInt(
RBFConfigKeys.DFS_ROUTER_METRICS_TOP_NUM_TOKEN_OWNERS_KEY, RBFConfigKeys.DFS_ROUTER_METRICS_TOP_NUM_TOKEN_OWNERS_KEY,
RBFConfigKeys.DFS_ROUTER_METRICS_TOP_NUM_TOKEN_OWNERS_KEY_DEFAULT); RBFConfigKeys.DFS_ROUTER_METRICS_TOP_NUM_TOKEN_OWNERS_KEY_DEFAULT);
@ -184,6 +189,11 @@ public RBFMetrics(Router router) throws IOException {
ms.register(RBFMetrics.class.getName(), "RBFActivity Metrics", this); ms.register(RBFMetrics.class.getName(), "RBFActivity Metrics", this);
} }
@VisibleForTesting
public void setEnableGetDNUsage(boolean enableGetDNUsage) {
this.enableGetDNUsage = enableGetDNUsage;
}
/** /**
* Unregister the JMX beans. * Unregister the JMX beans.
*/ */
@ -537,35 +547,34 @@ public int getNumEnteringMaintenanceDataNodes() {
@Override // NameNodeMXBean @Override // NameNodeMXBean
public String getNodeUsage() { public String getNodeUsage() {
float median = 0; double median = 0;
float max = 0; double max = 0;
float min = 0; double min = 0;
float dev = 0; double dev = 0;
final Map<String, Map<String, Object>> info = new HashMap<>(); final Map<String, Map<String, Object>> info = new HashMap<>();
try { try {
DatanodeInfo[] live = null;
if (this.enableGetDNUsage) {
RouterRpcServer rpcServer = this.router.getRpcServer(); RouterRpcServer rpcServer = this.router.getRpcServer();
DatanodeInfo[] live = rpcServer.getDatanodeReport( live = rpcServer.getDatanodeReport(DatanodeReportType.LIVE, false, timeOut);
DatanodeReportType.LIVE, false, timeOut); } else {
LOG.debug("Getting node usage is disabled.");
}
if (live.length > 0) { if (live != null && live.length > 0) {
float totalDfsUsed = 0; double[] usages = new double[live.length];
float[] usages = new float[live.length];
int i = 0; int i = 0;
for (DatanodeInfo dn : live) { for (DatanodeInfo dn : live) {
usages[i++] = dn.getDfsUsedPercent(); usages[i++] = dn.getDfsUsedPercent();
totalDfsUsed += dn.getDfsUsedPercent();
} }
totalDfsUsed /= live.length;
Arrays.sort(usages); Arrays.sort(usages);
median = usages[usages.length / 2]; median = usages[usages.length / 2];
max = usages[usages.length - 1]; max = usages[usages.length - 1];
min = usages[0]; min = usages[0];
for (i = 0; i < usages.length; i++) { StandardDeviation deviation = new StandardDeviation();
dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed); dev = deviation.evaluate(usages);
}
dev = (float) Math.sqrt(dev / usages.length);
} }
} catch (IOException e) { } catch (IOException e) {
LOG.error("Cannot get the live nodes: {}", e.getMessage()); LOG.error("Cannot get the live nodes: {}", e.getMessage());

View File

@ -321,6 +321,9 @@ public class RBFConfigKeys extends CommonConfigurationKeysPublic {
FEDERATION_ROUTER_PREFIX + "dn-report.cache-expire"; FEDERATION_ROUTER_PREFIX + "dn-report.cache-expire";
public static final long DN_REPORT_CACHE_EXPIRE_MS_DEFAULT = public static final long DN_REPORT_CACHE_EXPIRE_MS_DEFAULT =
TimeUnit.SECONDS.toMillis(10); TimeUnit.SECONDS.toMillis(10);
public static final String DFS_ROUTER_ENABLE_GET_DN_USAGE_KEY =
FEDERATION_ROUTER_PREFIX + "enable.get.dn.usage";
public static final boolean DFS_ROUTER_ENABLE_GET_DN_USAGE_DEFAULT = true;
// HDFS Router-based federation quota // HDFS Router-based federation quota
public static final String DFS_ROUTER_QUOTA_ENABLE = public static final String DFS_ROUTER_QUOTA_ENABLE =

View File

@ -195,6 +195,16 @@
</description> </description>
</property> </property>
<property>
<name>dfs.federation.router.enable.get.dn.usage</name>
<value>true</value>
<description>
If true, the getNodeUsage method in RBFMetrics will return an up-to-date
result collecting from downstream nameservices. But it will take a long
time and take up thread resources. If false, it will return a mock result with all 0.
</description>
</property>
<property> <property>
<name>dfs.federation.router.metrics.class</name> <name>dfs.federation.router.metrics.class</name>
<value>org.apache.hadoop.hdfs.server.federation.metrics.FederationRPCPerformanceMonitor</value> <value>org.apache.hadoop.hdfs.server.federation.metrics.FederationRPCPerformanceMonitor</value>

View File

@ -131,6 +131,7 @@
import org.apache.hadoop.service.Service.STATE; import org.apache.hadoop.service.Service.STATE;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.LambdaTestUtils; import org.apache.hadoop.test.LambdaTestUtils;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject; import org.codehaus.jettison.json.JSONObject;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.Before; import org.junit.Before;
@ -2181,4 +2182,34 @@ public void testContentSummaryWithSnapshot() throws Exception {
routerDFS.delete(dirPath, true); routerDFS.delete(dirPath, true);
} }
} }
@Test
public void testDisableNodeUsageInRBFMetrics() throws JSONException {
RBFMetrics rbfMetrics = router.getRouter().getMetrics();
FederationRPCMetrics federationRPCMetrics = router.getRouter().getRpcServer().getRPCMetrics();
long proxyOpBefore = federationRPCMetrics.getProxyOps();
String nodeUsageEnable = router.getRouter().getMetrics().getNodeUsage();
assertNotNull(nodeUsageEnable);
long proxyOpAfterWithEnable = federationRPCMetrics.getProxyOps();
assertEquals(proxyOpBefore + 2, proxyOpAfterWithEnable);
rbfMetrics.setEnableGetDNUsage(false);
String nodeUsageDisable = rbfMetrics.getNodeUsage();
assertNotNull(nodeUsageDisable);
long proxyOpAfterWithDisable = federationRPCMetrics.getProxyOps();
assertEquals(proxyOpAfterWithEnable, proxyOpAfterWithDisable);
JSONObject jsonObject = new JSONObject(nodeUsageDisable);
JSONObject json = jsonObject.getJSONObject("nodeUsage");
assertEquals("0.00%", json.get("min"));
assertEquals("0.00%", json.get("median"));
assertEquals("0.00%", json.get("max"));
assertEquals("0.00%", json.get("stdDev"));
rbfMetrics.setEnableGetDNUsage(true);
String nodeUsageWithReEnable = rbfMetrics.getNodeUsage();
assertNotNull(nodeUsageWithReEnable);
long proxyOpAfterWithReEnable = federationRPCMetrics.getProxyOps();
assertEquals(proxyOpAfterWithDisable + 2, proxyOpAfterWithReEnable);
}
} }