From 43fac739bb30aeea8aff959efcc40e575565f035 Mon Sep 17 00:00:00 2001 From: lfengnan Date: Sun, 2 May 2021 03:18:47 -0700 Subject: [PATCH] HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds (#2910) Reviewed-by: Inigo Goiri Signed-off-by: Akira Ajisaka (cherry picked from commit 6e525ab81cc2d93eb1e4aeea6b13769635076c06) --- .../src/site/markdown/Metrics.md | 9 +++-- .../federation/metrics/FederationMBean.java | 26 ++++++++++++++ .../server/federation/metrics/RBFMetrics.java | 36 +++++++++++++++++-- .../main/webapps/router/federationhealth.html | 6 ++-- .../federation/metrics/TestMetricsBase.java | 11 ++++++ .../federation/metrics/TestRBFMetrics.java | 36 ++++++++++++++++++- 6 files changed, 115 insertions(+), 9 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index 5260db19ab..4a078d7328 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -507,9 +507,12 @@ RBFMetrics shows the metrics which are the aggregated values of sub-clusters' in | `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state | | `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state | | `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state | -| `TotalCapacity` | Current raw capacity of DataNodes in bytes | -| `UsedCapacity` | Current used capacity across all DataNodes in bytes | -| `RemainingCapacity` | Current remaining capacity in bytes | +| `TotalCapacity` | Current raw capacity of DataNodes in bytes (long primitive, may overflow) | +| `UsedCapacity` | Current used capacity across all DataNodes in bytes (long primitive, may overflow) | +| `RemainingCapacity` | Current remaining capacity in bytes (long primitive, may overflow) | +| `TotalCapacityBigInt` | Current raw capacity of DataNodes in bytes (using BigInteger) | +| `UsedCapacityBigInt` | Current used capacity across all DataNodes in bytes (using BigInteger) | +| `RemainingCapacityBigInt` | Current remaining capacity in bytes (using BigInteger) | | `NumOfMissingBlocks` | Current number of missing blocks | | `NumLiveNodes` | Number of datanodes which are currently live | | `NumDeadNodes` | Number of datanodes which are currently dead | diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java index 5fa4755868..e78ae4c4fa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hdfs.server.federation.metrics; +import java.math.BigInteger; + import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; @@ -54,22 +56,46 @@ public interface FederationMBean { /** * Get the total capacity of the federated cluster. + * The number could overflow if too big. In that case use + * {@link #getTotalCapacityBigInt()} instead. * @return Total capacity of the federated cluster. */ long getTotalCapacity(); /** * Get the used capacity of the federated cluster. + * The number could overflow if too big. In that case use + * {@link #getUsedCapacityBigInt()} instead. * @return Used capacity of the federated cluster. */ long getUsedCapacity(); /** * Get the remaining capacity of the federated cluster. + * The number could overflow if too big. In that case use + * {@link #getRemainingCapacityBigInt()} instead. * @return Remaining capacity of the federated cluster. */ long getRemainingCapacity(); + /** + * Get the total capacity (big integer) of the federated cluster. + * @return Total capacity of the federated cluster. + */ + BigInteger getTotalCapacityBigInt(); + + /** + * Get the used capacity (big integer) of the federated cluster. + * @return Used capacity of the federated cluster. + */ + BigInteger getUsedCapacityBigInt(); + + /** + * Get the remaining capacity (big integer) of the federated cluster. + * @return Remaining capacity of the federated cluster. + */ + BigInteger getRemainingCapacityBigInt(); + /** * Get the total remote storage capacity mounted in the federated cluster. * @return Remote capacity of the federated cluster. diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java index d626c23c9d..1eae105b82 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.lang.reflect.Method; +import java.math.BigInteger; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.UnknownHostException; @@ -377,14 +378,29 @@ public long getRemainingCapacity() { return getNameserviceAggregatedLong(MembershipStats::getAvailableSpace); } + @Override + public long getUsedCapacity() { + return getTotalCapacity() - getRemainingCapacity(); + } + + @Override + public BigInteger getTotalCapacityBigInt() { + return getNameserviceAggregatedBigInt(MembershipStats::getTotalSpace); + } + + @Override + public BigInteger getRemainingCapacityBigInt() { + return getNameserviceAggregatedBigInt(MembershipStats::getAvailableSpace); + } + @Override public long getProvidedSpace() { return getNameserviceAggregatedLong(MembershipStats::getProvidedSpace); } @Override - public long getUsedCapacity() { - return getTotalCapacity() - getRemainingCapacity(); + public BigInteger getUsedCapacityBigInt() { + return getTotalCapacityBigInt().subtract(getRemainingCapacityBigInt()); } @Override @@ -730,6 +746,22 @@ private long getNameserviceAggregatedLong(ToLongFunction f) { } } + private BigInteger getNameserviceAggregatedBigInt( + ToLongFunction f) { + try { + List states = getActiveNamenodeRegistrations(); + BigInteger sum = BigInteger.valueOf(0); + for (MembershipState state : states) { + long lvalue = f.applyAsLong(state.getStats()); + sum = sum.add(BigInteger.valueOf(lvalue)); + } + return sum; + } catch (IOException e) { + LOG.error("Unable to extract metrics: {}", e.getMessage()); + return new BigInteger("0"); + } + } + /** * Fetches the most active namenode memberships for all known nameservices. * The fetched membership may not or may not be active. Excludes expired diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html index 389491c573..dff89fa4c0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html @@ -113,9 +113,9 @@

Non Heap Memory used {used|fmt_bytes} of {committed|fmt_bytes} Commited Non Heap Memory. Max Non Heap Memory is {@eq key=max value="-1" type="number"}<unbounded>{:else}{max|fmt_bytes}{/eq}.

{/mem.NonHeapMemoryUsage} - - - + + + diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java index 4759d05f82..b01e22006d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java @@ -259,4 +259,15 @@ private MembershipState createRegistration(String ns, String nn, assertTrue(response.getResult()); return record; } + + // refresh namenode registration for new attributes + public boolean refreshNamenodeRegistration(NamenodeHeartbeatRequest request) + throws IOException { + boolean result = membershipStore.namenodeHeartbeat(request).getResult(); + membershipStore.loadCache(true); + MembershipNamenodeResolver resolver = + (MembershipNamenodeResolver) router.getNamenodeResolver(); + resolver.loadCache(true); + return result; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java index e1d1d8ec28..eed41c7ba3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java @@ -19,11 +19,13 @@ import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.getBean; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertFalse; import java.io.IOException; +import java.math.BigInteger; import java.util.Iterator; import java.util.List; @@ -31,6 +33,7 @@ import org.apache.commons.collections.ListUtils; import org.apache.hadoop.hdfs.server.federation.router.Router; +import org.apache.hadoop.hdfs.server.federation.store.protocol.NamenodeHeartbeatRequest; import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState; import org.apache.hadoop.hdfs.server.federation.store.records.MembershipStats; import org.apache.hadoop.hdfs.server.federation.store.records.MountTable; @@ -58,6 +61,7 @@ public void testClusterStatsJMX() FederationMBean federationBean = getBean(FEDERATION_BEAN, FederationMBean.class); validateClusterStatsFederationBean(federationBean); + testCapacity(federationBean); RouterMBean routerBean = getBean(ROUTER_BEAN, RouterMBean.class); validateClusterStatsRouterBean(routerBean); } @@ -326,4 +330,34 @@ private void validateClusterStatsRouterBean(RouterMBean bean) { assertTrue(bean.getHostAndPort().length() > 0); assertFalse(bean.isSecurityEnabled()); } + + private void testCapacity(FederationMBean bean) throws IOException { + List memberships = getActiveMemberships(); + assertTrue(memberships.size() > 1); + + BigInteger availableCapacity = BigInteger.valueOf(0); + BigInteger totalCapacity = BigInteger.valueOf(0); + BigInteger unitCapacity = BigInteger.valueOf(Long.MAX_VALUE); + for (MembershipState mock : memberships) { + MembershipStats stats = mock.getStats(); + stats.setTotalSpace(Long.MAX_VALUE); + stats.setAvailableSpace(Long.MAX_VALUE); + // reset stats to make the new value persistent + mock.setStats(stats); + // write back the new namenode information to state store + assertTrue(refreshNamenodeRegistration( + NamenodeHeartbeatRequest.newInstance(mock))); + totalCapacity = totalCapacity.add(unitCapacity); + availableCapacity = availableCapacity.add(unitCapacity); + } + + // for local cache update + assertEquals(totalCapacity, bean.getTotalCapacityBigInt()); + // not equal since overflow happened. + assertNotEquals(totalCapacity, BigInteger.valueOf(bean.getTotalCapacity())); + assertEquals(availableCapacity, bean.getRemainingCapacityBigInt()); + // not equal since overflow happened. + assertNotEquals(availableCapacity, + BigInteger.valueOf(bean.getRemainingCapacity())); + } }
Total capacity{TotalCapacity|fmt_bytes}
Used capacity{UsedCapacity|fmt_bytes}
Remaining capacity{RemainingCapacity|fmt_bytes}
Total capacity{TotalCapacityBigInt|fmt_bytes}
Used capacity{UsedCapacityBigInt|fmt_bytes}
Remaining capacity{RemainingCapacityBigInt|fmt_bytes}
Nameservices{NumNameservices}
Namenodes{NumNamenodes}