HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds (#2910)
Reviewed-by: Inigo Goiri <inigoiri@apache.org> Signed-off-by: Akira Ajisaka <aajisaka@apache.org>
This commit is contained in:
parent
f1e1809029
commit
6e525ab81c
@ -527,9 +527,12 @@ RBFMetrics shows the metrics which are the aggregated values of sub-clusters' in
|
|||||||
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
|
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
|
||||||
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
|
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
|
||||||
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
|
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
|
||||||
| `TotalCapacity` | Current raw capacity of DataNodes in bytes |
|
| `TotalCapacity` | Current raw capacity of DataNodes in bytes (long primitive, may overflow) |
|
||||||
| `UsedCapacity` | Current used capacity across all DataNodes in bytes |
|
| `UsedCapacity` | Current used capacity across all DataNodes in bytes (long primitive, may overflow) |
|
||||||
| `RemainingCapacity` | Current remaining capacity in bytes |
|
| `RemainingCapacity` | Current remaining capacity in bytes (long primitive, may overflow) |
|
||||||
|
| `TotalCapacityBigInt` | Current raw capacity of DataNodes in bytes (using BigInteger) |
|
||||||
|
| `UsedCapacityBigInt` | Current used capacity across all DataNodes in bytes (using BigInteger) |
|
||||||
|
| `RemainingCapacityBigInt` | Current remaining capacity in bytes (using BigInteger) |
|
||||||
| `NumOfMissingBlocks` | Current number of missing blocks |
|
| `NumOfMissingBlocks` | Current number of missing blocks |
|
||||||
| `NumLiveNodes` | Number of datanodes which are currently live |
|
| `NumLiveNodes` | Number of datanodes which are currently live |
|
||||||
| `NumDeadNodes` | Number of datanodes which are currently dead |
|
| `NumDeadNodes` | Number of datanodes which are currently dead |
|
||||||
|
@ -17,6 +17,8 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hdfs.server.federation.metrics;
|
package org.apache.hadoop.hdfs.server.federation.metrics;
|
||||||
|
|
||||||
|
import java.math.BigInteger;
|
||||||
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.classification.InterfaceStability;
|
import org.apache.hadoop.classification.InterfaceStability;
|
||||||
|
|
||||||
@ -54,22 +56,46 @@ public interface FederationMBean {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the total capacity of the federated cluster.
|
* Get the total capacity of the federated cluster.
|
||||||
|
* The number could overflow if too big. In that case use
|
||||||
|
* {@link #getTotalCapacityBigInt()} instead.
|
||||||
* @return Total capacity of the federated cluster.
|
* @return Total capacity of the federated cluster.
|
||||||
*/
|
*/
|
||||||
long getTotalCapacity();
|
long getTotalCapacity();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the used capacity of the federated cluster.
|
* Get the used capacity of the federated cluster.
|
||||||
|
* The number could overflow if too big. In that case use
|
||||||
|
* {@link #getUsedCapacityBigInt()} instead.
|
||||||
* @return Used capacity of the federated cluster.
|
* @return Used capacity of the federated cluster.
|
||||||
*/
|
*/
|
||||||
long getUsedCapacity();
|
long getUsedCapacity();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the remaining capacity of the federated cluster.
|
* Get the remaining capacity of the federated cluster.
|
||||||
|
* The number could overflow if too big. In that case use
|
||||||
|
* {@link #getRemainingCapacityBigInt()} instead.
|
||||||
* @return Remaining capacity of the federated cluster.
|
* @return Remaining capacity of the federated cluster.
|
||||||
*/
|
*/
|
||||||
long getRemainingCapacity();
|
long getRemainingCapacity();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the total capacity (big integer) of the federated cluster.
|
||||||
|
* @return Total capacity of the federated cluster.
|
||||||
|
*/
|
||||||
|
BigInteger getTotalCapacityBigInt();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the used capacity (big integer) of the federated cluster.
|
||||||
|
* @return Used capacity of the federated cluster.
|
||||||
|
*/
|
||||||
|
BigInteger getUsedCapacityBigInt();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the remaining capacity (big integer) of the federated cluster.
|
||||||
|
* @return Remaining capacity of the federated cluster.
|
||||||
|
*/
|
||||||
|
BigInteger getRemainingCapacityBigInt();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the total remote storage capacity mounted in the federated cluster.
|
* Get the total remote storage capacity mounted in the federated cluster.
|
||||||
* @return Remote capacity of the federated cluster.
|
* @return Remote capacity of the federated cluster.
|
||||||
|
@ -21,6 +21,7 @@ import static org.apache.hadoop.util.Time.now;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
|
import java.math.BigInteger;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
@ -380,14 +381,29 @@ public class RBFMetrics implements RouterMBean, FederationMBean {
|
|||||||
return getNameserviceAggregatedLong(MembershipStats::getAvailableSpace);
|
return getNameserviceAggregatedLong(MembershipStats::getAvailableSpace);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getUsedCapacity() {
|
||||||
|
return getTotalCapacity() - getRemainingCapacity();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BigInteger getTotalCapacityBigInt() {
|
||||||
|
return getNameserviceAggregatedBigInt(MembershipStats::getTotalSpace);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BigInteger getRemainingCapacityBigInt() {
|
||||||
|
return getNameserviceAggregatedBigInt(MembershipStats::getAvailableSpace);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getProvidedSpace() {
|
public long getProvidedSpace() {
|
||||||
return getNameserviceAggregatedLong(MembershipStats::getProvidedSpace);
|
return getNameserviceAggregatedLong(MembershipStats::getProvidedSpace);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getUsedCapacity() {
|
public BigInteger getUsedCapacityBigInt() {
|
||||||
return getTotalCapacity() - getRemainingCapacity();
|
return getTotalCapacityBigInt().subtract(getRemainingCapacityBigInt());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -783,6 +799,22 @@ public class RBFMetrics implements RouterMBean, FederationMBean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private BigInteger getNameserviceAggregatedBigInt(
|
||||||
|
ToLongFunction<MembershipStats> f) {
|
||||||
|
try {
|
||||||
|
List<MembershipState> states = getActiveNamenodeRegistrations();
|
||||||
|
BigInteger sum = BigInteger.valueOf(0);
|
||||||
|
for (MembershipState state : states) {
|
||||||
|
long lvalue = f.applyAsLong(state.getStats());
|
||||||
|
sum = sum.add(BigInteger.valueOf(lvalue));
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.error("Unable to extract metrics: {}", e.getMessage());
|
||||||
|
return new BigInteger("0");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches the most active namenode memberships for all known nameservices.
|
* Fetches the most active namenode memberships for all known nameservices.
|
||||||
* The fetched membership may not or may not be active. Excludes expired
|
* The fetched membership may not or may not be active. Excludes expired
|
||||||
|
@ -113,9 +113,9 @@
|
|||||||
<p>Non Heap Memory used {used|fmt_bytes} of {committed|fmt_bytes} Commited Non Heap Memory. Max Non Heap Memory is {@eq key=max value="-1" type="number"}<unbounded>{:else}{max|fmt_bytes}{/eq}.</p>
|
<p>Non Heap Memory used {used|fmt_bytes} of {committed|fmt_bytes} Commited Non Heap Memory. Max Non Heap Memory is {@eq key=max value="-1" type="number"}<unbounded>{:else}{max|fmt_bytes}{/eq}.</p>
|
||||||
{/mem.NonHeapMemoryUsage}
|
{/mem.NonHeapMemoryUsage}
|
||||||
<table class="table table-bordered table-striped">
|
<table class="table table-bordered table-striped">
|
||||||
<tr><th>Total capacity</th><td>{TotalCapacity|fmt_bytes}</td></tr>
|
<tr><th>Total capacity</th><td>{TotalCapacityBigInt|fmt_bytes}</td></tr>
|
||||||
<tr><th>Used capacity</th><td>{UsedCapacity|fmt_bytes}</td></tr>
|
<tr><th>Used capacity</th><td>{UsedCapacityBigInt|fmt_bytes}</td></tr>
|
||||||
<tr><th>Remaining capacity</th><td>{RemainingCapacity|fmt_bytes}</td></tr>
|
<tr><th>Remaining capacity</th><td>{RemainingCapacityBigInt|fmt_bytes}</td></tr>
|
||||||
<tr><th>Nameservices</th><td>{NumNameservices}</td></tr>
|
<tr><th>Nameservices</th><td>{NumNameservices}</td></tr>
|
||||||
<tr><th>Namenodes</th><td>{NumNamenodes}</td></tr>
|
<tr><th>Namenodes</th><td>{NumNamenodes}</td></tr>
|
||||||
<tr>
|
<tr>
|
||||||
|
@ -259,4 +259,15 @@ public class TestMetricsBase {
|
|||||||
assertTrue(response.getResult());
|
assertTrue(response.getResult());
|
||||||
return record;
|
return record;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// refresh namenode registration for new attributes
|
||||||
|
public boolean refreshNamenodeRegistration(NamenodeHeartbeatRequest request)
|
||||||
|
throws IOException {
|
||||||
|
boolean result = membershipStore.namenodeHeartbeat(request).getResult();
|
||||||
|
membershipStore.loadCache(true);
|
||||||
|
MembershipNamenodeResolver resolver =
|
||||||
|
(MembershipNamenodeResolver) router.getNamenodeResolver();
|
||||||
|
resolver.loadCache(true);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,11 +19,13 @@ package org.apache.hadoop.hdfs.server.federation.metrics;
|
|||||||
|
|
||||||
import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.getBean;
|
import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.getBean;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertNotEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.assertFalse;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.math.BigInteger;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -31,6 +33,7 @@ import javax.management.MalformedObjectNameException;
|
|||||||
|
|
||||||
import org.apache.commons.collections.ListUtils;
|
import org.apache.commons.collections.ListUtils;
|
||||||
import org.apache.hadoop.hdfs.server.federation.router.Router;
|
import org.apache.hadoop.hdfs.server.federation.router.Router;
|
||||||
|
import org.apache.hadoop.hdfs.server.federation.store.protocol.NamenodeHeartbeatRequest;
|
||||||
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState;
|
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState;
|
||||||
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipStats;
|
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipStats;
|
||||||
import org.apache.hadoop.hdfs.server.federation.store.records.MountTable;
|
import org.apache.hadoop.hdfs.server.federation.store.records.MountTable;
|
||||||
@ -58,6 +61,7 @@ public class TestRBFMetrics extends TestMetricsBase {
|
|||||||
FederationMBean federationBean = getBean(FEDERATION_BEAN,
|
FederationMBean federationBean = getBean(FEDERATION_BEAN,
|
||||||
FederationMBean.class);
|
FederationMBean.class);
|
||||||
validateClusterStatsFederationBean(federationBean);
|
validateClusterStatsFederationBean(federationBean);
|
||||||
|
testCapacity(federationBean);
|
||||||
RouterMBean routerBean = getBean(ROUTER_BEAN, RouterMBean.class);
|
RouterMBean routerBean = getBean(ROUTER_BEAN, RouterMBean.class);
|
||||||
validateClusterStatsRouterBean(routerBean);
|
validateClusterStatsRouterBean(routerBean);
|
||||||
}
|
}
|
||||||
@ -348,4 +352,34 @@ public class TestRBFMetrics extends TestMetricsBase {
|
|||||||
assertTrue(bean.getHostAndPort().length() > 0);
|
assertTrue(bean.getHostAndPort().length() > 0);
|
||||||
assertFalse(bean.isSecurityEnabled());
|
assertFalse(bean.isSecurityEnabled());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void testCapacity(FederationMBean bean) throws IOException {
|
||||||
|
List<MembershipState> memberships = getActiveMemberships();
|
||||||
|
assertTrue(memberships.size() > 1);
|
||||||
|
|
||||||
|
BigInteger availableCapacity = BigInteger.valueOf(0);
|
||||||
|
BigInteger totalCapacity = BigInteger.valueOf(0);
|
||||||
|
BigInteger unitCapacity = BigInteger.valueOf(Long.MAX_VALUE);
|
||||||
|
for (MembershipState mock : memberships) {
|
||||||
|
MembershipStats stats = mock.getStats();
|
||||||
|
stats.setTotalSpace(Long.MAX_VALUE);
|
||||||
|
stats.setAvailableSpace(Long.MAX_VALUE);
|
||||||
|
// reset stats to make the new value persistent
|
||||||
|
mock.setStats(stats);
|
||||||
|
// write back the new namenode information to state store
|
||||||
|
assertTrue(refreshNamenodeRegistration(
|
||||||
|
NamenodeHeartbeatRequest.newInstance(mock)));
|
||||||
|
totalCapacity = totalCapacity.add(unitCapacity);
|
||||||
|
availableCapacity = availableCapacity.add(unitCapacity);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for local cache update
|
||||||
|
assertEquals(totalCapacity, bean.getTotalCapacityBigInt());
|
||||||
|
// not equal since overflow happened.
|
||||||
|
assertNotEquals(totalCapacity, BigInteger.valueOf(bean.getTotalCapacity()));
|
||||||
|
assertEquals(availableCapacity, bean.getRemainingCapacityBigInt());
|
||||||
|
// not equal since overflow happened.
|
||||||
|
assertNotEquals(availableCapacity,
|
||||||
|
BigInteger.valueOf(bean.getRemainingCapacity()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user