HDFS-16959. RBF: State store cache loading metrics (#5497)
This commit is contained in:
parent
5bc8f25327
commit
b4bcbb9515
@ -592,17 +592,19 @@ StateStoreMetrics
|
||||
-----------------
|
||||
StateStoreMetrics shows the statistics of the State Store component in Router-based federation.
|
||||
|
||||
| Name | Description |
|
||||
|:---- |:---- |
|
||||
| `ReadsNumOps` | Number of GET transactions for State Store within an interval time of metric |
|
||||
| `ReadsAvgTime` | Average time of GET transactions for State Store in milliseconds |
|
||||
| `WritesNumOps` | Number of PUT transactions for State Store within an interval time of metric |
|
||||
| `WritesAvgTime` | Average time of PUT transactions for State Store in milliseconds |
|
||||
| `RemovesNumOps` | Number of REMOVE transactions for State Store within an interval time of metric |
|
||||
| `RemovesAvgTime` | Average time of REMOVE transactions for State Store in milliseconds |
|
||||
| `FailuresNumOps` | Number of failed transactions for State Store within an interval time of metric |
|
||||
| `FailuresAvgTime` | Average time of failed transactions for State Store in milliseconds |
|
||||
| `Cache`*BaseRecord*`Size` | Number of store records to cache in State Store |
|
||||
| Name | Description |
|
||||
|:------------------------------------------|:-----------------------------------------------------------------------------------|
|
||||
| `ReadsNumOps` | Number of GET transactions for State Store within an interval time of metric |
|
||||
| `ReadsAvgTime` | Average time of GET transactions for State Store in milliseconds |
|
||||
| `WritesNumOps` | Number of PUT transactions for State Store within an interval time of metric |
|
||||
| `WritesAvgTime` | Average time of PUT transactions for State Store in milliseconds |
|
||||
| `RemovesNumOps` | Number of REMOVE transactions for State Store within an interval time of metric |
|
||||
| `RemovesAvgTime` | Average time of REMOVE transactions for State Store in milliseconds |
|
||||
| `FailuresNumOps` | Number of failed transactions for State Store within an interval time of metric |
|
||||
| `FailuresAvgTime` | Average time of failed transactions for State Store in milliseconds |
|
||||
| `Cache`*BaseRecord*`Size` | Number of store records to cache in State Store |
|
||||
| `Cache`*BaseRecord*`LoadNumOps` | Number of times store records are loaded in the State Store Cache from State Store |
|
||||
| `Cache`*BaseRecord*`LoadAvgTime` | Average time of loading State Store Cache from State Store in milliseconds |
|
||||
|
||||
yarn context
|
||||
============
|
||||
|
@ -20,6 +20,7 @@ package org.apache.hadoop.hdfs.server.federation.metrics;
|
||||
import static org.apache.hadoop.metrics2.impl.MsInfo.ProcessName;
|
||||
import static org.apache.hadoop.metrics2.impl.MsInfo.SessionId;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
@ -54,6 +55,7 @@ public class StateStoreMetrics implements StateStoreMBean {
|
||||
private MutableRate failures;
|
||||
|
||||
private Map<String, MutableGaugeInt> cacheSizes;
|
||||
private final Map<String, MutableRate> cacheLoadMetrics = new HashMap<>();
|
||||
|
||||
protected StateStoreMetrics() {}
|
||||
|
||||
@ -150,6 +152,32 @@ public class StateStoreMetrics implements StateStoreMBean {
|
||||
counter.set(count);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the cache loading metrics for the state store interface.
|
||||
*
|
||||
* @param name Name of the record of the cache.
|
||||
* @param value The time duration interval as the cache value.
|
||||
*/
|
||||
public void setCacheLoading(String name, long value) {
|
||||
String cacheLoad = "Cache" + name + "Load";
|
||||
MutableRate cacheLoadMetric = cacheLoadMetrics.get(cacheLoad);
|
||||
if (cacheLoadMetric == null) {
|
||||
cacheLoadMetric = registry.newRate(cacheLoad, name, false);
|
||||
cacheLoadMetrics.put(cacheLoad, cacheLoadMetric);
|
||||
}
|
||||
cacheLoadMetrics.get(cacheLoad).add(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve unmodifiable map of cache loading metrics.
|
||||
*
|
||||
* @return unmodifiable map of cache loading metrics.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public Map<String, MutableRate> getCacheLoadMetrics() {
|
||||
return Collections.unmodifiableMap(cacheLoadMetrics);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public void reset() {
|
||||
reads.resetMinMax();
|
||||
|
@ -113,6 +113,7 @@ public abstract class CachedRecordStore<R extends BaseRecord>
|
||||
if (force || isUpdateTime()) {
|
||||
List<R> newRecords = null;
|
||||
long t = -1;
|
||||
long startTime = Time.monotonicNow();
|
||||
try {
|
||||
QueryResult<R> result = getDriver().get(getRecordClass());
|
||||
newRecords = result.getRecords();
|
||||
@ -143,6 +144,7 @@ public abstract class CachedRecordStore<R extends BaseRecord>
|
||||
StateStoreMetrics metrics = getDriver().getMetrics();
|
||||
if (metrics != null) {
|
||||
String recordName = getRecordClass().getSimpleName();
|
||||
metrics.setCacheLoading(recordName, Time.monotonicNow() - startTime);
|
||||
metrics.setCacheSize(recordName, this.records.size());
|
||||
}
|
||||
|
||||
|
@ -48,6 +48,8 @@ import org.apache.hadoop.hdfs.server.federation.store.records.Query;
|
||||
import org.apache.hadoop.hdfs.server.federation.store.records.QueryResult;
|
||||
import org.apache.hadoop.hdfs.server.federation.store.records.RouterState;
|
||||
import org.apache.hadoop.hdfs.server.federation.store.records.StateStoreVersion;
|
||||
import org.apache.hadoop.metrics2.lib.MutableRate;
|
||||
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.slf4j.Logger;
|
||||
@ -76,6 +78,10 @@ public class TestStateStoreDriverBase {
|
||||
return stateStore.getDriver();
|
||||
}
|
||||
|
||||
protected StateStoreService getStateStoreService() {
|
||||
return stateStore;
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanMetrics() {
|
||||
if (stateStore != null) {
|
||||
@ -574,6 +580,36 @@ public class TestStateStoreDriverBase {
|
||||
return getters;
|
||||
}
|
||||
|
||||
public long getMountTableCacheLoadSamples(StateStoreDriver driver) throws IOException {
|
||||
final MutableRate mountTableCache = getMountTableCache(driver);
|
||||
return mountTableCache.lastStat().numSamples();
|
||||
}
|
||||
|
||||
private static MutableRate getMountTableCache(StateStoreDriver driver) throws IOException {
|
||||
StateStoreMetrics metrics = stateStore.getMetrics();
|
||||
final Query<MountTable> query = new Query<>(MountTable.newInstance());
|
||||
driver.getMultiple(MountTable.class, query);
|
||||
final Map<String, MutableRate> cacheLoadMetrics = metrics.getCacheLoadMetrics();
|
||||
final MutableRate mountTableCache = cacheLoadMetrics.get("CacheMountTableLoad");
|
||||
assertNotNull("CacheMountTableLoad should be present in the state store metrics",
|
||||
mountTableCache);
|
||||
return mountTableCache;
|
||||
}
|
||||
|
||||
public void testCacheLoadMetrics(StateStoreDriver driver, long numRefresh,
|
||||
double expectedHigherThan) throws IOException, IllegalArgumentException {
|
||||
final MutableRate mountTableCache = getMountTableCache(driver);
|
||||
// CacheMountTableLoadNumOps
|
||||
final long mountTableCacheLoadNumOps = getMountTableCacheLoadSamples(driver);
|
||||
assertEquals("Num of samples collected should match", numRefresh, mountTableCacheLoadNumOps);
|
||||
// CacheMountTableLoadAvgTime ms
|
||||
final double mountTableCacheLoadAvgTimeMs = mountTableCache.lastStat().mean();
|
||||
assertTrue(
|
||||
"Mean time duration for cache load is expected to be higher than " + expectedHigherThan
|
||||
+ " ms." + " Actual value: " + mountTableCacheLoadAvgTimeMs,
|
||||
mountTableCacheLoadAvgTimeMs > expectedHigherThan);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the type of field.
|
||||
*
|
||||
|
@ -73,4 +73,16 @@ public class TestStateStoreFile extends TestStateStoreDriverBase {
|
||||
throws IllegalArgumentException, IllegalAccessException, IOException {
|
||||
testMetrics(getStateStoreDriver());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCacheLoadMetrics() throws IOException {
|
||||
// inject value of CacheMountTableLoad as -1 initially, if tests get CacheMountTableLoadAvgTime
|
||||
// value as -1 ms, that would mean no other sample with value >= 0 would have been received and
|
||||
// hence this would be failure to assert that mount table avg load time is higher than -1
|
||||
getStateStoreService().getMetrics().setCacheLoading("MountTable", -1);
|
||||
long curMountTableLoadNum = getMountTableCacheLoadSamples(getStateStoreDriver());
|
||||
getStateStoreService().refreshCaches(true);
|
||||
testCacheLoadMetrics(getStateStoreDriver(), curMountTableLoadNum + 1, -1);
|
||||
}
|
||||
|
||||
}
|
@ -115,4 +115,16 @@ public class TestStateStoreFileSystem extends TestStateStoreDriverBase {
|
||||
|
||||
testInsertWithErrorDuringWrite(driver, MembershipState.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCacheLoadMetrics() throws IOException {
|
||||
// inject value of CacheMountTableLoad as -1 initially, if tests get CacheMountTableLoadAvgTime
|
||||
// value as -1 ms, that would mean no other sample with value >= 0 would have been received and
|
||||
// hence this would be failure to assert that mount table avg load time is higher than -1
|
||||
getStateStoreService().getMetrics().setCacheLoading("MountTable", -1);
|
||||
long curMountTableLoadNum = getMountTableCacheLoadSamples(getStateStoreDriver());
|
||||
getStateStoreService().refreshCaches(true);
|
||||
getStateStoreService().refreshCaches(true);
|
||||
testCacheLoadMetrics(getStateStoreDriver(), curMountTableLoadNum + 2, -1);
|
||||
}
|
||||
}
|
@ -206,4 +206,18 @@ public class TestStateStoreZK extends TestStateStoreDriverBase {
|
||||
stateStoreDriver.setEnableConcurrent(true);
|
||||
testFetchErrors(stateStoreDriver);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCacheLoadMetrics() throws IOException {
|
||||
// inject value of CacheMountTableLoad as -1 initially, if tests get CacheMountTableLoadAvgTime
|
||||
// value as -1 ms, that would mean no other sample with value >= 0 would have been received and
|
||||
// hence this would be failure to assert that mount table avg load time is higher than -1
|
||||
getStateStoreService().getMetrics().setCacheLoading("MountTable", -1);
|
||||
long curMountTableLoadNum = getMountTableCacheLoadSamples(getStateStoreDriver());
|
||||
getStateStoreService().refreshCaches(true);
|
||||
getStateStoreService().refreshCaches(true);
|
||||
getStateStoreService().refreshCaches(true);
|
||||
testCacheLoadMetrics(getStateStoreDriver(), curMountTableLoadNum + 3, -1);
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user