From af8e9842d2ca566528e09d905b609f1cf160d367 Mon Sep 17 00:00:00 2001 From: Chris Douglas Date: Tue, 18 Apr 2017 10:28:50 -0700 Subject: [PATCH] YARN-6451. Add RM monitor validating metrics invariants. Contributed by Carlo Curino --- .../pom.xml | 1 + .../InvariantViolationException.java | 35 ++++ .../monitor/invariants/InvariantsChecker.java | 96 +++++++++ .../invariants/MetricsInvariantChecker.java | 195 ++++++++++++++++++ .../monitor/invariants/package-info.java | 22 ++ .../TestMetricsInvariantChecker.java | 99 +++++++++ .../src/test/resources/invariants.txt | 54 +++++ 7 files changed, 502 insertions(+) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/TestMetricsInvariantChecker.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/invariants.txt diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml index 533be9ef61..8505df8c91 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml @@ -347,6 +347,7 @@ src/test/resources/submit-reservation.json src/test/resources/delete-reservation.json src/test/resources/update-reservation.json + src/test/resources/invariants.txt diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java new file mode 100644 index 0000000000..0491756b96 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + + +import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; + +/** + * This exception represents the violation of an internal invariant. + */ +public class InvariantViolationException extends YarnRuntimeException { + + public InvariantViolationException(String s) { + super(s); + } + + public InvariantViolationException(String s, Exception e) { + super(s, e); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java new file mode 100644 index 0000000000..5800162bed --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Abstract invariant checker, that setup common context for invariants + * checkers. + */ +public abstract class InvariantsChecker implements SchedulingEditPolicy { + + private static final Logger LOG = + LoggerFactory.getLogger(InvariantsChecker.class); + public static final String THROW_ON_VIOLATION = + "yarn.resourcemanager.invariant-checker.throw-on-violation"; + public static final String INVARIANT_MONITOR_INTERVAL = + "yarn.resourcemanager.invariant-checker.monitor-interval"; + + private Configuration conf; + private RMContext context; + private PreemptableResourceScheduler scheduler; + private boolean throwOnInvariantViolation; + private long monitoringInterval; + + @Override + public void init(Configuration config, RMContext rmContext, + PreemptableResourceScheduler preemptableResourceScheduler) { + this.conf = config; + this.context = rmContext; + this.scheduler = preemptableResourceScheduler; + this.throwOnInvariantViolation = + conf.getBoolean(InvariantsChecker.THROW_ON_VIOLATION, false); + this.monitoringInterval = + conf.getLong(InvariantsChecker.INVARIANT_MONITOR_INTERVAL, 1000L); + + LOG.info("Invariant checker " + this.getPolicyName() + + " enabled. Monitoring every " + monitoringInterval + + "ms, throwOnViolation=" + throwOnInvariantViolation); + } + + @Override + public long getMonitoringInterval() { + return monitoringInterval; + } + + @Override + public String getPolicyName() { + return this.getClass().getSimpleName(); + } + + public void logOrThrow(String message) throws InvariantViolationException { + if (getThrowOnInvariantViolation()) { + throw new InvariantViolationException(message); + } else { + LOG.warn(message); + } + } + + public boolean getThrowOnInvariantViolation() { + return throwOnInvariantViolation; + } + + public Configuration getConf() { + return conf; + } + + public RMContext getContext() { + return context; + } + + public PreemptableResourceScheduler getScheduler() { + return scheduler; + } + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java new file mode 100644 index 0000000000..9fee2bd2ab --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java @@ -0,0 +1,195 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.metrics2.AbstractMetric; +import org.apache.hadoop.metrics2.MetricsRecord; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.source.JvmMetrics; +import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.script.Compilable; +import javax.script.CompiledScript; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; +import javax.script.SimpleBindings; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * This policy checks at every invocation that a given set of invariants + * (specified in a file) are respected over QueueMetrics and JvmMetrics. The + * file may contain arbitrary (Javascrip) boolean expression over the metrics + * variables. + * + * The right set of invariants depends on the deployment environment, a large + * number of complex invariant can make this check expensive. + * + * The MetricsInvariantChecker can be configured to throw a RuntimeException or + * simlpy warn in the logs if an invariant is not respected. + */ +public class MetricsInvariantChecker extends InvariantsChecker { + + private static final Logger LOG = + LoggerFactory.getLogger(MetricsInvariantChecker.class); + public static final String INVARIANTS_FILE = + "yarn.resourcemanager.invariant-checker.file"; + + private MetricsSystem metricsSystem; + private MetricsCollectorImpl collector; + private SimpleBindings bindings; + private ScriptEngineManager manager; + private Compilable scriptEngine; + private String invariantFile; + private Map invariants; + private CompiledScript combinedInvariants; + + // set of metrics we monitor + private QueueMetrics queueMetrics; + private JvmMetrics jvmMetrics; + + @Override + public void init(Configuration config, RMContext rmContext, + PreemptableResourceScheduler preemptableResourceScheduler) { + + super.init(config, rmContext, preemptableResourceScheduler); + + this.metricsSystem = DefaultMetricsSystem.instance(); + this.queueMetrics = + QueueMetrics.forQueue(metricsSystem, "root", null, false, getConf()); + this.jvmMetrics = (JvmMetrics) metricsSystem.getSource("JvmMetrics"); + + // at first collect all metrics + collector = new MetricsCollectorImpl(); + queueMetrics.getMetrics(collector, true); + jvmMetrics.getMetrics(collector, true); + + // prepare bindings and evaluation engine + this.bindings = new SimpleBindings(); + this.manager = new ScriptEngineManager(); + this.scriptEngine = (Compilable) manager.getEngineByName("JavaScript"); + + // load metrics invariant from file + this.invariantFile = getConf().get(MetricsInvariantChecker.INVARIANTS_FILE); + + this.invariants = new HashMap<>(); + + // preload all bindings + queueMetrics.getMetrics(collector, true); + jvmMetrics.getMetrics(collector, true); + for (MetricsRecord record : collector.getRecords()) { + for (AbstractMetric am : record.metrics()) { + bindings.put(am.name().replace(' ', '_'), am.value()); + } + } + + StringBuilder sb = new StringBuilder(); + try { + List tempInv = + Files.readLines(new File(invariantFile), Charsets.UTF_8); + + + boolean first = true; + // precompile individual invariants + for (String inv : tempInv) { + + if(first) { + first = false; + } else { + sb.append("&&"); + } + + invariants.put(inv, scriptEngine.compile(inv)); + sb.append(" ("); + sb.append(inv); + sb.append(") "); + } + + // create a single large combined invariant for speed of checking + combinedInvariants = scriptEngine.compile(sb.toString()); + + } catch (IOException e) { + throw new RuntimeException( + "Error loading invariant file: " + e.getMessage()); + } catch (ScriptException e) { + throw new RuntimeException("Error compiling invariant " + e.getMessage()); + } + + } + + @Override + public void editSchedule() { + // grab all changed metrics and update bindings + collector.clear(); + queueMetrics.getMetrics(collector, false); + jvmMetrics.getMetrics(collector, false); + + for (MetricsRecord record : collector.getRecords()) { + for (AbstractMetric am : record.metrics()) { + bindings.put(am.name().replace(' ', '_'), am.value()); + } + } + + // evaluate all invariants with new bindings + try { + + // fastpath check all invariants at once (much faster) + boolean allInvHold = (boolean) combinedInvariants.eval(bindings); + + // if any fails, check individually to produce more insightful log + if (!allInvHold) { + for (Map.Entry e : invariants.entrySet()) { + boolean invariantsHold = (boolean) e.getValue().eval(bindings); + if (!invariantsHold) { + // filter bindings to produce minimal set + Map matchingBindings = + extractMatchingBindings(e.getKey(), bindings); + logOrThrow("Invariant \"" + e.getKey() + + "\" is NOT holding, with bindings: " + matchingBindings); + } + } + } + } catch (ScriptException e) { + logOrThrow(e.getMessage()); + } + } + + private static Map extractMatchingBindings(String inv, + SimpleBindings allBindings) { + Map matchingBindings = new HashMap<>(); + for (Map.Entry s : allBindings.entrySet()) { + if (inv.contains(s.getKey())) { + matchingBindings.put(s.getKey(), s.getValue()); + } + } + return matchingBindings; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java new file mode 100644 index 0000000000..d9931d6777 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Monitoring policies, used to check invariants. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/TestMetricsInvariantChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/TestMetricsInvariantChecker.java new file mode 100644 index 0000000000..35cf1e4e53 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/TestMetricsInvariantChecker.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.source.JvmMetrics; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; +import org.apache.log4j.Logger; +import org.junit.Before; +import org.junit.Test; + +import static junit.framework.TestCase.fail; + +/** + * This class tests the {@code MetricsInvariantChecker} by running it multiple + * time and reporting the time it takes to execute, as well as verifying that + * the invariant throws in case the invariants are not respected. + */ +public class TestMetricsInvariantChecker { + public final static Logger LOG = + Logger.getLogger(TestMetricsInvariantChecker.class); + + private MetricsSystem metricsSystem; + private MetricsInvariantChecker ic; + private Configuration conf; + + @Before + public void setup() { + this.metricsSystem = DefaultMetricsSystem.instance(); + JvmMetrics.initSingleton("ResourceManager", null); + this.ic = new MetricsInvariantChecker(); + this.conf = new Configuration(); + conf.set(MetricsInvariantChecker.INVARIANTS_FILE, + "src/test/resources/invariants.txt"); + conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, true); + ic.init(conf, null, null); + } + + @Test(timeout = 5000) + public void testManyRuns() { + + QueueMetrics qm = + QueueMetrics.forQueue(metricsSystem, "root", null, false, conf); + qm.setAvailableResourcesToQueue(Resource.newInstance(1, 1)); + + int numIterations = 1000; + long start = System.currentTimeMillis(); + for (int i = 0; i < numIterations; i++) { + ic.editSchedule(); + } + long end = System.currentTimeMillis(); + + System.out.println("Runtime per iteration (avg of " + numIterations + + " iterations): " + (end - start) + " tot time"); + + } + + @Test + public void testViolation() { + + // create a "wrong" condition in which the invariants are not respected + QueueMetrics qm = + QueueMetrics.forQueue(metricsSystem, "root", null, false, conf); + qm.setAvailableResourcesToQueue(Resource.newInstance(-1, -1)); + + // test with throwing exception turned on + try { + ic.editSchedule(); + fail(); + } catch (InvariantViolationException i) { + // expected + } + + // test log-only mode + conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, false); + ic.init(conf, null, null); + ic.editSchedule(); + + } + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/invariants.txt b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/invariants.txt new file mode 100644 index 0000000000..363ed0d996 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/invariants.txt @@ -0,0 +1,54 @@ +running_0 >= 0 +running_60 >= 0 +running_300 >= 0 +running_1440 >= 0 +AppsSubmitted >= 0 +AppsRunning >= 0 +AppsPending >= 0 +AppsCompleted >= 0 +AppsKilled >= 0 +AppsFailed >= 0 +AllocatedMB >= 0 +AllocatedVCores >= 0 +AllocatedContainers >= 0 +AggregateContainersAllocated >= 0 +AggregateNodeLocalContainersAllocated >= 0 +AggregateRackLocalContainersAllocated >= 0 +AggregateOffSwitchContainersAllocated >= 0 +AggregateContainersReleased >= 0 +AggregateContainersPreempted >= 0 +AvailableMB >= 0 +AvailableVCores >= 0 +PendingMB >= 0 +PendingVCores >= 0 +PendingContainers >= 0 +ReservedMB >= 0 +ReservedVCores >= 0 +ReservedContainers >= 0 +ActiveUsers >= 0 +ActiveApplications >= 0 +AppAttemptFirstContainerAllocationDelayNumOps >= 0 +AppAttemptFirstContainerAllocationDelayAvgTime >= 0 +MemNonHeapUsedM >= 0 +MemNonHeapCommittedM >= 0 +MemNonHeapMaxM >= 0 || MemNonHeapMaxM == -1 +MemHeapUsedM >= 0 +MemHeapCommittedM >= 0 +MemHeapMaxM >= 0 +MemMaxM >= 0 +GcCountPS_Scavenge >= 0 +GcTimeMillisPS_Scavenge >= 0 +GcCountPS_MarkSweep >= 0 +GcTimeMillisPS_MarkSweep >= 0 +GcCount >= 0 +GcTimeMillis >= 0 +ThreadsNew >= 0 +ThreadsRunnable >= 0 +ThreadsBlocked >= 0 +ThreadsWaiting >= 0 +ThreadsTimedWaiting >= 0 +ThreadsTerminated >= 0 +LogFatal >= 0 +LogError >= 0 +LogWarn >= 0 +LogInfo >= 0