http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java index e152696..154bcc9 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java @@ -18,7 +18,6 @@ package org.apache.hadoop.yarn.sls.web; -import java.io.File; import java.io.IOException; import java.io.ObjectInputStream; import java.text.MessageFormat; @@ -26,11 +25,12 @@ import java.util.HashMap; import java.util.Map; import java.util.Set; +import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; -import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType; @@ -38,12 +38,12 @@ import org.apache.hadoop.yarn.sls.SLSRunner; import org.apache.hadoop.yarn.sls.scheduler.FairSchedulerMetrics; import org.apache.hadoop.yarn.sls.scheduler.SchedulerMetrics; import org.apache.hadoop.yarn.sls.scheduler.SchedulerWrapper; + import org.mortbay.jetty.Handler; import org.mortbay.jetty.Request; import org.mortbay.jetty.Server; import org.mortbay.jetty.handler.AbstractHandler; import org.mortbay.jetty.handler.ResourceHandler; - import com.codahale.metrics.Counter; import com.codahale.metrics.Gauge; import com.codahale.metrics.Histogram; @@ -84,12 +84,12 @@ public class SLSWebApp extends HttpServlet { // load templates ClassLoader cl = Thread.currentThread().getContextClassLoader(); try { - simulateInfoTemplate = FileUtils.readFileToString(new File( - cl.getResource("simulate.info.html.template").getFile())); - simulateTemplate = FileUtils.readFileToString(new File( - cl.getResource("simulate.html.template").getFile())); - trackTemplate = FileUtils.readFileToString(new File( - cl.getResource("track.html.template").getFile())); + simulateInfoTemplate = IOUtils.toString( + cl.getResourceAsStream("html/simulate.info.html.template")); + simulateTemplate = IOUtils.toString( + cl.getResourceAsStream("html/simulate.html.template")); + trackTemplate = IOUtils.toString( + cl.getResourceAsStream("html/track.html.template")); } catch (IOException e) { e.printStackTrace(); } @@ -105,24 +105,23 @@ public class SLSWebApp extends HttpServlet { public SLSWebApp(SchedulerWrapper wrapper, int metricsAddressPort) { this.wrapper = wrapper; - metrics = wrapper.getMetrics(); - handleOperTimecostHistogramMap = - new HashMap<SchedulerEventType, Histogram>(); - queueAllocatedMemoryCounterMap = new HashMap<String, Counter>(); - queueAllocatedVCoresCounterMap = new HashMap<String, Counter>(); + handleOperTimecostHistogramMap = new HashMap<>(); + queueAllocatedMemoryCounterMap = new HashMap<>(); + queueAllocatedVCoresCounterMap = new HashMap<>(); schedulerMetrics = wrapper.getSchedulerMetrics(); + metrics = schedulerMetrics.getMetrics(); port = metricsAddressPort; } public void start() throws Exception { - // static files final ResourceHandler staticHandler = new ResourceHandler(); staticHandler.setResourceBase("html"); Handler handler = new AbstractHandler() { @Override public void handle(String target, HttpServletRequest request, - HttpServletResponse response, int dispatch) { + HttpServletResponse response, int dispatch) + throws IOException, ServletException { try{ // timeunit int timeunit = 1000; // second, divide millionsecond / 1000 @@ -183,14 +182,14 @@ public class SLSWebApp extends HttpServlet { response.setStatus(HttpServletResponse.SC_OK); String simulateInfo; - if (SLSRunner.simulateInfoMap.isEmpty()) { + if (SLSRunner.getSimulateInfoMap().isEmpty()) { String empty = "<tr><td colspan='2' align='center'>" + "No information available</td></tr>"; simulateInfo = MessageFormat.format(simulateInfoTemplate, empty); } else { StringBuilder info = new StringBuilder(); for (Map.Entry<String, Object> entry : - SLSRunner.simulateInfoMap.entrySet()) { + SLSRunner.getSimulateInfoMap().entrySet()) { info.append("<tr>"); info.append("<td class='td1'>").append(entry.getKey()).append("</td>"); info.append("<td class='td2'>").append(entry.getValue()) @@ -221,7 +220,7 @@ public class SLSWebApp extends HttpServlet { response.setStatus(HttpServletResponse.SC_OK); // queues {0} - Set<String> queues = wrapper.getQueueSet(); + Set<String> queues = wrapper.getTracker().getQueueSet(); StringBuilder queueInfo = new StringBuilder(); int i = 0; @@ -260,7 +259,7 @@ public class SLSWebApp extends HttpServlet { // tracked queues {0} StringBuilder trackedQueueInfo = new StringBuilder(); - Set<String> trackedQueues = wrapper.getQueueSet(); + Set<String> trackedQueues = wrapper.getTracker().getQueueSet(); for(String queue : trackedQueues) { trackedQueueInfo.append("<option value='Queue ").append(queue) .append("'>").append(queue).append("</option>"); @@ -268,7 +267,7 @@ public class SLSWebApp extends HttpServlet { // tracked apps {1} StringBuilder trackedAppInfo = new StringBuilder(); - Set<String> trackedApps = wrapper.getTrackedAppSet(); + Set<String> trackedApps = wrapper.getTracker().getTrackedAppSet(); for(String job : trackedApps) { trackedAppInfo.append("<option value='Job ").append(job) .append("'>").append(job).append("</option>"); @@ -417,7 +416,7 @@ public class SLSWebApp extends HttpServlet { // allocated resource for each queue Map<String, Double> queueAllocatedMemoryMap = new HashMap<String, Double>(); Map<String, Long> queueAllocatedVCoresMap = new HashMap<String, Long>(); - for (String queue : wrapper.getQueueSet()) { + for (String queue : wrapper.getTracker().getQueueSet()) { // memory String key = "counter.queue." + queue + ".allocated.memory"; if (! queueAllocatedMemoryCounterMap.containsKey(queue) && @@ -457,7 +456,7 @@ public class SLSWebApp extends HttpServlet { .append(",\"cluster.available.memory\":").append(availableMemoryGB) .append(",\"cluster.available.vcores\":").append(availableVCoresGB); - for (String queue : wrapper.getQueueSet()) { + for (String queue : wrapper.getTracker().getQueueSet()) { sb.append(",\"queue.").append(queue).append(".allocated.memory\":") .append(queueAllocatedMemoryMap.get(queue)); sb.append(",\"queue.").append(queue).append(".allocated.vcores\":")
http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md b/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md index 2cffc86..d1848e8 100644 --- a/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md +++ b/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md @@ -27,9 +27,11 @@ Yarn Scheduler Load Simulator (SLS) * [Metrics](#Metrics) * [Real-time Tracking](#Real-time_Tracking) * [Offline Analysis](#Offline_Analysis) + * [Synthetic Load Generator](#SynthGen) * [Appendix](#Appendix) * [Resources](#Resources) * [SLS JSON input file format](#SLS_JSON_input_file_format) + * [SYNTH JSON input file format](#SYNTH_JSON_input_file_format) * [Simulator input topology file format](#Simulator_input_topology_file_format) Overview @@ -72,7 +74,7 @@ The following figure illustrates the implementation architecture of the simulato ![The architecture of the simulator](images/sls_arch.png) -The simulator takes input of workload traces, and fetches the cluster and applications information. For each NM and AM, the simulator builds a simulator to simulate their running. All NM/AM simulators run in a thread pool. The simulator reuses Yarn Resource Manager, and builds a wrapper out of the scheduler. The Scheduler Wrapper can track the scheduler behaviors and generates several logs, which are the outputs of the simulator and can be further analyzed. +The simulator takes input of workload traces, or synthetic load distributions and generaters the cluster and applications information. For each NM and AM, the simulator builds a simulator to simulate their running. All NM/AM simulators run in a thread pool. The simulator reuses Yarn Resource Manager, and builds a wrapper out of the scheduler. The Scheduler Wrapper can track the scheduler behaviors and generates several logs, which are the outputs of the simulator and can be further analyzed. ### Usecases @@ -97,7 +99,7 @@ This section will show how to use the simulator. Here let `$HADOOP_ROOT` represe * `bin`: contains running scripts for the simulator. -* `html`: contains several html/css/js files we needed for real-time tracking. +* `html`: Users can also reproduce those real-time tracking charts in offline mode. Just upload the `realtimetrack.json` to `$HADOOP_ROOT/share/hadoop/tools/sls/html/showSimulationTrace.html`. For browser security problem, need to put files `realtimetrack.json` and `showSimulationTrace.html` in the same directory. * `sample-conf`: specifies the simulator configurations. @@ -179,17 +181,30 @@ The simulator supports two types of input files: the rumen traces and its own in $ cd $HADOOP_ROOT/share/hadoop/tools/sls $ bin/slsrun.sh - --input-rumen |--input-sls=<TRACE_FILE1,TRACE_FILE2,...> - --output-dir=<SLS_SIMULATION_OUTPUT_DIRECTORY> [--nodes=<SLS_NODES_FILE>] - [--track-jobs=<JOBID1,JOBID2,...>] [--print-simulation] + Usage: slsrun.sh <OPTIONS> + --tracetype=<SYNTH | SLS | RUMEN> + --tracelocation=<FILE1,FILE2,...> + (deprecated --input-rumen=<FILE1,FILE2,...> | --input-sls=<FILE1,FILE2,...>) + --output-dir=<SLS_SIMULATION_OUTPUT_DIRECTORY> + [--nodes=<SLS_NODES_FILE>] + [--track-jobs=<JOBID1,JOBID2,...>] + [--print-simulation] + * `--input-rumen`: The input rumen trace files. Users can input multiple files, separated by comma. One example trace is provided in `$HADOOP_ROOT/share/hadoop/tools/sls/sample-data/2jobs2min-rumen-jh.json`. + This is equivalent to `--tracetype=RUMEN --tracelocation=<path_to_trace>`. * `--input-sls`: Simulator its own file format. The simulator also provides a tool to convert rumen traces to sls traces (`rumen2sls.sh`). Refer to appendix for an example of sls input json file. + This is equivalent to `--tracetype=SLS --tracelocation=<path_to_trace>`. + +* `--tracetype`: This is the new way to configure the trace generation and + takes values RUMEN, SLS, or SYNTH, to trigger the three type of load generation + +* `--tracelocation`: Path to the input file, matching the tracetype above. * `--output-dir`: The output directory for generated running logs and metrics. @@ -281,30 +296,57 @@ After the simulator finishes, all logs are saved in the output directory specifi Users can also reproduce those real-time tracking charts in offline mode. Just upload the `realtimetrack.json` to `$HADOOP_ROOT/share/hadoop/tools/sls/html/showSimulationTrace.html`. For browser security problem, need to put files `realtimetrack.json` and `showSimulationTrace.html` in the same directory. + +Synthetic Load Generator +------------------------ +The Synthetic Load Generator complements the extensive nature of SLS-native and RUMEN traces, by providing a +distribution-driven generation of load. The load generator is organized as a JobStoryProducer +(compatible with rumen, and thus gridmix for later integration). We seed the Random number generator so +that results randomized but deterministic---hence reproducible. +We organize the jobs being generated around */workloads/job_class* hierarchy, which allow to easily +group jobs with similar behaviors and categorize them (e.g., jobs with long running containers, or maponly +computations, etc..). The user can control average and standard deviations for many of the +important parameters, such as number of mappers/reducers, duration of mapper/reducers, size +(mem/cpu) of containers, chance of reservation, etc. We use weighted-random sampling (whenever we +pick among a small number of options) or LogNormal distributions (to avoid negative values) when we +pick from wide ranges of values---see appendix on LogNormal distributions. + +The SYNTH mode of SLS is very convenient to generate very large loads without the need for extensive input +files. This allows to easily explore wide range of use cases (e.g., imagine simulating 100k jobs, and in different +runs simply tune the average number of mappers, or average task duration), in an efficient and compact way. + Appendix -------- ### Resources [YARN-1021](https://issues.apache.org/jira/browse/YARN-1021) is the main JIRA that introduces Yarn Scheduler Load Simulator to Hadoop Yarn project. +[YARN-6363](https://issues.apache.org/jira/browse/YARN-6363) is the main JIRA that introduces the Synthetic Load Generator to SLS. ### SLS JSON input file format Here we provide an example format of the sls json file, which contains 2 jobs. The first job has 3 map tasks and the second one has 2 map tasks. { - "am.type" : "mapreduce", - "job.start.ms" : 0, - "job.end.ms" : 95375, - "job.queue.name" : "sls_queue_1", - "job.id" : "job_1", - "job.user" : "default", + "num.nodes": 3, // total number of nodes in the cluster + "num.racks": 1 // total number of racks in the cluster, it divides num.nodes into the racks evenly, optional, the default value is 1 + } + { + "am.type" : "mapreduce", // type of AM, optional, the default value is "mapreduce" + "job.start.ms" : 0, // job start time + "job.end.ms" : 95375, // job finish time, optional, the default value is 0 + "job.queue.name" : "sls_queue_1", // the queue job will be submitted to + "job.id" : "job_1", // the job id used to track the job, optional. The default value, an zero-based integer increasing with number of jobs, is used if this is not specified or job.count > 1 + "job.user" : "default", // user, optional, the default value is "default" + "job.count" : 1, // number of jobs, optional, the default value is 1 "job.tasks" : [ { - "container.host" : "/default-rack/node1", - "container.start.ms" : 6664, - "container.end.ms" : 23707, - "container.priority" : 20, - "container.type" : "map" + "count": 1, // number of tasks, optional, the default value is 1 + "container.host" : "/default-rack/node1", // host the container asks for + "container.start.ms" : 6664, // container start time, optional + "container.end.ms" : 23707, // container finish time, optional + "duration.ms": 50000, // duration of the container, optional if start and end time is specified + "container.priority" : 20, // priority of the container, optional, the default value is 20 + "container.type" : "map" // type of the container, could be "map" or "reduce", optional, the default value is "map" }, { "container.host" : "/default-rack/node3", "container.start.ms" : 6665, @@ -341,6 +383,77 @@ Here we provide an example format of the sls json file, which contains 2 jobs. T } ] } + +### SYNTH JSON input file format +Here we provide an example format of the synthetic generator json file. We use *(json-non-conforming)* inline comments to explain the use of each parameter. + + { + "description" : "tiny jobs workload", //description of the meaning of this collection of workloads + "num_nodes" : 10, //total nodes in the simulated cluster + "nodes_per_rack" : 4, //number of nodes in each simulated rack + "num_jobs" : 10, // total number of jobs being simulated + "rand_seed" : 2, //the random seed used for deterministic randomized runs + + // a list of âworkloadsâ, each of which has job classes, and temporal properties + "workloads" : [ + { + "workload_name" : "tiny-test", // name of the workload + "workload_weight": 0.5, // used for weighted random selection of which workload to sample from + "queue_name" : "sls_queue_1", //queue the job will be submitted to + + //different classes of jobs for this workload + "job_classes" : [ + { + "class_name" : "class_1", //name of the class + "class_weight" : 1.0, //used for weighted random selection of class within workload + + //nextr group controls average and standard deviation of a LogNormal distribution that + //determines the number of mappers and reducers for thejob. + "mtasks_avg" : 5, + "mtasks_stddev" : 1, + "rtasks_avg" : 5, + "rtasks_stddev" : 1, + + //averge and stdev input param of LogNormal distribution controlling job duration + "dur_avg" : 60, + "dur_stddev" : 5, + + //averge and stdev input param of LogNormal distribution controlling mappers and reducers durations + "mtime_avg" : 10, + "mtime_stddev" : 2, + "rtime_avg" : 20, + "rtime_stddev" : 4, + + //averge and stdev input param of LogNormal distribution controlling memory and cores for map and reduce + "map_max_memory_avg" : 1024, + "map_max_memory_stddev" : 0.001, + "reduce_max_memory_avg" : 2048, + "reduce_max_memory_stddev" : 0.001, + "map_max_vcores_avg" : 1, + "map_max_vcores_stddev" : 0.001, + "reduce_max_vcores_avg" : 2, + "reduce_max_vcores_stddev" : 0.001, + + //probability of running this job with a reservation + "chance_of_reservation" : 0.5, + //input parameters of LogNormal distribution that determines the deadline slack (as a multiplier of job duration) + "deadline_factor_avg" : 10.0, + "deadline_factor_stddev" : 0.001, + } + ], + // for each workload determines with what probability each time bucket is picked to choose the job starttime. + // In the example below the jobs have twice as much chance to start in the first minute than in the second minute + // of simulation, and then zero chance thereafter. + "time_distribution" : [ + { "time" : 1, "weight" : 66 }, + { "time" : 60, "weight" : 33 }, + { "time" : 120, "jobs" : 0 } + ] + } + ] + } + + ### Simulator input topology file format Here is an example input topology file which has 3 nodes organized in 1 rack. @@ -355,3 +468,9 @@ Here is an example input topology file which has 3 nodes organized in 1 rack. "node" : "node3" }] } + +### Notes on LogNormal distribution: +LogNormal distributions represent well many of the parameters we see in practice (e.g., most jobs have +a small number of mappers, but few might be very large, and few very small, but greater than zero. It is +however worth noticing that it might be tricky to use, as the average is typically on the right side of the +peak (most common value) of the distribution, because the distribution has a one-side tail. http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java new file mode 100644 index 0000000..6b369f2 --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.sls; + +import net.jcip.annotations.NotThreadSafe; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants.MetricsInvariantChecker; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameter; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +/** + * This is a base class to ease the implementation of SLS-based tests. + */ +@RunWith(value = Parameterized.class) +@NotThreadSafe +@SuppressWarnings("VisibilityModifier") +public abstract class BaseSLSRunnerTest { + + @Parameter(value = 0) + public String schedulerType; + + @Parameter(value = 1) + public String traceType; + + @Parameter(value = 2) + public String traceLocation; + + @Parameter(value = 3) + public String nodeFile; + + protected SLSRunner sls; + protected String ongoingInvariantFile; + protected String exitInvariantFile; + + @Before + public abstract void setup(); + + @After + public void tearDown() throws InterruptedException { + sls.stop(); + } + + public void runSLS(Configuration conf, long timeout) throws Exception { + File tempDir = new File("target", UUID.randomUUID().toString()); + final List<Throwable> exceptionList = + Collections.synchronizedList(new ArrayList<Throwable>()); + + Thread.setDefaultUncaughtExceptionHandler( + new Thread.UncaughtExceptionHandler() { + @Override + public void uncaughtException(Thread t, Throwable e) { + e.printStackTrace(); + exceptionList.add(e); + } + }); + + // start the simulator + File slsOutputDir = new File(tempDir.getAbsolutePath() + "/slsoutput/"); + + String[] args; + + switch (traceType) { + case "OLD_SLS": + args = new String[] {"-inputsls", traceLocation, "-output", + slsOutputDir.getAbsolutePath() }; + break; + case "OLD_RUMEN": + args = new String[] {"-inputrumen", traceLocation, "-output", + slsOutputDir.getAbsolutePath() }; + break; + default: + args = new String[] {"-tracetype", traceType, "-tracelocation", + traceLocation, "-output", slsOutputDir.getAbsolutePath() }; + } + + if (nodeFile != null) { + args = ArrayUtils.addAll(args, new String[] {"-nodes", nodeFile }); + } + + // enable continuous invariant checks + conf.set(YarnConfiguration.RM_SCHEDULER, schedulerType); + if (ongoingInvariantFile != null) { + conf.set(YarnConfiguration.RM_SCHEDULER_MONITOR_POLICIES, + MetricsInvariantChecker.class.getCanonicalName()); + conf.set(MetricsInvariantChecker.INVARIANTS_FILE, ongoingInvariantFile); + conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, true); + } + + sls = new SLSRunner(conf); + sls.run(args); + + // wait for timeout seconds before stop, unless there is an uncaught + // exception in which + // case fail fast. + while (timeout >= 0) { + Thread.sleep(1000); + + if (!exceptionList.isEmpty()) { + sls.stop(); + Assert.fail("TestSLSRunner catched exception from child thread " + + "(TaskRunner.Task): " + exceptionList); + break; + } + timeout--; + } + shutdownHookInvariantCheck(); + } + + /** + * Checks exit invariants (e.g., number of apps submitted, completed, etc.). + */ + private void shutdownHookInvariantCheck() { + + if(exitInvariantFile!=null) { + MetricsInvariantChecker ic = new MetricsInvariantChecker(); + Configuration conf = new Configuration(); + conf.set(MetricsInvariantChecker.INVARIANTS_FILE, exitInvariantFile); + conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, true); + ic.init(conf, null, null); + ic.editSchedule(); + } + } + +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java new file mode 100644 index 0000000..22e1e2e --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.sls; + +import java.util.Arrays; +import java.util.Collection; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants.InvariantsChecker; +import org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants.ReservationInvariantsChecker; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import net.jcip.annotations.NotThreadSafe; + +/** + * This test performs an SLS run enabling a + * {@code ReservationInvariantsChecker}. + */ +@RunWith(value = Parameterized.class) +@NotThreadSafe +public class TestReservationSystemInvariants extends BaseSLSRunnerTest { + + @Parameters(name = "Testing with: {1}, {0}, (nodeFile {3})") + public static Collection<Object[]> data() { + // Test with both schedulers, and all three trace types + return Arrays.asList(new Object[][] { + {CapacityScheduler.class.getCanonicalName(), "SYNTH", + "src/test/resources/syn.json", null}, + {FairScheduler.class.getCanonicalName(), "SYNTH", + "src/test/resources/syn.json", null} + }); + } + + @Test(timeout = 120000) + @SuppressWarnings("all") + public void testSimulatorRunning() throws Exception { + + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.RM_SCHEDULER, schedulerType); + conf.setBoolean(YarnConfiguration.RM_SCHEDULER_ENABLE_MONITORS, true); + conf.set(YarnConfiguration.RM_SCHEDULER_MONITOR_POLICIES, + ReservationInvariantsChecker.class.getCanonicalName()); + conf.setBoolean(InvariantsChecker.THROW_ON_VIOLATION, true); + + + long timeTillShutDownInSec = 90; + runSLS(conf, timeTillShutDownInSec); + + } + + @Override + public void setup() { + + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java index 9da8ef3..5ab893d 100644 --- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java @@ -18,53 +18,72 @@ package org.apache.hadoop.yarn.sls; -import org.junit.Assert; +import net.jcip.annotations.NotThreadSafe; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; +import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.*; -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.UUID; +import java.util.*; -public class TestSLSRunner { +/** + * This test performs simple runs of the SLS with different trace types and + * schedulers. + */ +@RunWith(value = Parameterized.class) +@NotThreadSafe +public class TestSLSRunner extends BaseSLSRunnerTest { - @Test - @SuppressWarnings("all") - public void testSimulatorRunning() throws Exception { - File tempDir = new File("target", UUID.randomUUID().toString()); - final List<Throwable> exceptionList = - Collections.synchronizedList(new ArrayList<Throwable>()); + @Parameters(name = "Testing with: {1}, {0}, (nodeFile {3})") + public static Collection<Object[]> data() { - Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() { - @Override - public void uncaughtException(Thread t, Throwable e) { - exceptionList.add(e); - } - }); + String capScheduler = CapacityScheduler.class.getCanonicalName(); + String fairScheduler = FairScheduler.class.getCanonicalName(); + String slsTraceFile = "src/test/resources/inputsls.json"; + String rumenTraceFile = "src/main/data/2jobs2min-rumen-jh.json"; + String synthTraceFile = "src/test/resources/syn.json"; + String nodeFile = "src/test/resources/nodes.json"; + + // Test with both schedulers, and all three load producers. + return Arrays.asList(new Object[][] { - // start the simulator - File slsOutputDir = new File(tempDir.getAbsolutePath() + "/slsoutput/"); - String args[] = new String[]{ - "-inputrumen", "src/main/data/2jobs2min-rumen-jh.json", - "-output", slsOutputDir.getAbsolutePath()}; - SLSRunner.main(args); + // covering old commandline in tests + {capScheduler, "OLD_RUMEN", rumenTraceFile, nodeFile }, + {capScheduler, "OLD_SLS", slsTraceFile, nodeFile }, - // wait for 20 seconds before stop - int count = 20; - while (count >= 0) { - Thread.sleep(1000); + // covering the no nodeFile case + {capScheduler, "SYNTH", synthTraceFile, null }, + {capScheduler, "RUMEN", rumenTraceFile, null }, + {capScheduler, "SLS", slsTraceFile, null }, - if (! exceptionList.isEmpty()) { - SLSRunner.getRunner().stop(); - Assert.fail("TestSLSRunner catched exception from child thread " + - "(TaskRunner.Task): " + exceptionList.get(0).getMessage()); - break; - } - count--; - } + // covering new commandline and CapacityScheduler + {capScheduler, "SYNTH", synthTraceFile, nodeFile }, + {capScheduler, "RUMEN", rumenTraceFile, nodeFile }, + {capScheduler, "SLS", slsTraceFile, nodeFile }, - SLSRunner.getRunner().stop(); + // covering FairScheduler + {fairScheduler, "SYNTH", synthTraceFile, nodeFile }, + {fairScheduler, "RUMEN", rumenTraceFile, nodeFile }, + {fairScheduler, "SLS", slsTraceFile, nodeFile } + }); + } + + @Before + public void setup() { + ongoingInvariantFile = "src/test/resources/ongoing-invariants.txt"; + exitInvariantFile = "src/test/resources/exit-invariants.txt"; + } + + @Test(timeout = 120000) + @SuppressWarnings("all") + public void testSimulatorRunning() throws Exception { + Configuration conf = new Configuration(false); + long timeTillShutdownInsec = 20L; + runSLS(conf, timeTillShutdownInsec); } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java new file mode 100644 index 0000000..2b1971a --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.sls; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapreduce.TaskType; +import org.apache.hadoop.tools.rumen.TaskAttemptInfo; +import org.apache.hadoop.yarn.sls.synthetic.SynthJob; +import org.apache.hadoop.yarn.sls.synthetic.SynthTraceJobProducer; +import org.apache.hadoop.mapreduce.MRJobConfig; +import org.apache.log4j.Logger; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; + +import static org.junit.Assert.assertTrue; + +/** + * Simple test class driving the {@code SynthTraceJobProducer}, and validating + * jobs produce are within expected range. + */ +public class TestSynthJobGeneration { + + public final static Logger LOG = + Logger.getLogger(TestSynthJobGeneration.class); + + @Test + public void test() throws IllegalArgumentException, IOException { + + Configuration conf = new Configuration(); + + conf.set(SynthTraceJobProducer.SLS_SYNTHETIC_TRACE_FILE, + "src/test/resources/syn.json"); + + SynthTraceJobProducer stjp = new SynthTraceJobProducer(conf); + + SynthJob js = (SynthJob) stjp.getNextJob(); + + int jobCount = 0; + + while (js != null) { + LOG.info((jobCount++) + " " + js.getQueueName() + " -- " + + js.getJobClass().getClassName() + " (conf: " + + js.getJobConf().get(MRJobConfig.QUEUE_NAME) + ") " + " submission: " + + js.getSubmissionTime() + ", " + " duration: " + js.getDuration() + + " numMaps: " + js.getNumberMaps() + " numReduces: " + + js.getNumberReduces()); + + validateJob(js); + js = (SynthJob) stjp.getNextJob(); + } + + Assert.assertEquals(stjp.getNumJobs(), jobCount); + } + + private void validateJob(SynthJob js) { + + assertTrue(js.getSubmissionTime() > 0); + assertTrue(js.getDuration() > 0); + assertTrue(js.getNumberMaps() >= 0); + assertTrue(js.getNumberReduces() >= 0); + assertTrue(js.getNumberMaps() + js.getNumberReduces() > 0); + assertTrue(js.getTotalSlotTime() >= 0); + + for (int i = 0; i < js.getNumberMaps(); i++) { + TaskAttemptInfo tai = js.getTaskAttemptInfo(TaskType.MAP, i, 0); + assertTrue(tai.getRuntime() > 0); + } + + for (int i = 0; i < js.getNumberReduces(); i++) { + TaskAttemptInfo tai = js.getTaskAttemptInfo(TaskType.REDUCE, i, 0); + assertTrue(tai.getRuntime() > 0); + } + + if (js.hasDeadline()) { + assertTrue(js.getDeadline() > js.getSubmissionTime() + js.getDuration()); + } + + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java index fd1c861..02dc26e 100644 --- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java @@ -17,32 +17,62 @@ */ package org.apache.hadoop.yarn.sls.appmaster; +import com.codahale.metrics.MetricRegistry; +import org.apache.commons.io.FileUtils; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; import org.apache.hadoop.yarn.sls.conf.SLSConfiguration; -import org.apache.hadoop.yarn.sls.scheduler.ContainerSimulator; +import org.apache.hadoop.yarn.sls.scheduler.*; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.List; +@RunWith(Parameterized.class) public class TestAMSimulator { private ResourceManager rm; private YarnConfiguration conf; + private Path metricOutputDir; + + private Class slsScheduler; + private Class scheduler; + + @Parameterized.Parameters + public static Collection<Object[]> params() { + return Arrays.asList(new Object[][] { + {SLSFairScheduler.class, FairScheduler.class}, + {SLSCapacityScheduler.class, CapacityScheduler.class} + }); + } + + public TestAMSimulator(Class slsScheduler, Class scheduler) { + this.slsScheduler = slsScheduler; + this.scheduler = scheduler; + } @Before public void setup() { + createMetricOutputDir(); + conf = new YarnConfiguration(); - conf.set(YarnConfiguration.RM_SCHEDULER, - "org.apache.hadoop.yarn.sls.scheduler.ResourceSchedulerWrapper"); - conf.set(SLSConfiguration.RM_SCHEDULER, - "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler"); - conf.setBoolean(SLSConfiguration.METRICS_SWITCH, false); + conf.set(SLSConfiguration.METRICS_OUTPUT_DIR, metricOutputDir.toString()); + conf.set(YarnConfiguration.RM_SCHEDULER, slsScheduler.getName()); + conf.set(SLSConfiguration.RM_SCHEDULER, scheduler.getName()); + conf.setBoolean(SLSConfiguration.METRICS_SWITCH, true); rm = new ResourceManager(); rm.init(conf); rm.start(); @@ -64,14 +94,51 @@ public class TestAMSimulator { } } + private void verifySchedulerMetrics(String appId) { + if (scheduler.equals(FairScheduler.class)) { + SchedulerMetrics schedulerMetrics = ((SchedulerWrapper) + rm.getResourceScheduler()).getSchedulerMetrics(); + MetricRegistry metricRegistry = schedulerMetrics.getMetrics(); + for (FairSchedulerMetrics.Metric metric : + FairSchedulerMetrics.Metric.values()) { + String key = "variable.app." + appId + "." + metric.getValue() + + ".memory"; + Assert.assertTrue(metricRegistry.getGauges().containsKey(key)); + Assert.assertNotNull(metricRegistry.getGauges().get(key).getValue()); + } + } + } + + private void createMetricOutputDir() { + Path testDir = Paths.get(System.getProperty("test.build.data")); + try { + metricOutputDir = Files.createTempDirectory(testDir, "output"); + } catch (IOException e) { + Assert.fail(e.toString()); + } + } + + private void deleteMetricOutputDir() { + try { + FileUtils.deleteDirectory(metricOutputDir.toFile()); + } catch (IOException e) { + Assert.fail(e.toString()); + } + } + @Test public void testAMSimulator() throws Exception { // Register one app MockAMSimulator app = new MockAMSimulator(); - List<ContainerSimulator> containers = new ArrayList<ContainerSimulator>(); - app.init(1, 1000, containers, rm, null, 0, 1000000l, "user1", "default", - false, "app1"); + String appId = "app1"; + String queue = "default"; + List<ContainerSimulator> containers = new ArrayList<>(); + app.init(1000, containers, rm, null, 0, 1000000L, "user1", queue, true, + appId, null, 0, SLSConfiguration.getAMContainerResource(conf)); app.firstStep(); + + verifySchedulerMetrics(appId); + Assert.assertEquals(1, rm.getRMContext().getRMApps().size()); Assert.assertNotNull(rm.getRMContext().getRMApps().get(app.appId)); @@ -82,5 +149,7 @@ public class TestAMSimulator { @After public void tearDown() { rm.stop(); + + deleteMetricOutputDir(); } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java index f9a3932..2f10f7d 100644 --- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java @@ -21,26 +21,50 @@ import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.sls.conf.SLSConfiguration; +import org.apache.hadoop.yarn.sls.scheduler.SLSCapacityScheduler; +import org.apache.hadoop.yarn.sls.scheduler.SLSFairScheduler; import org.apache.hadoop.yarn.util.resource.Resources; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import java.util.Arrays; +import java.util.Collection; + +@RunWith(Parameterized.class) public class TestNMSimulator { private final int GB = 1024; private ResourceManager rm; private YarnConfiguration conf; + private Class slsScheduler; + private Class scheduler; + + @Parameterized.Parameters + public static Collection<Object[]> params() { + return Arrays.asList(new Object[][] { + {SLSFairScheduler.class, FairScheduler.class}, + {SLSCapacityScheduler.class, CapacityScheduler.class} + }); + } + + public TestNMSimulator(Class slsScheduler, Class scheduler) { + this.slsScheduler = slsScheduler; + this.scheduler = scheduler; + } + @Before public void setup() { conf = new YarnConfiguration(); - conf.set(YarnConfiguration.RM_SCHEDULER, - "org.apache.hadoop.yarn.sls.scheduler.ResourceSchedulerWrapper"); - conf.set(SLSConfiguration.RM_SCHEDULER, - "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler"); + conf.set(YarnConfiguration.RM_SCHEDULER, slsScheduler.getName()); + conf.set(SLSConfiguration.RM_SCHEDULER, scheduler.getName()); conf.setBoolean(SLSConfiguration.METRICS_SWITCH, false); rm = new ResourceManager(); rm.init(conf); http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java index 23f2bb6..ce6c1b3 100644 --- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java @@ -35,7 +35,7 @@ public class TestTaskRunner { } @After - public void cleanUp() { + public void cleanUp() throws InterruptedException { runner.stop(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java index f4eda67..30964a1 100644 --- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java @@ -21,6 +21,9 @@ package org.apache.hadoop.yarn.sls.utils; import org.junit.Assert; import org.junit.Test; +import java.util.HashSet; +import java.util.Set; + public class TestSLSUtils { @Test @@ -36,4 +39,31 @@ public class TestSLSUtils { Assert.assertEquals(rackHostname[1], "node1"); } + @Test + public void testGenerateNodes() { + Set<? extends String> nodes = SLSUtils.generateNodes(3, 3); + Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size()); + Assert.assertEquals("Number of racks is wrong.", 3, getNumRack(nodes)); + + nodes = SLSUtils.generateNodes(3, 1); + Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size()); + Assert.assertEquals("Number of racks is wrong.", 1, getNumRack(nodes)); + + nodes = SLSUtils.generateNodes(3, 4); + Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size()); + Assert.assertEquals("Number of racks is wrong.", 3, getNumRack(nodes)); + + nodes = SLSUtils.generateNodes(3, 0); + Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size()); + Assert.assertEquals("Number of racks is wrong.", 1, getNumRack(nodes)); + } + + private int getNumRack(Set<? extends String> nodes) { + Set<String> racks = new HashSet<>(); + for (String node : nodes) { + String[] rackHostname = SLSUtils.getRackHostName(node); + racks.add(rackHostname[0]); + } + return racks.size(); + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java index 1c1e63c..c9be450 100644 --- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java +++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java @@ -20,7 +20,6 @@ package org.apache.hadoop.yarn.sls.web; import org.junit.Assert; import org.apache.commons.io.FileUtils; -import org.apache.hadoop.yarn.sls.SLSRunner; import org.junit.Test; import java.io.File; @@ -28,6 +27,7 @@ import java.text.MessageFormat; import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.HashMap; public class TestSLSWebApp { @@ -36,20 +36,21 @@ public class TestSLSWebApp { String simulateInfoTemplate = FileUtils.readFileToString( new File("src/main/html/simulate.info.html.template")); - SLSRunner.simulateInfoMap.put("Number of racks", 10); - SLSRunner.simulateInfoMap.put("Number of nodes", 100); - SLSRunner.simulateInfoMap.put("Node memory (MB)", 1024); - SLSRunner.simulateInfoMap.put("Node VCores", 1); - SLSRunner.simulateInfoMap.put("Number of applications", 100); - SLSRunner.simulateInfoMap.put("Number of tasks", 1000); - SLSRunner.simulateInfoMap.put("Average tasks per applicaion", 10); - SLSRunner.simulateInfoMap.put("Number of queues", 4); - SLSRunner.simulateInfoMap.put("Average applications per queue", 25); - SLSRunner.simulateInfoMap.put("Estimated simulate time (s)", 10000); + Map<String, Object> simulateInfoMap = new HashMap<>(); + simulateInfoMap.put("Number of racks", 10); + simulateInfoMap.put("Number of nodes", 100); + simulateInfoMap.put("Node memory (MB)", 1024); + simulateInfoMap.put("Node VCores", 1); + simulateInfoMap.put("Number of applications", 100); + simulateInfoMap.put("Number of tasks", 1000); + simulateInfoMap.put("Average tasks per applicaion", 10); + simulateInfoMap.put("Number of queues", 4); + simulateInfoMap.put("Average applications per queue", 25); + simulateInfoMap.put("Estimated simulate time (s)", 10000); StringBuilder info = new StringBuilder(); for (Map.Entry<String, Object> entry : - SLSRunner.simulateInfoMap.entrySet()) { + simulateInfoMap.entrySet()) { info.append("<tr>"); info.append("<td class='td1'>" + entry.getKey() + "</td>"); info.append("<td class='td2'>" + entry.getValue() + "</td>"); @@ -60,8 +61,7 @@ public class TestSLSWebApp { MessageFormat.format(simulateInfoTemplate, info.toString()); Assert.assertTrue("The simulate info html page should not be empty", simulateInfo.length() > 0); - for (Map.Entry<String, Object> entry : - SLSRunner.simulateInfoMap.entrySet()) { + for (Map.Entry<String, Object> entry : simulateInfoMap.entrySet()) { Assert.assertTrue("The simulate info html page should have information " + "of " + entry.getKey(), simulateInfo.contains("<td class='td1'>" + entry.getKey() + "</td><td class='td2'>" http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml b/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml index 61be96a..1762265 100644 --- a/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml +++ b/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml @@ -39,6 +39,16 @@ </property> <property> + <name>yarn.scheduler.capacity.root.sls_queue_1.reservable</name> + <value>true</value> + </property> + + <property> + <name>yarn.scheduler.capacity.root.sls_queue_1.show-reservations-as-queues</name> + <value>true</value> + </property> + + <property> <name>yarn.scheduler.capacity.root.sls_queue_2.capacity</name> <value>25</value> </property> http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt b/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt new file mode 100644 index 0000000..b4a3228 --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt @@ -0,0 +1,8 @@ +ActiveApplications >= 0 +AppsCompleted >= 0 +AppsFailed >= 0 +AppsKilled >= 0 +AppsPending >= 0 +AppsRunning >= 0 +AppsSubmitted >= 0 +PendingContainers >= 0 http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml b/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml index fa10359..7c46767 100644 --- a/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml +++ b/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml @@ -21,6 +21,7 @@ --> <allocations> + <defaultQueueSchedulingPolicy>drf</defaultQueueSchedulingPolicy> <user name="jenkins"> <!-- Limit on running jobs for the user across all pools. If more jobs than this are submitted, only the first <maxRunningJobs> will @@ -31,20 +32,21 @@ <userMaxAppsDefault>1000</userMaxAppsDefault> <queue name="sls_queue_1"> <minResources>1024 mb, 1 vcores</minResources> - <schedulingMode>fair</schedulingMode> + <schedulingPolicy>drf</schedulingPolicy> <weight>0.25</weight> <minSharePreemptionTimeout>2</minSharePreemptionTimeout> + <reservation>true</reservation> </queue> <queue name="sls_queue_2"> <minResources>1024 mb, 1 vcores</minResources> - <schedulingMode>fair</schedulingMode> + <schedulingMode>drf</schedulingMode> <weight>0.25</weight> <minSharePreemptionTimeout>2</minSharePreemptionTimeout> </queue> <queue name="sls_queue_3"> <minResources>1024 mb, 1 vcores</minResources> <weight>0.5</weight> - <schedulingMode>fair</schedulingMode> + <schedulingMode>drf</schedulingMode> <minSharePreemptionTimeout>2</minSharePreemptionTimeout> </queue> </allocations> http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json b/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json new file mode 100644 index 0000000..b9d46a5 --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json @@ -0,0 +1,55 @@ +{ + "am.type": "mapreduce", + "job.start.ms": 0, + "job.end.ms": 95375, + "job.queue.name": "sls_queue_1", + "job.id": "job_1", + "job.user": "default", + "job.tasks": [ + { + "container.host": "/default-rack/node1", + "container.start.ms": 6664, + "container.end.ms": 23707, + "container.priority": 20, + "container.type": "map" + }, + { + "container.host": "/default-rack/node3", + "container.start.ms": 6665, + "container.end.ms": 21593, + "container.priority": 20, + "container.type": "map" + }, + { + "container.host": "/default-rack/node2", + "container.start.ms": 68770, + "container.end.ms": 86613, + "container.priority": 20, + "container.type": "map" + } + ] +} +{ + "am.type": "mapreduce", + "job.start.ms": 105204, + "job.end.ms": 197256, + "job.queue.name": "sls_queue_2", + "job.id": "job_2", + "job.user": "default", + "job.tasks": [ + { + "container.host": "/default-rack/node1", + "container.start.ms": 111822, + "container.end.ms": 133985, + "container.priority": 20, + "container.type": "map" + }, + { + "container.host": "/default-rack/node2", + "container.start.ms": 111788, + "container.end.ms": 131377, + "container.priority": 20, + "container.type": "map" + } + ] +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties b/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties new file mode 100644 index 0000000..81a3f6a --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties @@ -0,0 +1,19 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# log4j configuration used during build and unit tests + +log4j.rootLogger=info,stdout +log4j.threshold=ALL +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p [%t] %c{2} (%F:%M(%L)) - %m%n http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/nodes.json ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/nodes.json b/hadoop-tools/hadoop-sls/src/test/resources/nodes.json new file mode 100644 index 0000000..3039554 --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/resources/nodes.json @@ -0,0 +1,84 @@ +{ + "rack": "rack1", + "nodes": [ + { + "node": "node1" + }, + { + "node": "node2" + }, + { + "node": "node3" + }, + { + "node": "node4" + } + ] +} +{ + "rack": "rack2", + "nodes": [ + { + "node": "node5" + }, + { + "node": "node6" + }, + { + "node": "node7" + }, + { + "node": "node8" + } + ] +} +{ + "rack": "rack3", + "nodes": [ + { + "node": "node9" + }, + { + "node": "node10" + }, + { + "node": "node11" + }, + { + "node": "node12" + } + ] +} +{ + "rack": "rack4", + "nodes": [ + { + "node": "node13" + }, + { + "node": "node14" + }, + { + "node": "node15" + }, + { + "node": "node16" + } + ] +} +{ + "rack": "rack5", + "nodes": [ + { + "node": "node17" + }, + { + "node": "node18" + }, + { + "node": "node19" + }, + { + } + ] +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt b/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt new file mode 100644 index 0000000..363ed0d --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt @@ -0,0 +1,54 @@ +running_0 >= 0 +running_60 >= 0 +running_300 >= 0 +running_1440 >= 0 +AppsSubmitted >= 0 +AppsRunning >= 0 +AppsPending >= 0 +AppsCompleted >= 0 +AppsKilled >= 0 +AppsFailed >= 0 +AllocatedMB >= 0 +AllocatedVCores >= 0 +AllocatedContainers >= 0 +AggregateContainersAllocated >= 0 +AggregateNodeLocalContainersAllocated >= 0 +AggregateRackLocalContainersAllocated >= 0 +AggregateOffSwitchContainersAllocated >= 0 +AggregateContainersReleased >= 0 +AggregateContainersPreempted >= 0 +AvailableMB >= 0 +AvailableVCores >= 0 +PendingMB >= 0 +PendingVCores >= 0 +PendingContainers >= 0 +ReservedMB >= 0 +ReservedVCores >= 0 +ReservedContainers >= 0 +ActiveUsers >= 0 +ActiveApplications >= 0 +AppAttemptFirstContainerAllocationDelayNumOps >= 0 +AppAttemptFirstContainerAllocationDelayAvgTime >= 0 +MemNonHeapUsedM >= 0 +MemNonHeapCommittedM >= 0 +MemNonHeapMaxM >= 0 || MemNonHeapMaxM == -1 +MemHeapUsedM >= 0 +MemHeapCommittedM >= 0 +MemHeapMaxM >= 0 +MemMaxM >= 0 +GcCountPS_Scavenge >= 0 +GcTimeMillisPS_Scavenge >= 0 +GcCountPS_MarkSweep >= 0 +GcTimeMillisPS_MarkSweep >= 0 +GcCount >= 0 +GcTimeMillis >= 0 +ThreadsNew >= 0 +ThreadsRunnable >= 0 +ThreadsBlocked >= 0 +ThreadsWaiting >= 0 +ThreadsTimedWaiting >= 0 +ThreadsTerminated >= 0 +LogFatal >= 0 +LogError >= 0 +LogWarn >= 0 +LogInfo >= 0 http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml b/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml index d7acc98..2f076c2 100644 --- a/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml +++ b/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml @@ -25,11 +25,11 @@ <!-- Nodes configuration --> <property> <name>yarn.sls.nm.memory.mb</name> - <value>10240</value> + <value>100240</value> </property> <property> <name>yarn.sls.nm.vcores</name> - <value>10</value> + <value>100</value> </property> <property> <name>yarn.sls.nm.heartbeat.interval.ms</name> @@ -77,5 +77,5 @@ <name>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</name> <value>org.apache.hadoop.yarn.sls.scheduler.CapacitySchedulerMetrics</value> </property> - + </configuration> http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/syn.json ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/syn.json b/hadoop-tools/hadoop-sls/src/test/resources/syn.json new file mode 100644 index 0000000..8479d23 --- /dev/null +++ b/hadoop-tools/hadoop-sls/src/test/resources/syn.json @@ -0,0 +1,53 @@ +{ + "description": "tiny jobs workload", + "num_nodes": 20, + "nodes_per_rack": 4, + "num_jobs": 10, + "rand_seed": 2, + "workloads": [ + { + "workload_name": "tiny-test", + "workload_weight": 0.5, + "description": "Sort jobs", + "queue_name": "sls_queue_1", + "job_classes": [ + { + "class_name": "class_1", + "user_name": "foobar", + "class_weight": 1.0, + "mtasks_avg": 5, + "mtasks_stddev": 1, + "rtasks_avg": 5, + "rtasks_stddev": 1, + "dur_avg": 60, + "dur_stddev": 5, + "mtime_avg": 10, + "mtime_stddev": 2, + "rtime_avg": 20, + "rtime_stddev": 4, + "map_max_memory_avg": 1024, + "map_max_memory_stddev": 0.001, + "reduce_max_memory_avg": 2048, + "reduce_max_memory_stddev": 0.001, + "map_max_vcores_avg": 1, + "map_max_vcores_stddev": 0.001, + "reduce_max_vcores_avg": 2, + "reduce_max_vcores_stddev": 0.001, + "chance_of_reservation": 0.5, + "deadline_factor_avg": 10.0, + "deadline_factor_stddev": 0.001 + } + ], + "time_distribution": [ + { + "time": 1, + "weight": 100 + }, + { + "time": 60, + "jobs": 0 + } + ] + } + ] +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml b/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml index c9f714c..282aef3 100644 --- a/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml +++ b/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml @@ -17,7 +17,7 @@ <configuration> <property> <name>yarn.resourcemanager.scheduler.class</name> - <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value> + <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value> <!-- <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value> --> </property> @@ -69,4 +69,21 @@ <name>yarn.scheduler.fair.assignmultiple</name> <value>true</value> </property> + + + <property> + <description>Enable reservation system.</description> + <name>yarn.resourcemanager.reservation-system.enable</name> + <value>true</value> + </property> + + <property> + <name>yarn.nodemanager.resource.memory-mb</name> + <value>1000000</value> + </property> + <property> + <name>yarn.nodemanager.resource.cpu-vcores</name> + <value>320</value> + </property> + </configuration> http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml index 85df2c0..7061887 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml @@ -340,6 +340,7 @@ <exclude>src/test/resources/submit-reservation.json</exclude> <exclude>src/test/resources/delete-reservation.json</exclude> <exclude>src/test/resources/update-reservation.json</exclude> + <exclude>src/test/resources/invariants.txt</exclude> </excludes> </configuration> </plugin> http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java new file mode 100644 index 0000000..0491756 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + + +import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; + +/** + * This exception represents the violation of an internal invariant. + */ +public class InvariantViolationException extends YarnRuntimeException { + + public InvariantViolationException(String s) { + super(s); + } + + public InvariantViolationException(String s, Exception e) { + super(s, e); + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java new file mode 100644 index 0000000..2c9031f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Abstract invariant checker, that setup common context for invariants + * checkers. + */ +public abstract class InvariantsChecker implements SchedulingEditPolicy { + + private static final Logger LOG = + LoggerFactory.getLogger(InvariantsChecker.class); + public static final String THROW_ON_VIOLATION = + "yarn.resourcemanager.invariant-checker.throw-on-violation"; + public static final String INVARIANT_MONITOR_INTERVAL = + "yarn.resourcemanager.invariant-checker.monitor-interval"; + + private Configuration conf; + private RMContext context; + private ResourceScheduler scheduler; + private boolean throwOnInvariantViolation; + private long monitoringInterval; + + @Override + public void init(Configuration config, RMContext rmContext, + ResourceScheduler scheduler) { + this.conf = config; + this.context = rmContext; + this.scheduler = scheduler; + this.throwOnInvariantViolation = + conf.getBoolean(InvariantsChecker.THROW_ON_VIOLATION, false); + this.monitoringInterval = + conf.getLong(InvariantsChecker.INVARIANT_MONITOR_INTERVAL, 1000L); + + LOG.info("Invariant checker " + this.getPolicyName() + + " enabled. Monitoring every " + monitoringInterval + + "ms, throwOnViolation=" + throwOnInvariantViolation); + } + + @Override + public long getMonitoringInterval() { + return monitoringInterval; + } + + @Override + public String getPolicyName() { + return this.getClass().getSimpleName(); + } + + public void logOrThrow(String message) throws InvariantViolationException { + if (getThrowOnInvariantViolation()) { + throw new InvariantViolationException(message); + } else { + LOG.warn(message); + } + } + + public boolean getThrowOnInvariantViolation() { + return throwOnInvariantViolation; + } + + public Configuration getConf() { + return conf; + } + + public RMContext getContext() { + return context; + } + + public ResourceScheduler getScheduler() { + return scheduler; + } + +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java new file mode 100644 index 0000000..849cbf9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java @@ -0,0 +1,195 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.metrics2.AbstractMetric; +import org.apache.hadoop.metrics2.MetricsRecord; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.source.JvmMetrics; +import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.script.Compilable; +import javax.script.CompiledScript; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; +import javax.script.SimpleBindings; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This policy checks at every invocation that a given set of invariants + * (specified in a file) are respected over QueueMetrics and JvmMetrics. The + * file may contain arbitrary (Javascrip) boolean expression over the metrics + * variables. + * + * The right set of invariants depends on the deployment environment, a large + * number of complex invariant can make this check expensive. + * + * The MetricsInvariantChecker can be configured to throw a RuntimeException or + * simlpy warn in the logs if an invariant is not respected. + */ +public class MetricsInvariantChecker extends InvariantsChecker { + + private static final Logger LOG = + LoggerFactory.getLogger(MetricsInvariantChecker.class); + public static final String INVARIANTS_FILE = + "yarn.resourcemanager.invariant-checker.file"; + + private MetricsSystem metricsSystem; + private MetricsCollectorImpl collector; + private SimpleBindings bindings; + private ScriptEngineManager manager; + private Compilable scriptEngine; + private String invariantFile; + private Map<String, CompiledScript> invariants; + private CompiledScript combinedInvariants; + + // set of metrics we monitor + private QueueMetrics queueMetrics; + private JvmMetrics jvmMetrics; + + @Override + public void init(Configuration config, RMContext rmContext, + ResourceScheduler preemptableResourceScheduler) { + + super.init(config, rmContext, preemptableResourceScheduler); + + this.metricsSystem = DefaultMetricsSystem.instance(); + this.queueMetrics = + QueueMetrics.forQueue(metricsSystem, "root", null, false, getConf()); + this.jvmMetrics = (JvmMetrics) metricsSystem.getSource("JvmMetrics"); + + // at first collect all metrics + collector = new MetricsCollectorImpl(); + queueMetrics.getMetrics(collector, true); + jvmMetrics.getMetrics(collector, true); + + // prepare bindings and evaluation engine + this.bindings = new SimpleBindings(); + this.manager = new ScriptEngineManager(); + this.scriptEngine = (Compilable) manager.getEngineByName("JavaScript"); + + // load metrics invariant from file + this.invariantFile = getConf().get(MetricsInvariantChecker.INVARIANTS_FILE); + + this.invariants = new HashMap<>(); + + // preload all bindings + queueMetrics.getMetrics(collector, true); + jvmMetrics.getMetrics(collector, true); + for (MetricsRecord record : collector.getRecords()) { + for (AbstractMetric am : record.metrics()) { + bindings.put(am.name().replace(' ', '_'), am.value()); + } + } + + StringBuilder sb = new StringBuilder(); + try { + List<String> tempInv = + Files.readLines(new File(invariantFile), Charsets.UTF_8); + + + boolean first = true; + // precompile individual invariants + for (String inv : tempInv) { + + if(first) { + first = false; + } else { + sb.append("&&"); + } + + invariants.put(inv, scriptEngine.compile(inv)); + sb.append(" ("); + sb.append(inv); + sb.append(") "); + } + + // create a single large combined invariant for speed of checking + combinedInvariants = scriptEngine.compile(sb.toString()); + + } catch (IOException e) { + throw new RuntimeException( + "Error loading invariant file: " + e.getMessage()); + } catch (ScriptException e) { + throw new RuntimeException("Error compiling invariant " + e.getMessage()); + } + + } + + @Override + public void editSchedule() { + // grab all changed metrics and update bindings + collector.clear(); + queueMetrics.getMetrics(collector, false); + jvmMetrics.getMetrics(collector, false); + + for (MetricsRecord record : collector.getRecords()) { + for (AbstractMetric am : record.metrics()) { + bindings.put(am.name().replace(' ', '_'), am.value()); + } + } + + // evaluate all invariants with new bindings + try { + + // fastpath check all invariants at once (much faster) + boolean allInvHold = (boolean) combinedInvariants.eval(bindings); + + // if any fails, check individually to produce more insightful log + if (!allInvHold) { + for (Map.Entry<String, CompiledScript> e : invariants.entrySet()) { + boolean invariantsHold = (boolean) e.getValue().eval(bindings); + if (!invariantsHold) { + // filter bindings to produce minimal set + Map<String, Object> matchingBindings = + extractMatchingBindings(e.getKey(), bindings); + logOrThrow("Invariant \"" + e.getKey() + + "\" is NOT holding, with bindings: " + matchingBindings); + } + } + } + } catch (ScriptException e) { + logOrThrow(e.getMessage()); + } + } + + private static Map<String, Object> extractMatchingBindings(String inv, + SimpleBindings allBindings) { + Map<String, Object> matchingBindings = new HashMap<>(); + for (Map.Entry<String, Object> s : allBindings.entrySet()) { + if (inv.contains(s.getKey())) { + matchingBindings.put(s.getKey(), s.getValue()); + } + } + return matchingBindings; + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java new file mode 100644 index 0000000..2f9f03e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; + +import org.apache.hadoop.yarn.server.resourcemanager.reservation.Plan; +import org.apache.hadoop.yarn.util.UTCClock; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collection; + +/** + * Invariant checker that checks certain reservation invariants are respected. + */ +public class ReservationInvariantsChecker extends InvariantsChecker { + + private static final Logger LOG = + LoggerFactory.getLogger(ReservationInvariantsChecker.class); + + private UTCClock clock = new UTCClock(); + + @Override + public void editSchedule() { + Collection<Plan> plans = + getContext().getReservationSystem().getAllPlans().values(); + + try { + for (Plan plan : plans) { + long currReservations = + plan.getReservationsAtTime(clock.getTime()).size(); + long numberReservationQueues = getContext().getScheduler() + .getQueueInfo(plan.getQueueName(), true, false).getChildQueues() + .size(); + if (currReservations != numberReservationQueues - 1) { + logOrThrow("Number of reservations (" + currReservations + + ") does NOT match the number of reservationQueues (" + + (numberReservationQueues - 1) + "), while it should."); + } + } + } catch (IOException io) { + throw new InvariantViolationException("Issue during invariant check: ", + io); + } + + } + +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java new file mode 100644 index 0000000..d9931d6 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Monitoring policies, used to check invariants. + */ +package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants; \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org