Repository: hive Updated Branches: refs/heads/master 9249e9984 -> 719125cf1
HIVE-15651: LLAP: llap status tool enhancements (Prasanth Jayachandran reviewed by Siddharth Seth) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/719125cf Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/719125cf Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/719125cf Branch: refs/heads/master Commit: 719125cf183381f94590b619f69723eed348f856 Parents: 9249e99 Author: Prasanth Jayachandran <prasan...@apache.org> Authored: Mon Jan 23 17:42:18 2017 -0800 Committer: Prasanth Jayachandran <prasan...@apache.org> Committed: Mon Jan 23 17:42:18 2017 -0800 ---------------------------------------------------------------------- .../llap/cli/LlapStatusOptionsProcessor.java | 53 ++++++++++------ .../hive/llap/cli/LlapStatusServiceDriver.java | 65 +++++++++++++++----- 2 files changed, 84 insertions(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/719125cf/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java ---------------------------------------------------------------------- diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java index a501b6c..b4aa430 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java @@ -37,6 +37,7 @@ public class LlapStatusOptionsProcessor { private static final long DEFAULT_STATUS_REFRESH_INTERVAL_MS = 1 * 1000l; // 1 seconds wait until subsequent status private static final long DEFAULT_WATCH_MODE_TIMEOUT_MS = 5 * 60 * 1000l; // 5 minutes timeout for watch mode + private static final float DEFAULT_RUNNING_NODES_THRESHOLD = 1.0f; enum OptionConstants { NAME("name", 'n', "LLAP cluster name", true), @@ -44,9 +45,10 @@ public class LlapStatusOptionsProcessor { "Amount of time(s) that the tool will sleep to wait for the YARN application to start. negative values=wait forever, 0=Do not wait. default=" + TimeUnit.SECONDS.convert(FIND_YARN_APP_TIMEOUT_MS, TimeUnit.MILLISECONDS) + "s", true), OUTPUT_FILE("outputFile", 'o', "File to which output should be written (Default stdout)", true), - WATCH_UNTIL_STATUS_CHANGE("watchUntil", 'w', "Watch until LLAP application status changes to the specified " + - "desired state before printing to console. Accepted values are " + Arrays.toString(LlapStatusServiceDriver.State - .values()), true), + WATCH_MODE("watch", 'w', "Watch mode waits until all LLAP daemons are running or subset of the nodes are " + + "running (threshold can be specified via -r option) (Default wait until all nodes are running)", false), + RUNNING_NODES_THRESHOLD("runningNodesThreshold", 'r', "When watch mode is enabled (-w), wait until the " + + "specified threshold of nodes are running (Default 1.0 which means 100% nodes are running)", true), STATUS_REFRESH_INTERVAL("refreshInterval", 'i', "Amount of time in seconds to wait until subsequent status checks" + " in watch mode. Valid only for watch mode. (Default " + TimeUnit.SECONDS.convert(DEFAULT_STATUS_REFRESH_INTERVAL_MS, TimeUnit.MILLISECONDS) + "s)", true), @@ -102,25 +104,27 @@ public class LlapStatusOptionsProcessor { private final long findAppTimeoutMs; private final String outputFile; private final long refreshIntervalMs; - private final LlapStatusServiceDriver.State watchUntil; + private final boolean watchMode; private final long watchTimeout; + private final float runningNodesThreshold; - public LlapStatusOptions(String name) { - this(name, new Properties(), FIND_YARN_APP_TIMEOUT_MS, null, DEFAULT_STATUS_REFRESH_INTERVAL_MS, null, - DEFAULT_WATCH_MODE_TIMEOUT_MS); + public LlapStatusOptions(final String name) { + this(name, new Properties(), FIND_YARN_APP_TIMEOUT_MS, null, DEFAULT_STATUS_REFRESH_INTERVAL_MS, false, + DEFAULT_WATCH_MODE_TIMEOUT_MS, DEFAULT_RUNNING_NODES_THRESHOLD); } public LlapStatusOptions(String name, Properties hiveProperties, long findAppTimeoutMs, - String outputFile, long refreshIntervalMs, - final LlapStatusServiceDriver.State watchUntil, - final long watchTimeoutMs) { + String outputFile, long refreshIntervalMs, + final boolean watchMode, final long watchTimeoutMs, + final float runningNodesThreshold) { this.name = name; this.conf = hiveProperties; this.findAppTimeoutMs = findAppTimeoutMs; this.outputFile = outputFile; this.refreshIntervalMs = refreshIntervalMs; - this.watchUntil = watchUntil; + this.watchMode = watchMode; this.watchTimeout = watchTimeoutMs; + this.runningNodesThreshold = runningNodesThreshold; } public String getName() { @@ -143,13 +147,17 @@ public class LlapStatusOptionsProcessor { return refreshIntervalMs; } - public LlapStatusServiceDriver.State getWatchUntilState() { - return watchUntil; + public boolean isWatchMode() { + return watchMode; } public long getWatchTimeoutMs() { return watchTimeout; } + + public float getRunningNodesThreshold() { + return runningNodesThreshold; + } } private final Options options = new Options(); @@ -208,12 +216,7 @@ public class LlapStatusOptionsProcessor { refreshIntervalMs = TimeUnit.MILLISECONDS.convert(refreshIntervalSec, TimeUnit.SECONDS); } - LlapStatusServiceDriver.State watchUntil = null; - if (commandLine.hasOption(OptionConstants.WATCH_UNTIL_STATUS_CHANGE.getLongOpt())) { - String watchUntilStr = commandLine.getOptionValue(OptionConstants.WATCH_UNTIL_STATUS_CHANGE.getLongOpt()); - watchUntil = LlapStatusServiceDriver.State.valueOf(watchUntilStr); - } - + boolean watchMode = commandLine.hasOption(OptionConstants.WATCH_MODE.getLongOpt()) ? true : false; long watchTimeoutMs = DEFAULT_WATCH_MODE_TIMEOUT_MS; if (commandLine.hasOption(OptionConstants.WATCH_MODE_TIMEOUT.getLongOpt())) { long watchTimeoutSec = Long.parseLong(commandLine.getOptionValue(OptionConstants.WATCH_MODE_TIMEOUT.getLongOpt())); @@ -222,7 +225,17 @@ public class LlapStatusOptionsProcessor { } watchTimeoutMs = TimeUnit.MILLISECONDS.convert(watchTimeoutSec, TimeUnit.SECONDS); } - return new LlapStatusOptions(name, hiveConf, findAppTimeoutMs, outputFile, refreshIntervalMs, watchUntil, watchTimeoutMs); + + float runningNodesThreshold = DEFAULT_RUNNING_NODES_THRESHOLD; + if (commandLine.hasOption(OptionConstants.RUNNING_NODES_THRESHOLD.getLongOpt())) { + runningNodesThreshold = Float.parseFloat(commandLine.getOptionValue(OptionConstants.RUNNING_NODES_THRESHOLD + .getLongOpt())); + if (runningNodesThreshold < 0.0f || runningNodesThreshold > 1.0f) { + throw new IllegalArgumentException("Running nodes threshold value should be between 0.0 and 1.0 (inclusive)"); + } + } + return new LlapStatusOptions(name, hiveConf, findAppTimeoutMs, outputFile, refreshIntervalMs, watchMode, + watchTimeoutMs, runningNodesThreshold); } http://git-wip-us.apache.org/repos/asf/hive/blob/719125cf/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java ---------------------------------------------------------------------- diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java index 97a131e..39d542b 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java @@ -24,6 +24,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.PrintWriter; +import java.text.DecimalFormat; import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; @@ -945,40 +946,72 @@ public class LlapStatusServiceDriver { } final long refreshInterval = options.getRefreshIntervalMs(); - final State watchUntilState = options.getWatchUntilState(); + final boolean watchMode = options.isWatchMode(); final long watchTimeout = options.getWatchTimeoutMs(); long numAttempts = watchTimeout / refreshInterval; + State launchingState = null; State currentState = null; + boolean desiredStateAttained = false; + final float runningNodesThreshold = options.getRunningNodesThreshold(); try (OutputStream os = options.getOutputFile() == null ? System.out : new BufferedOutputStream(new FileOutputStream(options.getOutputFile())); PrintWriter pw = new PrintWriter(os)) { - LOG.info("Configured refresh interval: {}s. Watch timeout: {}s. Attempts remaining: {}", + LOG.info("Configured refresh interval: {}s. Watch timeout: {}s. Attempts remaining: {}." + + " Watch mode: {}. Running nodes threshold: {}.", TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS), TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS), - numAttempts); + numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold)); while (numAttempts > 0) { try { ret = statusServiceDriver.run(options); if (ret == ExitCode.SUCCESS.getInt()) { - if (watchUntilState != null) { + if (watchMode) { currentState = statusServiceDriver.appStatusBuilder.state; - if (!currentState.equals(watchUntilState)) { - LOG.warn("Current state: {}. Desired state: {}. {}/{} instances.", currentState, watchUntilState, + + // slider has started llap application, now if for some reason state changes to COMPLETE then fail fast + if (launchingState == null && + (currentState.equals(State.LAUNCHING) || currentState.equals(State.RUNNING_PARTIAL))) { + launchingState = currentState; + } + + if (launchingState != null && currentState.equals(State.COMPLETE)) { + LOG.warn("Application stopped while launching. COMPLETE state reached while waiting for RUNNING state." + + " Failing " + "fast.."); + break; + } + + if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals(State.RUNNING_ALL))) { + LOG.warn("Current state: {}. Desired state: {}. {}/{} instances.", currentState, + runningNodesThreshold == 1.0f ? State.RUNNING_ALL : State.RUNNING_PARTIAL, statusServiceDriver.appStatusBuilder.getLiveInstances(), statusServiceDriver.appStatusBuilder.getDesiredInstances()); numAttempts--; continue; } + + // we have reached RUNNING state, now check if running nodes threshold is met + final int liveInstances = statusServiceDriver.appStatusBuilder.getLiveInstances(); + final int desiredInstances = statusServiceDriver.appStatusBuilder.getDesiredInstances(); + if (liveInstances > 0 && desiredInstances > 0) { + final float ratio = (float) liveInstances / (float) desiredInstances; + if (ratio < runningNodesThreshold) { + LOG.warn("Waiting until running nodes threshold is reached. Current: {} Desired: {}." + + " {}/{} instances.", new DecimalFormat("#.###").format(ratio), + new DecimalFormat("#.###").format(runningNodesThreshold), + statusServiceDriver.appStatusBuilder.getLiveInstances(), + statusServiceDriver.appStatusBuilder.getDesiredInstances()); + numAttempts--; + continue; + } else { + desiredStateAttained = true; + } + } } - // desired state attained. print and break out of loop - statusServiceDriver.outputJson(pw); - os.flush(); - pw.flush(); } break; } finally { - if (watchUntilState != null) { + if (watchMode) { try { Thread.sleep(refreshInterval); } catch (InterruptedException e) { @@ -990,9 +1023,13 @@ public class LlapStatusServiceDriver { } } } - if (numAttempts == 0 && watchUntilState != null && currentState!= null && !currentState.equals(watchUntilState)) { - LOG.info("Watch timeout {}s exhausted before desired state {} is attained.", - TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS), watchUntilState); + // print current state before exiting + statusServiceDriver.outputJson(pw); + os.flush(); + pw.flush(); + if (numAttempts == 0 && watchMode && !desiredStateAttained) { + LOG.warn("Watch timeout {}s exhausted before desired state RUNNING is attained.", + TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS)); } } catch (Throwable t) { logError(t);