This is an automated email from the ASF dual-hosted git repository. sankarh pushed a commit to branch branch-3 in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push: new 7be7e176014 HIVE-27842: Backport of HIVE-20752, HIVE-20807 and HIVE-21866 to branch-3 (#4848) 7be7e176014 is described below commit 7be7e1760146d9da63a9f83b11291c7bc53edff1 Author: Sruthi Mooriyathvariam <warriersru...@gmail.com> AuthorDate: Sat Nov 25 20:35:06 2023 +0530 HIVE-27842: Backport of HIVE-20752, HIVE-20807 and HIVE-21866 to branch-3 (#4848) * HIVE-20752: In case of LLAP start failure add info how to find YARN logs (Miklos Gergely via Ashutosh Chauhan) * HIVE-20807: Refactor LlapStatusServiceDriver (Miklos Gergely via Sergey Shelukhin) * HIVE-21866: LLAP status service driver may get stuck with wrong Yarn app ID (Adam Szita, reviewed by Marta Kuczoram) --------- Co-authored-by: Miklos Gergely <mgerg...@hortonworks.com> Co-authored-by: Adam Szita <sz...@cloudera.com> --------- Signed-off-by: Sankar Hariappan <sank...@apache.org> Closes (#4848) --- bin/ext/llapstatus.sh | 4 +- .../hadoop/hive/llap/cli/LlapSliderUtils.java | 55 +- .../hive/llap/cli/LlapStatusOptionsProcessor.java | 272 -------- .../apache/hadoop/hive/llap/cli/status/AmInfo.java | 93 +++ .../hive/llap/cli/status/AppStatusBuilder.java | 231 +++++++ .../hadoop/hive/llap/cli/status/ExitCode.java | 44 ++ .../hadoop/hive/llap/cli/status/LlapInstance.java | 134 ++++ .../llap/cli/status/LlapStatusCliException.java | 40 ++ .../hive/llap/cli/status/LlapStatusHelpers.java | 449 ------------- .../cli/status/LlapStatusServiceCommandLine.java | 302 +++++++++ .../cli/{ => status}/LlapStatusServiceDriver.java | 735 +++++++++------------ .../apache/hadoop/hive/llap/cli/status/State.java | 31 + .../hadoop/hive/llap/cli/status/package-info.java | 24 + .../status/TestLlapStatusServiceCommandLine.java | 91 +++ .../hadoop/hive/llap/cli/status/package-info.java | 23 + .../src/java/org/apache/hive/http/LlapServlet.java | 11 +- 16 files changed, 1344 insertions(+), 1195 deletions(-) diff --git a/bin/ext/llapstatus.sh b/bin/ext/llapstatus.sh index 2d2c8f4c09c..23e6be6f10e 100644 --- a/bin/ext/llapstatus.sh +++ b/bin/ext/llapstatus.sh @@ -17,7 +17,7 @@ THISSERVICE=llapstatus export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} " llapstatus () { - CLASS=org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver; + CLASS=org.apache.hadoop.hive.llap.cli.status.LlapStatusServiceDriver; if [ ! -f ${HIVE_LIB}/hive-cli-*.jar ]; then echo "Missing Hive CLI Jar" exit 3; @@ -36,7 +36,7 @@ llapstatus () { } llapstatus_help () { - CLASS=org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver; + CLASS=org.apache.hadoop.hive.llap.cli.status.LlapStatusServiceDriver; execHiveCmd $CLASS "--help" } diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapSliderUtils.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapSliderUtils.java index af47b26e566..5ec9e1d91bc 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapSliderUtils.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapSliderUtils.java @@ -24,69 +24,24 @@ import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.yarn.api.records.ApplicationId; -import org.apache.hadoop.yarn.api.records.ApplicationReport; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.service.api.records.Service; import org.apache.hadoop.yarn.service.client.ServiceClient; import org.apache.hadoop.yarn.service.utils.CoreFileSystem; -import org.apache.hadoop.yarn.util.Clock; -import org.apache.hadoop.yarn.util.SystemClock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class LlapSliderUtils { - private static final Logger LOG = LoggerFactory - .getLogger(LlapSliderUtils.class); + private static final Logger LOG = LoggerFactory.getLogger(LlapSliderUtils.class); private static final String LLAP_PACKAGE_DIR = ".yarn/package/LLAP/"; - public static ServiceClient createServiceClient( - Configuration conf) throws Exception { + public static ServiceClient createServiceClient(Configuration conf) throws Exception { ServiceClient serviceClient = new ServiceClient(); serviceClient.init(conf); serviceClient.start(); return serviceClient; } - public static ApplicationReport getAppReport(String appName, ServiceClient serviceClient, - long timeoutMs) throws - LlapStatusServiceDriver.LlapStatusCliException { - Clock clock = SystemClock.getInstance(); - long startTime = clock.getTime(); - long timeoutTime = timeoutMs < 0 ? Long.MAX_VALUE : (startTime + timeoutMs); - ApplicationReport appReport = null; - ApplicationId appId; - try { - appId = serviceClient.getAppId(appName); - } catch (YarnException | IOException e) { - return null; - } - - while (appReport == null) { - try { - appReport = serviceClient.getYarnClient().getApplicationReport(appId); - if (timeoutMs == 0) { - // break immediately if timeout is 0 - break; - } - // Otherwise sleep, and try again. - if (appReport == null) { - long remainingTime = Math.min(timeoutTime - clock.getTime(), 500l); - if (remainingTime > 0) { - Thread.sleep(remainingTime); - } else { - break; - } - } - } catch (Exception e) { // No point separating IOException vs YarnException vs others - throw new LlapStatusServiceDriver.LlapStatusCliException( - LlapStatusServiceDriver.ExitCode.YARN_ERROR, - "Failed to get Yarn AppReport", e); - } - } - return appReport; - } - public static Service getService(Configuration conf, String name) { LOG.info("Get service details for " + name); ServiceClient sc; @@ -112,10 +67,8 @@ public class LlapSliderUtils { return service; } - public static void startCluster(Configuration conf, String name, - String packageName, Path packageDir, String queue) { - LOG.info("Starting cluster with " + name + ", " - + packageName + ", " + queue + ", " + packageDir); + public static void startCluster(Configuration conf, String name, String packageName, Path packageDir, String queue) { + LOG.info("Starting cluster with " + name + ", " + packageName + ", " + queue + ", " + packageDir); ServiceClient sc; try { sc = createServiceClient(conf); diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java deleted file mode 100644 index e88c819b2c2..00000000000 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.llap.cli; - -import java.util.Arrays; -import java.util.Properties; -import java.util.concurrent.TimeUnit; - -import jline.TerminalFactory; -import org.apache.commons.cli.GnuParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.OptionBuilder; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; - -public class LlapStatusOptionsProcessor { - - private static final String LLAPSTATUS_CONSTANT = "llapstatus"; - - private static final long FIND_YARN_APP_TIMEOUT_MS = 20 * 1000l; // 20seconds to wait for app to be visible - - private static final long DEFAULT_STATUS_REFRESH_INTERVAL_MS = 1 * 1000l; // 1 seconds wait until subsequent status - private static final long DEFAULT_WATCH_MODE_TIMEOUT_MS = 5 * 60 * 1000l; // 5 minutes timeout for watch mode - private static final float DEFAULT_RUNNING_NODES_THRESHOLD = 1.0f; - - // TODO: why doesn't this use one of the existing options implementations?! - enum OptionConstants { - - NAME("name", 'n', "LLAP cluster name", true), - FIND_APP_TIMEOUT("findAppTimeout", 'f', - "Amount of time(s) that the tool will sleep to wait for the YARN application to start. negative values=wait forever, 0=Do not wait. default=" + - TimeUnit.SECONDS.convert(FIND_YARN_APP_TIMEOUT_MS, TimeUnit.MILLISECONDS) + "s", true), - OUTPUT_FILE("outputFile", 'o', "File to which output should be written (Default stdout)", true), - WATCH_MODE("watch", 'w', "Watch mode waits until all LLAP daemons are running or subset of the nodes are " + - "running (threshold can be specified via -r option) (Default wait until all nodes are running)", false), - // This is a negative because we want the positive to be the default when nothing is specified. - NOT_LAUNCHED("notlaunched", 'l', "In watch mode, do not assume that the application was " - + "already launched if there's doubt (e.g. if the last application instance has failed).", - false), - RUNNING_NODES_THRESHOLD("runningNodesThreshold", 'r', "When watch mode is enabled (-w), wait until the " + - "specified threshold of nodes are running (Default 1.0 which means 100% nodes are running)", true), - STATUS_REFRESH_INTERVAL("refreshInterval", 'i', "Amount of time in seconds to wait until subsequent status checks" + - " in watch mode. Valid only for watch mode. (Default " + - TimeUnit.SECONDS.convert(DEFAULT_STATUS_REFRESH_INTERVAL_MS, TimeUnit.MILLISECONDS) + "s)", true), - WATCH_MODE_TIMEOUT("watchTimeout", 't', "Exit watch mode if the desired state is not attained until the specified" + - " timeout. (Default " + TimeUnit.SECONDS.convert(DEFAULT_WATCH_MODE_TIMEOUT_MS, TimeUnit.MILLISECONDS) +"s)", true), - HIVECONF("hiveconf", null, "Use value for given property. Overridden by explicit parameters", "property=value", 2), - HELP("help", 'H', "Print help information", false); - - - private final String longOpt; - private final Character shortOpt; - private final String description; - private final String argName; - private final int numArgs; - - OptionConstants(String longOpt, char shortOpt, String description, boolean hasArgs) { - this(longOpt, shortOpt, description, longOpt, hasArgs ? 1 : 0); - } - - OptionConstants(String longOpt, Character shortOpt, String description, String argName, int numArgs) { - this.longOpt = longOpt; - this.shortOpt = shortOpt; - this.description = description; - this.argName = argName; - this.numArgs = numArgs; - } - - public String getLongOpt() { - return longOpt; - } - - public Character getShortOpt() { - return shortOpt; - } - - public String getDescription() { - return description; - } - - public String getArgName() { - return argName; - } - - public int getNumArgs() { - return numArgs; - } - } - - - public static class LlapStatusOptions { - private final String name; - private final Properties conf; - private final long findAppTimeoutMs; - private final String outputFile; - private final long refreshIntervalMs; - private final boolean watchMode; - private final long watchTimeout; - private final float runningNodesThreshold; - private final boolean isLaunched; - - public LlapStatusOptions(final String name) { - this(name, new Properties(), FIND_YARN_APP_TIMEOUT_MS, null, DEFAULT_STATUS_REFRESH_INTERVAL_MS, false, - DEFAULT_WATCH_MODE_TIMEOUT_MS, DEFAULT_RUNNING_NODES_THRESHOLD, true); - } - - public LlapStatusOptions(String name, Properties hiveProperties, long findAppTimeoutMs, - String outputFile, long refreshIntervalMs, - final boolean watchMode, final long watchTimeoutMs, - final float runningNodesThreshold, final boolean isLaunched) { - this.name = name; - this.conf = hiveProperties; - this.findAppTimeoutMs = findAppTimeoutMs; - this.outputFile = outputFile; - this.refreshIntervalMs = refreshIntervalMs; - this.watchMode = watchMode; - this.watchTimeout = watchTimeoutMs; - this.runningNodesThreshold = runningNodesThreshold; - this.isLaunched = isLaunched; - } - - public String getName() { - return name; - } - - public Properties getConf() { - return conf; - } - - public long getFindAppTimeoutMs() { - return findAppTimeoutMs; - } - - public String getOutputFile() { - return outputFile; - } - - public long getRefreshIntervalMs() { - return refreshIntervalMs; - } - - public boolean isWatchMode() { - return watchMode; - } - - public boolean isLaunched() { - return isLaunched; - } - - public long getWatchTimeoutMs() { - return watchTimeout; - } - - public float getRunningNodesThreshold() { - return runningNodesThreshold; - } - } - - private final Options options = new Options(); - private org.apache.commons.cli.CommandLine commandLine; - - public LlapStatusOptionsProcessor() { - - for (OptionConstants optionConstant : OptionConstants.values()) { - - OptionBuilder optionBuilder = OptionBuilder.hasArgs(optionConstant.getNumArgs()) - .withArgName(optionConstant.getArgName()).withLongOpt(optionConstant.getLongOpt()) - .withDescription(optionConstant.getDescription()); - if (optionConstant.getShortOpt() == null) { - options.addOption(optionBuilder.create()); - } else { - options.addOption(optionBuilder.create(optionConstant.getShortOpt())); - } - } - } - - public LlapStatusOptions processOptions(String[] args) throws ParseException { - commandLine = new GnuParser().parse(options, args); - if (commandLine.hasOption(OptionConstants.HELP.getShortOpt())) { - printUsage(); - return null; - } - - String name = commandLine.getOptionValue(OptionConstants.NAME.getLongOpt()); - - long findAppTimeoutMs = FIND_YARN_APP_TIMEOUT_MS; - if (commandLine.hasOption(OptionConstants.FIND_APP_TIMEOUT.getLongOpt())) { - findAppTimeoutMs = TimeUnit.MILLISECONDS.convert(Long.parseLong( - commandLine.getOptionValue(OptionConstants.FIND_APP_TIMEOUT.getLongOpt())), - TimeUnit.SECONDS); - } - - Properties hiveConf; - if (commandLine.hasOption(OptionConstants.HIVECONF.getLongOpt())) { - hiveConf = commandLine.getOptionProperties(OptionConstants.HIVECONF.getLongOpt()); - } else { - hiveConf = new Properties(); - } - - String outputFile = null; - if (commandLine.hasOption(OptionConstants.OUTPUT_FILE.getLongOpt())) { - outputFile = commandLine.getOptionValue(OptionConstants.OUTPUT_FILE.getLongOpt()); - } - - long refreshIntervalMs = DEFAULT_STATUS_REFRESH_INTERVAL_MS; - if (commandLine.hasOption(OptionConstants.STATUS_REFRESH_INTERVAL.getLongOpt())) { - long refreshIntervalSec = Long.parseLong(commandLine.getOptionValue(OptionConstants.STATUS_REFRESH_INTERVAL - .getLongOpt())); - if (refreshIntervalSec <= 0) { - throw new IllegalArgumentException("Refresh interval should be >0"); - } - refreshIntervalMs = TimeUnit.MILLISECONDS.convert(refreshIntervalSec, TimeUnit.SECONDS); - } - - boolean watchMode = commandLine.hasOption(OptionConstants.WATCH_MODE.getLongOpt()); - long watchTimeoutMs = DEFAULT_WATCH_MODE_TIMEOUT_MS; - if (commandLine.hasOption(OptionConstants.WATCH_MODE_TIMEOUT.getLongOpt())) { - long watchTimeoutSec = Long.parseLong(commandLine.getOptionValue( - OptionConstants.WATCH_MODE_TIMEOUT.getLongOpt())); - if (watchTimeoutSec <= 0) { - throw new IllegalArgumentException("Watch timeout should be >0"); - } - watchTimeoutMs = TimeUnit.MILLISECONDS.convert(watchTimeoutSec, TimeUnit.SECONDS); - } - - boolean isLaunched = !commandLine.hasOption(OptionConstants.NOT_LAUNCHED.getLongOpt()); - - float runningNodesThreshold = DEFAULT_RUNNING_NODES_THRESHOLD; - if (commandLine.hasOption(OptionConstants.RUNNING_NODES_THRESHOLD.getLongOpt())) { - runningNodesThreshold = Float.parseFloat(commandLine.getOptionValue( - OptionConstants.RUNNING_NODES_THRESHOLD.getLongOpt())); - if (runningNodesThreshold < 0.0f || runningNodesThreshold > 1.0f) { - throw new IllegalArgumentException( - "Running nodes threshold value should be between 0.0 and 1.0 (inclusive)"); - } - } - return new LlapStatusOptions(name, hiveConf, findAppTimeoutMs, outputFile, refreshIntervalMs, - watchMode, watchTimeoutMs, runningNodesThreshold, isLaunched); - } - - - public static void printUsage() { - HelpFormatter hf = new HelpFormatter(); - try { - int width = hf.getWidth(); - int jlineWidth = TerminalFactory.get().getWidth(); - width = Math.min(160, Math.max(jlineWidth, width)); // Ignore potentially incorrect values - hf.setWidth(width); - } catch (Throwable t) { // Ignore - } - - LlapStatusOptionsProcessor optionsProcessor = new LlapStatusOptionsProcessor(); - hf.printHelp(LLAPSTATUS_CONSTANT, optionsProcessor.options); - } - -} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/AmInfo.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/AmInfo.java new file mode 100644 index 00000000000..ac2ff65789b --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/AmInfo.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + +/** + * Represents the state of the yarn application. + */ +class AmInfo { + private String appName; + private String appType; + private String appId; + private String containerId; + private String hostname; + private String amWebUrl; + + AmInfo setAppName(String appName) { + this.appName = appName; + return this; + } + + AmInfo setAppType(String appType) { + this.appType = appType; + return this; + } + + AmInfo setAppId(String appId) { + this.appId = appId; + return this; + } + + AmInfo setContainerId(String containerId) { + this.containerId = containerId; + return this; + } + + AmInfo setHostname(String hostname) { + this.hostname = hostname; + return this; + } + + AmInfo setAmWebUrl(String amWebUrl) { + this.amWebUrl = amWebUrl; + return this; + } + + String getAppName() { + return appName; + } + + String getAppType() { + return appType; + } + + String getAppId() { + return appId; + } + + String getContainerId() { + return containerId; + } + + String getHostname() { + return hostname; + } + + String getAmWebUrl() { + return amWebUrl; + } + + @Override + public String toString() { + return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE); + } +} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/AppStatusBuilder.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/AppStatusBuilder.java new file mode 100644 index 00000000000..c2ba4dba0c5 --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/AppStatusBuilder.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; +import org.codehaus.jackson.annotate.JsonIgnore; + +/** + * Represents the status of the Llap application. + */ +class AppStatusBuilder { + + private AmInfo amInfo; + private State state = State.UNKNOWN; + private String diagnostics; + private String originalConfigurationPath; + private String generatedConfigurationPath; + + private Long appStartTime; + private Long appFinishTime; + + private boolean runningThresholdAchieved = false; + + private Integer desiredInstances = null; + private Integer liveInstances = null; + private Integer launchingInstances = null; + + private final List<LlapInstance> runningInstances = new LinkedList<>(); + private final List<LlapInstance> completedInstances = new LinkedList<>(); + + private final transient Map<String, LlapInstance> containerToRunningInstanceMap = new HashMap<>(); + private final transient Map<String, LlapInstance> containerToCompletedInstanceMap = new HashMap<>(); + + void setAmInfo(AmInfo amInfo) { + this.amInfo = amInfo; + } + + AppStatusBuilder setState(State state) { + this.state = state; + return this; + } + + AppStatusBuilder setDiagnostics(String diagnostics) { + this.diagnostics = diagnostics; + return this; + } + + AppStatusBuilder setOriginalConfigurationPath(String originalConfigurationPath) { + this.originalConfigurationPath = originalConfigurationPath; + return this; + } + + AppStatusBuilder setGeneratedConfigurationPath(String generatedConfigurationPath) { + this.generatedConfigurationPath = generatedConfigurationPath; + return this; + } + + AppStatusBuilder setAppStartTime(long appStartTime) { + this.appStartTime = appStartTime; + return this; + } + + AppStatusBuilder setAppFinishTime(long finishTime) { + this.appFinishTime = finishTime; + return this; + } + + void setRunningThresholdAchieved(boolean runningThresholdAchieved) { + this.runningThresholdAchieved = runningThresholdAchieved; + } + + AppStatusBuilder setDesiredInstances(int desiredInstances) { + this.desiredInstances = desiredInstances; + return this; + } + + AppStatusBuilder setLiveInstances(int liveInstances) { + this.liveInstances = liveInstances; + return this; + } + + AppStatusBuilder setLaunchingInstances(int launchingInstances) { + this.launchingInstances = launchingInstances; + return this; + } + + AppStatusBuilder addNewRunningLlapInstance(LlapInstance llapInstance) { + this.runningInstances.add(llapInstance); + this.containerToRunningInstanceMap + .put(llapInstance.getContainerId(), llapInstance); + return this; + } + + LlapInstance removeAndGetRunningLlapInstanceForContainer(String containerIdString) { + return containerToRunningInstanceMap.remove(containerIdString); + } + + void clearRunningLlapInstances() { + this.runningInstances.clear(); + this.containerToRunningInstanceMap.clear(); + } + + AppStatusBuilder clearAndAddPreviouslyKnownRunningInstances(List<LlapInstance> llapInstances) { + clearRunningLlapInstances(); + for (LlapInstance llapInstance : llapInstances) { + addNewRunningLlapInstance(llapInstance); + } + return this; + } + + @JsonIgnore + List<LlapInstance> allRunningInstances() { + return this.runningInstances; + } + + AppStatusBuilder addNewCompleteLlapInstance(LlapInstance llapInstance) { + this.completedInstances.add(llapInstance); + this.containerToCompletedInstanceMap + .put(llapInstance.getContainerId(), llapInstance); + return this; + } + + LlapInstance removeAndGetCompletedLlapInstanceForContainer(String containerIdString) { + return containerToCompletedInstanceMap.remove(containerIdString); + } + + void clearCompletedLlapInstances() { + this.completedInstances.clear(); + this.containerToCompletedInstanceMap.clear(); + } + + AppStatusBuilder clearAndAddPreviouslyKnownCompletedInstances(List<LlapInstance> llapInstances) { + clearCompletedLlapInstances(); + for (LlapInstance llapInstance : llapInstances) { + addNewCompleteLlapInstance(llapInstance); + } + return this; + } + + @JsonIgnore + List<LlapInstance> allCompletedInstances() { + return this.completedInstances; + } + + AmInfo getAmInfo() { + return amInfo; + } + + State getState() { + return state; + } + + String getDiagnostics() { + return diagnostics; + } + + String getOriginalConfigurationPath() { + return originalConfigurationPath; + } + + String getGeneratedConfigurationPath() { + return generatedConfigurationPath; + } + + Long getAppStartTime() { + return appStartTime; + } + + Long getAppFinishTime() { + return appFinishTime; + } + + boolean isRunningThresholdAchieved() { + return runningThresholdAchieved; + } + + Integer getDesiredInstances() { + return desiredInstances; + } + + Integer getLiveInstances() { + return liveInstances; + } + + Integer getLaunchingInstances() { + return launchingInstances; + } + + List<LlapInstance> getRunningInstances() { + return runningInstances; + } + + List<LlapInstance> getCompletedInstances() { + return completedInstances; + } + + @JsonIgnore + AmInfo maybeCreateAndGetAmInfo() { + if (amInfo == null) { + amInfo = new AmInfo(); + } + return amInfo; + } + + @Override + public String toString() { + return ToStringBuilder.reflectionToString(this, ToStringStyle.MULTI_LINE_STYLE); + } +} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/ExitCode.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/ExitCode.java new file mode 100644 index 00000000000..16c71d27ff3 --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/ExitCode.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +/** + * Enumeration of the potential outcomes of the Llap state checking. + */ +public enum ExitCode { + SUCCESS(0), + INCORRECT_USAGE(10), + YARN_ERROR(20), + SERVICE_CLIENT_ERROR_CREATE_FAILED(30), + SERVICE_CLIENT_ERROR_OTHER(31), + LLAP_REGISTRY_ERROR(40), + LLAP_JSON_GENERATION_ERROR(50), + // Error in the script itself - likely caused by an incompatible change, or new functionality / states added. + INTERNAL_ERROR(100); + + private final int code; + + ExitCode(int code) { + this.code = code; + } + + public int getCode() { + return code; + } +} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapInstance.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapInstance.java new file mode 100644 index 00000000000..5b979fd59b6 --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapInstance.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + +/** + * Representing the state of an Llap instance monitored. + */ +class LlapInstance { + private final String hostname; + private final String containerId; + private String logUrl; + + // Only for live instances. + private String statusUrl; + private String webUrl; + private Integer rpcPort; + private Integer mgmtPort; + private Integer shufflePort; + + // For completed instances + private String diagnostics; + private int yarnContainerExitStatus; + + // TODO HIVE-13454 Add additional information such as #executors, container size, etc + + LlapInstance(String hostname, String containerId) { + this.hostname = hostname; + this.containerId = containerId; + } + + LlapInstance setLogUrl(String logUrl) { + this.logUrl = logUrl; + return this; + } + + LlapInstance setStatusUrl(String statusUrl) { + this.statusUrl = statusUrl; + return this; + } + + LlapInstance setWebUrl(String webUrl) { + this.webUrl = webUrl; + return this; + } + + LlapInstance setRpcPort(int rpcPort) { + this.rpcPort = rpcPort; + return this; + } + + LlapInstance setMgmtPort(int mgmtPort) { + this.mgmtPort = mgmtPort; + return this; + } + + LlapInstance setShufflePort(int shufflePort) { + this.shufflePort = shufflePort; + return this; + } + + LlapInstance setDiagnostics(String diagnostics) { + this.diagnostics = diagnostics; + return this; + } + + LlapInstance setYarnContainerExitStatus(int yarnContainerExitStatus) { + this.yarnContainerExitStatus = yarnContainerExitStatus; + return this; + } + + String getHostname() { + return hostname; + } + + String getContainerId() { + return containerId; + } + + String getLogUrl() { + return logUrl; + } + + String getStatusUrl() { + return statusUrl; + } + + String getWebUrl() { + return webUrl; + } + + Integer getRpcPort() { + return rpcPort; + } + + Integer getMgmtPort() { + return mgmtPort; + } + + Integer getShufflePort() { + return shufflePort; + } + + String getDiagnostics() { + return diagnostics; + } + + int getYarnContainerExitStatus() { + return yarnContainerExitStatus; + } + + @Override + public String toString() { + return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE); + } +} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusCliException.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusCliException.java new file mode 100644 index 00000000000..7ebf404818f --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusCliException.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +/** + * Representing the exceptions that may occur during the Llap state chacking. + */ +class LlapStatusCliException extends Exception { + private final ExitCode exitCode; + + LlapStatusCliException(ExitCode exitCode, String message) { + super(exitCode.getCode() +": " + message); + this.exitCode = exitCode; + } + + LlapStatusCliException(ExitCode exitCode, String message, Throwable cause) { + super(exitCode.getCode() +": " + message, cause); + this.exitCode = exitCode; + } + + ExitCode getExitCode() { + return exitCode; + } +} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusHelpers.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusHelpers.java deleted file mode 100644 index 5c8aeb07249..00000000000 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusHelpers.java +++ /dev/null @@ -1,449 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.llap.cli.status; - -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver; -import org.codehaus.jackson.annotate.JsonIgnore; - -public class LlapStatusHelpers { - public enum State { - APP_NOT_FOUND, LAUNCHING, - RUNNING_PARTIAL, - RUNNING_ALL, COMPLETE, UNKNOWN - } - - public static class AmInfo { - private String appName; - private String appType; - private String appId; - private String containerId; - private String hostname; - private String amWebUrl; - - public AmInfo setAppName(String appName) { - this.appName = appName; - return this; - } - - public AmInfo setAppType(String appType) { - this.appType = appType; - return this; - } - - public AmInfo setAppId(String appId) { - this.appId = appId; - return this; - } - - public AmInfo setContainerId(String containerId) { - this.containerId = containerId; - return this; - } - - public AmInfo setHostname(String hostname) { - this.hostname = hostname; - return this; - } - - public AmInfo setAmWebUrl(String amWebUrl) { - this.amWebUrl = amWebUrl; - return this; - } - - public String getAppName() { - return appName; - } - - public String getAppType() { - return appType; - } - - public String getAppId() { - return appId; - } - - public String getContainerId() { - return containerId; - } - - public String getHostname() { - return hostname; - } - - public String getAmWebUrl() { - return amWebUrl; - } - - @Override - public String toString() { - return "AmInfo{" + - "appName='" + appName + '\'' + - ", appType='" + appType + '\'' + - ", appId='" + appId + '\'' + - ", containerId='" + containerId + '\'' + - ", hostname='" + hostname + '\'' + - ", amWebUrl='" + amWebUrl + '\'' + - '}'; - } - } - - public static class LlapInstance { - private final String hostname; - private final String containerId; - private String logUrl; - - // Only for live instances. - private String statusUrl; - private String webUrl; - private Integer rpcPort; - private Integer mgmtPort; - private Integer shufflePort; - - // For completed instances - private String diagnostics; - private int yarnContainerExitStatus; - - // TODO HIVE-13454 Add additional information such as #executors, container size, etc - - public LlapInstance(String hostname, String containerId) { - this.hostname = hostname; - this.containerId = containerId; - } - - public LlapInstance setLogUrl(String logUrl) { - this.logUrl = logUrl; - return this; - } - - public LlapInstance setWebUrl(String webUrl) { - this.webUrl = webUrl; - return this; - } - - public LlapInstance setStatusUrl(String statusUrl) { - this.statusUrl = statusUrl; - return this; - } - - public LlapInstance setRpcPort(int rpcPort) { - this.rpcPort = rpcPort; - return this; - } - - public LlapInstance setMgmtPort(int mgmtPort) { - this.mgmtPort = mgmtPort; - return this; - } - - public LlapInstance setShufflePort(int shufflePort) { - this.shufflePort = shufflePort; - return this; - } - - public LlapInstance setDiagnostics(String diagnostics) { - this.diagnostics = diagnostics; - return this; - } - - public LlapInstance setYarnContainerExitStatus(int yarnContainerExitStatus) { - this.yarnContainerExitStatus = yarnContainerExitStatus; - return this; - } - - public String getHostname() { - return hostname; - } - - public String getLogUrl() { - return logUrl; - } - - public String getStatusUrl() { - return statusUrl; - } - - public String getContainerId() { - return containerId; - } - - public String getWebUrl() { - return webUrl; - } - - public Integer getRpcPort() { - return rpcPort; - } - - public Integer getMgmtPort() { - return mgmtPort; - } - - public Integer getShufflePort() { - return shufflePort; - } - - public String getDiagnostics() { - return diagnostics; - } - - public int getYarnContainerExitStatus() { - return yarnContainerExitStatus; - } - - @Override - public String toString() { - return "LlapInstance{" + - "hostname='" + hostname + '\'' + - "logUrl=" + logUrl + '\'' + - ", containerId='" + containerId + '\'' + - ", statusUrl='" + statusUrl + '\'' + - ", webUrl='" + webUrl + '\'' + - ", rpcPort=" + rpcPort + - ", mgmtPort=" + mgmtPort + - ", shufflePort=" + shufflePort + - ", diagnostics=" + diagnostics + - ", yarnContainerExitStatus=" + yarnContainerExitStatus + - '}'; - } - } - - public static final class AppStatusBuilder { - - private AmInfo amInfo; - private State state = State.UNKNOWN; - private String diagnostics; - private String originalConfigurationPath; - private String generatedConfigurationPath; - - private Integer desiredInstances = null; - private Integer liveInstances = null; - private Integer launchingInstances = null; - - - private Long appStartTime; - private Long appFinishTime; - - private boolean runningThresholdAchieved = false; - - private final List<LlapInstance> runningInstances = new LinkedList<>(); - private final List<LlapInstance> completedInstances = new LinkedList<>(); - - private transient final Map<String, LlapInstance> - containerToRunningInstanceMap = new HashMap<>(); - private transient final Map<String, LlapInstance> - containerToCompletedInstanceMap = new HashMap<>(); - - public void setAmInfo(AmInfo amInfo) { - this.amInfo = amInfo; - } - - public AppStatusBuilder setState( - State state) { - this.state = state; - return this; - } - - public AppStatusBuilder setDiagnostics(String diagnostics) { - this.diagnostics = diagnostics; - return this; - } - - public AppStatusBuilder setOriginalConfigurationPath(String originalConfigurationPath) { - this.originalConfigurationPath = originalConfigurationPath; - return this; - } - - public AppStatusBuilder setGeneratedConfigurationPath(String generatedConfigurationPath) { - this.generatedConfigurationPath = generatedConfigurationPath; - return this; - } - - public AppStatusBuilder setAppStartTime(long appStartTime) { - this.appStartTime = appStartTime; - return this; - } - - public AppStatusBuilder setAppFinishTime(long finishTime) { - this.appFinishTime = finishTime; - return this; - } - - public void setRunningThresholdAchieved(boolean runningThresholdAchieved) { - this.runningThresholdAchieved = runningThresholdAchieved; - } - - public AppStatusBuilder setDesiredInstances(int desiredInstances) { - this.desiredInstances = desiredInstances; - return this; - } - - public AppStatusBuilder setLiveInstances(int liveInstances) { - this.liveInstances = liveInstances; - return this; - } - - public AppStatusBuilder setLaunchingInstances(int launchingInstances) { - this.launchingInstances = launchingInstances; - return this; - } - - public AppStatusBuilder addNewRunningLlapInstance(LlapInstance llapInstance) { - this.runningInstances.add(llapInstance); - this.containerToRunningInstanceMap - .put(llapInstance.getContainerId(), llapInstance); - return this; - } - - public LlapInstance removeAndGetRunningLlapInstanceForContainer(String containerIdString) { - return containerToRunningInstanceMap.remove(containerIdString); - } - - public void clearRunningLlapInstances() { - this.runningInstances.clear(); - this.containerToRunningInstanceMap.clear(); - } - - public AppStatusBuilder clearAndAddPreviouslyKnownRunningInstances(List<LlapInstance> llapInstances) { - clearRunningLlapInstances(); - for (LlapInstance llapInstance : llapInstances) { - addNewRunningLlapInstance(llapInstance); - } - return this; - } - - @JsonIgnore - public List<LlapInstance> allRunningInstances() { - return this.runningInstances; - } - - public AppStatusBuilder addNewCompleteLlapInstance(LlapInstance llapInstance) { - this.completedInstances.add(llapInstance); - this.containerToCompletedInstanceMap - .put(llapInstance.getContainerId(), llapInstance); - return this; - } - - public LlapInstance removeAndGetCompletedLlapInstanceForContainer(String containerIdString) { - return containerToCompletedInstanceMap.remove(containerIdString); - } - - public void clearCompletedLlapInstances() { - this.completedInstances.clear(); - this.containerToCompletedInstanceMap.clear(); - } - - public AppStatusBuilder clearAndAddPreviouslyKnownCompletedInstances(List<LlapInstance> llapInstances) { - clearCompletedLlapInstances(); - for (LlapInstance llapInstance : llapInstances) { - addNewCompleteLlapInstance(llapInstance); - } - return this; - } - - @JsonIgnore - public List<LlapInstance> allCompletedInstances() { - return this.completedInstances; - } - - public AmInfo getAmInfo() { - return amInfo; - } - - public State getState() { - return state; - } - - public String getDiagnostics() { - return diagnostics; - } - - public String getOriginalConfigurationPath() { - return originalConfigurationPath; - } - - public String getGeneratedConfigurationPath() { - return generatedConfigurationPath; - } - - public Integer getDesiredInstances() { - return desiredInstances; - } - - public Integer getLiveInstances() { - return liveInstances; - } - - public Integer getLaunchingInstances() { - return launchingInstances; - } - - public Long getAppStartTime() { - return appStartTime; - } - - public Long getAppFinishTime() { - return appFinishTime; - } - - public boolean isRunningThresholdAchieved() { - return runningThresholdAchieved; - } - - public List<LlapInstance> getRunningInstances() { - return runningInstances; - } - - public List<LlapInstance> getCompletedInstances() { - return completedInstances; - } - - @JsonIgnore - public AmInfo maybeCreateAndGetAmInfo() { - if (amInfo == null) { - amInfo = new AmInfo(); - } - return amInfo; - } - - @Override - public String toString() { - return "AppStatusBuilder{" + - "amInfo=" + amInfo + - ", state=" + state + - ", diagnostics=" + diagnostics + - ", originalConfigurationPath='" + originalConfigurationPath + '\'' + - ", generatedConfigurationPath='" + generatedConfigurationPath + '\'' + - ", desiredInstances=" + desiredInstances + - ", liveInstances=" + liveInstances + - ", launchingInstances=" + launchingInstances + - ", appStartTime=" + appStartTime + - ", appFinishTime=" + appFinishTime + - ", runningThresholdAchieved=" + runningThresholdAchieved + - ", runningInstances=" + runningInstances + - ", completedInstances=" + completedInstances + - ", containerToRunningInstanceMap=" + containerToRunningInstanceMap + - '}'; - } - } -} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusServiceCommandLine.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusServiceCommandLine.java new file mode 100644 index 00000000000..bee50791a3a --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusServiceCommandLine.java @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +import java.util.Arrays; +import java.util.Properties; + +import jline.TerminalFactory; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; + +/** + * Parses, verifies, prints and provides the command line arguments of the Llap Status program. + */ +public class LlapStatusServiceCommandLine { + private static final Logger LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole"); + + @VisibleForTesting + static final long DEFAULT_FIND_YARN_APP_TIMEOUT_MS = 20 * 1000L; + @VisibleForTesting + static final long DEFAULT_STATUS_REFRESH_INTERVAL_MS = 1 * 1000L; + @VisibleForTesting + static final long DEFAULT_WATCH_MODE_TIMEOUT_MS = 5 * 60 * 1000L; + @VisibleForTesting + static final float DEFAULT_RUNNING_NODES_THRESHOLD = 1.0f; + + @SuppressWarnings("static-access") + private static final Option NAME = OptionBuilder + .withLongOpt("name") + .withDescription("LLAP cluster name") + .withArgName("name") + .hasArg() + .create('n'); + + @SuppressWarnings("static-access") + private static final Option FIND_APP_TIMEOUT = OptionBuilder + .withLongOpt("findAppTimeout") + .withDescription("Amount of time(s) that the tool will sleep to wait for the YARN application to start." + + "negative values=wait forever, 0=Do not wait. default=" + (DEFAULT_FIND_YARN_APP_TIMEOUT_MS / 1000) + "s") + .withArgName("findAppTimeout") + .hasArg() + .create('f'); + + @SuppressWarnings("static-access") + private static final Option OUTPUT_FILE = OptionBuilder + .withLongOpt("outputFile") + .withDescription("File to which output should be written (Default stdout)") + .withArgName("outputFile") + .hasArg() + .create('o'); + + @SuppressWarnings("static-access") + private static final Option WATCH_MODE = OptionBuilder + .withLongOpt("watch") + .withDescription("Watch mode waits until all LLAP daemons are running or subset of the nodes are running " + + "(threshold can be specified via -r option) (Default wait until all nodes are running)") + .withArgName("watch") + .create('w'); + + @SuppressWarnings("static-access") + private static final Option NOT_LAUNCHED = OptionBuilder + .withLongOpt("notLaunched") + .withDescription("In watch mode, do not assume that the application was already launched if there's doubt " + + "(e.g. if the last application instance has failed).") + .withArgName("notLaunched") + .create('l'); + + @SuppressWarnings("static-access") + private static final Option RUNNING_NODES_THRESHOLD = OptionBuilder + .withLongOpt("runningNodesThreshold") + .withDescription("When watch mode is enabled (-w), wait until the specified threshold of nodes are running " + + "(Default 1.0 which means 100% nodes are running)") + .withArgName("runningNodesThreshold") + .hasArg() + .create('r'); + + @SuppressWarnings("static-access") + private static final Option REFRESH_INTERVAL = OptionBuilder + .withLongOpt("refreshInterval") + .withDescription("Amount of time in seconds to wait until subsequent status checks in watch mode. Valid only " + + "for watch mode. (Default " + (DEFAULT_STATUS_REFRESH_INTERVAL_MS / 1000) + "s)") + .withArgName("refreshInterval") + .hasArg() + .create('i'); + + @SuppressWarnings("static-access") + private static final Option WATCH_TIMEOUT = OptionBuilder + .withLongOpt("watchTimeout") + .withDescription("Exit watch mode if the desired state is not attained until the specified timeout. (Default " + + (DEFAULT_WATCH_MODE_TIMEOUT_MS / 1000) + "s)") + .withArgName("watchTimeout") + .hasArg() + .create('t'); + + @SuppressWarnings("static-access") + private static final Option HIVECONF = OptionBuilder + .withLongOpt("hiveconf") + .withDescription("Use value for given property. Overridden by explicit parameters") + .withArgName("property=value") + .hasArgs(2) + .create(); + + @SuppressWarnings("static-access") + private static final Option HELP = OptionBuilder + .withLongOpt("help") + .withDescription("Print help information") + .withArgName("help") + .create('h'); + + private static final Options OPTIONS = new Options(); + static { + OPTIONS.addOption(NAME); + OPTIONS.addOption(FIND_APP_TIMEOUT); + OPTIONS.addOption(OUTPUT_FILE); + OPTIONS.addOption(WATCH_MODE); + OPTIONS.addOption(NOT_LAUNCHED); + OPTIONS.addOption(RUNNING_NODES_THRESHOLD); + OPTIONS.addOption(REFRESH_INTERVAL); + OPTIONS.addOption(WATCH_TIMEOUT); + OPTIONS.addOption(HIVECONF); + OPTIONS.addOption(HELP); + } + + private String name; + private long findAppTimeoutMs; + private String outputFile; + private boolean watchMode; + private boolean isLaunched; + private float runningNodesThreshold; + private long refreshIntervalMs; + private long watchTimeoutMs; + private Properties hiveConf; + private boolean isHelp; + + public static LlapStatusServiceCommandLine parseArguments(String[] args) { + LlapStatusServiceCommandLine cl = null; + try { + cl = new LlapStatusServiceCommandLine(args); + } catch (Exception e) { + LOGGER.error("Parsing the command line arguments failed", e); + printUsage(); + System.exit(ExitCode.INCORRECT_USAGE.getCode()); + } + + if (cl.isHelp()) { + printUsage(); + System.exit(0); + } + + return cl; + } + + LlapStatusServiceCommandLine(String[] args) throws ParseException { + LOGGER.info("LLAP status invoked with arguments = {}", Arrays.toString(args)); + parseCommandLine(args); + printArguments(); + } + + private void parseCommandLine(String[] args) throws ParseException { + CommandLine cl = new GnuParser().parse(OPTIONS, args); + + name = cl.getOptionValue(NAME.getLongOpt()); + + findAppTimeoutMs = DEFAULT_FIND_YARN_APP_TIMEOUT_MS; + if (cl.hasOption(FIND_APP_TIMEOUT.getLongOpt())) { + findAppTimeoutMs = Long.parseLong(cl.getOptionValue(FIND_APP_TIMEOUT.getLongOpt())) * 1000; + } + + if (cl.hasOption(OUTPUT_FILE.getLongOpt())) { + outputFile = cl.getOptionValue(OUTPUT_FILE.getLongOpt()); + } + + watchMode = cl.hasOption(WATCH_MODE.getLongOpt()); + + isLaunched = !cl.hasOption(NOT_LAUNCHED.getLongOpt()); + + runningNodesThreshold = DEFAULT_RUNNING_NODES_THRESHOLD; + if (cl.hasOption(RUNNING_NODES_THRESHOLD.getLongOpt())) { + runningNodesThreshold = Float.parseFloat(cl.getOptionValue(RUNNING_NODES_THRESHOLD.getLongOpt())); + if (runningNodesThreshold < 0.0f || runningNodesThreshold > 1.0f) { + throw new IllegalArgumentException("Running nodes threshold value should be between 0.0 and 1.0 (inclusive)"); + } + } + + refreshIntervalMs = DEFAULT_STATUS_REFRESH_INTERVAL_MS; + if (cl.hasOption(REFRESH_INTERVAL.getLongOpt())) { + long refreshIntervalSec = Long.parseLong(cl.getOptionValue(REFRESH_INTERVAL.getLongOpt())); + if (refreshIntervalSec <= 0) { + throw new IllegalArgumentException("Refresh interval should be >0"); + } + refreshIntervalMs = refreshIntervalSec * 1000; + } + + watchTimeoutMs = DEFAULT_WATCH_MODE_TIMEOUT_MS; + if (cl.hasOption(WATCH_TIMEOUT.getLongOpt())) { + long watchTimeoutSec = Long.parseLong(cl.getOptionValue(WATCH_TIMEOUT.getLongOpt())); + if (watchTimeoutSec <= 0) { + throw new IllegalArgumentException("Watch timeout should be >0"); + } + watchTimeoutMs = watchTimeoutSec * 1000; + } + + hiveConf = new Properties(); + if (cl.hasOption(HIVECONF.getLongOpt())) { + hiveConf = cl.getOptionProperties(HIVECONF.getLongOpt()); + } + + isHelp = cl.hasOption(HELP.getOpt()); + } + + private static void printUsage() { + HelpFormatter hf = new HelpFormatter(); + try { + int width = hf.getWidth(); + int jlineWidth = TerminalFactory.get().getWidth(); + width = Math.min(160, Math.max(jlineWidth, width)); + hf.setWidth(width); + } catch (Throwable t) { // Ignore + } + + hf.printHelp("llapstatus", OPTIONS); + } + + private void printArguments() { + LOGGER.info("LLAP status running with the following parsed arguments: \n" + + "\tname : " + name + "\n" + + "\tfindAppTimeoutMs : " + findAppTimeoutMs + "\n" + + "\toutputFile : " + outputFile + "\n" + + "\twatchMode : " + watchMode + "\n" + + "\tisLaunched : " + isLaunched + "\n" + + "\trunningNodesThreshold: " + runningNodesThreshold + "\n" + + "\trefreshIntervalMs : " + refreshIntervalMs + "\n" + + "\twatchTimeoutMs : " + watchTimeoutMs + "\n" + + "\thiveConf : " + hiveConf); + } + + String getName() { + return name; + } + + long getFindAppTimeoutMs() { + return findAppTimeoutMs; + } + + String getOutputFile() { + return outputFile; + } + + boolean isWatchMode() { + return watchMode; + } + + boolean isLaunched() { + return isLaunched; + } + + float getRunningNodesThreshold() { + return runningNodesThreshold; + } + + long getRefreshIntervalMs() { + return refreshIntervalMs; + } + + long getWatchTimeoutMs() { + return watchTimeoutMs; + } + + Properties getHiveConf() { + return hiveConf; + } + + boolean isHelp() { + return isHelp; + } +} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusServiceDriver.java similarity index 59% rename from llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java rename to llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusServiceDriver.java index c1bae653479..e3302b71f8e 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/LlapStatusServiceDriver.java @@ -16,40 +16,36 @@ * limitations under the License. */ -package org.apache.hadoop.hive.llap.cli; +package org.apache.hadoop.hive.llap.cli.status; -import com.google.common.annotations.VisibleForTesting; -import java.io.BufferedOutputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.io.OutputStreamWriter; import java.io.PrintWriter; +import java.io.Writer; +import java.nio.charset.Charset; import java.text.DecimalFormat; -import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; -import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.llap.cli.LlapStatusOptionsProcessor.LlapStatusOptions; -import org.apache.hadoop.hive.llap.cli.status.LlapStatusHelpers; -import org.apache.hadoop.hive.llap.cli.status.LlapStatusHelpers.AppStatusBuilder; -import org.apache.hadoop.hive.llap.cli.status.LlapStatusHelpers.LlapInstance; -import org.apache.hadoop.hive.llap.cli.status.LlapStatusHelpers.State; +import org.apache.hadoop.hive.llap.cli.LlapSliderUtils; import org.apache.hadoop.hive.llap.configuration.LlapDaemonConfiguration; import org.apache.hadoop.hive.llap.registry.LlapServiceInstance; import org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService; import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationReport; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.service.api.records.Container; import org.apache.hadoop.yarn.service.api.records.Service; @@ -57,20 +53,25 @@ import org.apache.hadoop.yarn.service.api.records.ServiceState; import org.apache.hadoop.yarn.service.client.ServiceClient; import org.apache.hadoop.yarn.util.Clock; import org.apache.hadoop.yarn.util.SystemClock; +import org.codehaus.jackson.annotate.JsonAutoDetect.Visibility; +import org.codehaus.jackson.annotate.JsonMethod; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.SerializationConfig; import org.codehaus.jackson.map.annotate.JsonSerialize; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * Checks the status of the Llap. + */ public class LlapStatusServiceDriver { + private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class); + private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole"); private static final EnumSet<State> NO_YARN_SERVICE_INFO_STATES = EnumSet.of( State.APP_NOT_FOUND, State.COMPLETE, State.LAUNCHING); private static final EnumSet<State> LAUNCHING_STATES = EnumSet.of( State.LAUNCHING, State.RUNNING_PARTIAL, State.RUNNING_ALL); - private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class); - private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole"); // Defining a bunch of configs here instead of in HiveConf. These are experimental, and mainly // for use when retry handling is fixed in Yarn/Hadoop @@ -79,50 +80,51 @@ public class LlapStatusServiceDriver { // The following two keys should ideally be used to control RM connect timeouts. However, // they don't seem to work. The IPC timeout needs to be set instead. - @InterfaceAudience.Private - private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS = - CONF_PREFIX + "yarn.rm.connect.max-wait-ms"; - private static final long CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT = 10000l; - @InterfaceAudience.Private - private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS = - CONF_PREFIX + "yarn.rm.connect.retry-interval.ms"; - private static final long CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT = 5000l; + private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS = CONF_PREFIX + "yarn.rm.connect.max-wait-ms"; + private static final long CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT = 10000L; + private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS = CONF_PREFIX + "yarn.rm.connect.retry-interval.ms"; + private static final long CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT = 5000L; // As of Hadoop 2.7 - this is what controls the RM timeout. - @InterfaceAudience.Private - private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES = - CONF_PREFIX + "ipc.client.max-retries"; + private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES = CONF_PREFIX + "ipc.client.max-retries"; private static final int CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 2; - @InterfaceAudience.Private private static final String CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS = CONF_PREFIX + "ipc.client.connect.retry-interval-ms"; - private static final long CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT = 1500l; + private static final long CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT = 1500L; // As of Hadoop 2.8 - this timeout spec behaves in a strnage manner. "2000,1" means 2000s with 1 retry. // However it does this - but does it thrice. Essentially - #retries+2 is the number of times the entire config // is retried. "2000,1" means 3 retries - each with 1 retry with a random 2000ms sleep. - @InterfaceAudience.Private private static final String CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC = CONF_PREFIX + "timeline.service.fs-store.retry.policy.spec"; private static final String CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT = "2000, 1"; - private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS = - CONF_PREFIX + "zk-registry.timeout-ms"; - private static final long CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT = 20000l; + private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS = CONF_PREFIX + "zk-registry.timeout-ms"; + private static final long CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT = 20000L; private static final long LOG_SUMMARY_INTERVAL = 15000L; // Log summary every ~15 seconds. - private static final String LLAP_KEY = "llap"; + private final Configuration conf; - private final Clock clock = new SystemClock(); private String appName = null; + private String applicationId = null; private ServiceClient serviceClient = null; private Configuration llapRegistryConf = null; private LlapRegistryService llapRegistry = null; - @VisibleForTesting - AppStatusBuilder appStatusBuilder; + private AppStatusBuilder appStatusBuilder; + + private static LlapStatusServiceDriver createServiceDriver() { + LlapStatusServiceDriver statusServiceDriver = null; + try { + statusServiceDriver = new LlapStatusServiceDriver(); + } catch (Throwable t) { + logError(t); + System.exit(ExitCode.INTERNAL_ERROR.getCode()); + } + return statusServiceDriver; + } public LlapStatusServiceDriver() { SessionState ss = SessionState.get(); @@ -141,59 +143,35 @@ public class LlapStatusServiceDriver { // Once we move to a Hadoop-2.8 dependency, the following paramteer can be used. // conf.set(YarnConfiguration.TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC); conf.set("yarn.timeline-service.entity-group-fs-store.retry-policy-spec", - conf.get(CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC, - CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT)); + conf.get(CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC, + CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT)); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, - conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS, - CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT)); + conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS, CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT)); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, - conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT)); + conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT)); conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, - conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES, - CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT)); + conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES, CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT)); conf.setLong(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_RETRY_INTERVAL_KEY, - conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS, - CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT)); + conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS, CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT)); - HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, (conf - .getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) + - "ms")); + HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, ( + conf.getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) + "ms")); llapRegistryConf = new Configuration(conf); } - /** - * Parse command line options. - * - * @param args - * @return command line options. - */ - public LlapStatusOptions parseOptions(String[] args) throws - LlapStatusCliException { - - LlapStatusOptionsProcessor optionsProcessor = new LlapStatusOptionsProcessor(); - LlapStatusOptions options; - try { - options = optionsProcessor.processOptions(args); - return options; - } catch (Exception e) { - LOG.info("Failed to parse arguments", e); - throw new LlapStatusCliException(ExitCode.INCORRECT_USAGE, "Incorrect usage"); - } - } - - public int run(LlapStatusOptions options, long watchTimeoutMs) { + public ExitCode run(LlapStatusServiceCommandLine cl, long watchTimeoutMs) { appStatusBuilder = new AppStatusBuilder(); try { if (appName == null) { // user provided configs - for (Map.Entry<Object, Object> props : options.getConf().entrySet()) { + for (Map.Entry<Object, Object> props : cl.getHiveConf().entrySet()) { conf.set((String) props.getKey(), (String) props.getValue()); } - appName = options.getName(); + appName = cl.getName(); if (StringUtils.isEmpty(appName)) { appName = HiveConf.getVar(conf, HiveConf.ConfVars.LLAP_DAEMON_SERVICE_HOSTS); if (appName.startsWith("@") && appName.length() > 1) { @@ -205,15 +183,11 @@ public class LlapStatusServiceDriver { } } if (StringUtils.isEmpty(appName)) { - String message = - "Invalid app name. This must be setup via config or passed in as a parameter." + - " This tool works with clusters deployed by YARN Service"; - LOG.info(message); - return ExitCode.INCORRECT_USAGE.getInt(); - } - if (LOG.isDebugEnabled()) { - LOG.debug("Using appName: {}", appName); + LOG.error("Invalid app name. This must be setup via config or passed in as a parameter." + + " This tool works with clusters deployed by YARN Service"); + return ExitCode.INCORRECT_USAGE; } + LOG.debug("Using appName: {}", appName); llapRegistryConf.set(HiveConf.ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname, "@" + appName); } @@ -224,20 +198,18 @@ public class LlapStatusServiceDriver { } } catch (Exception e) { LlapStatusCliException le = new LlapStatusCliException( - LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED, - "Failed to create service client", e); + ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED, "Failed to create service client", e); logError(le); - return le.getExitCode().getInt(); + return le.getExitCode(); } // Get the App report from YARN ApplicationReport appReport; try { - appReport = LlapSliderUtils.getAppReport(appName, serviceClient, - options.getFindAppTimeoutMs()); + appReport = getAppReport(appName, cl.getFindAppTimeoutMs()); } catch (LlapStatusCliException e) { logError(e); - return e.getExitCode().getInt(); + return e.getExitCode(); } // Process the report @@ -246,139 +218,165 @@ public class LlapStatusServiceDriver { ret = processAppReport(appReport, appStatusBuilder); } catch (LlapStatusCliException e) { logError(e); - return e.getExitCode().getInt(); + return e.getExitCode(); } if (ret != ExitCode.SUCCESS) { - return ret.getInt(); + return ret; } else if (NO_YARN_SERVICE_INFO_STATES.contains(appStatusBuilder.getState())) { - return ExitCode.SUCCESS.getInt(); + return ExitCode.SUCCESS; } else { // Get information from YARN Service try { - ret = populateAppStatusFromServiceStatus(appName, serviceClient, - appStatusBuilder); + ret = populateAppStatusFromServiceStatus(appName, serviceClient, appStatusBuilder); } catch (LlapStatusCliException e) { - // In case of failure, send back whatever is constructed so far - - // which would be from the AppReport + // In case of failure, send back whatever is constructed so far - which would be from the AppReport logError(e); - return e.getExitCode().getInt(); + return e.getExitCode(); } } if (ret != ExitCode.SUCCESS) { - return ret.getInt(); + return ret; } else { try { ret = populateAppStatusFromLlapRegistry(appStatusBuilder, watchTimeoutMs); } catch (LlapStatusCliException e) { logError(e); - return e.getExitCode().getInt(); + return e.getExitCode(); } } - return ret.getInt(); + return ret; } finally { - if (LOG.isDebugEnabled()) { - LOG.debug("Final AppState: " + appStatusBuilder.toString()); + LOG.debug("Final AppState: " + appStatusBuilder.toString()); + } + } + + private ApplicationReport getAppReport(String appName, long timeoutMs) + throws LlapStatusCliException { + Clock clock = SystemClock.getInstance(); + long startTime = clock.getTime(); + long timeoutTime = timeoutMs < 0 ? Long.MAX_VALUE : (startTime + timeoutMs); + ApplicationReport appReport = null; + ApplicationId appId; + try { + appId = serviceClient.getAppId(appName); + } catch (YarnException | IOException e) { + return null; + } + + while (appReport == null) { + try { + appReport = serviceClient.getYarnClient().getApplicationReport(appId); + if (timeoutMs == 0) { + // break immediately if timeout is 0 + break; + } + // Otherwise sleep, and try again. + if (appReport == null) { + long remainingTime = Math.min(timeoutTime - clock.getTime(), 500L); + if (remainingTime > 0) { + Thread.sleep(remainingTime); + } else { + break; + } + } + } catch (Exception e) { + if (e instanceof ApplicationNotFoundException) { + //This might happen when serviceClient caches an appId from the past which is now not + // valid (i.e. Yarn RM restart). This will force re-creation of service client in the + // next check (if watch mode is on..) which effectively invalidates such cache. + serviceClient = null; + } + throw new LlapStatusCliException(ExitCode.YARN_ERROR, "Failed to get Yarn AppReport", e); } } + return appReport; } - public void outputJson(PrintWriter writer) throws - LlapStatusCliException { + public void outputJson(PrintWriter writer) throws LlapStatusCliException { ObjectMapper mapper = new ObjectMapper(); mapper.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false); mapper.setSerializationInclusion(JsonSerialize.Inclusion.NON_NULL); mapper.setSerializationInclusion(JsonSerialize.Inclusion.NON_EMPTY); + mapper.setVisibility(JsonMethod.ALL, Visibility.NON_PRIVATE); try { writer.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(appStatusBuilder)); } catch (IOException e) { LOG.warn("Failed to create JSON", e); - throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON", - e); + throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON", e); } } /** - * Populates parts of the AppStatus + * Populates parts of the AppStatus. * - * @param appReport - * @param appStatusBuilder * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode processAppReport(ApplicationReport appReport, - AppStatusBuilder appStatusBuilder) throws - LlapStatusCliException { + private ExitCode processAppReport(ApplicationReport appReport, AppStatusBuilder appStatusBuilder) + throws LlapStatusCliException { if (appReport == null) { appStatusBuilder.setState(State.APP_NOT_FOUND); LOG.info("No Application Found"); return ExitCode.SUCCESS; } + applicationId = appReport.getApplicationId().toString(); + // TODO Maybe add the YARN URL for the app. appStatusBuilder.setAmInfo( - new LlapStatusHelpers.AmInfo().setAppName(appReport.getName()).setAppType(appReport.getApplicationType())); + new AmInfo().setAppName(appReport.getName()).setAppType(appReport.getApplicationType())); appStatusBuilder.setAppStartTime(appReport.getStartTime()); switch (appReport.getYarnApplicationState()) { - case NEW: - case NEW_SAVING: - case SUBMITTED: - appStatusBuilder.setState(State.LAUNCHING); - return ExitCode.SUCCESS; - case ACCEPTED: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); - appStatusBuilder.setState(State.LAUNCHING); - return ExitCode.SUCCESS; - case RUNNING: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); - // If the app state is running, get additional information from YARN Service - return ExitCode.SUCCESS; - case FINISHED: - case FAILED: - case KILLED: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); - appStatusBuilder.setAppFinishTime(appReport.getFinishTime()); - appStatusBuilder.setState(State.COMPLETE); - // add log links and other diagnostics from YARN Service - return ExitCode.SUCCESS; - default: - throw new LlapStatusCliException(ExitCode.INTERNAL_ERROR, - "Unknown Yarn Application State: " + appReport.getYarnApplicationState()); + case NEW: + case NEW_SAVING: + case SUBMITTED: + appStatusBuilder.setState(State.LAUNCHING); + return ExitCode.SUCCESS; + case ACCEPTED: + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); + appStatusBuilder.setState(State.LAUNCHING); + return ExitCode.SUCCESS; + case RUNNING: + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); + // If the app state is running, get additional information from YARN Service + return ExitCode.SUCCESS; + case FINISHED: + case FAILED: + case KILLED: + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); + appStatusBuilder.setAppFinishTime(appReport.getFinishTime()); + appStatusBuilder.setState(State.COMPLETE); + // add log links and other diagnostics from YARN Service + return ExitCode.SUCCESS; + default: + throw new LlapStatusCliException(ExitCode.INTERNAL_ERROR, + "Unknown Yarn Application State: " + appReport.getYarnApplicationState()); } } /** * Populates information from YARN Service Status. * - * @param appName - * @param serviceClient - * @param appStatusBuilder - * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future - * progress not possible + * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode populateAppStatusFromServiceStatus(String appName, - ServiceClient serviceClient, AppStatusBuilder appStatusBuilder) - throws LlapStatusCliException { + private ExitCode populateAppStatusFromServiceStatus(String appName, ServiceClient serviceClient, + AppStatusBuilder appStatusBuilder) throws LlapStatusCliException { ExitCode exitCode = ExitCode.YARN_ERROR; try { Service service = serviceClient.getStatus(appName); if (service != null) { // How to get config paths and AmInfo ServiceState state = service.getState(); - appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0 - : service.getLaunchTime().getTime()); - appStatusBuilder.setDesiredInstances( - service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0 - : service.getComponent(LLAP_KEY).getNumberOfContainers() - .intValue()); - appStatusBuilder.setLiveInstances( - service.getComponent(LLAP_KEY).getContainers().size()); + appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0 : service.getLaunchTime().getTime()); + appStatusBuilder.setDesiredInstances(service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0 + : service.getComponent(LLAP_KEY).getNumberOfContainers().intValue()); + appStatusBuilder.setLiveInstances(service.getComponent(LLAP_KEY).getContainers().size()); for (Container cont : service.getComponent(LLAP_KEY).getContainers()) { - LlapInstance llapInstance = new LlapInstance(cont.getHostname(), - cont.getId()); + LlapInstance llapInstance = new LlapInstance(cont.getHostname(), cont.getId()); appStatusBuilder.addNewRunningLlapInstance(llapInstance); } if (state == ServiceState.STARTED || state == ServiceState.STABLE || state == ServiceState.FLEX) { @@ -389,8 +387,7 @@ public class LlapStatusServiceDriver { } } catch (IOException | YarnException e) { LlapStatusCliException le = new LlapStatusCliException( - LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_OTHER, - "Failed to get service status", e); + ExitCode.SERVICE_CLIENT_ERROR_OTHER, "Failed to get service status", e); logError(le); exitCode = le.getExitCode(); } @@ -400,13 +397,11 @@ public class LlapStatusServiceDriver { /** * Populate additional information for containers from the LLAP registry. Must be invoked * after YARN Service status and diagnostics. - * @param appStatusBuilder * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode populateAppStatusFromLlapRegistry( - AppStatusBuilder appStatusBuilder, long watchTimeoutMs) throws - LlapStatusCliException { + private ExitCode populateAppStatusFromLlapRegistry(AppStatusBuilder appStatusBuilder, long watchTimeoutMs) + throws LlapStatusCliException { if (llapRegistry == null) { try { @@ -425,9 +420,7 @@ public class LlapStatusServiceDriver { } if (serviceInstances == null || serviceInstances.isEmpty()) { - if (LOG.isDebugEnabled()) { - LOG.debug("No information found in the LLAP registry"); - } + LOG.debug("No information found in the LLAP registry"); appStatusBuilder.setLiveInstances(0); appStatusBuilder.setState(State.LAUNCHING); appStatusBuilder.clearRunningLlapInstances(); @@ -439,10 +432,9 @@ public class LlapStatusServiceDriver { for (LlapServiceInstance serviceInstance : serviceInstances) { String containerIdString = serviceInstance.getProperties().get( - HiveConf.ConfVars.LLAP_DAEMON_CONTAINER_ID.varname); + HiveConf.ConfVars.LLAP_DAEMON_CONTAINER_ID.varname); - LlapInstance llapInstance = appStatusBuilder.removeAndGetRunningLlapInstanceForContainer( - containerIdString); + LlapInstance llapInstance = appStatusBuilder.removeAndGetRunningLlapInstanceForContainer(containerIdString); if (llapInstance != null) { llapInstance.setMgmtPort(serviceInstance.getManagementPort()); llapInstance.setRpcPort(serviceInstance.getRpcPort()); @@ -461,8 +453,8 @@ public class LlapStatusServiceDriver { appStatusBuilder.setLiveInstances(validatedInstances.size()); appStatusBuilder.setLaunchingInstances(llapExtraInstances.size()); - if (appStatusBuilder.getDesiredInstances() != null && validatedInstances - .size() >= appStatusBuilder.getDesiredInstances()) { + if (appStatusBuilder.getDesiredInstances() != null && + validatedInstances.size() >= appStatusBuilder.getDesiredInstances()) { appStatusBuilder.setState(State.RUNNING_ALL); if (validatedInstances.size() > appStatusBuilder.getDesiredInstances()) { LOG.warn("Found more entries in LLAP registry, as compared to desired entries"); @@ -493,207 +485,40 @@ public class LlapStatusServiceDriver { return ExitCode.SUCCESS; } - private static String constructCompletedContainerDiagnostics(List<LlapInstance> completedInstances) { - StringBuilder sb = new StringBuilder(); - if (completedInstances == null || completedInstances.size() == 0) { - return ""; - } else { - // TODO HIVE-15865 Ideally sort these by completion time, once that is available. - boolean isFirst = true; - for (LlapInstance instance : completedInstances) { - if (!isFirst) { - sb.append("\n"); - } else { - isFirst = false; - } - - if (instance.getYarnContainerExitStatus() == - ContainerExitStatus.KILLED_EXCEEDED_PMEM || - instance.getYarnContainerExitStatus() == - ContainerExitStatus.KILLED_EXCEEDED_VMEM) { - sb.append("\tKILLED container (by YARN for exceeding memory limits): "); - } else { - // TODO HIVE-15865 Handle additional reasons like OS launch failed - sb.append("\tFAILED container: "); - } - sb.append(" ").append(instance.getContainerId()); - sb.append(", Logs at: ").append(instance.getLogUrl()); - } - } - return sb.toString(); - } - - /** - * Helper method to construct a diagnostic message from a complete - * AppStatusBuilder. - * - * @return - */ - private static String constructDiagnostics( - AppStatusBuilder appStatusBuilder) { - StringBuilder sb = new StringBuilder(); - - switch (appStatusBuilder.getState()) { - case APP_NOT_FOUND: - sb.append("LLAP status unknown. Awaiting app launch"); - break; - case LAUNCHING: - // This is a catch all state - when containers have not started yet, or LLAP has not started yet. - if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) { - sb.append("LLAP Starting up with AppId=") - .append(appStatusBuilder.getAmInfo().getAppId()).append("."); - if (appStatusBuilder.getDesiredInstances() != null) { - sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances"); - } - - String containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - } else { - sb.append("Awaiting LLAP startup"); - } - break; - case RUNNING_PARTIAL: - sb.append("LLAP Starting up with ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - sb.append(" Started").append(appStatusBuilder.getLiveInstances()) - .append("/").append(appStatusBuilder.getDesiredInstances()) - .append(" instances"); - String containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - - // TODO HIVE-15865: Include information about pending requests, and last - // allocation time once YARN Service provides this information. - break; - case RUNNING_ALL: - sb.append("LLAP Application running with ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - break; - case COMPLETE: - - sb.append("LLAP Application already complete. ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - - break; - case UNKNOWN: - sb.append("LLAP status unknown"); - break; - } - if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) { - sb.append("\n").append(appStatusBuilder.getDiagnostics()); - } - - return sb.toString(); - } - - public enum ExitCode { - SUCCESS(0), - INCORRECT_USAGE(10), - YARN_ERROR(20), - SERVICE_CLIENT_ERROR_CREATE_FAILED(30), - SERVICE_CLIENT_ERROR_OTHER(31), - LLAP_REGISTRY_ERROR(40), - LLAP_JSON_GENERATION_ERROR(50), - // Error in the script itself - likely caused by an incompatible change, or new functionality / states added. - INTERNAL_ERROR(100); - - private final int exitCode; - - ExitCode(int exitCode) { - this.exitCode = exitCode; - } - - public int getInt() { - return exitCode; - } - } - - - public static class LlapStatusCliException extends Exception { - final LlapStatusServiceDriver.ExitCode exitCode; - - - public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message) { - super(exitCode.getInt() +": " + message); - this.exitCode = exitCode; - } - - public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message, Throwable cause) { - super(message, cause); - this.exitCode = exitCode; + private void close() { + if (serviceClient != null) { + serviceClient.stop(); } - - public LlapStatusServiceDriver.ExitCode getExitCode() { - return exitCode; + if (llapRegistry != null) { + llapRegistry.stop(); } } - - private static void logError(Throwable t) { - LOG.error("FAILED: " + t.getMessage(), t); - System.err.println("FAILED: " + t.getMessage()); - } - - public static void main(String[] args) { - LOG.info("LLAP status invoked with arguments = {}", Arrays.toString(args)); - int ret = ExitCode.SUCCESS.getInt(); + LlapStatusServiceCommandLine cl = LlapStatusServiceCommandLine.parseArguments(args); + LlapStatusServiceDriver statusServiceDriver = createServiceDriver(); + + ExitCode ret = ExitCode.SUCCESS; Clock clock = SystemClock.getInstance(); - long startTime = clock.getTime(); long lastSummaryLogTime = -1; - LlapStatusServiceDriver statusServiceDriver = null; - LlapStatusOptions options = null; - try { - statusServiceDriver = new LlapStatusServiceDriver(); - options = statusServiceDriver.parseOptions(args); - } catch (Throwable t) { - statusServiceDriver.close(); - logError(t); - if (t instanceof LlapStatusCliException) { - LlapStatusCliException - ce = (LlapStatusCliException) t; - ret = ce.getExitCode().getInt(); - } else { - ret = ExitCode.INTERNAL_ERROR.getInt(); - } - } - if (ret != 0 || options == null) { // Failure / help - if (statusServiceDriver != null) { - statusServiceDriver.close(); - } - System.exit(ret); - } - boolean firstAttempt = true; - final long refreshInterval = options.getRefreshIntervalMs(); - final boolean watchMode = options.isWatchMode(); - final long watchTimeout = options.getWatchTimeoutMs(); + final long refreshInterval = cl.getRefreshIntervalMs(); + final boolean watchMode = cl.isWatchMode(); + final long watchTimeout = cl.getWatchTimeoutMs(); long numAttempts = watchTimeout / refreshInterval; numAttempts = watchMode ? numAttempts : 1; // Break out of the loop fast if watchMode is disabled. - LlapStatusHelpers.State launchingState = null; - LlapStatusHelpers.State currentState = null; + State launchingState = null; + State currentState = null; boolean desiredStateAttained = false; - final float runningNodesThreshold = options.getRunningNodesThreshold(); - try (OutputStream os = options.getOutputFile() == null ? System.out : - new BufferedOutputStream(new FileOutputStream(options.getOutputFile())); - PrintWriter pw = new PrintWriter(os)) { + final float runningNodesThreshold = cl.getRunningNodesThreshold(); + try (OutputStream os = cl.getOutputFile() == null ? System.out : new FileOutputStream(cl.getOutputFile()); + Writer w = new OutputStreamWriter(os, Charset.defaultCharset()); + PrintWriter pw = new PrintWriter(w)) { LOG.info("Configured refresh interval: {}s. Watch timeout: {}s. Attempts remaining: {}." + - " Watch mode: {}. Running nodes threshold: {}.", - TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS), - TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS), - numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold)); + " Watch mode: {}. Running nodes threshold: {}.", refreshInterval/1000, watchTimeout/1000, + numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold)); while (numAttempts > 0) { if (!firstAttempt) { if (watchMode) { @@ -709,17 +534,16 @@ public class LlapStatusServiceDriver { } else { firstAttempt = false; } - ret = statusServiceDriver.run(options, watchMode ? watchTimeout : 0); + ret = statusServiceDriver.run(cl, watchMode ? watchTimeout : 0); currentState = statusServiceDriver.appStatusBuilder.getState(); try { - lastSummaryLogTime = LlapStatusServiceDriver - .maybeLogSummary(clock, lastSummaryLogTime, statusServiceDriver, - watchMode, watchTimeout, launchingState); + lastSummaryLogTime = LlapStatusServiceDriver.maybeLogSummary(clock, lastSummaryLogTime, + statusServiceDriver, watchMode, watchTimeout, launchingState); } catch (Exception e) { LOG.warn("Failed to log summary", e); } - if (ret == ExitCode.SUCCESS.getInt()) { + if (ret == ExitCode.SUCCESS) { if (watchMode) { // YARN Service has started llap application, now if for some reason @@ -729,10 +553,9 @@ public class LlapStatusServiceDriver { } if (currentState.equals(State.COMPLETE)) { - if (launchingState != null || options.isLaunched()) { + if (launchingState != null || cl.isLaunched()) { LOG.warn("COMPLETE state reached while waiting for RUNNING state. Failing."); - System.err.println("Final diagnostics: " + - statusServiceDriver.appStatusBuilder.getDiagnostics()); + System.err.println("Final diagnostics: " + statusServiceDriver.appStatusBuilder.getDiagnostics()); break; } else { LOG.info("Found a stopped application; assuming it was a previous attempt " @@ -740,19 +563,15 @@ public class LlapStatusServiceDriver { } } - if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals( - State.RUNNING_ALL))) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Current state: {}. Desired state: {}. {}/{} instances.", - currentState, - runningNodesThreshold == 1.0f ? - State.RUNNING_ALL : - State.RUNNING_PARTIAL, - statusServiceDriver.appStatusBuilder.getLiveInstances(), - statusServiceDriver.appStatusBuilder - .getDesiredInstances()); - } + if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals(State.RUNNING_ALL))) { + LOG.debug( + "Current state: {}. Desired state: {}. {}/{} instances.", + currentState, + runningNodesThreshold == 1.0f ? + State.RUNNING_ALL : + State.RUNNING_PARTIAL, + statusServiceDriver.appStatusBuilder.getLiveInstances(), + statusServiceDriver.appStatusBuilder.getDesiredInstances()); numAttempts--; continue; } @@ -763,17 +582,13 @@ public class LlapStatusServiceDriver { if (desiredInstances > 0) { final float ratio = (float) liveInstances / (float) desiredInstances; if (ratio < runningNodesThreshold) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Waiting until running nodes threshold is reached. Current: {} Desired: {}." + - " {}/{} instances.", - new DecimalFormat("#.###").format(ratio), - new DecimalFormat("#.###") - .format(runningNodesThreshold), - statusServiceDriver.appStatusBuilder.getLiveInstances(), - statusServiceDriver.appStatusBuilder - .getDesiredInstances()); - } + LOG.debug( + "Waiting until running nodes threshold is reached. Current: {} Desired: {}." + + " {}/{} instances.", + new DecimalFormat("#.###").format(ratio), + new DecimalFormat("#.###").format(runningNodesThreshold), + statusServiceDriver.appStatusBuilder.getLiveInstances(), + statusServiceDriver.appStatusBuilder.getDesiredInstances()); numAttempts--; continue; } else { @@ -785,19 +600,19 @@ public class LlapStatusServiceDriver { continue; } } - } else if (ret == ExitCode.YARN_ERROR.getInt() && watchMode) { + } else if (ret == ExitCode.YARN_ERROR && watchMode) { LOG.warn("Watch mode enabled and got YARN error. Retrying.."); numAttempts--; continue; - } else if (ret == ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED.getInt() && watchMode) { + } else if (ret == ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED && watchMode) { LOG.warn("Watch mode enabled and YARN Service client creation failed. Retrying.."); numAttempts--; continue; - } else if (ret == ExitCode.SERVICE_CLIENT_ERROR_OTHER.getInt() && watchMode) { + } else if (ret == ExitCode.SERVICE_CLIENT_ERROR_OTHER && watchMode) { LOG.warn("Watch mode enabled and got YARN Service client error. Retrying.."); numAttempts--; continue; - } else if (ret == ExitCode.LLAP_REGISTRY_ERROR.getInt() && watchMode) { + } else if (ret == ExitCode.LLAP_REGISTRY_ERROR && watchMode) { LOG.warn("Watch mode enabled and got LLAP registry error. Retrying.."); numAttempts--; continue; @@ -805,45 +620,47 @@ public class LlapStatusServiceDriver { break; } // Log final state to CONSOLE_LOGGER - LlapStatusServiceDriver - .maybeLogSummary(clock, 0L, statusServiceDriver, - watchMode, watchTimeout, launchingState); + maybeLogSummary(clock, 0L, statusServiceDriver, watchMode, watchTimeout, launchingState); CONSOLE_LOGGER.info("\n\n\n"); - // print current state before exiting - statusServiceDriver.outputJson(pw); - os.flush(); + + statusServiceDriver.outputJson(pw); // print current state before exiting pw.flush(); if (numAttempts == 0 && watchMode && !desiredStateAttained) { - LOG.warn("Watch timeout {}s exhausted before desired state RUNNING is attained.", - TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS)); + LOG.warn("Watch timeout {}s exhausted before desired state RUNNING is attained.", watchTimeout/1000); } } catch (Throwable t) { logError(t); if (t instanceof LlapStatusCliException) { - LlapStatusCliException - ce = (LlapStatusCliException) t; - ret = ce.getExitCode().getInt(); + LlapStatusCliException ce = (LlapStatusCliException) t; + ret = ce.getExitCode(); } else { - ret = ExitCode.INTERNAL_ERROR.getInt(); + ret = ExitCode.INTERNAL_ERROR; } } finally { LOG.info("LLAP status finished"); + if (ret != ExitCode.SUCCESS) { + LOG.error("LLAP did not start. Check the application log for more info:\n" + + "\tyarn logs --applicationId {} -out <path>", statusServiceDriver.applicationId); + } statusServiceDriver.close(); } - if (LOG.isDebugEnabled()) { - LOG.debug("Completed processing - exiting with " + ret); + LOG.debug("Completed processing - exiting with " + ret); + + // HACK: due to the System.exit some log messages may not be present. + try { + Thread.sleep(1000); + } catch (Exception e) { + // ignore } - System.exit(ret); + System.exit(ret.getCode()); } - private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, - LlapStatusServiceDriver statusServiceDriver, - boolean watchMode, long watchTimeout, LlapStatusHelpers.State launchingState) { + private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, LlapStatusServiceDriver statusServiceDriver, + boolean watchMode, long watchTimeout, State launchingState) { long currentTime = clock.getTime(); if (lastSummaryLogTime < currentTime - LOG_SUMMARY_INTERVAL) { String diagString = null; - if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() == - State.COMPLETE && watchMode) { + if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() == State.COMPLETE && watchMode) { // First known state was COMPLETED. Wait for the app launch to start. diagString = "Awaiting LLAP launch"; // Clear completed instances in this case. Don't want to provide information from the previous run. @@ -854,28 +671,112 @@ public class LlapStatusServiceDriver { if (lastSummaryLogTime == -1) { if (watchMode) { - CONSOLE_LOGGER.info("\nLLAPSTATUS WatchMode with timeout={} s", - TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS)); + CONSOLE_LOGGER.info("\nLLAPSTATUS WatchMode with timeout={} s", watchTimeout/1000); } else { CONSOLE_LOGGER.info("\nLLAPSTATUS"); } - CONSOLE_LOGGER.info( - "--------------------------------------------------------------------------------"); + CONSOLE_LOGGER.info("--------------------------------------------------------------------------------"); } CONSOLE_LOGGER.info(diagString); - CONSOLE_LOGGER.info( - "--------------------------------------------------------------------------------"); + CONSOLE_LOGGER.info("--------------------------------------------------------------------------------"); lastSummaryLogTime = currentTime; } return lastSummaryLogTime; } - private void close() { - if (serviceClient != null) { - serviceClient.stop(); + /** + * Helper method to construct a diagnostic message from a complete AppStatusBuilder. + */ + private static String constructDiagnostics(AppStatusBuilder appStatusBuilder) { + StringBuilder sb = new StringBuilder(); + + switch (appStatusBuilder.getState()) { + case APP_NOT_FOUND: + sb.append("LLAP status unknown. Awaiting app launch"); + break; + case LAUNCHING: + // This is a catch all state - when containers have not started yet, or LLAP has not started yet. + if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) { + sb.append("LLAP Starting up with AppId=").append(appStatusBuilder.getAmInfo().getAppId()).append("."); + if (appStatusBuilder.getDesiredInstances() != null) { + sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances"); + } + + String containerDiagnostics = constructCompletedContainerDiagnostics( + appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + } else { + sb.append("Awaiting LLAP startup"); + } + break; + case RUNNING_PARTIAL: + sb.append("LLAP Starting up with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + sb.append(" Started").append(appStatusBuilder.getLiveInstances()).append("/") + .append(appStatusBuilder.getDesiredInstances()).append(" instances"); + String containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + + // TODO HIVE-15865: Include information about pending requests, and last + // allocation time once YARN Service provides this information. + break; + case RUNNING_ALL: + sb.append("LLAP Application running with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + break; + case COMPLETE: + sb.append("LLAP Application already complete. ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + + break; + case UNKNOWN: + sb.append("LLAP status unknown"); + break; + default: + throw new IllegalStateException("Unknown State: " + appStatusBuilder.getState()); } - if (llapRegistry != null) { - llapRegistry.stop(); + if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) { + sb.append("\n").append(appStatusBuilder.getDiagnostics()); } + + return sb.toString(); + } + + private static String constructCompletedContainerDiagnostics(List<LlapInstance> completedInstances) { + StringBuilder sb = new StringBuilder(); + if (completedInstances == null || completedInstances.size() == 0) { + return ""; + } else { + // TODO HIVE-15865 Ideally sort these by completion time, once that is available. + boolean isFirst = true; + for (LlapInstance instance : completedInstances) { + if (!isFirst) { + sb.append("\n"); + } else { + isFirst = false; + } + + if (instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_PMEM || + instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_VMEM) { + sb.append("\tKILLED container (by YARN for exceeding memory limits): "); + } else { + // TODO HIVE-15865 Handle additional reasons like OS launch failed + sb.append("\tFAILED container: "); + } + sb.append(" ").append(instance.getContainerId()); + sb.append(", Logs at: ").append(instance.getLogUrl()); + } + } + return sb.toString(); + } + + private static void logError(Throwable t) { + LOG.error("FAILED: " + t.getMessage(), t); + System.err.println("FAILED: " + t.getMessage()); } } diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/State.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/State.java new file mode 100644 index 00000000000..40c189a8a11 --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/State.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +/** + * Enumeration of the potential states of the Llap. + */ +enum State { + APP_NOT_FOUND, + LAUNCHING, + RUNNING_PARTIAL, + RUNNING_ALL, + COMPLETE, + UNKNOWN +} diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/package-info.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/package-info.java new file mode 100644 index 00000000000..79cadc7cb07 --- /dev/null +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/status/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Package consisting the program LlapStatusServiceDriver (and other classes used by it) + * which is monitoring if Llap is running. + */ +package org.apache.hadoop.hive.llap.cli.status; + diff --git a/llap-server/src/test/org/apache/hadoop/hive/llap/cli/status/TestLlapStatusServiceCommandLine.java b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/status/TestLlapStatusServiceCommandLine.java new file mode 100644 index 00000000000..3e4f6832a4d --- /dev/null +++ b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/status/TestLlapStatusServiceCommandLine.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.llap.cli.status; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import static junit.framework.TestCase.assertEquals; + +import java.util.Properties; + +/** + * Tests for LlapStatusServiceCommandLine. + */ +public class TestLlapStatusServiceCommandLine { + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void testArgumentParsingDefault() throws Exception { + LlapStatusServiceCommandLine cl = new LlapStatusServiceCommandLine(new String[] {}); + + assertEquals("findAppTimeout should be the default value if not specified otherwise", + cl.getFindAppTimeoutMs(), LlapStatusServiceCommandLine.DEFAULT_FIND_YARN_APP_TIMEOUT_MS); + + assertEquals("refreshInterval should be the default value if not specified otherwise", + cl.getRefreshIntervalMs(), LlapStatusServiceCommandLine.DEFAULT_STATUS_REFRESH_INTERVAL_MS); + + assertEquals("watchTimeout should be the default value if not specified otherwise", + cl.getWatchTimeoutMs(), LlapStatusServiceCommandLine.DEFAULT_WATCH_MODE_TIMEOUT_MS); + + assertEquals("runningNodesThreshold should be the default value if not specified otherwise", + cl.getRunningNodesThreshold(), LlapStatusServiceCommandLine.DEFAULT_RUNNING_NODES_THRESHOLD); + + assertEquals("hiveConf should be empty properties if not specified otherwise", cl.getHiveConf(), + new Properties()); + + assertEquals("isLaunched should be the true if not specified otherwise", cl.isLaunched(), true); + + assertEquals("watchMode should be the false if not specified otherwise", cl.isWatchMode(), false); + } + + @Test + public void testNegativeRefreshInterval() throws Exception { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Refresh interval should be >0"); + + new LlapStatusServiceCommandLine(new String[] {"--refreshInterval", "-1"}); + } + + @Test + public void testNegativeWatchTimeout() throws Exception { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Watch timeout should be >0"); + + new LlapStatusServiceCommandLine(new String[] {"--watchTimeout", "-1"}); + } + + @Test + public void testNegativeRunningNodesThreshold() throws Exception { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Running nodes threshold value should be between 0.0 and 1.0 (inclusive)"); + + new LlapStatusServiceCommandLine(new String[] {"--runningNodesThreshold", "-1"}); + } + + @Test + public void testRunningNodesThresholdOverOne() throws Exception { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Running nodes threshold value should be between 0.0 and 1.0 (inclusive)"); + + new LlapStatusServiceCommandLine(new String[] {"--runningNodesThreshold", "1.1"}); + } +} diff --git a/llap-server/src/test/org/apache/hadoop/hive/llap/cli/status/package-info.java b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/status/package-info.java new file mode 100644 index 00000000000..9af5dd8ca39 --- /dev/null +++ b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/status/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Package consisting the tests for the program LlapStatusServiceDriver and other classes used by it. + */ +package org.apache.hadoop.hive.llap.cli.status; + diff --git a/service/src/java/org/apache/hive/http/LlapServlet.java b/service/src/java/org/apache/hive/http/LlapServlet.java index 2bc59b59b27..ecb8e312313 100644 --- a/service/src/java/org/apache/hive/http/LlapServlet.java +++ b/service/src/java/org/apache/hive/http/LlapServlet.java @@ -28,8 +28,11 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.llap.cli.LlapStatusOptionsProcessor; -import org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver; +import org.apache.hadoop.hive.llap.cli.status.ExitCode; +import org.apache.hadoop.hive.llap.cli.status.LlapStatusServiceCommandLine; +import org.apache.hadoop.hive.llap.cli.status.LlapStatusServiceDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @SuppressWarnings("serial") public class LlapServlet extends HttpServlet { @@ -96,8 +99,8 @@ public class LlapServlet extends HttpServlet { LOG.info("Retrieving info for cluster: " + clusterName); LlapStatusServiceDriver driver = new LlapStatusServiceDriver(); - int ret = driver.run(new LlapStatusOptionsProcessor.LlapStatusOptions(clusterName), 0); - if (ret == LlapStatusServiceDriver.ExitCode.SUCCESS.getInt()) { + ExitCode ret = driver.run(LlapStatusServiceCommandLine.parseArguments(new String[] {"-n", clusterName}), 0); + if (ret == ExitCode.SUCCESS) { driver.outputJson(writer); }