[ 
https://issues.apache.org/jira/browse/FLINK-9891?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16628663#comment-16628663
 ] 

ASF GitHub Bot commented on FLINK-9891:
---------------------------------------

asfgit closed pull request #6718: [FLINK-9891] Add optional hook to shutdown 
cluster if a session was created in per-job mode in attached mode
URL: https://github.com/apache/flink/pull/6718
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/docs/ops/cli.md b/docs/ops/cli.md
index f96ccf52ea9..9af5c5b5333 100644
--- a/docs/ops/cli.md
+++ b/docs/ops/cli.md
@@ -257,6 +257,11 @@ Action "run" compiles and runs a program.
      -s,--fromSavepoint <savepointPath>   Path to a savepoint to restore the 
job
                                           from (for example
                                           hdfs:///flink/savepoint-1537).
+     -sae,--shutdownOnAttachedExit        If the job is submitted in attached
+                                          mode, perform a best-effort cluster
+                                          shutdown when the CLI is terminated
+                                          abruptly, e.g., in response to a user
+                                          interrupt, such as typing Ctrl + C.
   Options for yarn-cluster mode:
      -d,--detached                        If present, runs the job in detached
                                           mode
@@ -265,6 +270,11 @@ Action "run" compiles and runs a program.
                                           connect to a different JobManager 
than
                                           the one specified in the
                                           configuration.
+     -sae,--shutdownOnAttachedExit        If the job is submitted in attached
+                                          mode, perform a best-effort cluster
+                                          shutdown when the CLI is terminated
+                                          abruptly, e.g., in response to a user
+                                          interrupt, such as typing Ctrl + C.
      -yD <property=value>                 use value for given property
      -yd,--yarndetached                   If present, runs the job in detached
                                           mode (deprecated; use non-YARN
diff --git 
a/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontend.java 
b/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontend.java
index e2a260c5478..ae0052c3c38 100644
--- a/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontend.java
+++ b/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontend.java
@@ -58,6 +58,7 @@
 import org.apache.flink.util.ExceptionUtils;
 import org.apache.flink.util.FlinkException;
 import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.ShutdownHookUtil;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Options;
@@ -250,13 +251,22 @@ protected void run(String[] args) throws Exception {
                                        LOG.info("Could not properly shut down 
the client.", e);
                                }
                        } else {
+                               final Thread shutdownHook;
                                if (clusterId != null) {
                                        client = 
clusterDescriptor.retrieve(clusterId);
+                                       shutdownHook = null;
                                } else {
                                        // also in job mode we have to deploy a 
session cluster because the job
                                        // might consist of multiple parts 
(e.g. when using collect)
                                        final ClusterSpecification 
clusterSpecification = customCommandLine.getClusterSpecification(commandLine);
                                        client = 
clusterDescriptor.deploySessionCluster(clusterSpecification);
+                                       // if not running in detached mode, add 
a shutdown hook to shut down cluster if client exits
+                                       // there's a race-condition here if cli 
is killed before shutdown hook is installed
+                                       if (!runOptions.getDetachedMode() && 
runOptions.isShutdownOnAttachedExit()) {
+                                               shutdownHook = 
ShutdownHookUtil.addShutdownHook(client::shutDownCluster, 
client.getClass().getSimpleName(), LOG);
+                                       } else {
+                                               shutdownHook = null;
+                                       }
                                }
 
                                try {
@@ -286,8 +296,11 @@ protected void run(String[] args) throws Exception {
                                                } catch (final Exception e) {
                                                        LOG.info("Could not 
properly terminate the Flink cluster.", e);
                                                }
+                                               if (shutdownHook != null) {
+                                                       // we do not need the 
hook anymore as we have just tried to shutdown the cluster.
+                                                       
ShutdownHookUtil.removeShutdownHook(shutdownHook, 
client.getClass().getSimpleName(), LOG);
+                                               }
                                        }
-
                                        try {
                                                client.shutdown();
                                        } catch (Exception e) {
@@ -492,11 +505,10 @@ private static void 
printJobStatusMessages(List<JobStatusMessage> jobs) {
                jobsByState.entrySet().stream()
                        .sorted(statusComparator)
                        
.map(Map.Entry::getValue).flatMap(List::stream).sorted(startTimeComparator)
-                       .forEachOrdered(job -> {
-                       System.out.println(dateFormat.format(new 
Date(job.getStartTime()))
-                               + " : " + job.getJobId() + " : " + 
job.getJobName()
-                               + " (" + job.getJobState() + ")");
-               });
+                       .forEachOrdered(job ->
+                               System.out.println(dateFormat.format(new 
Date(job.getStartTime()))
+                                       + " : " + job.getJobId() + " : " + 
job.getJobName()
+                                       + " (" + job.getJobState() + ")"));
        }
 
        /**
@@ -827,11 +839,8 @@ protected void executeProgram(PackagedProgram program, 
ClusterClient<?> client,
         * Creates a Packaged program from the given command line options.
         *
         * @return A PackagedProgram (upon success)
-        * @throws java.io.FileNotFoundException
-        * @throws org.apache.flink.client.program.ProgramInvocationException
         */
-       protected PackagedProgram buildProgram(ProgramOptions options)
-                       throws FileNotFoundException, 
ProgramInvocationException {
+       PackagedProgram buildProgram(ProgramOptions options) throws 
FileNotFoundException, ProgramInvocationException {
                String[] programArgs = options.getProgramArgs();
                String jarFilePath = options.getJarFilePath();
                List<URL> classpaths = options.getClasspaths();
@@ -1163,7 +1172,7 @@ else if (new File(CONFIG_DIRECTORY_FALLBACK_2).exists()) {
         * @param address Address to write to the configuration
         * @param config The configuration to write to
         */
-       public static void setJobManagerAddressInConfig(Configuration config, 
InetSocketAddress address) {
+       static void setJobManagerAddressInConfig(Configuration config, 
InetSocketAddress address) {
                config.setString(JobManagerOptions.ADDRESS, 
address.getHostString());
                config.setInteger(JobManagerOptions.PORT, address.getPort());
                config.setString(RestOptions.ADDRESS, address.getHostString());
diff --git 
a/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontendParser.java
 
b/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontendParser.java
index 357a87e4fbc..8eb0dd6774e 100644
--- 
a/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontendParser.java
+++ 
b/flink-clients/src/main/java/org/apache/flink/client/cli/CliFrontendParser.java
@@ -63,6 +63,11 @@
        public static final Option DETACHED_OPTION = new Option("d", 
"detached", false, "If present, runs " +
                        "the job in detached mode");
 
+       public static final Option SHUTDOWN_IF_ATTACHED_OPTION = new Option(
+               "sae", "shutdownOnAttachedExit", false,
+               "If the job is submitted in attached mode, perform a 
best-effort cluster shutdown " +
+                       "when the CLI is terminated abruptly, e.g., in response 
to a user interrupt, such as typing Ctrl + C.");
+
        /**
         * @deprecated use non-prefixed variant {@link #DETACHED_OPTION} for 
both YARN and non-YARN deployments
         */
@@ -128,6 +133,7 @@
 
                LOGGING_OPTION.setRequired(false);
                DETACHED_OPTION.setRequired(false);
+               SHUTDOWN_IF_ATTACHED_OPTION.setRequired(false);
                YARN_DETACHED_OPTION.setRequired(false);
 
                ARGS_OPTION.setRequired(false);
@@ -170,6 +176,7 @@ private static Options getProgramSpecificOptions(Options 
options) {
                options.addOption(ARGS_OPTION);
                options.addOption(LOGGING_OPTION);
                options.addOption(DETACHED_OPTION);
+               options.addOption(SHUTDOWN_IF_ATTACHED_OPTION);
                options.addOption(YARN_DETACHED_OPTION);
                return options;
        }
@@ -180,6 +187,7 @@ private static Options 
getProgramSpecificOptionsWithoutDeprecatedOptions(Options
                options.addOption(PARALLELISM_OPTION);
                options.addOption(LOGGING_OPTION);
                options.addOption(DETACHED_OPTION);
+               options.addOption(SHUTDOWN_IF_ATTACHED_OPTION);
                return options;
        }
 
diff --git 
a/flink-clients/src/main/java/org/apache/flink/client/cli/ProgramOptions.java 
b/flink-clients/src/main/java/org/apache/flink/client/cli/ProgramOptions.java
index ccaa4916f9c..da03d64048c 100644
--- 
a/flink-clients/src/main/java/org/apache/flink/client/cli/ProgramOptions.java
+++ 
b/flink-clients/src/main/java/org/apache/flink/client/cli/ProgramOptions.java
@@ -36,6 +36,7 @@
 import static org.apache.flink.client.cli.CliFrontendParser.JAR_OPTION;
 import static org.apache.flink.client.cli.CliFrontendParser.LOGGING_OPTION;
 import static org.apache.flink.client.cli.CliFrontendParser.PARALLELISM_OPTION;
+import static 
org.apache.flink.client.cli.CliFrontendParser.SHUTDOWN_IF_ATTACHED_OPTION;
 import static 
org.apache.flink.client.cli.CliFrontendParser.YARN_DETACHED_OPTION;
 
 /**
@@ -57,6 +58,8 @@
 
        private final boolean detachedMode;
 
+       private final boolean shutdownOnAttachedExit;
+
        private final SavepointRestoreSettings savepointSettings;
 
        protected ProgramOptions(CommandLine line) throws CliArgsException {
@@ -113,6 +116,7 @@ else if (args.length > 0) {
                stdoutLogging = !line.hasOption(LOGGING_OPTION.getOpt());
                detachedMode = line.hasOption(DETACHED_OPTION.getOpt()) || 
line.hasOption(
                        YARN_DETACHED_OPTION.getOpt());
+               shutdownOnAttachedExit = 
line.hasOption(SHUTDOWN_IF_ATTACHED_OPTION.getOpt());
 
                this.savepointSettings = 
CliFrontendParser.createSavepointRestoreSettings(line);
        }
@@ -145,6 +149,10 @@ public boolean getDetachedMode() {
                return detachedMode;
        }
 
+       public boolean isShutdownOnAttachedExit() {
+               return shutdownOnAttachedExit;
+       }
+
        public SavepointRestoreSettings getSavepointRestoreSettings() {
                return savepointSettings;
        }
diff --git 
a/flink-yarn/src/main/java/org/apache/flink/yarn/cli/FlinkYarnSessionCli.java 
b/flink-yarn/src/main/java/org/apache/flink/yarn/cli/FlinkYarnSessionCli.java
index c0180a83be1..e0c0f942405 100644
--- 
a/flink-yarn/src/main/java/org/apache/flink/yarn/cli/FlinkYarnSessionCli.java
+++ 
b/flink-yarn/src/main/java/org/apache/flink/yarn/cli/FlinkYarnSessionCli.java
@@ -85,6 +85,7 @@
 import java.util.stream.Stream;
 
 import static org.apache.flink.client.cli.CliFrontendParser.DETACHED_OPTION;
+import static 
org.apache.flink.client.cli.CliFrontendParser.SHUTDOWN_IF_ATTACHED_OPTION;
 import static 
org.apache.flink.client.cli.CliFrontendParser.YARN_DETACHED_OPTION;
 import static 
org.apache.flink.configuration.HighAvailabilityOptions.HA_CLUSTER_ID;
 
@@ -218,6 +219,7 @@ public FlinkYarnSessionCli(
                allOptions.addOption(slots);
                allOptions.addOption(dynamicproperties);
                allOptions.addOption(DETACHED_OPTION);
+               allOptions.addOption(SHUTDOWN_IF_ATTACHED_OPTION);
                allOptions.addOption(YARN_DETACHED_OPTION);
                allOptions.addOption(streaming);
                allOptions.addOption(name);


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Flink cluster is not shutdown in YARN mode when Flink client is stopped
> -----------------------------------------------------------------------
>
>                 Key: FLINK-9891
>                 URL: https://issues.apache.org/jira/browse/FLINK-9891
>             Project: Flink
>          Issue Type: Bug
>          Components: Client, YARN
>    Affects Versions: 1.5.0, 1.5.1
>            Reporter: Sergey Krasovskiy
>            Assignee: Andrey Zagrebin
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 1.7.0, 1.6.2, 1.5.5
>
>
> We are not using session mode and detached mode. The command to run Flink job 
> on YARN is:
> {code:java}
> <flink-1.5.1>/bin/flink run -m yarn-cluster -yn 1 -yqu flink -yjm 768 -ytm 
> 2048 -j ./flink-quickstart-java-1.0-SNAPSHOT.jar -c org.test.WordCount
> {code}
> Flink CLI logs:
> {code:java}
> Setting HADOOP_CONF_DIR=/etc/hadoop/conf because no HADOOP_CONF_DIR was set.
> SLF4J: Class path contains multiple SLF4J bindings.
> SLF4J: Found binding in 
> [jar:file:/opt/flink-streaming/flink-streaming-1.5.1-1.5.1-bin-hadoop27-scala_2.11-1531485329/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class]
> SLF4J: Found binding in 
> [jar:file:/usr/hdp/2.4.2.10-1/hadoop/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
> SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an 
> explanation.
> SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
> 2018-07-18 12:47:03,747 INFO 
> org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service 
> address: http://hmaster-1.ipbl.rgcloud.net:8188/ws/v1/timeline/
> 2018-07-18 12:47:04,222 INFO org.apache.flink.yarn.cli.FlinkYarnSessionCli - 
> No path for the flink jar passed. Using the location of class 
> org.apache.flink.yarn.YarnClusterDescriptor to locate the jar
> 2018-07-18 12:47:04,222 INFO org.apache.flink.yarn.cli.FlinkYarnSessionCli - 
> No path for the flink jar passed. Using the location of class 
> org.apache.flink.yarn.YarnClusterDescriptor to locate the jar
> 2018-07-18 12:47:04,248 WARN 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Neither the 
> HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set. The Flink 
> YARN Client needs one of these to be set to properly load the Hadoop 
> configuration for accessing YARN.
> 2018-07-18 12:47:04,409 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Cluster specification: 
> ClusterSpecification{masterMemoryMB=768, taskManagerMemoryMB=2048, 
> numberTaskManagers=1, slotsPerTaskManager=1}
> 2018-07-18 12:47:04,783 WARN 
> org.apache.hadoop.hdfs.shortcircuit.DomainSocketFactory - The short-circuit 
> local reads feature cannot be used because libhadoop cannot be loaded.
> 2018-07-18 12:47:04,788 WARN 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - The configuration 
> directory 
> ('/opt/flink-streaming/flink-streaming-1.5.1-1.5.1-bin-hadoop27-scala_2.11-1531485329/conf')
>  contains both LOG4J and Logback configuration files. Please delete or rename 
> one of them.
> 2018-07-18 12:47:07,846 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Submitting application 
> master application_1531474158783_10814
> 2018-07-18 12:47:08,073 INFO 
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl - Submitted application 
> application_1531474158783_10814
> 2018-07-18 12:47:08,074 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Waiting for the cluster 
> to be allocated
> 2018-07-18 12:47:08,076 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Deploying cluster, 
> current state ACCEPTED
> 2018-07-18 12:47:12,864 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - YARN application has 
> been deployed successfully.
> {code}
> Job Manager logs:
> {code:java}
> 2018-07-18 12:47:09,913 INFO 
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint - 
> --------------------------------------------------------------------------------
> 2018-07-18 12:47:09,915 INFO 
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint - Starting 
> YarnSessionClusterEntrypoint (Version: 1.5.1, Rev:3488f8b, Date:10.07.2018 @ 
> 11:51:27 GMT)
> ...
> {code}
> Issues:
>  # Flink job is running as a Flink session
>  # Ctrl+C or 'stop' doesn't stop a job and YARN cluster
>  # Cancel job via Job Maanager web ui doesn't stop Flink cluster. To kill the 
> cluster we need to run: yarn application -kill <id>
> We also tried to run a flink job with 'mode: legacy' and we have the same 
> issues:
>  # Add property 'mode: legacy' to ./conf/flink-conf.yaml
>  # Execute the following command:
> {code:java}
> <flink-1.5.1>/bin/flink run -m yarn-cluster -yn 1 -yqu flink -yjm 768 -ytm 
> 2048 -j ./flink-quickstart-java-1.0-SNAPSHOT.jar -c org.test.WordCount
> {code}
> Flink CLI logs:
> {code:java}
> Setting HADOOP_CONF_DIR=/etc/hadoop/conf because no HADOOP_CONF_DIR was set.
> SLF4J: Class path contains multiple SLF4J bindings.
> SLF4J: Found binding in 
> [jar:file:/opt/flink-streaming/flink-streaming-1.5.1-1.5.1-bin-hadoop27-scala_2.11-1531485329/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class]
> SLF4J: Found binding in 
> [jar:file:/usr/hdp/2.4.2.10-1/hadoop/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
> SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an 
> explanation.
> SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
> 2018-07-18 16:07:13,820 INFO 
> org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service 
> address: http://hmaster-1.ipbl.rgcloud.net:8188/ws/v1/timeline/
> 2018-07-18 16:07:14,165 INFO org.apache.flink.yarn.cli.FlinkYarnSessionCli - 
> No path for the flink jar passed. Using the location of class 
> org.apache.flink.yarn.LegacyYarnClusterDescriptor to locate the jar
> 2018-07-18 16:07:14,165 INFO org.apache.flink.yarn.cli.FlinkYarnSessionCli - 
> No path for the flink jar passed. Using the location of class 
> org.apache.flink.yarn.LegacyYarnClusterDescriptor to locate the jar
> 2018-07-18 16:07:14,182 WARN 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Neither the 
> HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set. The Flink 
> YARN Client needs one of these to be set to properly load the Hadoop 
> configuration for accessing YARN.
> 2018-07-18 16:07:14,356 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Cluster specification: 
> ClusterSpecification{masterMemoryMB=768, taskManagerMemoryMB=2048, 
> numberTaskManagers=1, slotsPerTaskManager=1}
> 2018-07-18 16:07:14,703 WARN 
> org.apache.hadoop.hdfs.shortcircuit.DomainSocketFactory - The short-circuit 
> local reads feature cannot be used because libhadoop cannot be loaded.
> 2018-07-18 16:07:14,708 WARN 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - The configuration 
> directory ('/home/skrasovs/flink-conf') contains both LOG4J and Logback 
> configuration files. Please delete or rename one of them.
> 2018-07-18 16:07:17,678 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Submitting application 
> master application_1531474158783_10843
> 2018-07-18 16:07:17,717 INFO 
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl - Submitted application 
> application_1531474158783_10843
> 2018-07-18 16:07:17,717 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Waiting for the cluster 
> to be allocated
> 2018-07-18 16:07:17,720 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - Deploying cluster, 
> current state ACCEPTED
> 2018-07-18 16:07:23,527 INFO 
> org.apache.flink.yarn.AbstractYarnClusterDescriptor - YARN application has 
> been deployed successfully.
> Using the parallelism provided by the remote cluster (1). To use another 
> parallelism, set it at the ./bin/flink client.
> Starting execution of program
> 2018-07-18 16:07:23,551 INFO org.apache.flink.yarn.YarnClusterClient - 
> Starting program in interactive mode (detached: false)
> {code}
> Job Manager logs:
> {code:java}
> 2018-07-18 16:07:19,831 INFO 
> org.apache.flink.yarn.YarnApplicationMasterRunner - 
> --------------------------------------------------------------------------------
> 2018-07-18 16:07:19,833 INFO 
> org.apache.flink.yarn.YarnApplicationMasterRunner - Starting YARN 
> ApplicationMaster / ResourceManager / JobManager (Version: 1.5.1, 
> Rev:3488f8b, Date:10.07.2018 @ 11:51:27 GMT)
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to