Repository: spark Updated Branches: refs/heads/branch-1.6 179f6e323 -> 5e9cefc8c
[SPARK-13642][YARN][1.6-BACKPORT] Properly handle signal kill in ApplicationMaster ## What changes were proposed in this pull request? This patch is fixing the race condition in ApplicationMaster when receiving a signal. In the current implementation, if signal is received and with no any exception, this application will be finished with successful state in Yarn, and there's no another attempt. Actually the application is killed by signal in the runtime, so another attempt is expected. This patch adds a signal handler to handle the signal things, if signal is received, marking this application finished with failure, rather than success. ## How was this patch tested? This patch is tested with following situations: Application is finished normally. Application is finished by calling System.exit(n). Application is killed by yarn command. ApplicationMaster is killed by "SIGTERM" send by kill pid command. ApplicationMaster is killed by NM with "SIGTERM" in case of NM failure. Author: jerryshao <ss...@hortonworks.com> Closes #11690 from jerryshao/SPARK-13642-1.6-backport. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e9cefc8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e9cefc8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e9cefc8 Branch: refs/heads/branch-1.6 Commit: 5e9cefc8ccfaa0ef0bb0f2052f9aa755197b0184 Parents: 179f6e3 Author: jerryshao <ss...@hortonworks.com> Authored: Wed Mar 23 09:14:29 2016 -0500 Committer: Tom Graves <tgra...@yahoo-inc.com> Committed: Wed Mar 23 09:14:29 2016 -0500 ---------------------------------------------------------------------- .../spark/deploy/yarn/ApplicationMaster.scala | 21 ++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/5e9cefc8/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala ---------------------------------------------------------------------- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 50ae7ff..d723586 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -17,17 +17,19 @@ package org.apache.spark.deploy.yarn -import scala.util.control.NonFatal - import java.io.{File, IOException} import java.lang.reflect.InvocationTargetException import java.net.{Socket, URL} import java.util.concurrent.atomic.AtomicReference +import scala.util.control.NonFatal + +import org.apache.commons.lang3.SystemUtils import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.yarn.api._ import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.conf.YarnConfiguration +import sun.misc.{Signal, SignalHandler} import org.apache.spark.rpc._ import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkEnv, @@ -117,6 +119,20 @@ private[spark] class ApplicationMaster( private var delegationTokenRenewerOption: Option[AMDelegationTokenRenewer] = None + if (SystemUtils.IS_OS_UNIX) { + // Register signal handler for signal "TERM", "INT" and "HUP". For the cases where AM receive a + // signal and stop, from RM's aspect this application needs to be reattempted, rather than mark + // as success. + class AMSignalHandler(name: String) extends SignalHandler { + val prevHandler = Signal.handle(new Signal(name), this) + override def handle(sig: Signal): Unit = { + finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_SIGNAL) + prevHandler.handle(sig) + } + } + Seq("TERM", "INT", "HUP").foreach { sig => new AMSignalHandler(sig) } + } + final def run(): Int = { try { val appAttemptId = client.getAttemptId() @@ -642,6 +658,7 @@ object ApplicationMaster extends Logging { private val EXIT_SC_NOT_INITED = 13 private val EXIT_SECURITY = 14 private val EXIT_EXCEPTION_USER_CLASS = 15 + private val EXIT_SIGNAL = 16 private var master: ApplicationMaster = _ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org