Repository: spark
Updated Branches:
  refs/heads/branch-1.6 179f6e323 -> 5e9cefc8c


[SPARK-13642][YARN][1.6-BACKPORT] Properly handle signal kill in 
ApplicationMaster

## What changes were proposed in this pull request?

This patch is fixing the race condition in ApplicationMaster when receiving a 
signal. In the current implementation, if signal is received and with no any 
exception, this application will be finished with successful state in Yarn, and 
there's no another attempt. Actually the application is killed by signal in the 
runtime, so another attempt is expected.

This patch adds a signal handler to handle the signal things, if signal is 
received, marking this application finished with failure, rather than success.

## How was this patch tested?

This patch is tested with following situations:

Application is finished normally.
Application is finished by calling System.exit(n).
Application is killed by yarn command.
ApplicationMaster is killed by "SIGTERM" send by kill pid command.
ApplicationMaster is killed by NM with "SIGTERM" in case of NM failure.

Author: jerryshao <ss...@hortonworks.com>

Closes #11690 from jerryshao/SPARK-13642-1.6-backport.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e9cefc8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e9cefc8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e9cefc8

Branch: refs/heads/branch-1.6
Commit: 5e9cefc8ccfaa0ef0bb0f2052f9aa755197b0184
Parents: 179f6e3
Author: jerryshao <ss...@hortonworks.com>
Authored: Wed Mar 23 09:14:29 2016 -0500
Committer: Tom Graves <tgra...@yahoo-inc.com>
Committed: Wed Mar 23 09:14:29 2016 -0500

----------------------------------------------------------------------
 .../spark/deploy/yarn/ApplicationMaster.scala   | 21 ++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5e9cefc8/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
----------------------------------------------------------------------
diff --git 
a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala 
b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 50ae7ff..d723586 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -17,17 +17,19 @@
 
 package org.apache.spark.deploy.yarn
 
-import scala.util.control.NonFatal
-
 import java.io.{File, IOException}
 import java.lang.reflect.InvocationTargetException
 import java.net.{Socket, URL}
 import java.util.concurrent.atomic.AtomicReference
 
+import scala.util.control.NonFatal
+
+import org.apache.commons.lang3.SystemUtils
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.yarn.api._
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.conf.YarnConfiguration
+import sun.misc.{Signal, SignalHandler}
 
 import org.apache.spark.rpc._
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, 
SparkEnv,
@@ -117,6 +119,20 @@ private[spark] class ApplicationMaster(
 
   private var delegationTokenRenewerOption: Option[AMDelegationTokenRenewer] = 
None
 
+  if (SystemUtils.IS_OS_UNIX) {
+    // Register signal handler for signal "TERM", "INT" and "HUP". For the 
cases where AM receive a
+    // signal and stop, from RM's aspect this application needs to be 
reattempted, rather than mark
+    // as success.
+    class AMSignalHandler(name: String) extends SignalHandler {
+      val prevHandler = Signal.handle(new Signal(name), this)
+      override def handle(sig: Signal): Unit = {
+        finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_SIGNAL)
+        prevHandler.handle(sig)
+      }
+    }
+    Seq("TERM", "INT", "HUP").foreach { sig => new AMSignalHandler(sig) }
+  }
+
   final def run(): Int = {
     try {
       val appAttemptId = client.getAttemptId()
@@ -642,6 +658,7 @@ object ApplicationMaster extends Logging {
   private val EXIT_SC_NOT_INITED = 13
   private val EXIT_SECURITY = 14
   private val EXIT_EXCEPTION_USER_CLASS = 15
+  private val EXIT_SIGNAL = 16
 
   private var master: ApplicationMaster = _
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to