Github user BryanCutler commented on a diff in the pull request: https://github.com/apache/spark/pull/11746#discussion_r59657657 --- Diff: core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala --- @@ -68,56 +70,72 @@ private[deploy] class DriverRunner( private var clock: Clock = new SystemClock() private var sleeper = new Sleeper { - def sleep(seconds: Int): Unit = (0 until seconds).takeWhile(f => {Thread.sleep(1000); !killed}) + def sleep(seconds: Int): Unit = Thread.sleep(seconds * 1000) } /** Starts a thread to run and manage the driver. */ private[worker] def start() = { - new Thread("DriverRunner for " + driverId) { + workerThread = new Thread("DriverRunner for " + driverId) { override def run() { + var shutdownHook: AnyRef = null try { - val driverDir = createWorkingDirectory() - val localJarFilename = downloadUserJar(driverDir) - - def substituteVariables(argument: String): String = argument match { - case "{{WORKER_URL}}" => workerUrl - case "{{USER_JAR}}" => localJarFilename - case other => other + shutdownHook = ShutdownHookManager.addShutdownHook { () => + logInfo(s"Worker shutting down, killing driver $driverId") + kill() } - // TODO: If we add ability to submit multiple jars they should also be added here - val builder = CommandUtils.buildProcessBuilder(driverDesc.command, securityManager, - driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables) - launchDriver(builder, driverDir, driverDesc.supervise) + // prepare driver jars, launch driver and set final state from process exit code + val exitCode = prepareAndLaunchDriver() + finalState = if (exitCode == 0) Some(DriverState.FINISHED) else Some(DriverState.FAILED) } catch { - case e: Exception => finalException = Some(e) + case interrupted: InterruptedException => + logInfo("Runner thread for driver " + driverId + " interrupted") + killProcessAndFinalize(DriverState.KILLED, interrupted) + case e: Exception => + killProcessAndFinalize(DriverState.ERROR, e) + } + finally { + if (shutdownHook != null) ShutdownHookManager.removeShutdownHook(shutdownHook) } - val state = - if (killed) { - DriverState.KILLED - } else if (finalException.isDefined) { - DriverState.ERROR - } else { - finalExitCode match { - case Some(0) => DriverState.FINISHED - case _ => DriverState.FAILED - } - } + // notify worker of final driver state, possible exception + worker.send(DriverStateChanged(driverId, finalState.get, finalException)) + } + // kill the process if started, set shared finalizing variables + def killProcessAndFinalize(state: DriverState.DriverState, e: Exception): Unit = { + killProcess() finalState = Some(state) + finalException = Some(e) + } + } + + workerThread.start() + } - worker.send(DriverStateChanged(driverId, state, finalException)) + /** Kill driver process and wait for it to exit. */ + private def killProcess(): Unit = { + if (process != null) { + logInfo("Killing driver process!") + val exitCode = Utils.terminateProcess(process, DRIVER_TERMINATE_TIMEOUT_MS) + if (exitCode.isEmpty) { + logWarning("Failed to terminate driver process: " + process + + ". This process will likely be orphaned.") } - }.start() + } } - /** Terminate this driver (or prevent it from ever starting if not yet started) */ - private[worker] def kill() { - synchronized { - process.foreach(p => p.destroy()) - killed = true + /** Stop this driver, including the process it launched */ + private[worker] def kill(): Unit = { + if (workerThread != null) { + // make sure process does not start if being interrupted + this.synchronized { + // the workerThread will kill the child process when interrupted + workerThread.interrupt() + workerThread.join() --- End diff -- > This doesn't fix the race condition. Image the following execution order: That order can't happen because the ShutdownHook isn't added until the thread is started, so `workerThread` will be assigned at that point.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org