Github user uce commented on a diff in the pull request: https://github.com/apache/flink/pull/2652#discussion_r83841451 --- Diff: flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/Task.java --- @@ -1251,33 +1299,124 @@ public void run() { catch (InterruptedException e) { // we can ignore this } + } + catch (Throwable t) { + logger.error("Error in the task canceler", t); + } + } + } + + /** + * Watchdog for the cancellation. If the task is stuck in cancellation, + * we notify the task manager about a fatal error. + */ + private static class TaskCancellationWatchDog extends TimerTask { + + /** + * Pass logger in order to prevent that the compiler needs to inject static bridge methods + * to access it. + */ + private final Logger logger; + + /** Thread executing the Task. */ + private final Thread executor; + + /** Interrupt interval. */ + private final long interruptInterval; + + /** Timeout after which a fatal error notification happens. */ + private final long interruptTimeout; + + /** TaskManager to notify about a timeout */ + private final TaskManagerConnection taskManager; + + /** Task name (for logging and error messages). */ + private final String taskName; + + /** Synchronization with the {@link TaskCanceler} thread. */ + private final CountDownLatch taskCancellerLatch; + + public TaskCancellationWatchDog( + Logger logger, + Thread executor, + long interruptInterval, + long interruptTimeout, + TaskManagerConnection taskManager, + String taskName, + CountDownLatch taskCancellerLatch) { + + this.logger = checkNotNull(logger); + this.executor = checkNotNull(executor); + this.interruptInterval = checkNotNull(interruptInterval); + this.interruptTimeout = checkNotNull(interruptTimeout); + this.taskManager = checkNotNull(taskManager); + this.taskName = checkNotNull(taskName); + this.taskCancellerLatch = checkNotNull(taskCancellerLatch); + } + + @Override + public void run() { + try { + // Synchronize with task canceler + taskCancellerLatch.await(); + } catch (Exception e) { + String msg = String.format("Exception while waiting on task " + + "canceller to cancel task '%s'.", taskName); + taskManager.notifyFatalError(msg, e); + return; + } + + long intervalNanos = TimeUnit.NANOSECONDS.convert(interruptInterval, TimeUnit.MILLISECONDS); + long timeoutNanos = TimeUnit.NANOSECONDS.convert(interruptTimeout, TimeUnit.MILLISECONDS); + long deadline = System.nanoTime() + timeoutNanos; + + try { + // Initial wait before interrupting periodically + Thread.sleep(interruptInterval); + } catch (InterruptedException ignored) { + } + + // It is possible that the user code does not react to the task canceller. + // for that reason, we spawn this separate thread that repeatedly interrupts + // the user code until it exits. If the suer user code does not exit within + // the timeout, we notify the job manager about a fatal error. + while (executor.isAlive()) { + long now = System.nanoTime(); + + // build the stack trace of where the thread is stuck, for the log + StringBuilder bld = new StringBuilder(); + StackTraceElement[] stack = executor.getStackTrace(); + for (StackTraceElement e : stack) { + bld.append(e).append('\n'); + } - // it is possible that the user code does not react immediately. for that - // reason, we spawn a separate thread that repeatedly interrupts the user code until - // it exits - while (executer.isAlive()) { - // build the stack trace of where the thread is stuck, for the log - StringBuilder bld = new StringBuilder(); - StackTraceElement[] stack = executer.getStackTrace(); - for (StackTraceElement e : stack) { - bld.append(e).append('\n'); - } + if (now >= deadline) { + long duration = TimeUnit.SECONDS.convert(interruptInterval, TimeUnit.MILLISECONDS); + String msg = String.format("Task '%s' did not react to cancelling signal in " + + "the last %d seconds, but is stuck in method:\n %s", + taskName, + duration, + bld.toString()); + + taskManager.notifyFatalError(msg, null); + return; // done, don't forget to leave the loop + } else { logger.warn("Task '{}' did not react to cancelling signal, but is stuck in method:\n {}", taskName, bld.toString()); - executer.interrupt(); + executor.interrupt(); try { - executer.join(taskCancellationIntervalMillis); - } - catch (InterruptedException e) { - // we can ignore this + long timeLeftNanos = Math.min(intervalNanos, deadline - now - intervalNanos); --- End diff -- No, your suggestion is correct.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---