[ https://issues.apache.org/jira/browse/DRILL-6039?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16703751#comment-16703751 ]
ASF GitHub Bot commented on DRILL-6039: --------------------------------------- asfgit closed pull request #1536: DRILL-6039: Fixed drillbit.sh script to do graceful shutdown URL: https://github.com/apache/drill/pull/1536 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/distribution/src/resources/drill-config.sh b/distribution/src/resources/drill-config.sh index d23788b006a..a4686c50354 100644 --- a/distribution/src/resources/drill-config.sh +++ b/distribution/src/resources/drill-config.sh @@ -334,6 +334,7 @@ fi # provided in drill-env.sh. export DRILL_PID_DIR=${DRILL_PID_DIR:-$DRILL_HOME} +export GRACEFUL_SIGFILE=${GRACEFUL_SIGFILE:-"graceful"} # Prepare log file prefix and the main Drillbit log file. diff --git a/distribution/src/resources/drillbit.sh b/distribution/src/resources/drillbit.sh index 88d56c8a14f..5ad87b15cdb 100755 --- a/distribution/src/resources/drillbit.sh +++ b/distribution/src/resources/drillbit.sh @@ -87,6 +87,7 @@ export args # Set default scheduling priority DRILL_NICENESS=${DRILL_NICENESS:-0} +GRACEFUL_FILE=$DRILL_PID_DIR/$GRACEFUL_SIGFILE waitForProcessEnd() { @@ -94,11 +95,19 @@ waitForProcessEnd() commandName=$2 kill_drillbit=$3 processedAt=`date +%s` + triggered_shutdown=false origcnt=${DRILL_STOP_TIMEOUT:-120} while kill -0 $pidKilled > /dev/null 2>&1; do echo -n "." sleep 1; + #Incase of graceful shutdown, create graceful file and wait till the process ends. + if [ "$kill_drillbit" = false ]; then + if [ "$triggered_shutdown" = false ]; then + touch $GRACEFUL_FILE + triggered_shutdown=true + fi + fi if [ "$kill_drillbit" = true ] ; then # if process persists more than $DRILL_STOP_TIMEOUT (default 120 sec) no mercy if [ $(( `date +%s` - $processedAt )) -gt $origcnt ]; then @@ -125,6 +134,15 @@ check_before_start() exit 1 fi fi + #remove any previous uncleaned graceful file + if [ -f "$GRACEFUL_FILE" ]; then + rm $GRACEFUL_FILE + rm_status=$? + if [ $rm_status -ne 0 ];then + echo "Error: Failed to remove $GRACEFUL_FILE!" + exit $rm_status + fi + fi } check_after_start(){ @@ -204,7 +222,9 @@ stop_bit ( ) if kill -0 $pidToKill > /dev/null 2>&1; then echo "Stopping $command" echo "`date` Terminating $command pid $pidToKill" >> "$DRILLBIT_LOG_PATH" - kill $pidToKill > /dev/null 2>&1 + if [ $kill_drillbit = true ]; then + kill $pidToKill > /dev/null 2>&1 + fi waitForProcessEnd $pidToKill $command $kill_drillbit retval=0 else diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/server/Drillbit.java b/exec/java-exec/src/main/java/org/apache/drill/exec/server/Drillbit.java index dd1c5f19faf..a0c63ab6dcc 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/server/Drillbit.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/server/Drillbit.java @@ -17,6 +17,13 @@ */ package org.apache.drill.exec.server; +import java.io.IOException; +import java.nio.file.FileSystems; +import java.nio.file.Path; +import java.nio.file.StandardWatchEventKinds; +import java.nio.file.WatchEvent; +import java.nio.file.WatchKey; +import java.nio.file.WatchService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -90,6 +97,8 @@ private DrillbitStateManager stateManager; private boolean quiescentMode; private boolean forcefulShutdown = false; + GracefulShutdownThread gracefulShutdownThread; + private boolean interruptPollShutdown = true; public void setQuiescentMode(boolean quiescentMode) { this.quiescentMode = quiescentMode; @@ -212,6 +221,8 @@ public void run() throws Exception { drillbitContext.startRM(); Runtime.getRuntime().addShutdownHook(new ShutdownThread(this, new StackTrace())); + gracefulShutdownThread = new GracefulShutdownThread(this, new StackTrace()); + gracefulShutdownThread.start(); logger.info("Startup completed ({} ms).", w.elapsed(TimeUnit.MILLISECONDS)); } @@ -291,6 +302,11 @@ public synchronized void close() { logger.info("Shutdown completed ({} ms).", w.elapsed(TimeUnit.MILLISECONDS) ); stateManager.setState(DrillbitState.SHUTDOWN); + // Interrupt GracefulShutdownThread since Drillbit close is not called from it. + if (interruptPollShutdown) { + gracefulShutdownThread.interrupt(); + } + } private void javaPropertiesToSystemOptions() { @@ -335,6 +351,55 @@ private void javaPropertiesToSystemOptions() { } } + + // Polls for graceful file to check if graceful shutdown is triggered from the script. + private static class GracefulShutdownThread extends Thread { + + private final Drillbit drillbit; + private final StackTrace stackTrace; + public GracefulShutdownThread(final Drillbit drillbit, final StackTrace stackTrace) { + this.drillbit = drillbit; + this.stackTrace = stackTrace; + } + + @Override + public void run () { + try { + pollShutdown(drillbit); + } catch (InterruptedException e) { + logger.debug("Interrupted GracefulShutdownThread"); + } catch (IOException e) { + throw new RuntimeException("Caught exception while polling for gracefulshutdown\n" + stackTrace, e); + } + } + + private void pollShutdown(Drillbit drillbit) throws IOException, InterruptedException { + final Path drillPidDirPath = FileSystems.getDefault().getPath(System.getenv("DRILL_PID_DIR")); + final String gracefulFileName = System.getenv("GRACEFUL_SIGFILE"); + boolean triggered_shutdown = false; + WatchKey wk = null; + try (final WatchService watchService = FileSystems.getDefault().newWatchService()) { + drillPidDirPath.register(watchService, StandardWatchEventKinds.ENTRY_MODIFY, StandardWatchEventKinds.ENTRY_CREATE); + while (!triggered_shutdown) { + wk = watchService.take(); + for (WatchEvent<?> event : wk.pollEvents()) { + final Path changed = (Path) event.context(); + if (changed != null && changed.endsWith(gracefulFileName)) { + drillbit.interruptPollShutdown = false; + triggered_shutdown = true; + drillbit.close(); + break; + } + } + } + } finally { + if (wk != null) { + wk.cancel(); + } + } + } + } + /** * Shutdown hook for Drillbit. Closes the drillbit, and reports on errors that * occur during closure, as well as the location the drillbit was started from. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > drillbit.sh graceful_stop does not wait for fragments to complete before > stopping the drillbit > ---------------------------------------------------------------------------------------------- > > Key: DRILL-6039 > URL: https://issues.apache.org/jira/browse/DRILL-6039 > Project: Apache Drill > Issue Type: Bug > Components: Execution - Flow > Affects Versions: 1.3.0 > Reporter: Krystal > Assignee: Venkata Jyothsna Donapati > Priority: Major > Labels: ready-to-commit > Fix For: 1.15.0 > > > git.commit.id.abbrev=eb0c403 > I have 3-nodes cluster with drillbits running on each node. I kicked off a > long running query. In the middle of the query, I did a "./drillbit.sh > graceful_stop" on one of the non-foreman node. The node was stopped within a > few seconds and the query failed with error: > Error: SYSTEM ERROR: IOException: Filesystem closed > Fragment 4:15 -- This message was sent by Atlassian JIRA (v7.6.3#76005)