Apparently due to some library bug, forking sometimes fails: The new process is running, but it doesn't start executing. Therefore we retry the attempt several times.
Signed-off-by: Petr Pudlak <[email protected]> --- I tested the patch with simulated fork failures, works as expected. src/Ganeti/Query/Exec.hs | 93 ++++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/src/Ganeti/Query/Exec.hs b/src/Ganeti/Query/Exec.hs index 57045ce..23b4ea7 100644 --- a/src/Ganeti/Query/Exec.hs +++ b/src/Ganeti/Query/Exec.hs @@ -52,9 +52,10 @@ module Ganeti.Query.Exec ) where import Control.Concurrent -import Control.Exception.Lifted (finally) +import Control.Exception.Lifted (onException) import Control.Monad import Control.Monad.Error +import Control.Monad.Trans.Maybe () import Data.Functor import qualified Data.Map as M import Data.Maybe (listToMaybe, mapMaybe) @@ -63,6 +64,7 @@ import System.Environment import System.IO.Error (tryIOError) import System.Posix.Process import System.Posix.IO +import System.Posix.Signals (sigTERM, signalProcess) import System.Posix.Types (Fd, ProcessID) import System.Time import Text.Printf @@ -74,6 +76,7 @@ import qualified Ganeti.Path as P import Ganeti.Types import Ganeti.UDSServer import Ganeti.Utils +import Ganeti.Utils.MonadPlus isForkSupported :: IO Bool isForkSupported = return $ not rtsSupportsBoundThreads @@ -168,7 +171,7 @@ forkWithPipe conf childAction = do -- | Forks the job process and starts processing of the given job. -- Returns the livelock of the job and its process ID. -forkJobProcess :: (Error e) +forkJobProcess :: (Error e, Show e) => JobId -- ^ a job to process -> FilePath -- ^ the daemons own livelock file -> (FilePath -> ResultT e IO ()) @@ -179,39 +182,53 @@ forkJobProcess jid luxiLivelock update = do logDebug $ "Setting the lockfile temporarily to " ++ luxiLivelock update luxiLivelock - (pid, master) <- liftIO $ forkWithPipe connectConfig (runJobProcess jid) - - let logChildStatus = do - logDebug $ "Getting the status of job process " - ++ show (fromJobId jid) - status <- liftIO $ getProcessStatus False True pid - logDebug $ "Child process (job " ++ show (fromJobId jid) - ++ ") status: " ++ maybe "running" show status - - flip finally logChildStatus $ do - update luxiLivelock - - let recv = liftIO $ recvMsg master - send = liftIO . sendMsg master - logDebug "Getting the lockfile of the client" - lockfile <- recv - logDebug $ "Setting the lockfile to the final " ++ lockfile - update lockfile - logDebug "Confirming the client it can start" - send "" - - -- from now on, we communicate with the job's Python process - - logDebug "Waiting for the job to ask for the job id" - _ <- recv - logDebug "Writing job id to the client" - send . show $ fromJobId jid - - logDebug "Waiting for the job to ask for the lock file name" - _ <- recv - logDebug "Writing the lock file name to the client" - send lockfile - - logDebug "Closing the pipe to the client" - liftIO $ closeClient master - return (lockfile, pid) + -- Due to some bug in GHC forking process, we want to retry, + -- if the forked process fails to start to communicate. + -- If it fails later on, the failure is handled by 'ResultT' + -- and no retry is performed. + resultOpt <- retryMaybeN 3 $ \_ -> do + (pid, master) <- liftIO $ forkWithPipe connectConfig (runJobProcess jid) + + let onError = do + logDebug "Closing the pipe to the client" + withErrorLogAt WARNING "Closing the communication pipe failed" + (liftIO (closeClient master)) `mplus` return () + logDebug $ "Getting the status of job process " + ++ show (fromJobId jid) + status <- liftIO $ getProcessStatus False True pid + case status of + Just s -> logDebug $ "Child process (job " ++ show (fromJobId jid) + ++ ") status: " ++ show s + Nothing -> do + logDebug $ "Child process (job " ++ show (fromJobId jid) + ++ ") running, killing by SIGTERM" + liftIO $ signalProcess sigTERM pid + + flip onException onError $ do + let recv = liftIO $ recvMsg master + send = liftIO . sendMsg master + logDebug "Getting the lockfile of the client" + -- If we fail to receive a message from the client, fail the MaybeT + -- computation here using `mzero` to retry. + lockfile <- recv `orElse` mzero + + logDebug $ "Setting the lockfile to the final " ++ lockfile + lift $ update lockfile + logDebug "Confirming the client it can start" + send "" + + -- from now on, we communicate with the job's Python process + + logDebug "Waiting for the job to ask for the job id" + _ <- recv + logDebug "Writing job id to the client" + send . show $ fromJobId jid + + logDebug "Waiting for the job to ask for the lock file name" + _ <- recv + logDebug "Writing the lock file name to the client" + send lockfile + + return (lockfile, pid) + + maybe (failError "The client process timed out repeatedly") return resultOpt -- 1.9.1.423.g4596e3a
