If luxi returned an error when getting statuses for any maintenance job that was submitted and has not yet completed, maintd would immediately fail, and could not recover from this condition on subsequent rounds of maintenance.
This could happen, if for example, ganeti was manually shut down while the job was running, and the job file was archived after ganeti restarted. Fix by treating missing/unparseable jobs as failed. Signed-off-by: Brian Foley <bpfo...@google.com> --- src/Ganeti/Jobs.hs | 2 +- src/Ganeti/MaintD/Server.hs | 24 +++++++++++++++++++++--- src/Ganeti/Utils.hs | 11 ++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/Ganeti/Jobs.hs b/src/Ganeti/Jobs.hs index 21b8815..e31d74e 100644 --- a/src/Ganeti/Jobs.hs +++ b/src/Ganeti/Jobs.hs @@ -152,7 +152,7 @@ execJobsWait opcodes callback client = do callback jids' waitForJobs jids' client --- | Wait for one job units it is finished, using the WaitForJobChange +-- | Wait for one job until it is finished, using the WaitForJobChange -- luxi command. Return the JobId and the and the final job status. waitForJob :: L.Client -> L.JobId -> ResultT String IO (L.JobId, JobStatus) waitForJob c jid = waitForJob' J.JSNull 0 where diff --git a/src/Ganeti/MaintD/Server.hs b/src/Ganeti/MaintD/Server.hs index 670d2bb..d834c9b 100644 --- a/src/Ganeti/MaintD/Server.hs +++ b/src/Ganeti/MaintD/Server.hs @@ -56,7 +56,7 @@ import System.Time (getClockTime) import qualified Text.JSON as J import Ganeti.BasicTypes ( GenericResult(..), ResultT, runResultT, mkResultT - , withErrorT, isBad) + , withErrorT, isBad, isOk) import qualified Ganeti.Constants as C import Ganeti.Daemon ( OptType, CheckFn, PrepFn, MainFn, oDebug , oNoVoting, oYesDoIt, oPort, oBindAddress, oNoDaemonize) @@ -76,7 +76,7 @@ import Ganeti.MaintD.MemoryState import qualified Ganeti.Path as Path import Ganeti.Runtime (GanetiDaemon(GanetiMaintd)) import Ganeti.Types (JobId(..), JobStatus(..)) -import Ganeti.Utils (threadDelaySeconds) +import Ganeti.Utils (threadDelaySeconds, partitionM) import Ganeti.Utils.Http (httpConfFromOpts, plainJSON, error404) import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceRoundDelay , maintenanceBalancing) @@ -128,10 +128,28 @@ maintenance memstate = do logDebug $ "Jobs submitted in the last round: " ++ show (map fromJobId oldjobs) luxiSocket <- liftIO Path.defaultQuerySocket + + -- Filter out any jobs in the maintenance list which can't be parsed by luxi + -- anymore. This can happen if the job file is corrupted, missing or archived. + -- We have to query one job at a time, as luxi returns a single error if any + -- job in the query list can't be read/parsed. + (okjobs, badjobs) <- bracket + (mkResultT . liftM (either (Bad . show) Ok) + . tryIOError $ L.getLuxiClient luxiSocket) + (liftIO . L.closeClient) + $ mkResultT . liftM Ok + . (\c -> partitionM (\j -> liftM isOk $ L.queryJobsStatus c [j]) oldjobs) + + unless (null badjobs) $ do + logInfo . (++) "Unparsable jobs (marking as failed): " + . show $ map fromJobId badjobs + mapM_ (failIncident memstate) badjobs + jobresults <- bracket (mkResultT . liftM (either (Bad . show) Ok) . tryIOError $ L.getLuxiClient luxiSocket) (liftIO . L.closeClient) - $ mkResultT . waitForJobs oldjobs + $ mkResultT . (\c -> waitForJobs okjobs c) + let failedjobs = map fst $ filter ((/=) JOB_STATUS_SUCCESS . snd) jobresults unless (null failedjobs) $ do logInfo . (++) "Failed jobs: " . show $ map fromJobId failedjobs diff --git a/src/Ganeti/Utils.hs b/src/Ganeti/Utils.hs index fbe3a36..42a8db9 100644 --- a/src/Ganeti/Utils.hs +++ b/src/Ganeti/Utils.hs @@ -101,6 +101,7 @@ module Ganeti.Utils , threadDelaySeconds , monotoneFind , iterateJust + , partitionM ) where import Prelude () @@ -871,7 +872,15 @@ monotoneFind heuristics p xs = else monotoneFind heuristics p xs' _ -> Nothing --- | Iterate a funtion as long as it returns Just values, collecting +-- | Iterate a function as long as it returns Just values, collecting -- all the Justs that where obtained. iterateJust :: (a -> Maybe a) -> a -> [a] iterateJust f a = a : maybe [] (iterateJust f) (f a) + +-- | A version of partition with a monadic predicate +-- Implementation taken from David Fox's Extras package. +partitionM :: (Monad m) => (a -> m Bool) -> [a] -> m ([a], [a]) +partitionM p xs = foldM f ([], []) xs + where f (a, b) x = do + pv <- p x + return $ if pv then (x : a, b) else (a, x : b) -- 2.8.0.rc3.226.g39d4020