Currently, the maintenance daemon just waits for its jobs to be
finished before running the next maintenance round. However, if
one of the submitted jobs fails, the corresponding maintenance event
should be marked as failed. Do this now.

Signed-off-by: Klaus Aehlig <[email protected]>
---
 src/Ganeti/MaintD/Server.hs | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/Ganeti/MaintD/Server.hs b/src/Ganeti/MaintD/Server.hs
index b22a06e..670d2bb 100644
--- a/src/Ganeti/MaintD/Server.hs
+++ b/src/Ganeti/MaintD/Server.hs
@@ -70,11 +70,12 @@ import Ganeti.MaintD.Autorepairs (harepTasks)
 import Ganeti.MaintD.Balance (balanceTask)
 import Ganeti.MaintD.CleanupIncidents (cleanupIncidents)
 import Ganeti.MaintD.CollectIncidents (collectIncidents)
+import Ganeti.MaintD.FailIncident (failIncident)
 import Ganeti.MaintD.HandleIncidents (handleIncidents)
 import Ganeti.MaintD.MemoryState
 import qualified Ganeti.Path as Path
 import Ganeti.Runtime (GanetiDaemon(GanetiMaintd))
-import Ganeti.Types (JobId(..))
+import Ganeti.Types (JobId(..), JobStatus(..))
 import Ganeti.Utils (threadDelaySeconds)
 import Ganeti.Utils.Http (httpConfFromOpts, plainJSON, error404)
 import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceRoundDelay
@@ -127,10 +128,14 @@ maintenance memstate = do
   logDebug $ "Jobs submitted in the last round: "
              ++ show (map fromJobId oldjobs)
   luxiSocket <- liftIO Path.defaultQuerySocket
-  bracket (mkResultT . liftM (either (Bad . show) Ok)
-            . tryIOError $ L.getLuxiClient luxiSocket)
-          (liftIO . L.closeClient)
-          $ void . mkResultT . waitForJobs oldjobs
+  jobresults <- bracket (mkResultT . liftM (either (Bad . show) Ok)
+                         . tryIOError $ L.getLuxiClient luxiSocket)
+                  (liftIO . L.closeClient)
+                  $ mkResultT . waitForJobs oldjobs
+  let failedjobs = map fst $ filter ((/=) JOB_STATUS_SUCCESS . snd) jobresults
+  unless (null failedjobs) $ do
+    logInfo . (++) "Failed jobs: " . show $ map fromJobId failedjobs
+    mapM_ (failIncident memstate) failedjobs
   unless (null oldjobs)
     . liftIO $ clearJobs memstate
   logDebug "New round of maintenance started"
-- 
2.5.0.457.gab17608

Reply via email to