As per our design, once a job belonging to the handling of an incident failed, the incident handling is aborted and tagged and marked as failed. Implement this.
Signed-off-by: Klaus Aehlig <[email protected]> --- Makefile.am | 1 + src/Ganeti/MaintD/FailIncident.hs | 91 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 src/Ganeti/MaintD/FailIncident.hs diff --git a/Makefile.am b/Makefile.am index 06361b5..acb80c3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -982,6 +982,7 @@ HS_LIB_SRCS = \ src/Ganeti/MaintD/Balance.hs \ src/Ganeti/MaintD/CleanupIncidents.hs \ src/Ganeti/MaintD/CollectIncidents.hs \ + src/Ganeti/MaintD/FailIncident.hs \ src/Ganeti/MaintD/HandleIncidents.hs \ src/Ganeti/MaintD/MemoryState.hs \ src/Ganeti/MaintD/Server.hs \ diff --git a/src/Ganeti/MaintD/FailIncident.hs b/src/Ganeti/MaintD/FailIncident.hs new file mode 100644 index 0000000..c2d9db4 --- /dev/null +++ b/src/Ganeti/MaintD/FailIncident.hs @@ -0,0 +1,91 @@ +{-| Incident failing in the maintenace daemon + +This module implements the treatment of an incident, once +a job failed. + +-} + +{- + +Copyright (C) 2015 Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-} + +module Ganeti.MaintD.FailIncident + ( failIncident + ) where + +import Control.Exception.Lifted (bracket) +import Control.Lens.Setter (over) +import Control.Monad (liftM, when) +import Control.Monad.IO.Class (liftIO) +import Data.IORef (IORef) +import System.IO.Error (tryIOError) + +import Ganeti.BasicTypes (ResultT, mkResultT, GenericResult(..)) +import Ganeti.JQueue (currentTimestamp) +import Ganeti.Jobs (execJobsWaitOkJid) +import Ganeti.Logging.Lifted +import qualified Ganeti.Luxi as L +import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, updateIncident) +import Ganeti.MaintD.Utils (annotateOpCode) +import Ganeti.Objects.Lens (incidentJobsL) +import Ganeti.Objects.Maintenance (Incident(..), RepairStatus(..)) +import Ganeti.OpCodes (OpCode(..)) +import qualified Ganeti.Path as Path +import Ganeti.Types (JobId, fromJobId, TagKind(..)) + +-- | Mark an incident as failed. +markAsFailed :: IORef MemoryState -> Incident -> ResultT String IO () +markAsFailed memstate incident = do + let uuid = incidentUuid incident + newtag = "maintd:repairfailed:" ++ uuid + logInfo $ "Marking incident " ++ uuid ++ " as failed" + now <- liftIO currentTimestamp + luxiSocket <- liftIO Path.defaultQuerySocket + jids <- bracket (mkResultT . liftM (either (Bad . show) Ok) + . tryIOError $ L.getLuxiClient luxiSocket) + (liftIO . L.closeClient) + (mkResultT . execJobsWaitOkJid + [[ annotateOpCode "marking incident handling as failed" now + . OpTagsSet TagKindNode [ newtag ] + . Just $ incidentNode incident ]]) + let incident' = over incidentJobsL (++ jids) + $ incident { incidentRepairStatus = RSFailed + , incidentTag = newtag + } + liftIO $ updateIncident memstate incident' + +-- | Mark the incident, if any, belonging to the given job as +-- failed after having tagged it appropriately. +failIncident :: IORef MemoryState -> JobId -> ResultT String IO () +failIncident memstate jid = do + incidents <- getIncidents memstate + let affected = filter (elem jid . incidentJobs) incidents + when (null affected) . logInfo + $ "Job " ++ show (fromJobId jid) ++ " does not belong to an incident" + mapM_ (markAsFailed memstate) affected -- 2.5.0.457.gab17608
