On Thu, Jul 09, 2015 at 04:11:36PM +0200, 'Klaus Aehlig' via ganeti-devel wrote:
As per our design, this task will get a restriction on
which nodes it may operate and will return an updated
list of nodes not affected and the list of job ids it
submitted.
Signed-off-by: Klaus Aehlig <[email protected]>
---
Makefile.am | 1 +
src/Ganeti/MaintD/Autorepairs.hs | 242 +++++++++++++++++++++++++++++++++++++++
2 files changed, 243 insertions(+)
create mode 100644 src/Ganeti/MaintD/Autorepairs.hs
diff --git a/Makefile.am b/Makefile.am
index 2f70784..3cfdb46 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -973,6 +973,7 @@ HS_LIB_SRCS = \
src/Ganeti/Logging/Lifted.hs \
src/Ganeti/Logging/WriterLog.hs \
src/Ganeti/Luxi.hs \
+ src/Ganeti/MaintD/Autorepairs.hs \
src/Ganeti/MaintD/Server.hs \
src/Ganeti/MaintD/Utils.hs \
src/Ganeti/Network.hs \
diff --git a/src/Ganeti/MaintD/Autorepairs.hs b/src/Ganeti/MaintD/Autorepairs.hs
new file mode 100644
index 0000000..b14f182
--- /dev/null
+++ b/src/Ganeti/MaintD/Autorepairs.hs
@@ -0,0 +1,242 @@
+{-| Auto-repair task of the maintenance daemon.
+
+This module implements the non-pure parts of harep-style
+repairs carried out by the maintenance daemon.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Autorepairs
+ ( harepTasks
+ ) where
+
+import Control.Arrow (second, (***))
+import Control.Monad (forM)
+import Control.Exception (bracket)
+import Data.Maybe (isJust, fromJust)
+import qualified Data.Set as Set
+import System.IO.Error (tryIOError)
+import System.Time (getClockTime)
+
+import Ganeti.BasicTypes
+import Ganeti.Errors (formatError)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.HTools.Repair
+import Ganeti.HTools.Types
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.Jobs (execJobsWaitOkJid, submitJobs)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.Utils (annotateOpCode)
+import Ganeti.OpCodes (OpCode(..))
+import qualified Ganeti.Path as Path
+import Ganeti.Types (JobId, JobStatus(..), TagKind(..), mkNonNegative)
+import Ganeti.Utils (newUUID)
+
+-- | Log a message and return a Bad result.
+logAndBad :: String -> IO (Result a)
+logAndBad msg = do
+ logNotice msg
+ return $ Bad msg
+
+-- | Apply and remove tags form an instance indicated by `InstanceData`.
+commitChange :: L.Client
+ -> InstanceData
+ -> ResultT String IO (InstanceData, [JobId])
+commitChange client instData = do
+ now <- liftIO currentTimestamp
+ let arData = getArData $ arState instData
+ iname = Instance.name $ arInstance instData
+ rmTags = tagsToRemove instData
+ addJobs <- if isJust arData
+ then do
+ let tag = arTag $ fromJust arData
+ logDebug $ "Adding tag " ++ tag ++ " to " ++ iname
+ mkResultT $ execJobsWaitOkJid
+ [[ annotateOpCode "harep state tagging" now
+ . OpTagsSet TagKindInstance [tag]
+ $ Just iname ]]
+ client
+ else return []
+ rmJobs <- if null rmTags
+ then return []
+ else do
+ logDebug $ "Removing tags " ++ show rmTags ++ " from " ++ iname
+ mkResultT $ execJobsWaitOkJid
+ [[ annotateOpCode "harep state tag removal" now
+ . OpTagsDel TagKindInstance rmTags
+ $ Just iname ]]
+ client
+ return (instData { tagsToRemove = [] }, addJobs ++ rmJobs)
+
+-- | Query jobs of a pending repair, returning the new instance data.
+processPending :: L.Client
+ -> InstanceData
+ -> IO (Result (InstanceData, [JobId]))
+processPending client instData = runResultT $ case arState instData of
+ (ArPendingRepair arData) -> do
+ sts <- liftIO . L.queryJobsStatus client $ arJobs arData
+ time <- liftIO getClockTime
+ case sts of
+ Bad e -> mkResultT . logAndBad
+ $ "Could not check job status: " ++ formatError e
+ Ok sts' ->
+ if any (<= JOB_STATUS_RUNNING) sts' then
+ return (instData, [])
+ else do
+ let iname = Instance.name $ arInstance instData
+ srcSt = arStateName $ arState instData
+ arState' =
+ if all (== JOB_STATUS_SUCCESS) sts' then
+ ArHealthy . Just
+ . updateTag $ arData { arResult = Just ArSuccess
+ , arTime = time }
+ else
+ ArFailedRepair . updateTag
+ $ arData { arResult = Just ArFailure, arTime = time }
+ destSt = arStateName arState'
+ instData' = instData { arState = arState'
+ , tagsToRemove = delCurTag instData
+ }
+ logInfo $ "Moving " ++ iname ++ " form " ++ show srcSt ++ " to "
+ ++ show destSt
+ commitChange client instData'
+ _ -> return (instData, [])
+
+-- | Perfom the suggested repair on an instance if its policy allows it
+-- and return the list of submitted jobs.
+doRepair :: L.Client
+ -> InstanceData
+ -> (AutoRepairType, [OpCode])
+ -> IO (Result ([Idx], [JobId]))
+doRepair client instData (rtype, opcodes) = runResultT $ do
+ let inst = arInstance instData
+ ipol = Instance.arPolicy inst
+ iname = Instance.name inst
+ case ipol of
+ ArEnabled maxtype -> do
+ uuid <- liftIO newUUID
+ time <- liftIO getClockTime
+ if rtype > maxtype then do
+ let arState' = ArNeedsRepair (
+ updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm)
"")
+ instData' = instData { arState = arState'
+ , tagsToRemove = delCurTag instData
+ }
+ logInfo $ "Not performing repair of type " ++ show rtype ++ " on "
+ ++ iname ++ " because only repairs up to " ++ show maxtype
+ ++ " are allowed"
+ (_, jobs) <- commitChange client instData'
+ return ([], jobs)
+ else do
+ now <- liftIO currentTimestamp
+ logInfo $ "Executing " ++ show rtype ++ " repair on " ++ iname
+ -- As in harep, we delay the actual repair, to allow the tagging
+ -- to happen first; again this is only about speeding up the harep
+ -- round, not about correctness.
+ let opcodes' = OpTestDelay { opDelayDuration = 10
+ , opDelayOnMaster = True
+ , opDelayOnNodes = []
+ , opDelayOnNodeUuids = Nothing
+ , opDelayRepeat = fromJust $ mkNonNegative 0
+ , opDelayInterruptible = False
+ , opDelayNoLocks = False
+ } : opcodes
+ jids <- liftIO $ submitJobs
+ [ map (annotateOpCode "harep-style repair" now)
+ opcodes'] client
+ case jids of
+ Bad e -> mkResultT . logAndBad $ "Failure submitting repair jobs: "
+ ++ e
+ Ok jids' -> do
+ let arState' = ArPendingRepair (
+ updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
+ instData' = instData { arState = arState'
+ , tagsToRemove = delCurTag instData
+ }
+ (_, tagjobs) <- commitChange client instData'
+ let nodes = filter (>= 0) [Instance.pNode inst, Instance.sNode
inst]
+ return (nodes, jids' ++ tagjobs)
+ otherSt -> do
+ logDebug $ "Not repairing " ++ iname ++ " becuase it is in in state "
s/ in in / in /
+ ++ show otherSt
+ return ([], [])
+
+-- | Harep-like repair tasks.
+harepTasks :: (Node.List, Instance.List) -- ^ Current cluster configuration
+ -> Set.Set Int -- ^ Node indices on which actions may be taken
+ -> ResultT String IO (Set.Set Int, [JobId])
+ -- ^ untouched nodes and jobs submitted
+harepTasks (nl, il) nidxs = do
+ logDebug $ "harep tasks on nodes " ++ show (Set.toList nidxs)
+ iniData <- mkResultT . return . mapM setInitialState $ Container.elems il
+
+ -- First step: check all pending repairs, see if they are completed.
+ luxiSocket <- liftIO Path.defaultQuerySocket
+ either_iData <- liftIO . tryIOError
+ . bracket (L.getLuxiClient luxiSocket) L.closeClient
+ $ forM iniData . processPending
+ (iData', jobs) <- mkResultT $ case either_iData of
+ Left e -> logAndBad $ "Error while harep status update: "
+ ++ show e
+ Right r ->
+ if any isBad r
+ then logAndBad $ "Bad harep processing pending: "
+ ++ show (justBad r)
+ else return . Ok . second concat . unzip $ justOk r
+
+ -- Second step: detect any problems.
+ let repairs = map (detectBroken nl . arInstance) iData'
+
+ -- Third step: create repair jobs for broken instances that are in ArHealthy.
+ let repairIfHealthy c i = case arState i of
+ ArHealthy _ -> doRepair c i
+ _ -> const . return $ Ok ([], [])
+ maybeRepair c (i, r) = maybe (return $ Ok ([], []))
+ (repairIfHealthy c i) r
+ either_repairJobs <- liftIO . tryIOError
+ . bracket (L.getLuxiClient luxiSocket) L.closeClient
+ $ forM (zip iData' repairs) . maybeRepair
+
+ (ntouched, jobs') <- mkResultT $ case either_repairJobs of
+ Left e -> logAndBad $ "Error while attempting repair:
"
+ ++ show e
+ Right r ->
+ if any isBad r
+ then logAndBad $ "Error submitting repair jobs: "
+ ++ show (justBad r)
+ else return . Ok . (concat *** concat) . unzip
+ $ justOk r
+
+ return (nidxs Set.\\ Set.fromList ntouched, jobs ++ jobs' )
--
2.4.3.573.g4eafbef
Rest LGTM, no need to resend