In this way, they can be reused by the maintenance daemon. Note that the parts of harep living in IO are very specific to the stand-alone--tool approach, where showing messages on stderr and dying on first error are OK.
Signed-off-by: Klaus Aehlig <[email protected]> --- Makefile.am | 1 + src/Ganeti/HTools/Program/Harep.hs | 247 +----------------------------- src/Ganeti/HTools/Repair.hs | 305 +++++++++++++++++++++++++++++++++++++ 3 files changed, 308 insertions(+), 245 deletions(-) create mode 100644 src/Ganeti/HTools/Repair.hs diff --git a/Makefile.am b/Makefile.am index 240628c..b60661e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -943,6 +943,7 @@ HS_LIB_SRCS = \ src/Ganeti/HTools/Program/Hsqueeze.hs \ src/Ganeti/HTools/Program/Hroller.hs \ src/Ganeti/HTools/Program/Main.hs \ + src/Ganeti/HTools/Repair.hs \ src/Ganeti/HTools/Tags.hs \ src/Ganeti/HTools/Tags/Constants.hs \ src/Ganeti/HTools/Types.hs \ diff --git a/src/Ganeti/HTools/Program/Harep.hs b/src/Ganeti/HTools/Program/Harep.hs index 54628b0..c81bef8 100644 --- a/src/Ganeti/HTools/Program/Harep.hs +++ b/src/Ganeti/HTools/Program/Harep.hs @@ -42,10 +42,7 @@ module Ganeti.HTools.Program.Harep import Control.Exception (bracket) import Control.Lens (over) import Control.Monad -import Data.Function -import Data.List import Data.Maybe -import Data.Ord import System.Time import qualified Data.Map as Map @@ -57,21 +54,18 @@ import Ganeti.JQueue.Objects (Timestamp) import Ganeti.Jobs import Ganeti.OpCodes import Ganeti.OpCodes.Lens (metaParamsL, opReasonL) -import Ganeti.OpParams import Ganeti.Types import Ganeti.Utils -import qualified Ganeti.Constants as C import qualified Ganeti.Luxi as L import qualified Ganeti.Path as Path import Ganeti.HTools.CLI +import qualified Ganeti.HTools.Container as Container import Ganeti.HTools.Loader import Ganeti.HTools.ExtLoader -import qualified Ganeti.HTools.Tags.Constants as Tags +import Ganeti.HTools.Repair import Ganeti.HTools.Types -import qualified Ganeti.HTools.Container as Container import qualified Ganeti.HTools.Instance as Instance -import qualified Ganeti.HTools.Node as Node import Ganeti.Version (version) @@ -99,135 +93,6 @@ annotateOpCode reason ts = . setOpComment ("automated repairs by harep " ++ version) . wrapOpCode -data InstanceData = InstanceData { arInstance :: Instance.Instance - , arState :: AutoRepairStatus - , tagsToRemove :: [String] - } - deriving (Eq, Show) - --- | Parse a tag into an 'AutoRepairData' record. --- --- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's --- malformed. -parseInitTag :: String -> Maybe AutoRepairData -parseInitTag tag = - let parsePending = do - subtag <- chompPrefix Tags.autoRepairTagPending tag - case sepSplit ':' subtag of - [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs - _ -> fail ("Invalid tag: " ++ show tag) - - parseResult = do - subtag <- chompPrefix Tags.autoRepairTagResult tag - case sepSplit ':' subtag of - [rtype, uuid, ts, result, jobs] -> do - arData <- makeArData rtype uuid ts jobs - result' <- autoRepairResultFromRaw result - return arData { arResult = Just result' } - _ -> fail ("Invalid tag: " ++ show tag) - - makeArData rtype uuid ts jobs = do - rtype' <- autoRepairTypeFromRaw rtype - ts' <- tryRead "auto-repair time" ts - jobs' <- mapM makeJobIdS $ sepSplit '+' jobs - return AutoRepairData { arType = rtype' - , arUuid = uuid - , arTime = TOD ts' 0 - , arJobs = jobs' - , arResult = Nothing - , arTag = tag - } - in - parsePending `mplus` parseResult - --- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type. -getArData :: AutoRepairStatus -> Maybe AutoRepairData -getArData status = - case status of - ArHealthy (Just d) -> Just d - ArFailedRepair d -> Just d - ArPendingRepair d -> Just d - ArNeedsRepair d -> Just d - _ -> Nothing - --- | Return a short name for each auto-repair status. --- --- This is a more concise representation of the status, because the default --- "Show" formatting includes all the accompanying auto-repair data. -arStateName :: AutoRepairStatus -> String -arStateName status = - case status of - ArHealthy _ -> "Healthy" - ArFailedRepair _ -> "Failure" - ArPendingRepair _ -> "Pending repair" - ArNeedsRepair _ -> "Needs repair" - --- | Return a new list of tags to remove that includes @arTag@ if present. -delCurTag :: InstanceData -> [String] -delCurTag instData = - let arData = getArData $ arState instData - rmTags = tagsToRemove instData - in - case arData of - Just d -> arTag d : rmTags - Nothing -> rmTags - --- | Set the initial auto-repair state of an instance from its auto-repair tags. --- --- The rules when there are multiple tags is: --- --- * the earliest failure result always wins --- --- * two or more pending repairs results in a fatal error --- --- * a pending result from id X and a success result from id Y result in error --- if Y is newer than X --- --- * if there are no pending repairs, the newest success result wins, --- otherwise the pending result is used. -setInitialState :: Instance.Instance -> Result InstanceData -setInitialState inst = - let arData = mapMaybe parseInitTag $ Instance.allTags inst - -- Group all the AutoRepairData records by id (i.e. by repair task), and - -- present them from oldest to newest. - arData' = sortBy (comparing arUuid) arData - arGroups = groupBy ((==) `on` arUuid) arData' - arGroups' = sortBy (comparing $ minimum . map arTime) arGroups - in - foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups' - --- | Update the initial status of an instance with new repair task tags. --- --- This function gets called once per repair group in an instance's tag, and it --- determines whether to set the status of the instance according to this new --- group, or to keep the existing state. See the documentation for --- 'setInitialState' for the rules to be followed when determining this. -arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData -arStatusCmp instData arData = - let curSt = arState instData - arData' = sortBy (comparing keyfn) arData - keyfn d = (arResult d, arTime d) - newData = last arData' - newSt = case arResult newData of - Just ArSuccess -> ArHealthy $ Just newData - Just ArEnoperm -> ArHealthy $ Just newData - Just ArFailure -> ArFailedRepair newData - Nothing -> ArPendingRepair newData - in - case curSt of - ArFailedRepair _ -> Ok instData -- Always keep the earliest failure. - ArHealthy _ -> Ok instData { arState = newSt - , tagsToRemove = delCurTag instData - } - ArPendingRepair d -> Bad ( - "An unfinished repair was found in instance " ++ - Instance.name (arInstance instData) ++ ": found tag " ++ - show (arTag newData) ++ ", but older pending tag " ++ - show (arTag d) ++ "exists.") - - ArNeedsRepair _ -> Bad - "programming error: ArNeedsRepair found as an initial state" - -- | Query jobs of a pending repair, returning the new instance data. processPending :: Options -> L.Client -> InstanceData -> IO InstanceData processPending opts client instData = @@ -262,20 +127,6 @@ processPending opts client instData = _ -> return instData --- | Update the tag of an 'AutoRepairData' record to match all the other fields. -updateTag :: AutoRepairData -> AutoRepairData -updateTag arData = - let ini = [autoRepairTypeToRaw $ arType arData, - arUuid arData, - clockTimeToString $ arTime arData] - end = [intercalate "+" . map (show . fromJobId) $ arJobs arData] - (pfx, middle) = - case arResult arData of - Nothing -> (Tags.autoRepairTagPending, []) - Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs]) - in - arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) } - -- | Apply and remove tags from an instance as indicated by 'InstanceData'. -- -- If the /arState/ of the /InstanceData/ record has an associated @@ -307,100 +158,6 @@ commitChange opts client instData = do return instData { tagsToRemove = [] } --- | Detect brokenness with an instance and suggest repair type and jobs to run. -detectBroken :: Node.List -> Instance.Instance - -> Maybe (AutoRepairType, [OpCode]) -detectBroken nl inst = - let disk = Instance.diskTemplate inst - iname = Instance.name inst - offPri = Node.offline $ Container.find (Instance.pNode inst) nl - offSec = Node.offline $ Container.find (Instance.sNode inst) nl - in - case disk of - DTDrbd8 - | offPri && offSec -> - Just ( - ArReinstall, - [ OpInstanceRecreateDisks { opInstanceName = iname - , opInstanceUuid = Nothing - , opRecreateDisksInfo = RecreateDisksAll - , opNodes = [] - -- FIXME: there should be a better way to - -- specify opcode parameters than abusing - -- mkNonEmpty in this way (using the fact - -- that Maybe is used both for optional - -- fields, and to express failure). - , opNodeUuids = Nothing - , opIallocator = mkNonEmpty "hail" - } - , OpInstanceReinstall { opInstanceName = iname - , opInstanceUuid = Nothing - , opOsType = Nothing - , opTempOsParams = Nothing - , opOsparamsPrivate = Nothing - , opOsparamsSecret = Nothing - , opForceVariant = False - } - ]) - | offPri -> - Just ( - ArFailover, - [ OpInstanceFailover { opInstanceName = iname - , opInstanceUuid = Nothing - -- FIXME: ditto, see above. - , opShutdownTimeout = fromJust $ mkNonNegative - C.defaultShutdownTimeout - , opIgnoreConsistency = False - , opTargetNode = Nothing - , opTargetNodeUuid = Nothing - , opIgnoreIpolicy = False - , opIallocator = Nothing - , opMigrationCleanup = False - } - ]) - | offSec -> - Just ( - ArFixStorage, - [ OpInstanceReplaceDisks { opInstanceName = iname - , opInstanceUuid = Nothing - , opReplaceDisksMode = ReplaceNewSecondary - , opReplaceDisksList = [] - , opRemoteNode = Nothing - -- FIXME: ditto, see above. - , opRemoteNodeUuid = Nothing - , opIallocator = mkNonEmpty "hail" - , opEarlyRelease = False - , opIgnoreIpolicy = False - } - ]) - | otherwise -> Nothing - - DTPlain - | offPri -> - Just ( - ArReinstall, - [ OpInstanceRecreateDisks { opInstanceName = iname - , opInstanceUuid = Nothing - , opRecreateDisksInfo = RecreateDisksAll - , opNodes = [] - -- FIXME: ditto, see above. - , opNodeUuids = Nothing - , opIallocator = mkNonEmpty "hail" - } - , OpInstanceReinstall { opInstanceName = iname - , opInstanceUuid = Nothing - , opOsType = Nothing - , opTempOsParams = Nothing - , opOsparamsPrivate = Nothing - , opOsparamsSecret = Nothing - , opForceVariant = False - } - ]) - | otherwise -> Nothing - - _ -> Nothing -- Other cases are unimplemented for now: DTDiskless, - -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt. - -- | Perform the suggested repair on an instance if its policy allows it. doRepair :: Options -> L.Client -- ^ The Luxi client diff --git a/src/Ganeti/HTools/Repair.hs b/src/Ganeti/HTools/Repair.hs new file mode 100644 index 0000000..4220635 --- /dev/null +++ b/src/Ganeti/HTools/Repair.hs @@ -0,0 +1,305 @@ +{-| Implementation of the auto-repair logic for Ganeti. + +-} + +{- + +Copyright (C) 2013, 2015 Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-} + +module Ganeti.HTools.Repair + ( InstanceData(..) + , parseInitTag + , getArData + , arStateName + , delCurTag + , setInitialState + , arStatusCmp + , updateTag + , detectBroken + ) where + +import Control.Monad (mplus, foldM) +import Data.Function (on) +import Data.List (sortBy, groupBy, intercalate) +import Data.Maybe (mapMaybe, fromJust) +import Data.Ord (comparing) +import System.Time (ClockTime(TOD)) + +import Ganeti.BasicTypes (GenericResult(..), Result) +import qualified Ganeti.Constants as C +import qualified Ganeti.HTools.Container as Container +import qualified Ganeti.HTools.Instance as Instance +import qualified Ganeti.HTools.Node as Node +import qualified Ganeti.HTools.Tags.Constants as Tags +import Ganeti.HTools.Types +import Ganeti.OpCodes (OpCode(..)) +import Ganeti.OpParams ( RecreateDisksInfo(RecreateDisksAll) + , ReplaceDisksMode(ReplaceNewSecondary) + ) +import Ganeti.Types (makeJobIdS, fromJobId, mkNonEmpty, mkNonNegative) +import Ganeti.Utils (chompPrefix, sepSplit, tryRead, clockTimeToString) + +-- | Description of an instance annotated with repair-related information. +data InstanceData = InstanceData { arInstance :: Instance.Instance + , arState :: AutoRepairStatus + , tagsToRemove :: [String] + } + deriving (Eq, Show) + + +-- | Parse a tag into an 'AutoRepairData' record. +-- +-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's +-- malformed. +parseInitTag :: String -> Maybe AutoRepairData +parseInitTag tag = + let parsePending = do + subtag <- chompPrefix Tags.autoRepairTagPending tag + case sepSplit ':' subtag of + [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs + _ -> fail ("Invalid tag: " ++ show tag) + + parseResult = do + subtag <- chompPrefix Tags.autoRepairTagResult tag + case sepSplit ':' subtag of + [rtype, uuid, ts, result, jobs] -> do + arData <- makeArData rtype uuid ts jobs + result' <- autoRepairResultFromRaw result + return arData { arResult = Just result' } + _ -> fail ("Invalid tag: " ++ show tag) + + makeArData rtype uuid ts jobs = do + rtype' <- autoRepairTypeFromRaw rtype + ts' <- tryRead "auto-repair time" ts + jobs' <- mapM makeJobIdS $ sepSplit '+' jobs + return AutoRepairData { arType = rtype' + , arUuid = uuid + , arTime = TOD ts' 0 + , arJobs = jobs' + , arResult = Nothing + , arTag = tag + } + in + parsePending `mplus` parseResult + +-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type. +getArData :: AutoRepairStatus -> Maybe AutoRepairData +getArData status = + case status of + ArHealthy (Just d) -> Just d + ArFailedRepair d -> Just d + ArPendingRepair d -> Just d + ArNeedsRepair d -> Just d + _ -> Nothing + +-- | Return a short name for each auto-repair status. +-- +-- This is a more concise representation of the status, because the default +-- "Show" formatting includes all the accompanying auto-repair data. +arStateName :: AutoRepairStatus -> String +arStateName status = + case status of + ArHealthy _ -> "Healthy" + ArFailedRepair _ -> "Failure" + ArPendingRepair _ -> "Pending repair" + ArNeedsRepair _ -> "Needs repair" + +-- | Return a new list of tags to remove that includes @arTag@ if present. +delCurTag :: InstanceData -> [String] +delCurTag instData = + let arData = getArData $ arState instData + rmTags = tagsToRemove instData + in + case arData of + Just d -> arTag d : rmTags + Nothing -> rmTags + +-- | Set the initial auto-repair state of an instance from its auto-repair tags. +-- +-- The rules when there are multiple tags is: +-- +-- * the earliest failure result always wins +-- +-- * two or more pending repairs results in a fatal error +-- +-- * a pending result from id X and a success result from id Y result in error +-- if Y is newer than X +-- +-- * if there are no pending repairs, the newest success result wins, +-- otherwise the pending result is used. +setInitialState :: Instance.Instance -> Result InstanceData +setInitialState inst = + let arData = mapMaybe parseInitTag $ Instance.allTags inst + -- Group all the AutoRepairData records by id (i.e. by repair task), and + -- present them from oldest to newest. + arData' = sortBy (comparing arUuid) arData + arGroups = groupBy ((==) `on` arUuid) arData' + arGroups' = sortBy (comparing $ minimum . map arTime) arGroups + in + foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups' + +-- | Update the initial status of an instance with new repair task tags. +-- +-- This function gets called once per repair group in an instance's tag, and it +-- determines whether to set the status of the instance according to this new +-- group, or to keep the existing state. See the documentation for +-- 'setInitialState' for the rules to be followed when determining this. +arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData +arStatusCmp instData arData = + let curSt = arState instData + arData' = sortBy (comparing keyfn) arData + keyfn d = (arResult d, arTime d) + newData = last arData' + newSt = case arResult newData of + Just ArSuccess -> ArHealthy $ Just newData + Just ArEnoperm -> ArHealthy $ Just newData + Just ArFailure -> ArFailedRepair newData + Nothing -> ArPendingRepair newData + in + case curSt of + ArFailedRepair _ -> Ok instData -- Always keep the earliest failure. + ArHealthy _ -> Ok instData { arState = newSt + , tagsToRemove = delCurTag instData + } + ArPendingRepair d -> Bad ( + "An unfinished repair was found in instance " ++ + Instance.name (arInstance instData) ++ ": found tag " ++ + show (arTag newData) ++ ", but older pending tag " ++ + show (arTag d) ++ "exists.") + + ArNeedsRepair _ -> Bad + "programming error: ArNeedsRepair found as an initial state" + +-- | Update the tag of an 'AutoRepairData' record to match all the other fields. +updateTag :: AutoRepairData -> AutoRepairData +updateTag arData = + let ini = [autoRepairTypeToRaw $ arType arData, + arUuid arData, + clockTimeToString $ arTime arData] + end = [intercalate "+" . map (show . fromJobId) $ arJobs arData] + (pfx, middle) = + case arResult arData of + Nothing -> (Tags.autoRepairTagPending, []) + Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs]) + in + arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) } + +-- | Detect brokenness with an instance and suggest repair type and jobs to run. +detectBroken :: Node.List -> Instance.Instance + -> Maybe (AutoRepairType, [OpCode]) +detectBroken nl inst = + let disk = Instance.diskTemplate inst + iname = Instance.name inst + offPri = Node.offline $ Container.find (Instance.pNode inst) nl + offSec = Node.offline $ Container.find (Instance.sNode inst) nl + in + case disk of + DTDrbd8 + | offPri && offSec -> + Just ( + ArReinstall, + [ OpInstanceRecreateDisks { opInstanceName = iname + , opInstanceUuid = Nothing + , opRecreateDisksInfo = RecreateDisksAll + , opNodes = [] + -- FIXME: there should be a better way to + -- specify opcode parameters than abusing + -- mkNonEmpty in this way (using the fact + -- that Maybe is used both for optional + -- fields, and to express failure). + , opNodeUuids = Nothing + , opIallocator = mkNonEmpty "hail" + } + , OpInstanceReinstall { opInstanceName = iname + , opInstanceUuid = Nothing + , opOsType = Nothing + , opTempOsParams = Nothing + , opOsparamsPrivate = Nothing + , opOsparamsSecret = Nothing + , opForceVariant = False + } + ]) + | offPri -> + Just ( + ArFailover, + [ OpInstanceFailover { opInstanceName = iname + , opInstanceUuid = Nothing + -- FIXME: ditto, see above. + , opShutdownTimeout = fromJust $ mkNonNegative + C.defaultShutdownTimeout + , opIgnoreConsistency = False + , opTargetNode = Nothing + , opTargetNodeUuid = Nothing + , opIgnoreIpolicy = False + , opIallocator = Nothing + , opMigrationCleanup = False + } + ]) + | offSec -> + Just ( + ArFixStorage, + [ OpInstanceReplaceDisks { opInstanceName = iname + , opInstanceUuid = Nothing + , opReplaceDisksMode = ReplaceNewSecondary + , opReplaceDisksList = [] + , opRemoteNode = Nothing + -- FIXME: ditto, see above. + , opRemoteNodeUuid = Nothing + , opIallocator = mkNonEmpty "hail" + , opEarlyRelease = False + , opIgnoreIpolicy = False + } + ]) + | otherwise -> Nothing + + DTPlain + | offPri -> + Just ( + ArReinstall, + [ OpInstanceRecreateDisks { opInstanceName = iname + , opInstanceUuid = Nothing + , opRecreateDisksInfo = RecreateDisksAll + , opNodes = [] + -- FIXME: ditto, see above. + , opNodeUuids = Nothing + , opIallocator = mkNonEmpty "hail" + } + , OpInstanceReinstall { opInstanceName = iname + , opInstanceUuid = Nothing + , opOsType = Nothing + , opTempOsParams = Nothing + , opOsparamsPrivate = Nothing + , opOsparamsSecret = Nothing + , opForceVariant = False + } + ]) + | otherwise -> Nothing + + _ -> Nothing -- Other cases are unimplemented for now: DTDiskless, + -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt. -- 2.4.3.573.g4eafbef
