On Tue, Sep 22, 2015 at 10:51:39AM +0200, 'Klaus Aehlig' via ganeti-devel wrote:
Among the actions a node-status data collector can request is
submitting a live-command for repair actions. Make maintd honor
these requests.

Signed-off-by: Klaus Aehlig <[email protected]>
---
src/Ganeti/MaintD/HandleIncidents.hs | 72 ++++++++++++++++++++++++++++++++++--
1 file changed, 68 insertions(+), 4 deletions(-)

diff --git a/src/Ganeti/MaintD/HandleIncidents.hs 
b/src/Ganeti/MaintD/HandleIncidents.hs
index e78dac3..600707d 100644
--- a/src/Ganeti/MaintD/HandleIncidents.hs
+++ b/src/Ganeti/MaintD/HandleIncidents.hs
@@ -48,8 +48,10 @@ import Data.Function (on)
import Data.IORef (IORef)
import qualified Data.Map as Map
import qualified Data.Set as Set
+import qualified Text.JSON as J

import Ganeti.BasicTypes ( GenericResult(..), ResultT, mkResultT, Down(..))
+import qualified Ganeti.Constants as C
import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), defaultOptions)
import Ganeti.HTools.Cluster.Evacuate (tryNodeEvac, EvacSolution(..))
import qualified Ganeti.HTools.Container as Container
@@ -63,7 +65,7 @@ import Ganeti.Logging.Lifted
import qualified Ganeti.Luxi as L
import Ganeti.MaintD.MemoryState ( MemoryState, getIncidents, rmIncident
                                 , updateIncident, appendJobs)
-import Ganeti.MaintD.Utils (annotateOpCode)
+import Ganeti.MaintD.Utils (annotateOpCode, getRepairCommand)
import Ganeti.Objects.Lens (incidentJobsL)
import Ganeti.Objects.Maintenance ( RepairStatus(..), RepairAction(..)
                                  , Incident(..))
@@ -175,6 +177,69 @@ handleEvacuation client memst (gl, nl, il) ndx migrate 
freenodes incident = do
                   liftIO $ appendJobs memst jids
                   return freenodes

+-- | Submit the next action for a live-repair incident.
+handleLiveRepairs :: L.Client -- ^ Luxi client to use
+                 -> IORef MemoryState -- ^ memory state of the daemon
+                 -> Idx -- ^ the node to handle the event on
+                 -> Set.Set Int -- ^ unaffected nodes
+                 -> Incident -- ^ the incident
+                 -> ResultT String IO (Set.Set Int) -- ^ nodes still available
+handleLiveRepairs client memst ndx freenodes incident = do
+  let maybeCmd = getRepairCommand incident
+      uuid = incidentUuid incident
+      name = incidentNode incident
+  now <- liftIO currentTimestamp
+  logDebug $ "Handling requested command " ++ show maybeCmd ++ " on " ++ name
+  case () of
+    _ | null $ incidentJobs incident,
+        Just cmd <- maybeCmd,
+        cmd /= "" -> do
+            logDebug "Submitting repair command job"
+            name' <- mkNonEmpty name
+            cmd' <- mkNonEmpty cmd
+            orig' <- mkNonEmpty . J.encode $ incidentOriginal incident
+            jids_r <- liftIO $ submitJobs
+                        [[ annotateOpCode "repair command requested by node" 
now
+                           OpRepairCommand { opNodeName = name'
+                                           , opRepairCommand = cmd'
+                                           , opInput = Just orig'
+                                           } ]] client
+            case jids_r of
+              Ok jids -> do
+                let incident' = over incidentJobsL (++ jids) incident
+                liftIO $ updateIncident memst incident'
+                liftIO $ appendJobs memst jids
+                logDebug $ "Jobs submitted: " ++ show (map fromJobId jids)
+              Bad e -> mkResultT . logAndBad
+                   $ "Failure requesting command " ++ cmd ++ " on " ++ name
+                     ++ ": " ++ e
+      | null $ incidentJobs incident -> do
+            logInfo $ "Marking incident " ++ uuid ++ " as failed;"
+                      ++ " command for live repair not specified"
+            let newtag = C.maintdFailureTagPrefix ++ uuid
+            jids <- mkResultT $ execJobsWaitOkJid
+                      [[ annotateOpCode "marking incident as ill specified" now
+                         . OpTagsSet TagKindNode [ newtag ]
+                         $ Just name ]] client
+            let incident' = over incidentJobsL (++ jids)
+                              $ incident { incidentRepairStatus = RSFailed
+                                         , incidentTag = newtag
+                                         }
+            liftIO $ updateIncident memst incident'
+            liftIO $ appendJobs memst jids
+      | otherwise -> do
+            logDebug "Command execution has succeeded"
+            jids <- mkResultT $ execJobsWaitOkJid
+                      [[ annotateOpCode "repair command requested by node" now
+                         . OpTagsSet TagKindNode [ incidentTag incident ]
+                         $ Just name ]] client
+            let incident' = over incidentJobsL (++ jids)
+                            $ incident { incidentRepairStatus = RSCompleted }
+            liftIO $ updateIncident memst incident'
+            liftIO $ appendJobs memst jids
+  return $ Set.delete ndx freenodes
+
+
-- | Submit the next actions for a single incident, given the unaffected nodes;
-- register all submitted jobs and return the new set of unaffected nodes.
handleIncident :: L.Client
@@ -200,9 +265,8 @@ handleIncident client memstate (gl, nl, il) freeNodes 
(name, incident) = do
      logDebug $ "Nothing to do for " ++ show incident
      liftIO . rmIncident memstate $ uuidOf incident
      return freeNodes
-    RALiveRepair -> do
-      logInfo "Live repairs not yet implemented"
-      return freeNodes
+    RALiveRepair ->
+      handleLiveRepairs client memstate ndx freeNodes incident
    RAEvacuate ->
      handleEvacuation client memstate (gl, nl, il) ndx True freeNodes incident
    RAEvacuateFailover ->
--
2.6.0.rc0.131.gf624c3d


LGTM, thanks

Reply via email to