.. mark it as failed only if it's not finalized, and let the queue management handle the cleanup.
Signed-off-by: Petr Pudlak <[email protected]> --- src/Ganeti/JQScheduler.hs | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/Ganeti/JQScheduler.hs b/src/Ganeti/JQScheduler.hs index 5991ff5..12f6fa5 100644 --- a/src/Ganeti/JQScheduler.hs +++ b/src/Ganeti/JQScheduler.hs @@ -355,15 +355,6 @@ showQueue (Queue {qEnqueued=waiting, qRunning=running}) = in "Waiting jobs: " ++ showids waiting ++ "; running jobs: " ++ showids running --- | Pure function to remove a job from the list of running --- jobs, if it is still there. Return whether it was still among --- the running jobs. -rmFromRunning :: JobId -> Queue -> (Queue, Bool) -rmFromRunning jid queue = - let running = qRunning queue - (running', removed) = partition ((/=) jid . qjId . jJob) running - in ( queue { qRunning = running' }, not $ null removed) - -- | Check if a job died, and clean up if so. checkForDeath :: JQStatus -> JobWithStat -> IO () checkForDeath state jobWS = do @@ -379,19 +370,19 @@ checkForDeath state jobWS = do logInfo $ "Detected death of job " ++ sjid -- if we manage to remove the job from the queue, we own the job file -- and can manipulate it. - removed <- atomicModifyIORef (jqJobs state) $ rmFromRunning jid - when removed . void . runResultT $ do - logDebug $ "Removed job " ++ sjid ++ " from the list of running" - :: ResultG () - jobWS' <- mkResultT $ readJobFromDisk jid - now <- liftIO currentTimestamp - qDir <- liftIO queueDir - let reason = ( "gnt:daemon:wconfd:deathdetection" - , "detected death of job " ++ sjid - , reasonTrailTimestamp now ) - failedJob = failQueuedJob reason now $ jJob jobWS' - cfg <- mkResultT . readIORef $ jqConfig state - writeAndReplicateJob cfg qDir failedJob + void . manipulateRunningJob state jid . runResultT $ do + jobWS' <- mkResultT $ readJobFromDisk jid :: ResultG JobWithStat + unless (jobFinalized . jJob $ jobWS') . void $ do + -- If the job isn't finalized, but dead, add a corresponding + -- failed status. + now <- liftIO currentTimestamp + qDir <- liftIO queueDir + let reason = ( "gnt:daemon:wconfd:deathdetection" + , "detected death of job " ++ sjid + , reasonTrailTimestamp now ) + failedJob = failQueuedJob reason now $ jJob jobWS' + cfg <- mkResultT . readIORef $ jqConfig state + writeAndReplicateJob cfg qDir failedJob -- | Time-based watcher for updating the job queue. onTimeWatcher :: JQStatus -> IO () -- 1.9.1.423.g4596e3a
