By opening the lock file early, other programs can lock the
state file to prevent ganeti-watcher from restarting daemons.
Using the pause feature is inherently prone to race conditions.

Before a tracecback was logged when the lock file couldn't
be acquired. Now it'll be a more friendly message.
---
 daemons/ganeti-watcher |   32 ++++++++++++++++++++------------
 1 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 7fc2dc5..720e671 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -106,23 +106,16 @@ def RunWatcherHooks():
         logging.debug("Watcher hook %s: success (output: %s)", relname,
                       runresult.output)
 
+
 class WatcherState(object):
   """Interface to a state file recording restart attempts.
 
   """
-  def __init__(self):
+  def __init__(self, statefile):
     """Open, lock, read and parse the file.
 
-    Raises exception on lock contention.
-
     """
-    # The two-step dance below is necessary to allow both opening existing
-    # file read/write and creating if not existing.  Vanilla open will truncate
-    # an existing file -or- allow creating if not existing.
-    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
-    self.statefile = os.fdopen(fd, 'w+')
-
-    utils.LockFile(self.statefile.fileno())
+    self.statefile = statefile
 
     try:
       state_data = self.statefile.read()
@@ -497,12 +490,27 @@ def main():
     logging.debug("Pause has been set, exiting")
     sys.exit(constants.EXIT_SUCCESS)
 
+  # The two-step dance below is necessary to allow both opening existing
+  # file read/write and creating if not existing. Vanilla open will truncate
+  # an existing file -or- allow creating if not existing.
+  statefile_fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
+
+  # Try to acquire lock on state file. If this fails, another watcher instance
+  # might already be running or another program is temporarily blocking the
+  # watcher from running.
+  try:
+    utils.LockFile(statefile_fd)
+  except errors.LockError, err:
+    logging.error("Can't acquire lock on state file %s: %s",
+                  constants.WATCHER_STATEFILE, err)
+    sys.exit(constants.EXIT_FAILURE)
+
   update_file = False
   try:
     StartNodeDaemons()
     RunWatcherHooks()
 
-    notepad = WatcherState()
+    notepad = WatcherState(os.fdopen(statefile_fd, "w+"))
     try:
       try:
         client = cli.GetClient()
@@ -551,7 +559,7 @@ def main():
   except errors.JobQueueDrainError:
     logging.error("Job queue is drained, can't maintain cluster state")
   except Exception, err:
-    logging.error(str(err), exc_info=True)
+    logging.exception(str(err))
     sys.exit(constants.EXIT_FAILURE)
 
 
-- 
1.6.6

Reply via email to