By opening the lock file early, other programs can lock the
state file to prevent ganeti-watcher from restarting daemons.
Using the pause feature is inherently prone to race conditions.
Before a tracecback was logged when the lock file couldn't
be acquired. Now it'll be a more friendly message.
---
daemons/ganeti-watcher | 32 ++++++++++++++++++++------------
1 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 7fc2dc5..720e671 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -106,23 +106,16 @@ def RunWatcherHooks():
logging.debug("Watcher hook %s: success (output: %s)", relname,
runresult.output)
+
class WatcherState(object):
"""Interface to a state file recording restart attempts.
"""
- def __init__(self):
+ def __init__(self, statefile):
"""Open, lock, read and parse the file.
- Raises exception on lock contention.
-
"""
- # The two-step dance below is necessary to allow both opening existing
- # file read/write and creating if not existing. Vanilla open will truncate
- # an existing file -or- allow creating if not existing.
- fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
- self.statefile = os.fdopen(fd, 'w+')
-
- utils.LockFile(self.statefile.fileno())
+ self.statefile = statefile
try:
state_data = self.statefile.read()
@@ -497,12 +490,27 @@ def main():
logging.debug("Pause has been set, exiting")
sys.exit(constants.EXIT_SUCCESS)
+ # The two-step dance below is necessary to allow both opening existing
+ # file read/write and creating if not existing. Vanilla open will truncate
+ # an existing file -or- allow creating if not existing.
+ statefile_fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
+
+ # Try to acquire lock on state file. If this fails, another watcher instance
+ # might already be running or another program is temporarily blocking the
+ # watcher from running.
+ try:
+ utils.LockFile(statefile_fd)
+ except errors.LockError, err:
+ logging.error("Can't acquire lock on state file %s: %s",
+ constants.WATCHER_STATEFILE, err)
+ sys.exit(constants.EXIT_FAILURE)
+
update_file = False
try:
StartNodeDaemons()
RunWatcherHooks()
- notepad = WatcherState()
+ notepad = WatcherState(os.fdopen(statefile_fd, "w+"))
try:
try:
client = cli.GetClient()
@@ -551,7 +559,7 @@ def main():
except errors.JobQueueDrainError:
logging.error("Job queue is drained, can't maintain cluster state")
except Exception, err:
- logging.error(str(err), exc_info=True)
+ logging.exception(str(err))
sys.exit(constants.EXIT_FAILURE)
--
1.6.6