On Fri, Feb 26, 2010 at 4:04 PM, Michael Hanselmann <[email protected]> wrote:
> By opening the lock file early, other programs can lock the
> state file to prevent ganeti-watcher from restarting daemons.
> Using the pause feature is inherently prone to race conditions.
>
> Before a tracecback was logged when the lock file couldn't
> be acquired. Now it'll be a more friendly message.
> ---
> daemons/ganeti-watcher | 32 ++++++++++++++++++++------------
> 1 files changed, 20 insertions(+), 12 deletions(-)
>
> diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
> index 7fc2dc5..720e671 100755
> --- a/daemons/ganeti-watcher
> +++ b/daemons/ganeti-watcher
> @@ -106,23 +106,16 @@ def RunWatcherHooks():
> logging.debug("Watcher hook %s: success (output: %s)", relname,
> runresult.output)
>
> +
> class WatcherState(object):
> """Interface to a state file recording restart attempts.
>
> """
> - def __init__(self):
> + def __init__(self, statefile):
> """Open, lock, read and parse the file.
>
Please document the statefile argument in the docstring.
> - Raises exception on lock contention.
> -
> """
> - # The two-step dance below is necessary to allow both opening existing
> - # file read/write and creating if not existing. Vanilla open will
> truncate
> - # an existing file -or- allow creating if not existing.
> - fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
> - self.statefile = os.fdopen(fd, 'w+')
> -
> - utils.LockFile(self.statefile.fileno())
> + self.statefile = statefile
>
> try:
> state_data = self.statefile.read()
> @@ -497,12 +490,27 @@ def main():
> logging.debug("Pause has been set, exiting")
> sys.exit(constants.EXIT_SUCCESS)
>
> + # The two-step dance below is necessary to allow both opening existing
> + # file read/write and creating if not existing. Vanilla open will truncate
> + # an existing file -or- allow creating if not existing.
> + statefile_fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
> +
> + # Try to acquire lock on state file. If this fails, another watcher
> instance
> + # might already be running or another program is temporarily blocking the
> + # watcher from running.
> + try:
> + utils.LockFile(statefile_fd)
> + except errors.LockError, err:
> + logging.error("Can't acquire lock on state file %s: %s",
> + constants.WATCHER_STATEFILE, err)
> + sys.exit(constants.EXIT_FAILURE)
> +
> update_file = False
> try:
> StartNodeDaemons()
> RunWatcherHooks()
>
> - notepad = WatcherState()
> + notepad = WatcherState(os.fdopen(statefile_fd, "w+"))
Ugh... do we have to do these in main()? Can we abstract it somewhere else?
> try:
> try:
> client = cli.GetClient()
> @@ -551,7 +559,7 @@ def main():
> except errors.JobQueueDrainError:
> logging.error("Job queue is drained, can't maintain cluster state")
> except Exception, err:
> - logging.error(str(err), exc_info=True)
> + logging.exception(str(err))
> sys.exit(constants.EXIT_FAILURE)
>
Thanks,
Guido