On Thu, May 27, 2010 at 09:59:56PM -0400, Tom Limoncelli wrote:
>
> Signed-off-by: Tom Limoncelli <[email protected]>
> ---
> daemons/ganeti-watcher | 25 ++++++++++++++++++++++++-
> lib/utils.py | 13 +++++++++++++
> 2 files changed, 37 insertions(+), 1 deletions(-)
>
> diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
> index 1f82db8..82bd24b 100755
> --- a/daemons/ganeti-watcher
> +++ b/daemons/ganeti-watcher
> @@ -48,6 +48,7 @@ from ganeti import ssconf
> from ganeti import bdev
> from ganeti import hypervisor
> from ganeti.confd import client as confd_client
> +from ganeti.rapi import client as rapi_client
>
>
> MAXTRIES = 5
> @@ -666,7 +667,29 @@ def main():
> client = cli.GetClient()
>
> # we are on master now
> - utils.EnsureDaemon(constants.RAPI)
> +
> + # Restart RAPI if it isn't responding to queries.
> + # Only kill/restart RAPI once. Otherwise just give up.
> + rapi_restarted = False
> + while True:
> + utils.EnsureDaemon(constants.RAPI)
> + logging.debug("Attempting to talk with RAPI")
> + master_rapi = rapi_client.GanetiRapiClient("localhost",
> + ssl_cert_file=constants.RAPI_CERT_FILE)
> + try:
> + master_version = master_rapi.GetVersion()
> + except:
> + logging.error("Could not open connection to RAPI")
> + if rapi_restarted:
You never set this variable to True in any codepaths… Is this an
incomplete patch?
> + break
> + else:
> + logging.debug("RAPI is running but did not speak. Killing
> RAPI")
This is a significant condition. I'd suggest logging.warning instead of
debug.
> + utils.StopDaemon(constants.RAPI)
> + continue
> + if master_version == 2:
> + break
> + else:
> + logging.fatal("RAPI version said %s, expecting 2" % master_version)
>
> try:
> watcher = Watcher(options, notepad)
> diff --git a/lib/utils.py b/lib/utils.py
> index 2b3e785..8594779 100644
> --- a/lib/utils.py
> +++ b/lib/utils.py
> @@ -2148,6 +2148,19 @@ def EnsureDaemon(name):
> return True
>
>
> +def StopDaemon(name):
> + """Stop a daemon.
> +
> + """
> + result = RunCmd([constants.DAEMON_UTIL, "stop", name])
I wonder if, for the case of a broken RAPI, a simple stop command is
enough. Ah, we're using start-stop-daemon with --retry, which should
send a SIGKILL at the end, so it should be fine.
iustin