Signed-off-by: Tom Limoncelli <[email protected]>
---
 daemons/ganeti-watcher |   25 ++++++++++++++++++++++++-
 lib/utils.py           |   13 +++++++++++++
 2 files changed, 37 insertions(+), 1 deletions(-)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 1f82db8..82bd24b 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -48,6 +48,7 @@ from ganeti import ssconf
 from ganeti import bdev
 from ganeti import hypervisor
 from ganeti.confd import client as confd_client
+from ganeti.rapi import client as rapi_client
 
 
 MAXTRIES = 5
@@ -666,7 +667,29 @@ def main():
         client = cli.GetClient()
 
       # we are on master now
-      utils.EnsureDaemon(constants.RAPI)
+
+      # Restart RAPI if it isn't responding to queries.
+      # Only kill/restart RAPI once.  Otherwise just give up.
+      rapi_restarted = False
+      while True:
+        utils.EnsureDaemon(constants.RAPI)
+        logging.debug("Attempting to talk with RAPI")
+        master_rapi = rapi_client.GanetiRapiClient("localhost",
+          ssl_cert_file=constants.RAPI_CERT_FILE)
+        try:
+          master_version = master_rapi.GetVersion()
+        except:
+          logging.error("Could not open connection to RAPI")
+          if rapi_restarted:
+             break
+          else:
+             logging.debug("RAPI is running but did not speak.  Killing RAPI")
+             utils.StopDaemon(constants.RAPI)
+             continue
+        if master_version == 2:
+          break
+        else:
+          logging.fatal("RAPI version said %s, expecting 2" % master_version)
 
       try:
         watcher = Watcher(options, notepad)
diff --git a/lib/utils.py b/lib/utils.py
index 2b3e785..8594779 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -2148,6 +2148,19 @@ def EnsureDaemon(name):
   return True
 
 
+def StopDaemon(name):
+  """Stop a daemon.
+
+  """
+  result = RunCmd([constants.DAEMON_UTIL, "stop", name])
+  if result.failed:
+    logging.error("Can't stop daemon '%s', failure %s, output: %s",
+                  name, result.fail_reason, result.output)
+    return False
+
+  return True
+
+
 def WritePidFile(name):
   """Write the current process pidfile.
 
-- 
1.7.0.1

Reply via email to