ArielGlenn has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/211432

Change subject: nova monitoring instaces and salt keys: add new options
......................................................................

nova monitoring instaces and salt keys: add new options

show all instances, show instances not responsive to test.ping,
show instances without salt keys

Change-Id: I1651c9340b34b3967413a22f55a3862bc95920be
---
M modules/openstack/files/monitor_labs_salt_keys.py
1 file changed, 222 insertions(+), 54 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/32/211432/1

diff --git a/modules/openstack/files/monitor_labs_salt_keys.py 
b/modules/openstack/files/monitor_labs_salt_keys.py
index d5aa043..7bc8c29 100644
--- a/modules/openstack/files/monitor_labs_salt_keys.py
+++ b/modules/openstack/files/monitor_labs_salt_keys.py
@@ -8,10 +8,6 @@
 import salt.key
 from novaclient.v1_1 import client
 
-# fixme sanity checking:
-# we want to make sure we don't get bogus results
-# for instance exists check (if it's broken what happens?)
-
 
 class Whiner(object):
     '''
@@ -41,6 +37,29 @@
     the nova openstack compute api
     '''
 
+    @staticmethod
+    def instance_display(server):
+        '''
+        given a nova server object (returned by listing
+        servers), display a few useful fields from it
+        '''
+
+        print 'Instance:', getattr(server, 'OS-EXT-SRV-ATTR:instance_name'),
+        print 'Status:', server.status,
+        print 'hostname:', server.name,
+        if 'public' in server.addresses and server.addresses['public']:
+            print 'IP:', server.addresses['public'][0]['addr'],
+        else:
+            print 'IP: seems to have none',
+        print 'Hypervisor:',
+        print getattr(server, 'OS-EXT-SRV-ATTR:hypervisor_hostname'),
+        if server.tenant_id == 'deployment-prep':
+            print 'Salt-master: deployment-salt'
+        else:
+            print
+        if hasattr(server, 'fault'):
+            print 'Fault:', server.fault
+
     def __init__(self, authfile, limit=300):
         '''
         authfile: full path to a file of auth creds, see
@@ -60,22 +79,22 @@
         (status is ACTIVE or ERROR or SHUTOFF)
         for all tenants
         '''
-        ec2ids = set()
+        instances = {}
         opts = {'all_tenants': True, 'limit': self.limit}
         while True:
             servers = self.client.servers.list(detailed=True, search_opts=opts)
             if not servers:
                 break
-            new_ec2ids = [getattr(instance, 'OS-EXT-SRV-ATTR:instance_name')
-                          for instance in servers]
-            ec2ids = ec2ids | set(new_ec2ids)
+            for instance in servers:
+                instances[getattr(instance,
+                                  'OS-EXT-SRV-ATTR:instance_name')] = instance
             opts['marker'] = servers[-1].id
             time.sleep(1)
 
-        if not ec2ids:
+        if not instances:
             Whiner.whine("no good nova instances found, something's wrong",
                          fatal=True)
-        return ec2ids
+        return instances
 
     def get_bad_instances(self):
         '''
@@ -84,19 +103,22 @@
         (status is DELETED only, not ERROR or BUILD)
         for all tenants
         '''
-        ec2ids = set()
+        instances = {}
         opts = {'all_tenants': True, 'deleted': True, 'limit': self.limit}
         while True:
             servers = self.client.servers.list(detailed=True, search_opts=opts)
             if not servers:
                 break
-            new_ec2ids = [getattr(instance, 'OS-EXT-SRV-ATTR:instance_name')
-                          for instance in servers if
-                          instance.status == 'DELETED']
-            ec2ids = ec2ids | set(new_ec2ids)
+            for instance in servers:
+                if instance.status == 'DELETED':
+                    instances[getattr(
+                        instance, 'OS-EXT-SRV-ATTR:instance_name')] = instance
             opts['marker'] = servers[-1].id
             time.sleep(1)
-        return ec2ids
+        if not instances:
+            Whiner.whine("no deleted nova instances found, very fishy...",
+                         fatal=True)
+        return instances
 
 
 class NovaAuth(object):
@@ -183,6 +205,7 @@
         self.accepted_keys = []
         for dirname in dirnames:
             self.accepted_keys += accepted[dirname]
+        return self.accepted_keys
 
     def get_unresponsive_hosts(self):
         '''
@@ -210,18 +233,140 @@
         self.key_manager.delete_key(host_name)
 
 
-def canonicalize(salt_hostname, region):
+class Runner(object):
     '''
-    convert the hostname we get back from a salt
-    command to the standard ec2 form
-    'region' should be eqiad/pmtpa/ etc depending
-    on the dc this script runs in
+    handle action requests for display of nova instances
+    or manipulation of their salt keys
     '''
-    domain = '.' + region + '.wmflabs'
-    truncate_by = -1 * len(domain)
-    if salt_hostname.endswith(domain):
-        salt_hostname = salt_hostname[:truncate_by]
-    return salt_hostname
+
+    @staticmethod
+    def canonicalize(hostname, region):
+        '''
+        convert fqdn to the short form by
+        dropping the domain, if there is one
+
+        'region' should be eqiad/pmtpa/ etc depending
+        on the dc this script runs in
+        '''
+        domain = '.' + region + '.wmflabs'
+        truncate_by = -1 * len(domain)
+        if hostname.endswith(domain):
+            hostname = hostname[:truncate_by]
+        return hostname
+
+    def __init__(self, actions, authfile, dryrun):
+        self.actions = actions
+        self.dryrun = dryrun
+        self.nova_client = NovaClient(authfile)
+        self.saltkeys = SaltKeys()
+        self.good_instances = None
+        self.bad_salt_hosts = None
+
+    def run(self):
+        '''
+        actually do the actions the caller requested
+        '''
+
+        if 'missingkey' in self.actions and self.actions['missingkey']:
+            self.do_missingkeys()
+        if 'unresponsive' in self.actions and self.actions['unresponsive']:
+            self.do_unresponsive()
+        if 'cleanup' in self.actions and self.actions['cleanup']:
+            self.do_cleanup()
+        if 'showall' in self.actions and self.actions['showall']:
+           self.do_showall()
+
+    def do_unresponsive(self):
+        '''
+        display information about undeleted nova instances
+        which do not respond to salt ping but are known to salt
+        '''
+
+        if self.bad_salt_hosts is None:
+            self.bad_salt_hosts = self.saltkeys.get_unresponsive_hosts()
+        if not self.bad_salt_hosts:
+            # nothing to do
+            return
+
+        if self.good_instances is None:
+            self.good_instances = self.nova_client.get_good_instances()
+
+        print "instances unreponsive to salt test.ping"
+        print "======================================="
+        for bad_key in self.bad_salt_hosts:
+            canonical_name = Runner.canonicalize(
+                bad_key, self.nova_client.auth.get_region())
+            if canonical_name in self.good_instances:
+                NovaClient.instance_display(
+                    self.good_instances[canonical_name])
+            else:
+                print "Instance ", canonical_name, "seems to be deleted."
+        print
+
+    def do_missingkeys(self):
+        '''
+        display information about undeleted nova instances
+        which are unknown to salt (no salt key)
+        '''
+
+        known_to_salt = self.saltkeys.get_accepted_keys()
+        if not known_to_salt:
+            # nothing to do
+            return
+        salt_canonical_names = [Runner.canonicalize(
+            key, self.nova_client.auth.get_region())
+            for key in known_to_salt]
+
+        if self.good_instances is None:
+            self.good_instances = self.nova_client.get_good_instances()
+
+        print "instances with no salt key:"
+        print "==========================="
+        for ec2_id in self.good_instances:
+            if ec2_id not in salt_canonical_names:
+                NovaClient.instance_display(self.good_instances[ec2_id])
+        print
+
+    def do_showall(self):
+        '''
+        display information about all instances
+        '''
+
+        if self.good_instances is None:
+            self.good_instances = self.nova_client.get_good_instances()
+
+        print "all instances not deleted:"
+        print "=========================="
+        for ec2_id in self.good_instances:
+            NovaClient.instance_display(self.good_instances[ec2_id])
+
+
+    def do_cleanup(self):
+        '''
+        remove salt keys for deleted nova instances
+        '''
+
+        if self.bad_salt_hosts is None:
+            self.bad_salt_hosts = self.saltkeys.get_unresponsive_hosts()
+        if not self.bad_salt_hosts:
+            # nothing to do
+            return
+
+        if self.good_instances is None:
+            self.good_instances = self.nova_client.get_good_instances()
+
+        instance_ids = self.good_instances.keys()
+
+        log("Key deletion")
+        for bad_key in self.bad_salt_hosts:
+            if (Runner.canonicalize(bad_key,
+                                    self.nova_client.auth.get_region())
+                    not in instance_ids):
+                if not self.dryrun:
+                    log("About to delete key %s" % bad_key)
+                    self.saltkeys.delete_bad_key(bad_key)
+                else:
+                    print "would delete", bad_key
 
 
 def log(message):
@@ -242,14 +387,36 @@
         sys.stderr.write(message)
         sys.stderr.write("\n")
     usage_message = """
-Usage: cleanup_labs_saltkeys.py [--authfile]
-       [--dryrun] [--help]
+Usage: monitor_labs_salt_keys.py <action>...
+                         [--authfile] [--dryrun] [--help]
 
-This script removes salt keys for deleted nova instances.
+where <action> is one of --cleanup --missing --no_ping --showall
 
-It relies on salt and on nova.  There should also be a file of
-nova authentication credentials ('authfile') someplace in the
-following format (order of lines does not matter):
+This script can, depending on the options specified, display
+information about labs instances with no salt key or labs instances
+unresponsive to salt, or it can delete saly keys of deleted labs
+instances.
+
+It relies on salt and on nova; it must be run on the salt master.
+
+Options:
+
+  --authfile (-a): path to a file of nova authentication credentials
+                   see 'Authfile Format' for the contents of the file
+                   default: /root/novaenv.sh
+  --cleanup (-c):  cleanup salt keys of deleted instances
+  --dryrun (-d):   don't delete anything, describe what would be done
+  --missing (-m):  show information about instances with missing salt keys
+  --no_ping (-n):  show information about instances unresponsive to salt
+  --showall (-s):  show information about all undeleted instances
+
+  --help (-h):     display this usage message
+
+Authfile Format
+
+The file of authentication credentials must be in the following
+format (order of lines does not matter but each line must occur
+someplace):
 
 export OS_USERNAME="some-name"
 export OS_PASSWORD="password-here"
@@ -258,9 +425,6 @@
 export OS_TENANT_NAME="..."
 
 Lines with # or blank lines are skipped.
-
-If no authfile option is given, the file /root/novaenv.sh is
-used.
 """
     sys.stderr.write(usage_message)
     sys.exit(1)
@@ -269,45 +433,49 @@
 def main():
     'main entry point, does all the work'
     authfile = '/root/novaenv.sh'
+    missingkey = False
+    unresponsive = False
+    cleanup = False
+    showall = False
     dryrun = False
 
     try:
         (options, remainder) = getopt.gnu_getopt(
-            sys.argv[1:], "a:dh",
-            ["auth=", "dryrun", "help"])
+            sys.argv[1:], "a:cdmnsh",
+            ["auth=", "showall", "cleanup", "missing",
+             "no_ping", "dryrun", "help"])
 
     except getopt.GetoptError as err:
         usage("Unknown option specified: " + str(err))
     for (opt, val) in options:
         if opt in ["-a", "--auth"]:
             authfile = val
-        elif opt in ["-h", "--help"]:
-            usage('Help for this script\n')
+        elif opt in ["-c", "--cleanup"]:
+            cleanup = True
         elif opt in ["-d", "--dryrun"]:
             dryrun = True
+        elif opt in ["-m", "--missing"]:
+            missingkey = True
+        elif opt in ["-n", "--no_ping"]:
+            unresponsive = True
+        elif opt in ["-s", "--showall"]:
+            showall = True
+        elif opt in ["-h", "--help"]:
+            usage('Help for this script\n')
         else:
             usage("Unknown option specified: <%s>" % opt)
 
     if len(remainder) > 0:
         usage("Unknown option(s) specified: <%s>" % remainder[0])
 
-    nova_client = NovaClient(authfile)
-    saltkeys = SaltKeys()
-    bad_hosts = saltkeys.get_unresponsive_hosts()
-    if not bad_hosts:
-        # nothing to do
-        return
+    if not cleanup and not missingkey and not unresponsive and not showall:
+        usage("One of the options 'cleanup', 'missing', 'showall' or"
+              "'no_ping' must be specified")
 
-    good_instances = nova_client.get_good_instances()
+    runner = Runner({'cleanup': cleanup, 'missingkey': missingkey,
+                     'unresponsive': unresponsive, 'showall': showall}, 
authfile, dryrun)
+    runner.run()
 
-    for bad_key in bad_hosts:
-        if (canonicalize(bad_key, nova_client.auth.get_region())
-                not in good_instances):
-            if not dryrun:
-                log("About to delete key %s" % bad_key)
-                saltkeys.delete_bad_key(bad_key)
-            else:
-                print "would delete", bad_key
 
 if __name__ == '__main__':
     main()

-- 
To view, visit https://gerrit.wikimedia.org/r/211432
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I1651c9340b34b3967413a22f55a3862bc95920be
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to