Repository: incubator-hawq Updated Branches: refs/heads/HAWQ-617 [created] 19604066b
HAWQ-617. Add ignore-bad-hosts option. Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/19604066 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/19604066 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/19604066 Branch: refs/heads/HAWQ-617 Commit: 19604066b134bac8b7226a3abdb55411c181a4c4 Parents: 8bd1063 Author: Bhuvnesh Chaudhary <bchaudh...@pivotal.io> Authored: Wed Apr 6 14:35:30 2016 -0700 Committer: Bhuvnesh Chaudhary <bchaudh...@pivotal.io> Committed: Wed Apr 6 14:35:30 2016 -0700 ---------------------------------------------------------------------- tools/bin/gppylib/util/ssh_utils.py | 23 +++++++++++++++++++ tools/bin/gpscp | 36 ++++++++++++++++++----------- tools/bin/hawq_ctl | 36 +++++++++++++++++++++-------- tools/bin/hawqconfig | 12 ++++++---- tools/bin/hawqpylib/HAWQ_HELP.py | 1 + tools/bin/hawqpylib/hawqlib.py | 39 ++++++++++++++++++++++++++++++++ tools/doc/gpscp_help | 7 ++++++ 7 files changed, 126 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/gppylib/util/ssh_utils.py ---------------------------------------------------------------------- diff --git a/tools/bin/gppylib/util/ssh_utils.py b/tools/bin/gppylib/util/ssh_utils.py index 3194e11..853c0f5 100644 --- a/tools/bin/gppylib/util/ssh_utils.py +++ b/tools/bin/gppylib/util/ssh_utils.py @@ -160,6 +160,29 @@ class HostList(): return self.list + def removeBadHosts(self): + ''' Update list of host to include only the host on which SSH was successful''' + + pool = WorkerPool() + + for h in self.list: + cmd = Echo('ssh test', '', ctxt=REMOTE, remoteHost=h) + pool.addCommand(cmd) + + pool.join() + pool.haltWork() + + bad_hosts = [] + working_hosts = [] + for cmd in pool.getCompletedItems(): + if not cmd.get_results().wasSuccessful(): + bad_hosts.append(cmd.remoteHost) + else: + working_hosts.append(cmd.remoteHost) + + self.list = working_hosts[:] + return bad_hosts + # Session is a command session, derived from a base class cmd.Cmd class Session(cmd.Cmd): '''Implements a list of open ssh sessions ready to execute commands''' http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/gpscp ---------------------------------------------------------------------- diff --git a/tools/bin/gpscp b/tools/bin/gpscp index d00f15d..c02d677 100755 --- a/tools/bin/gpscp +++ b/tools/bin/gpscp @@ -64,6 +64,7 @@ class Global: opt['-f'] = None opt['-J'] = '=:' opt['-r'] = False + opt['--ignore-bad-hosts'] = False filePath = [] GV = Global() @@ -86,18 +87,19 @@ def print_version(): ############# def parseCommandLine(): try: - (options, args) = getopt.getopt(sys.argv[1:], '?vrJ:p:u:h:f:', ['version']) + (options, args) = getopt.getopt(sys.argv[1:], '?vrJ:p:u:h:f:', ['version', 'ignore-bad-hosts']) except Exception, e: usage('[ERROR] ' + str(e)) for (switch, val) in options: - if (switch == '-?'): usage(0) - elif (switch == '-v'): GV.opt[switch] = True - elif (switch == '-f'): GV.opt[switch] = val - elif (switch == '-h'): GV.opt[switch].append(val) - elif (switch == '-J'): GV.opt[switch] = val + ':' - elif (switch == '-r'): GV.opt[switch] = True - elif (switch == '--version'): print_version() + if (switch == '-?'): usage(0) + elif (switch == '-v'): GV.opt[switch] = True + elif (switch == '-f'): GV.opt[switch] = val + elif (switch == '-h'): GV.opt[switch].append(val) + elif (switch == '-J'): GV.opt[switch] = val + ':' + elif (switch == '-r'): GV.opt[switch] = True + elif (switch == '--version'): print_version() + elif (switch == '--ignore-bad-hosts'): GV.opt[switch] = True hf = (len(GV.opt['-h']) and 1 or 0) + (GV.opt['-f'] and 1 or 0) if hf != 1: @@ -131,15 +133,23 @@ try: if GV.opt['-f']: hostlist.parseFile(GV.opt['-f']) - try: - hostlist.checkSSH() - except ssh_utils.SSHError, e: - sys.exit('[ERROR] ' + str(e)) + if GV.opt['--ignore-bad-hosts']: + original_hostlist = hostlist.list + bad_hosts = hostlist.removeBadHosts() + if len(bad_hosts) == len(original_hostlist): + sys.exit('[ERROR]: Unable to SSH to any of the hosts {0}'.format(original_hostlist)) + if len(bad_hosts) > 0: + print "[WARN]: Skipping syncing configuration file on hosts {0}, as ssh test failed".format(bad_hosts) + else: + try: + hostlist.checkSSH() + except ssh_utils.SSHError, e: + sys.exit('[ERROR] ' + str(e)) GV.opt['-h'] = hostlist.filterMultiHomedHosts() + if len(GV.opt['-h']) == 0: usage('Error: missing hosts in -h and/or -f arguments') - scp = 'scp -o "BatchMode yes" -o "StrictHostKeyChecking no"' if GV.opt['-r']: scp += ' -r' http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawq_ctl ---------------------------------------------------------------------- diff --git a/tools/bin/hawq_ctl b/tools/bin/hawq_ctl index 7937ac6..a4e0c3c 100755 --- a/tools/bin/hawq_ctl +++ b/tools/bin/hawq_ctl @@ -493,6 +493,7 @@ class HawqStart: self.masteronly = opts.masteronly self.special_mode = opts.special_mode self.restrict = opts.restrict + self.ignore_bad_hosts = opts.ignore_bad_hosts self._get_config() @@ -682,13 +683,24 @@ class HawqStart: def _start_all_segments(self): logger.info("Start all the segments in hawq cluster") - segment_cmd_str = self._start_segment_cmd() logger.info("Start segments in list: %s" % self.host_list) - work_list = [] + bad_hosts = [] + working_hosts = self.host_list + if self.ignore_bad_hosts: + working_hosts, bad_hosts = exclude_bad_hosts(self.host_list) + if len(bad_hosts) == len(self.host_list): + logger.error("Unable to SSH on any of the hosts, skipping segment start operation") + return + if len(bad_hosts) > 0: + logger.warning("Skipping starting segments in the list {0}, SSH test failed".format(bad_hosts)) + self.hosts_count_number -= len(bad_hosts) + + segment_cmd_str = self._start_segment_cmd() q = Queue.Queue() - for host in self.host_list: + work_list = [] + for host in working_hosts: work_list.append({"func":remote_ssh,"args":(segment_cmd_str, host, self.user, q)}) - work_list.append({"func":check_progress,"args":(q, self.hosts_count_number, 'start', 0, self.quiet)}) + work_list.append({"func":check_progress,"args":(q, self.hosts_count_number, 'start', len(bad_hosts), self.quiet)}) node_init = HawqCommands(name = 'HAWQ', action_name = 'start', logger = logger) node_init.get_function_list(work_list) node_init.start() @@ -699,7 +711,6 @@ class HawqStart: logger.info("Segments started successfully") return node_init.return_flag - def run(self): if self.node_type == "master": check_return_code(self.start_master(), logger, \ @@ -1145,7 +1156,7 @@ def hawq_activate_standby(opts, hawq_dict): logger.error("Stop master failed, try again with immediate mode") cmd = "%s; hawq stop master -a -M immediate -q;" % source_hawq_env return_result = remote_ssh(cmd, old_master_host_name, '') - if return_resutl != 0: + if return_result != 0: logger.error("Stop master failed, abort") logger.error("Please manually bring hawq cluster down, then do activate standby again") sys.exit(1) @@ -1175,13 +1186,13 @@ def hawq_activate_standby(opts, hawq_dict): # Set current standby host name as the new master host name in configuration. logger.info("Update master host name in hawq-site.xml") - cmd = "%s; hawq config -c hawq_master_address_host -v %s --skipvalidation -q" % \ - (source_hawq_env, hawq_dict['hawq_standby_address_host']) + ignore_bad_hosts = '--ignore-bad-hosts' if opts.ignore_bad_hosts else '' + cmd = "%s; hawq config -c hawq_master_address_host -v %s --skipvalidation -q %s" % (source_hawq_env, hawq_dict['hawq_standby_address_host'], ignore_bad_hosts) check_return_code(remote_ssh(cmd, old_standby_host_name, ''), logger, "Set hawq_master_address_host failed") # Remove the old standby host configuration from hawq-site.xml. logger.info("Remove current standby from hawq-site.xml") - cmd = "%s; hawq config -r hawq_standby_address_host --skipvalidation -q" % source_hawq_env + cmd = "%s; hawq config -r hawq_standby_address_host --skipvalidation -q %s" % (source_hawq_env, ignore_bad_hosts) check_return_code(remote_ssh(cmd, old_standby_host_name, ''), logger, "Remove hawq_standby_address_host from configuration failed") cmd = '''echo "gp_persistent_repair_global_sequence = true" >> %s/%s''' % (hawq_dict['hawq_master_directory'], 'postgresql.conf') @@ -1205,7 +1216,7 @@ def hawq_activate_standby(opts, hawq_dict): logger.info("Start hawq cluster") cmd = "%s; hawq start master" % source_hawq_env check_return_code(remote_ssh(cmd, new_master_host_name, ''), logger, "Start master failed") - cmd = "%s; hawq start allsegments" % source_hawq_env + cmd = "%s; hawq start allsegments %s" % (source_hawq_env, ignore_bad_hosts) check_return_code(remote_ssh(cmd, new_master_host_name, ''), logger, "Start all the segments failed") cmd = '''sed -i "/gp_persistent_repair_global_sequence/d" %s/%s''' % (hawq_dict['hawq_master_directory'], 'postgresql.conf') check_return_code(remote_ssh(cmd, new_master_host_name, '')) @@ -1279,6 +1290,10 @@ def create_parser(): parser.add_option('-n', '--no-update', action='store_true', dest='no_update', default=False, help='Do not update system catalog tables.') + parser.add_option('-i', '--ignore-bad-hosts', + dest='ignore_bad_hosts', action='store_true', + default=False, + help='Skips syncing configuration files on hosts on which SSH fails') parser.add_option("--bucket_number", type="int", dest="default_hash_table_bucket_number", @@ -1319,6 +1334,7 @@ def create_parser(): dest="shared_buffers", default="128000kB", help="Sets the shared_buffers for formatting hawq database") + (options, args) = parser.parse_args() if len(args) == 0: parser.print_help() http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawqconfig ---------------------------------------------------------------------- diff --git a/tools/bin/hawqconfig b/tools/bin/hawqconfig index 6618bc3..4a1e3c7 100755 --- a/tools/bin/hawqconfig +++ b/tools/bin/hawqconfig @@ -39,6 +39,8 @@ def parseargs(): parser.add_option('-l', '--list', action='store_true', help="List all HAWQ Properties.") parser.add_option('--skipvalidation', action='store_true', default=False) + parser.add_option('--ignore-bad-hosts', action='store_true', default=False, + help='Skips copying configuration files on host on which SSH fails') parser.add_option('-q', '--quiet', action='store_true', default=False) parser.add_option("-v", "--value", dest="property_value", @@ -174,12 +176,11 @@ def update_hawq_site(org_config_file, hawq_site, property_name, property_value): append_xml_property(org_config_file, property_name, property_value) -def sync_hawq_site(config_dir, host_list): +def sync_hawq_site(config_dir, host_list, ignore_bad_hosts): sync_host_str = "" for node in host_list: sync_host_str += " -h %s" % node - - result = local_ssh("hawq scp %s %s/etc/hawq-site.xml =:%s/etc/" % (sync_host_str, config_dir, config_dir)) + result = local_ssh("hawq scp %s %s %s/etc/hawq-site.xml =:%s/etc/" % (sync_host_str, ignore_bad_hosts, config_dir, config_dir)) if result != 0: sys.exit("sync hawq-site.xml failed.") @@ -196,6 +197,7 @@ if __name__ == '__main__': segment_list = parse_hosts_file(GPHOME) master_host = hawq_site.hawq_dict['hawq_master_address_host'] host_list = segment_list + [master_host] + ignore_bad_hosts = '--ignore-bad-hosts' if options.ignore_bad_hosts else '' if 'hawq_standby_address_host' in hawq_site.hawq_dict: standby_host = hawq_site.hawq_dict['hawq_standby_address_host'] if standby_host not in ('None', 'none', ''): @@ -212,7 +214,7 @@ if __name__ == '__main__': check_property_valid(hawq_site, options.change) update_hawq_site(org_config_file, hawq_site, options.change, options.property_value) - sync_hawq_site(GPHOME, host_list) + sync_hawq_site(GPHOME, host_list, ignore_bad_hosts) if not options.quiet: latest_hawq_site = HawqXMLParser(GPHOME) @@ -232,7 +234,7 @@ if __name__ == '__main__': print "Remove %s is not allowed" % options.remove sys.exit(1) remove_property_xml(options.remove, org_config_file, options.quiet) - sync_hawq_site(GPHOME, host_list) + sync_hawq_site(GPHOME, host_list, ignore_bad_hosts) else: print "Please input correct options" sys.exit(1) http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawqpylib/HAWQ_HELP.py ---------------------------------------------------------------------- diff --git a/tools/bin/hawqpylib/HAWQ_HELP.py b/tools/bin/hawqpylib/HAWQ_HELP.py index 2c184ac..e0d901f 100755 --- a/tools/bin/hawqpylib/HAWQ_HELP.py +++ b/tools/bin/hawqpylib/HAWQ_HELP.py @@ -164,6 +164,7 @@ The "options" are: -v --verbose Displays detailed status. -r --remove HAWQ GUC name to be removed. --skipvalidation Skip the system validation checks. + --ignore-bad-hosts Skips copying configuration files on host on which SSH fails See 'hawq --help' for more information on other commands. """ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawqpylib/hawqlib.py ---------------------------------------------------------------------- diff --git a/tools/bin/hawqpylib/hawqlib.py b/tools/bin/hawqpylib/hawqlib.py index 85354b4..79bcdae 100755 --- a/tools/bin/hawqpylib/hawqlib.py +++ b/tools/bin/hawqpylib/hawqlib.py @@ -24,6 +24,8 @@ from xml.dom import minidom from xml.etree.ElementTree import ElementTree import shutil from gppylib.db import dbconn +from gppylib.commands.base import WorkerPool, REMOTE +from gppylib.commands.unix import Echo import re @@ -484,3 +486,40 @@ def get_hawq_hostname_all(master_port): hawq_host_array = {'master': {master_host: master_status}, 'standby': {standby_host: standby_status}, 'segment': seg_host_list} return hawq_host_array + +def get_host_status(hostlist): + """ + Test if SSH command works on a host and return a dictionary + Return Ex: {host1: True, host2: False} + where True represents SSH command success and False represents failure + """ + if not isinstance(hostlist, list): + raise Exception("Input parameter should be of type list") + + pool = WorkerPool() + + for host in hostlist: + cmd = Echo('ssh test', '', ctxt=REMOTE, remoteHost=host) + pool.addCommand(cmd) + + pool.join() + pool.haltWork() + + host_status_dict = {} + for cmd in pool.getCompletedItems(): + if not cmd.get_results().wasSuccessful(): + host_status_dict[cmd.remoteHost] = False + else: + host_status_dict[cmd.remoteHost] = True + + return host_status_dict + + +def exclude_bad_hosts(host_list): + """ + Split Hosts on which SSH works vs node on which it fails + """ + host_status_dict = get_host_status(host_list) + working_hosts = [host for host in host_status_dict.keys() if host_status_dict[host]] + bad_hosts = list(set(host_list) - set(working_hosts)) + return working_hosts, bad_hosts http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/doc/gpscp_help ---------------------------------------------------------------------- diff --git a/tools/doc/gpscp_help b/tools/doc/gpscp_help index 4212288..aa5bd25 100755 --- a/tools/doc/gpscp_help +++ b/tools/doc/gpscp_help @@ -84,6 +84,13 @@ character is an equal sign (=). Optional. Reports additional messages in addition to the SCP command output. +--ignore-bad-hosts +Ignore copying files to the hosts on which test SSH attempt failed +and continue with the remaining. If test SSH failed, it indicates +that either the host is not working or there are issues while attempting +to SSH on these host. Once the skipped hosts are brought back, ensure +that the required files are synced to them. + <file_to_copy> Required. The file name (or absolute path) of a file that