Volans has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/399161 )
Change subject: wmf-auto-reimage: improve resume capabilities ...................................................................... wmf-auto-reimage: improve resume capabilities * If the reimage has issues after the debian-installer, it's useful to be able to resume it with the --no-pxe option, but there are still some manual steps, depending on the status of the host's Puppet certificate. * Improve the resume capability when --no-pxe is set to auto-detect the status of the Puppet certificate and automatically generate and sign it if missing. * Increased the timeout for the reboots to 1 hour to have more room to manually fix any issue in the reboot process. Bug: T182702 Change-Id: I41f92341ea9650c1a330492a8211d21b2a347978 --- M modules/profile/files/cumin/wmf_auto_reimage.py M modules/profile/files/cumin/wmf_auto_reimage_host.py M modules/profile/files/cumin/wmf_auto_reimage_lib.py 3 files changed, 51 insertions(+), 22 deletions(-) Approvals: jenkins-bot: Verified Volans: Looks good to me, approved diff --git a/modules/profile/files/cumin/wmf_auto_reimage.py b/modules/profile/files/cumin/wmf_auto_reimage.py index ea2338d..70b9d95 100644 --- a/modules/profile/files/cumin/wmf_auto_reimage.py +++ b/modules/profile/files/cumin/wmf_auto_reimage.py @@ -136,7 +136,7 @@ # Validate hosts if not args.new: - lib.validate_hosts(args.hosts, args.no_verify) + lib.validate_hosts(args.hosts, no_raise=args.no_verify) # Update the Phabricator task if args.phab_task_id is not None: diff --git a/modules/profile/files/cumin/wmf_auto_reimage_host.py b/modules/profile/files/cumin/wmf_auto_reimage_host.py index 6cff051..a6373e6 100644 --- a/modules/profile/files/cumin/wmf_auto_reimage_host.py +++ b/modules/profile/files/cumin/wmf_auto_reimage_host.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -"""Automated reimaging of a list of hosts.""" +"""Automated reimaging of a single host.""" import argparse import logging @@ -97,8 +97,8 @@ rename_from = None # In case of host rename, hold the previous hostname # Validate hosts have a signed Puppet certificate - if not args.new and not args.no_verify: - lib.validate_hosts([args.host], args.no_verify) + if not args.new: + lib.validate_hosts([args.host], no_raise=args.no_verify) # Set Icinga downtime if not args.new and not args.no_downtime: @@ -112,6 +112,10 @@ if args.no_pxe: lib.print_line('Skipping PXE reboot', host=args.host) + if (not lib.validate_hosts([args.host], no_raise=True) and + lib.puppet_check_cert_to_sign(args.host) == 1): + # There is no signed or pending signing certificate for the host + lib.puppet_generate_cert(args.host) else: lib.puppet_remove_host(args.host) # Cleanup Puppet diff --git a/modules/profile/files/cumin/wmf_auto_reimage_lib.py b/modules/profile/files/cumin/wmf_auto_reimage_lib.py index 32d131c..8b8ef9a 100644 --- a/modules/profile/files/cumin/wmf_auto_reimage_lib.py +++ b/modules/profile/files/cumin/wmf_auto_reimage_lib.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -"""Automated reimaging of a list of hosts.""" +"""Library for the wmf-auto-reimage and wmf-auto-reimage-host scripts.""" from __future__ import print_function import argparse @@ -487,6 +487,7 @@ if no_raise: logger.warning(message) + return False else: raise RuntimeError(message) else: @@ -494,6 +495,8 @@ print_line('Validated host', host=host) else: print_line('Validated hosts: {hosts}'.format(hosts=hosts)) + + return True def icinga_downtime(host, user, phab_task): @@ -560,13 +563,43 @@ print_line('{message} on hosts: {hosts}'.format(message=message, hosts=hosts)) +def puppet_check_cert_to_sign(host): + """Check if on the puppetmaster there is a new certificate to sign for the given host. + + Return 0 if there is a pending certificate to be signed, 1 if there isn't and 2 if the + certificate is already signed. + + Arguments: + host -- the host to check for a certificate pending signing. + """ + command = "puppet cert list '{host}' 2> /dev/null".format(host=host) + puppetmaster_host = get_puppet_ca_master() + + try: + exit_code, worker = run_cumin( + 'puppet_check_cert_to_sign', puppetmaster_host, [command]) + except RuntimeError: + return 1 + + for _, output in worker.get_results(): + if host in output.message(): + break + + if output.message().startswith(' "{host}"'.format(host=host)): + return 0 + elif output.message().startswith('+ "{host}"'.format(host=host)): + print_line('Puppet cert already signed', host=host) + return 2 + else: + raise RuntimeError('Unable to find cert to sign') + + def puppet_wait_cert_and_sign(host): """Poll the puppetmaster looking for a new key to sign for the given host. Arguments: host -- the host to monitor for a complete Puppet run """ - wait_command = "puppet cert list '{host}' 2> /dev/null".format(host=host) sign_command = "puppet cert -s '{host}'".format(host=host) puppetmaster_host = get_puppet_ca_master() start = datetime.utcnow() @@ -581,28 +614,20 @@ print_line('Still waiting for Puppet cert to sign after {min} minutes'.format( min=(retries * WATCHER_LONG_SLEEP) // 60.0), host=host) - try: - exit_code, worker = run_cumin( - 'puppet_wait_cert_and_sign', puppetmaster_host, [wait_command]) - except RuntimeError: + check_cert = puppet_check_cert_to_sign(host) + if check_cert == 0: # Found Puppet cert to sign + break + elif check_cert == 1: # Puppet cert to sign still missing if (datetime.utcnow() - start).total_seconds() > timeout: logger.error('Timeout reached') raise RuntimeError('Timeout reached') time.sleep(WATCHER_LONG_SLEEP) continue - - for _, output in worker.get_results(): - if host in output.message(): - break - - if output.message().startswith(' "{host}"'.format(host=host)): - break - elif output.message().startswith('+ "{host}"'.format(host=host)): - print_line('Puppet cert already signed', host=host) + elif check_cert == 2: # Puppet cert already signed return False - else: - raise RuntimeError('Unable to find cert to sign') + else: # Should never happen + raise RuntimeError('Unable to check Puppet certificate status on puppetmaster') run_cumin('puppet_wait_cert_and_sign', puppetmaster_host, [sign_command]) print_line('Signed Puppet cert', host=host) @@ -755,7 +780,7 @@ if start is None: start = datetime.utcnow() check_start = datetime.utcnow() - timeout = 1800 # 30 minutes + timeout = 3600 # 1 hour retries = 0 while True: -- To view, visit https://gerrit.wikimedia.org/r/399161 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I41f92341ea9650c1a330492a8211d21b2a347978 Gerrit-PatchSet: 3 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Volans <rcocci...@wikimedia.org> Gerrit-Reviewer: Elukey <ltosc...@wikimedia.org> Gerrit-Reviewer: Giuseppe Lavagetto <glavage...@wikimedia.org> Gerrit-Reviewer: Marostegui <maroste...@wikimedia.org> Gerrit-Reviewer: Muehlenhoff <mmuhlenh...@wikimedia.org> Gerrit-Reviewer: Volans <rcocci...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits