Volans has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/399161 )

Change subject: wmf-auto-reimage: improve resume capabilities
......................................................................


wmf-auto-reimage: improve resume capabilities

* If the reimage has issues after the debian-installer, it's useful to
  be able to resume it with the --no-pxe option, but there are still
  some manual steps, depending on the status of the host's Puppet
  certificate.
* Improve the resume capability when --no-pxe is set to auto-detect the
  status of the Puppet certificate and automatically generate and sign
  it if missing.
* Increased the timeout for the reboots to 1 hour to have more room to
  manually fix any issue in the reboot process.

Bug: T182702
Change-Id: I41f92341ea9650c1a330492a8211d21b2a347978
---
M modules/profile/files/cumin/wmf_auto_reimage.py
M modules/profile/files/cumin/wmf_auto_reimage_host.py
M modules/profile/files/cumin/wmf_auto_reimage_lib.py
3 files changed, 51 insertions(+), 22 deletions(-)

Approvals:
  jenkins-bot: Verified
  Volans: Looks good to me, approved



diff --git a/modules/profile/files/cumin/wmf_auto_reimage.py 
b/modules/profile/files/cumin/wmf_auto_reimage.py
index ea2338d..70b9d95 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage.py
@@ -136,7 +136,7 @@
 
     # Validate hosts
     if not args.new:
-        lib.validate_hosts(args.hosts, args.no_verify)
+        lib.validate_hosts(args.hosts, no_raise=args.no_verify)
 
     # Update the Phabricator task
     if args.phab_task_id is not None:
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_host.py 
b/modules/profile/files/cumin/wmf_auto_reimage_host.py
index 6cff051..a6373e6 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_host.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_host.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-"""Automated reimaging of a list of hosts."""
+"""Automated reimaging of a single host."""
 
 import argparse
 import logging
@@ -97,8 +97,8 @@
     rename_from = None  # In case of host rename, hold the previous hostname
 
     # Validate hosts have a signed Puppet certificate
-    if not args.new and not args.no_verify:
-        lib.validate_hosts([args.host], args.no_verify)
+    if not args.new:
+        lib.validate_hosts([args.host], no_raise=args.no_verify)
 
     # Set Icinga downtime
     if not args.new and not args.no_downtime:
@@ -112,6 +112,10 @@
 
     if args.no_pxe:
         lib.print_line('Skipping PXE reboot', host=args.host)
+        if (not lib.validate_hosts([args.host], no_raise=True) and
+                lib.puppet_check_cert_to_sign(args.host) == 1):
+            # There is no signed or pending signing certificate for the host
+            lib.puppet_generate_cert(args.host)
     else:
         lib.puppet_remove_host(args.host)  # Cleanup Puppet
 
diff --git a/modules/profile/files/cumin/wmf_auto_reimage_lib.py 
b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
index 32d131c..8b8ef9a 100644
--- a/modules/profile/files/cumin/wmf_auto_reimage_lib.py
+++ b/modules/profile/files/cumin/wmf_auto_reimage_lib.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-"""Automated reimaging of a list of hosts."""
+"""Library for the wmf-auto-reimage and wmf-auto-reimage-host scripts."""
 from __future__ import print_function
 
 import argparse
@@ -487,6 +487,7 @@
 
         if no_raise:
             logger.warning(message)
+            return False
         else:
             raise RuntimeError(message)
     else:
@@ -494,6 +495,8 @@
             print_line('Validated host', host=host)
         else:
             print_line('Validated hosts: {hosts}'.format(hosts=hosts))
+
+    return True
 
 
 def icinga_downtime(host, user, phab_task):
@@ -560,13 +563,43 @@
         print_line('{message} on hosts: {hosts}'.format(message=message, 
hosts=hosts))
 
 
+def puppet_check_cert_to_sign(host):
+    """Check if on the puppetmaster there is a new certificate to sign for the 
given host.
+
+    Return 0 if there is a pending certificate to be signed, 1 if there isn't 
and 2 if the
+    certificate is already signed.
+
+    Arguments:
+    host  -- the host to check for a certificate pending signing.
+    """
+    command = "puppet cert list '{host}' 2> /dev/null".format(host=host)
+    puppetmaster_host = get_puppet_ca_master()
+
+    try:
+        exit_code, worker = run_cumin(
+            'puppet_check_cert_to_sign', puppetmaster_host, [command])
+    except RuntimeError:
+        return 1
+
+    for _, output in worker.get_results():
+        if host in output.message():
+            break
+
+    if output.message().startswith('  "{host}"'.format(host=host)):
+        return 0
+    elif output.message().startswith('+ "{host}"'.format(host=host)):
+        print_line('Puppet cert already signed', host=host)
+        return 2
+    else:
+        raise RuntimeError('Unable to find cert to sign')
+
+
 def puppet_wait_cert_and_sign(host):
     """Poll the puppetmaster looking for a new key to sign for the given host.
 
     Arguments:
     host  -- the host to monitor for a complete Puppet run
     """
-    wait_command = "puppet cert list '{host}' 2> /dev/null".format(host=host)
     sign_command = "puppet cert -s '{host}'".format(host=host)
     puppetmaster_host = get_puppet_ca_master()
     start = datetime.utcnow()
@@ -581,28 +614,20 @@
             print_line('Still waiting for Puppet cert to sign after {min} 
minutes'.format(
                 min=(retries * WATCHER_LONG_SLEEP) // 60.0), host=host)
 
-        try:
-            exit_code, worker = run_cumin(
-                'puppet_wait_cert_and_sign', puppetmaster_host, [wait_command])
-        except RuntimeError:
+        check_cert = puppet_check_cert_to_sign(host)
+        if check_cert == 0:  # Found Puppet cert to sign
+            break
+        elif check_cert == 1:  # Puppet cert to sign still missing
             if (datetime.utcnow() - start).total_seconds() > timeout:
                 logger.error('Timeout reached')
                 raise RuntimeError('Timeout reached')
 
             time.sleep(WATCHER_LONG_SLEEP)
             continue
-
-        for _, output in worker.get_results():
-            if host in output.message():
-                break
-
-        if output.message().startswith('  "{host}"'.format(host=host)):
-            break
-        elif output.message().startswith('+ "{host}"'.format(host=host)):
-            print_line('Puppet cert already signed', host=host)
+        elif check_cert == 2:  # Puppet cert already signed
             return False
-        else:
-            raise RuntimeError('Unable to find cert to sign')
+        else:  # Should never happen
+            raise RuntimeError('Unable to check Puppet certificate status on 
puppetmaster')
 
     run_cumin('puppet_wait_cert_and_sign', puppetmaster_host, [sign_command])
     print_line('Signed Puppet cert', host=host)
@@ -755,7 +780,7 @@
     if start is None:
         start = datetime.utcnow()
     check_start = datetime.utcnow()
-    timeout = 1800  # 30 minutes
+    timeout = 3600  # 1 hour
     retries = 0
 
     while True:

-- 
To view, visit https://gerrit.wikimedia.org/r/399161
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I41f92341ea9650c1a330492a8211d21b2a347978
Gerrit-PatchSet: 3
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Volans <rcocci...@wikimedia.org>
Gerrit-Reviewer: Elukey <ltosc...@wikimedia.org>
Gerrit-Reviewer: Giuseppe Lavagetto <glavage...@wikimedia.org>
Gerrit-Reviewer: Marostegui <maroste...@wikimedia.org>
Gerrit-Reviewer: Muehlenhoff <mmuhlenh...@wikimedia.org>
Gerrit-Reviewer: Volans <rcocci...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to