jenkins-bot has submitted this change and it was merged.

Change subject: Handle webservice calls erroring out
......................................................................


Handle webservice calls erroring out

- Log errors and also push them to graphite
- Add a 15s timeout to each webservice call

Change-Id: Ie53664655def94ebd924d56dec1e00040be7ecd1
---
M tools/manifest/servicemonitor.py
1 file changed, 21 insertions(+), 12 deletions(-)

Approvals:
  Yuvipanda: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/tools/manifest/servicemonitor.py b/tools/manifest/servicemonitor.py
index 6b65571..a3912fc 100644
--- a/tools/manifest/servicemonitor.py
+++ b/tools/manifest/servicemonitor.py
@@ -11,14 +11,25 @@
 
     def _start_webservice(self, manifest):
         self.log.info('Starting webservice for tool %s', manifest.tool.name)
-        return subprocess.check_output([
-            '/usr/bin/sudo',
-            '-i', '-u', manifest.tool.username,
-            '/usr/local/bin/webservice',
-            '--release', manifest.webservice_release,
-            manifest.webservice_server,
-            'start',
-        ])
+        try:
+            subprocess.check_output([
+                '/usr/bin/sudo',
+                '-i', '-u', manifest.tool.username,
+                '/usr/local/bin/webservice',
+                '--release', manifest.webservice_release,
+                manifest.webservice_server,
+                'start',
+            ], timeout=15)  # 15 second timeout!
+            self.log.info('Started webservice for %s', manifest.tool.name)
+            return True
+        except subprocess.CalledProcessError as e:
+            self.log.exception('Could not start webservice for tool %s', 
manifest.tool.name)
+            self.stats.incr('webservice_startfailed')
+            manifest.tool.log('Could not start webservice - webservice tool 
exited with error code %s' % e.returncode)
+        except subprocess.TimeoutExpired:
+            self.log.exception('Timed out attempting to start webservice for 
tool %s', manifest.tool.name)
+            self.stats.incr('webservice_startfailed')
+            manifest.tool.log('Timed out attempting to start webservice (15s)')
 
     def run(self):
         qstat_xml = ET.fromstring(subprocess.check_output(['/usr/bin/qstat', 
'-u', '*', '-xml']))
@@ -28,11 +39,9 @@
                 continue
             job = qstat_xml.find('.//job_list[JB_name="%s"]' % 
self._webjob_name(manifest))
             if job is None or 'r' not in job.findtext('.//state'):
-                self._start_webservice(manifest)
                 manifest.tool.log('No running webservice job found, starting 
it')
-                self.log.info('Started webservice for %s', manifest.tool.name)
-                self.stats.incr('webservice.%s.restarted' % manifest.tool.name)
-                restarts_count += 1
+                if self._start_webservice(manifest):
+                    restarts_count += 1
         self.log.info('Service monitor run completed, %s webservices 
restarted', restarts_count)
         self.stats.incr('webservices_restarted', restarts_count)
 

-- 
To view, visit https://gerrit.wikimedia.org/r/202342
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie53664655def94ebd924d56dec1e00040be7ecd1
Gerrit-PatchSet: 3
Gerrit-Project: operations/software/tools-manifest
Gerrit-Branch: master
Gerrit-Owner: Yuvipanda <yuvipa...@gmail.com>
Gerrit-Reviewer: Legoktm <legoktm.wikipe...@gmail.com>
Gerrit-Reviewer: Yuvipanda <yuvipa...@gmail.com>
Gerrit-Reviewer: coren <mpellet...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to