jenkins-bot has submitted this change and it was merged. Change subject: Handle webservice calls erroring out ......................................................................
Handle webservice calls erroring out - Log errors and also push them to graphite - Add a 15s timeout to each webservice call Change-Id: Ie53664655def94ebd924d56dec1e00040be7ecd1 --- M tools/manifest/servicemonitor.py 1 file changed, 21 insertions(+), 12 deletions(-) Approvals: Yuvipanda: Looks good to me, approved jenkins-bot: Verified diff --git a/tools/manifest/servicemonitor.py b/tools/manifest/servicemonitor.py index 6b65571..a3912fc 100644 --- a/tools/manifest/servicemonitor.py +++ b/tools/manifest/servicemonitor.py @@ -11,14 +11,25 @@ def _start_webservice(self, manifest): self.log.info('Starting webservice for tool %s', manifest.tool.name) - return subprocess.check_output([ - '/usr/bin/sudo', - '-i', '-u', manifest.tool.username, - '/usr/local/bin/webservice', - '--release', manifest.webservice_release, - manifest.webservice_server, - 'start', - ]) + try: + subprocess.check_output([ + '/usr/bin/sudo', + '-i', '-u', manifest.tool.username, + '/usr/local/bin/webservice', + '--release', manifest.webservice_release, + manifest.webservice_server, + 'start', + ], timeout=15) # 15 second timeout! + self.log.info('Started webservice for %s', manifest.tool.name) + return True + except subprocess.CalledProcessError as e: + self.log.exception('Could not start webservice for tool %s', manifest.tool.name) + self.stats.incr('webservice_startfailed') + manifest.tool.log('Could not start webservice - webservice tool exited with error code %s' % e.returncode) + except subprocess.TimeoutExpired: + self.log.exception('Timed out attempting to start webservice for tool %s', manifest.tool.name) + self.stats.incr('webservice_startfailed') + manifest.tool.log('Timed out attempting to start webservice (15s)') def run(self): qstat_xml = ET.fromstring(subprocess.check_output(['/usr/bin/qstat', '-u', '*', '-xml'])) @@ -28,11 +39,9 @@ continue job = qstat_xml.find('.//job_list[JB_name="%s"]' % self._webjob_name(manifest)) if job is None or 'r' not in job.findtext('.//state'): - self._start_webservice(manifest) manifest.tool.log('No running webservice job found, starting it') - self.log.info('Started webservice for %s', manifest.tool.name) - self.stats.incr('webservice.%s.restarted' % manifest.tool.name) - restarts_count += 1 + if self._start_webservice(manifest): + restarts_count += 1 self.log.info('Service monitor run completed, %s webservices restarted', restarts_count) self.stats.incr('webservices_restarted', restarts_count) -- To view, visit https://gerrit.wikimedia.org/r/202342 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie53664655def94ebd924d56dec1e00040be7ecd1 Gerrit-PatchSet: 3 Gerrit-Project: operations/software/tools-manifest Gerrit-Branch: master Gerrit-Owner: Yuvipanda <yuvipa...@gmail.com> Gerrit-Reviewer: Legoktm <legoktm.wikipe...@gmail.com> Gerrit-Reviewer: Yuvipanda <yuvipa...@gmail.com> Gerrit-Reviewer: coren <mpellet...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits