Ori.livneh has uploaded a new change for review. https://gerrit.wikimedia.org/r/59059
Change subject: Monitor MediaWiki fatals and exceptions in Ganglia ...................................................................... Monitor MediaWiki fatals and exceptions in Ganglia This change extends the EventLogging Puppet module to configure Ganglia monitoring of MediaWiki fatals and exceptions. Change I1632a6b19 configured fluorine to forward MediaWiki fatals and exceptions to vanadium via UDP port 8423. This change configures an EventLogging UDP-to-ZMQ router that publishes the same stream using ZeroMQ on TCP 8423. (ZeroMQ facilitates having multiple subscribers consume the stream; UDP with SO_REUSEADDR does not work well with unicast.) This change also sets up a metric gathering module that reports errors (broken down by type) to Ganglia. Error types are detected using simple substring matching. Port 8423 is hard-coded in three places (twice in this change, once in I1632a6b19), which is unfortunate. Instead of plopping static configuration files in /etc/supervisor, the EventLogging Puppet module should declare parametrized resource types for common patterns, like UDP-to-ZMQ forwarding. I intend to do this sometime in the next month or two. Change-Id: I55450783d018ed7fd7399ee5adf4305af156a59b --- A modules/eventlogging/files/mwerrors.conf A modules/eventlogging/files/mwerrors.py A modules/eventlogging/files/mwerrors.pyconf M modules/eventlogging/manifests/init.pp A modules/eventlogging/manifests/mediawiki_errors.pp 5 files changed, 214 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/59/59059/1 diff --git a/modules/eventlogging/files/mwerrors.conf b/modules/eventlogging/files/mwerrors.conf new file mode 100644 index 0000000..2982121 --- /dev/null +++ b/modules/eventlogging/files/mwerrors.conf @@ -0,0 +1,10 @@ +; Supervisord configuration for 'mwerrors' Ganglia module. +; Managed by Puppet: puppet:///files/eventlogging/mwerrors.conf +; Forward MediaWiki fatals / exceptions to ZeroMQ + +[group:mwerrors] +programs = udp2zmq_8423 + +[program:udp2zmq_8423] +command = udp2zmq 8423 +user = eventlogging diff --git a/modules/eventlogging/files/mwerrors.py b/modules/eventlogging/files/mwerrors.py new file mode 100755 index 0000000..510956f --- /dev/null +++ b/modules/eventlogging/files/mwerrors.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + Gmond metric-gathering module for MediaWiki fatals and exceptions + + Reads fatals / exceptions from a ZeroMQ publisher. MediaWiki logs to a file + or a UDP socket, so for this to work you will also need a UDP-to-ZMQ router. + See 'udp2zmq' in EventLogging. + + When invoked by itself, runs a self-test. + + Usage: mwerrors.py tcp://HOST:PORT + + Written by Ori Livneh <o...@wikimedia.org> + +""" +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +import errno +import threading +import time + +import zmq + + +patterns = ( + # Substring to match # Metric # Metric title + ('Fatal error: Out of memory', 'oom', 'Out-of-memory fatals'), + ('Fatal error: Maximum execution time', 'timelimit', 'Time limit fatals'), + ('Fatal error:', 'fatal', 'Miscellaneous fatals'), + ('Exception from', 'exception', 'Exceptions'), + ('Catchable fatal error', 'catchable', 'Catchable fatals'), + ('DatabaseBase->reportQueryError', 'query', 'Query errors'), +) + + +def count_errors(counter, endpoint): + """Count error types in error stream.""" + ctx = zmq.Context.instance() + sock = ctx.socket(zmq.SUB) + sock.connect(endpoint) + sock.setsockopt(zmq.SUBSCRIBE, b'') + + while 1: + try: + line = sock.recv() + for pattern, name, description in patterns: + if pattern in line: + counter[name] += 1 + break + except zmq.ZMQError as e: + # Calls interrupted by EINTR should be re-tried. + if e.errno == errno.EINTR: + continue + raise + + +def metric_init(params): + """ + Initialize; part of Gmond interface + + `params` is a dictionary of configuration options, generated by + Ganglia out of values specified in the module's .pyconf file. It + should contain an 'endpoint' key, specifying the address of the + streaming endpoint. Example: + + param endpoint { + value = 'tcp://127.0.0.1:8423' + } + + """ + endpoint = params['endpoint'] + counter = {name: 0 for pattern, name, description in patterns} + + thread = threading.Thread(target=count_errors, args=(counter, endpoint)) + thread.daemon = True + thread.start() + + time.sleep(2) + + return [{ + 'name': name, + 'value_type': 'uint', + 'format': '%d', + 'units': 'errors', + 'slope': 'positive', + 'time_max': 15, + 'description': description, + 'groups': 'mediawiki', + 'call_back': counter.get, + } for pattern, name, description in patterns] + + +def metric_cleanup(): + """Teardown; part of Gmond interface""" + pass + + +if __name__ == '__main__': + # Self-test: report metrics to stdout every 10 seconds. + import sys + + if len(sys.argv) != 2: + sys.exit('Usage: %s tcp://HOST:PORT' % __file__) + + params = {'endpoint': sys.argv[1]} + metrics = metric_init(params) + + print('Streaming errors from %(endpoint)s...' % params) + + while 1: + print('\n{:-^32}'.format(time.asctime())) + for metric in metrics: + call_back = metric['call_back'] + name = metric['name'] + description = metric['description'] + print('{:.<30}{}'.format(description, call_back(name))) + time.sleep(10) + +# vim: set et ft=python ts=4 sw=4: diff --git a/modules/eventlogging/files/mwerrors.pyconf b/modules/eventlogging/files/mwerrors.pyconf new file mode 100644 index 0000000..6435a29 --- /dev/null +++ b/modules/eventlogging/files/mwerrors.pyconf @@ -0,0 +1,49 @@ +/** + * MediaWiki exceptions & fatals monitoring + * File managed by Puppet: puppet:///files/eventlogging/mwerrors.pyconf + */ + +modules { + module { + name = "mwerrors" + language = "python" + } +} + + +collection_group { + + collect_every = 15 + time_threshold = 30 + + metric { + name = "oom" + title = "Out-of-memory fatals" + value_threshold = 1 + } + metric { + name = "timelimit" + title = "Time limit fatals" + value_threshold = 1 + } + metric { + name = "fatal" + title = "Miscellaneous fatals" + value_threshold = 1 + } + metric { + name = "exception" + title = "Exceptions" + value_threshold = 1 + } + metric { + name = "catchable" + title = "Catchable fatals" + value_threshold = 1 + } + metric { + name = "query" + title = "Query errors" + value_threshold = 1 + } +} diff --git a/modules/eventlogging/manifests/init.pp b/modules/eventlogging/manifests/init.pp index 986ba74..8687a43 100644 --- a/modules/eventlogging/manifests/init.pp +++ b/modules/eventlogging/manifests/init.pp @@ -8,6 +8,8 @@ destinations => [ 'stat1.wikimedia.org' ], } + class { 'eventlogging::mediawiki_errors': } + package { [ 'python-jsonschema', 'python-mysqldb', diff --git a/modules/eventlogging/manifests/mediawiki_errors.pp b/modules/eventlogging/manifests/mediawiki_errors.pp new file mode 100644 index 0000000..2870b84 --- /dev/null +++ b/modules/eventlogging/manifests/mediawiki_errors.pp @@ -0,0 +1,31 @@ +# Monitor MediaWiki errors using Ganglia +class eventlogging::mediawiki_errors { + + file { '/usr/lib/ganglia/python_modules/mwerrors.py': + ensure => present, + source => 'puppet:///modules/eventlogging/mwerrors.py', + require => [ + File['/usr/lib/ganglia/python_modules'], + Package['python-zmq'], + ], + } + + file { '/etc/supervisor/conf.d/mwerrors.conf': + source => 'puppet:///modules/eventlogging/mwerrors.conf', + require => [ Package['supervisor'], Systemuser['eventlogging'] ], + notify => Service['supervisor'], + mode => '0444', + } + + file { '/etc/ganglia/conf.d/mwerrors.pyconf': + ensure => present, + source => 'puppet:///modules/eventlogging/mwerrors.pyconf', + require => [ + File['/etc/ganglia/conf.d'], + File['/usr/lib/ganglia/python_modules/mwerrors.py'], + File['/etc/supervisor/conf.d/mwerrors.conf'], + ], + notify => Service[gmond], + } + +} -- To view, visit https://gerrit.wikimedia.org/r/59059 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I55450783d018ed7fd7399ee5adf4305af156a59b Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ori.livneh <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits