Ottomata has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/389722 )
Change subject: [WIP] EventLogging analytics capsule discrepency fixes ...................................................................... [WIP] EventLogging analytics capsule discrepency fixes Bug: T179625 Change-Id: I3d5f8650c416c71f7b8cc904a58f979c852a723f --- D modules/eventlogging/files/filters.py A modules/eventlogging/files/plugins.py M modules/role/manifests/eventlogging/analytics/mysql.pp M modules/role/manifests/eventlogging/analytics/processor.pp M modules/role/manifests/eventlogging/analytics/server.pp 5 files changed, 89 insertions(+), 32 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/22/389722/1 diff --git a/modules/eventlogging/files/filters.py b/modules/eventlogging/files/filters.py deleted file mode 100644 index e411a89..0000000 --- a/modules/eventlogging/files/filters.py +++ /dev/null @@ -1,23 +0,0 @@ -import json - - -def should_insert_event(e): - """ - Given an Event dict e, returns true if this event should be inserted into the - EventLogging storage (MySQL), or false otherwise. This is used - to filter out events generated by unwanted bots. - """ - # If no userAgent information, then insert anyway. - if 'userAgent' not in e: - return True - - user_agent_dict = json.loads(e['userAgent']) - - is_bot = user_agent_dict.get('is_bot', False) - is_mediawiki = user_agent_dict.get('is_mediawiki', False) - - # Don't insert events generated by bots unless they are mediawiki bots. - if is_bot and not is_mediawiki: - return False - else: - return True diff --git a/modules/eventlogging/files/plugins.py b/modules/eventlogging/files/plugins.py new file mode 100644 index 0000000..4850fd8 --- /dev/null +++ b/modules/eventlogging/files/plugins.py @@ -0,0 +1,74 @@ +import json +import dateutil.parser +from datetime import datetime +import unittest + + +# Format string for :func:`datetime.datetime.strptime` for MediaWiki +# timestamps. See `<https://www.mediawiki.org/wiki/Manual:Timestamp>`_. +MEDIAWIKI_TIMESTAMP_FORMAT = '%Y%m%d%H%M%S' +def inject_mediawiki_timestamp(e): + """ + Convert dt to backwards compatible Mediawiki timestamp field. + If dt is not in event, use current time. T179540 + """ + + if 'dt' in e: + dt = dateutil.parser.parse(e['dt']) + else: + dt = datetime.utcnow() + + e['timestamp'] = dt.strftime(MEDIAWIKI_TIMESTAMP_FORMAT) + return e + + +def mysql_mapper(e): + """ + The WMF EventLogging Analytics MySQL log database has a lot of curious + legacy compatibility problems. This function converts an event + to a format that the MySQL database expects. + """ + if 'userAgent' in e and isinstance(e['userAgent'], dict): + # Get rid of unwanted bots. T67508 + is_bot = e['userAgent'].get('is_bot', False) + is_mediawiki = e['userAgent'].get('is_mediawiki', False) + # Don't insert events generated by bots unless they are mediawiki bots. + if is_bot and not is_mediawiki: + # Returning None will cause map:// + # reader to exclude this event. + return None + + # MySQL expects that userAgent is a string, so we + # convert it to JSON string now. T153207 + e['userAgent'] = json.dumps(e['userAgent']) + + # Historicaly, EventCapsule did not have `dt` so we remove it from + # insertion into MySQL. + if 'dt' in e: + del e['dt'] + + return e + + +# ##### Tests ###### +# To run: +# python -m unittest -v plugins.py +# Or: +# python plugins.py +# +class TestEventLoggingPlugins(unittest.TestCase): + def test_inject_mediawiki_timestamp(self): + e = {'dt': '2017-11-01T11:00:00', 'userAgent': {}} + should_be = {'dt': '2017-11-01T11:00:00', 'timestamp': '20171101110000', 'userAgent': {}} + self.assertEqual(inject_mediawiki_timestamp(e), should_be) + + def test_mysql_mapper(self): + e1 = {'dt': '2017-11-01T11:00:00', 'timestamp': '20171101110000', 'userAgent': {'browser_family': 'Chrome'}} + should_be1 = {'timestamp': '20171101110000', 'userAgent': '{"browser_family": "Chrome"}'} + self.assertEqual(mysql_mapper(e1), should_be1) + + e2 = {'dt': '2017-11-01T11:00:00', 'timestamp': '20171101110000', 'userAgent': {'is_bot': True}} + self.assertEqual(mysql_mapper(e2), None) + +if __name__ == '__main__': + unittest.main(verbosity=2) \ No newline at end of file diff --git a/modules/role/manifests/eventlogging/analytics/mysql.pp b/modules/role/manifests/eventlogging/analytics/mysql.pp index 8ba8972..cac5874 100644 --- a/modules/role/manifests/eventlogging/analytics/mysql.pp +++ b/modules/role/manifests/eventlogging/analytics/mysql.pp @@ -25,10 +25,6 @@ labs => '127.0.0.1/log', } - eventlogging::plugin { 'filters': - source => 'puppet:///modules/eventlogging/filters.py', - } - # Run N parallel mysql consumers processors. # These will auto balance amongst themselves. $mysql_consumers = hiera( @@ -63,16 +59,16 @@ # For beta cluster, set in https://wikitech.wikimedia.org/wiki/Hiera:Deployment-prep $statsd_host = hiera('eventlogging_statsd_host', 'statsd.eqiad.wmnet') - # Filtering function to use on events consumed by mysql - $filter_function = '&function=should_insert_event' + # Map function to use on events consumed by mysql. T179625 + $map_function = '&function=mysql_mapper' - # Custom URI scheme to pass events through filter - $filter_scheme = 'filter://' + # Custom URI scheme to pass events through map function + $map_scheme = 'map://' # Kafka consumer group for this consumer is mysql-m4-master eventlogging::service::consumer { $mysql_consumers: # auto commit offsets to kafka more often for mysql consumer - input => "${filter_scheme}${kafka_consumer_uri}&auto_commit_interval_ms=1000${kafka_api_version_param}${filter_function}", + input => "${map_scheme}${kafka_consumer_uri}&auto_commit_interval_ms=1000${kafka_api_version_param}${map_function}", output => "mysql://${mysql_user}:${mysql_pass}@${mysql_db}?charset=utf8&statsd_host=${statsd_host}&replace=True", sid => 'eventlogging_consumer_mysql_00', # Restrict permissions on this config file since it contains a password. diff --git a/modules/role/manifests/eventlogging/analytics/processor.pp b/modules/role/manifests/eventlogging/analytics/processor.pp index 24e4b1b..9d42e32 100644 --- a/modules/role/manifests/eventlogging/analytics/processor.pp +++ b/modules/role/manifests/eventlogging/analytics/processor.pp @@ -54,6 +54,12 @@ default => "&api_version=${kafka_api_version}" } + # Inject mediawiki_timestamp into this data for backwards compatibility. T179625 + $map_function = '&function=inject_mediawiki_timestamp' + + # Custom URI scheme to pass events through map function + $map_scheme = 'map://' + # Increase number and backoff time of retries for async # analytics uses. If metadata changes, we should give # more time to retry. NOTE: testing this in production diff --git a/modules/role/manifests/eventlogging/analytics/server.pp b/modules/role/manifests/eventlogging/analytics/server.pp index 219f01f..2c78208 100644 --- a/modules/role/manifests/eventlogging/analytics/server.pp +++ b/modules/role/manifests/eventlogging/analytics/server.pp @@ -35,6 +35,10 @@ $kafka_mixed_uri = "${kafka_consumer_scheme}/${kafka_brokers_string}?topic=eventlogging-valid-mixed" $kafka_client_side_raw_uri = "${kafka_consumer_scheme}/${kafka_brokers_string}?topic=eventlogging-client-side" + eventlogging::plugin { 'plugins': + source => 'puppet:///modules/eventlogging/plugins.py', + } + # This check was written for eventlog1001, so only include it there., if $::hostname == 'eventlog1001' { -- To view, visit https://gerrit.wikimedia.org/r/389722 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3d5f8650c416c71f7b8cc904a58f979c852a723f Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ottomata <ao...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits