Ottomata has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/389722 )

Change subject: [WIP] EventLogging analytics capsule discrepency fixes
......................................................................

[WIP] EventLogging analytics capsule discrepency fixes

Bug: T179625
Change-Id: I3d5f8650c416c71f7b8cc904a58f979c852a723f
---
D modules/eventlogging/files/filters.py
A modules/eventlogging/files/plugins.py
M modules/role/manifests/eventlogging/analytics/mysql.pp
M modules/role/manifests/eventlogging/analytics/processor.pp
M modules/role/manifests/eventlogging/analytics/server.pp
5 files changed, 89 insertions(+), 32 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/22/389722/1

diff --git a/modules/eventlogging/files/filters.py 
b/modules/eventlogging/files/filters.py
deleted file mode 100644
index e411a89..0000000
--- a/modules/eventlogging/files/filters.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import json
-
-
-def should_insert_event(e):
-    """
-    Given an Event dict e, returns true if this event should be inserted into 
the
-    EventLogging storage (MySQL), or false otherwise.  This is used
-    to filter out events generated by unwanted bots.
-    """
-    # If no userAgent information, then insert anyway.
-    if 'userAgent' not in e:
-        return True
-
-    user_agent_dict = json.loads(e['userAgent'])
-
-    is_bot = user_agent_dict.get('is_bot', False)
-    is_mediawiki = user_agent_dict.get('is_mediawiki', False)
-
-    # Don't insert events generated by bots unless they are mediawiki bots.
-    if is_bot and not is_mediawiki:
-        return False
-    else:
-        return True
diff --git a/modules/eventlogging/files/plugins.py 
b/modules/eventlogging/files/plugins.py
new file mode 100644
index 0000000..4850fd8
--- /dev/null
+++ b/modules/eventlogging/files/plugins.py
@@ -0,0 +1,74 @@
+import json
+import dateutil.parser
+from datetime import datetime
+import unittest
+
+
+# Format string for :func:`datetime.datetime.strptime` for MediaWiki
+# timestamps. See `<https://www.mediawiki.org/wiki/Manual:Timestamp>`_.
+MEDIAWIKI_TIMESTAMP_FORMAT = '%Y%m%d%H%M%S'
+def inject_mediawiki_timestamp(e):
+    """
+    Convert dt to backwards compatible Mediawiki timestamp field.
+    If dt is not in event, use current time. T179540
+    """
+
+    if 'dt' in e:
+        dt = dateutil.parser.parse(e['dt'])
+    else:
+        dt = datetime.utcnow()
+
+    e['timestamp'] = dt.strftime(MEDIAWIKI_TIMESTAMP_FORMAT)
+    return e
+
+
+def mysql_mapper(e):
+    """
+    The WMF EventLogging Analytics MySQL log database has a lot of curious
+    legacy compatibility problems.  This function converts an event
+    to a format that the MySQL database expects.
+    """
+    if 'userAgent' in e and isinstance(e['userAgent'], dict):
+        # Get rid of unwanted bots. T67508
+        is_bot = e['userAgent'].get('is_bot', False)
+        is_mediawiki = e['userAgent'].get('is_mediawiki', False)
+        # Don't insert events generated by bots unless they are mediawiki bots.
+        if is_bot and not is_mediawiki:
+            # Returning None will cause map://
+            # reader to exclude this event.
+            return None
+
+        # MySQL expects that userAgent is a string, so we
+        # convert it to JSON string now.  T153207
+        e['userAgent'] = json.dumps(e['userAgent'])
+
+    # Historicaly, EventCapsule did not have `dt` so we remove it from
+    # insertion into MySQL.
+    if 'dt' in e:
+        del e['dt']
+
+    return e
+
+
+# ##### Tests ######
+# To run:
+#   python -m unittest -v plugins.py
+# Or:
+#   python plugins.py
+#
+class TestEventLoggingPlugins(unittest.TestCase):
+    def test_inject_mediawiki_timestamp(self):
+        e = {'dt': '2017-11-01T11:00:00', 'userAgent': {}}
+        should_be = {'dt': '2017-11-01T11:00:00', 'timestamp': 
'20171101110000', 'userAgent': {}}
+        self.assertEqual(inject_mediawiki_timestamp(e), should_be)
+
+    def test_mysql_mapper(self):
+        e1 = {'dt': '2017-11-01T11:00:00', 'timestamp': '20171101110000', 
'userAgent': {'browser_family': 'Chrome'}}
+        should_be1 = {'timestamp': '20171101110000', 'userAgent': 
'{"browser_family": "Chrome"}'}
+        self.assertEqual(mysql_mapper(e1), should_be1)
+
+        e2 = {'dt': '2017-11-01T11:00:00', 'timestamp': '20171101110000', 
'userAgent': {'is_bot': True}}
+        self.assertEqual(mysql_mapper(e2), None)
+
+if __name__ == '__main__':
+    unittest.main(verbosity=2)
\ No newline at end of file
diff --git a/modules/role/manifests/eventlogging/analytics/mysql.pp 
b/modules/role/manifests/eventlogging/analytics/mysql.pp
index 8ba8972..cac5874 100644
--- a/modules/role/manifests/eventlogging/analytics/mysql.pp
+++ b/modules/role/manifests/eventlogging/analytics/mysql.pp
@@ -25,10 +25,6 @@
         labs       => '127.0.0.1/log',
     }
 
-    eventlogging::plugin { 'filters':
-        source => 'puppet:///modules/eventlogging/filters.py',
-    }
-
     # Run N parallel mysql consumers processors.
     # These will auto balance amongst themselves.
     $mysql_consumers = hiera(
@@ -63,16 +59,16 @@
     # For beta cluster, set in 
https://wikitech.wikimedia.org/wiki/Hiera:Deployment-prep
     $statsd_host          = hiera('eventlogging_statsd_host', 
'statsd.eqiad.wmnet')
 
-    # Filtering function to use on events consumed by mysql
-    $filter_function      = '&function=should_insert_event'
+    # Map function to use on events consumed by mysql. T179625
+    $map_function      = '&function=mysql_mapper'
 
-    # Custom URI scheme to pass events through filter
-    $filter_scheme        = 'filter://'
+    # Custom URI scheme to pass events through map function
+    $map_scheme        = 'map://'
 
     # Kafka consumer group for this consumer is mysql-m4-master
     eventlogging::service::consumer { $mysql_consumers:
         # auto commit offsets to kafka more often for mysql consumer
-        input  => 
"${filter_scheme}${kafka_consumer_uri}&auto_commit_interval_ms=1000${kafka_api_version_param}${filter_function}",
+        input  => 
"${map_scheme}${kafka_consumer_uri}&auto_commit_interval_ms=1000${kafka_api_version_param}${map_function}",
         output => 
"mysql://${mysql_user}:${mysql_pass}@${mysql_db}?charset=utf8&statsd_host=${statsd_host}&replace=True",
         sid    => 'eventlogging_consumer_mysql_00',
         # Restrict permissions on this config file since it contains a 
password.
diff --git a/modules/role/manifests/eventlogging/analytics/processor.pp 
b/modules/role/manifests/eventlogging/analytics/processor.pp
index 24e4b1b..9d42e32 100644
--- a/modules/role/manifests/eventlogging/analytics/processor.pp
+++ b/modules/role/manifests/eventlogging/analytics/processor.pp
@@ -54,6 +54,12 @@
         default => "&api_version=${kafka_api_version}"
     }
 
+    # Inject mediawiki_timestamp into this data for backwards compatibility. 
T179625
+    $map_function      = '&function=inject_mediawiki_timestamp'
+
+    # Custom URI scheme to pass events through map function
+    $map_scheme        = 'map://'
+
     # Increase number and backoff time of retries for async
     # analytics uses.  If metadata changes, we should give
     # more time to retry. NOTE: testing this in production
diff --git a/modules/role/manifests/eventlogging/analytics/server.pp 
b/modules/role/manifests/eventlogging/analytics/server.pp
index 219f01f..2c78208 100644
--- a/modules/role/manifests/eventlogging/analytics/server.pp
+++ b/modules/role/manifests/eventlogging/analytics/server.pp
@@ -35,6 +35,10 @@
     $kafka_mixed_uri = 
"${kafka_consumer_scheme}/${kafka_brokers_string}?topic=eventlogging-valid-mixed"
     $kafka_client_side_raw_uri = 
"${kafka_consumer_scheme}/${kafka_brokers_string}?topic=eventlogging-client-side"
 
+    eventlogging::plugin { 'plugins':
+        source => 'puppet:///modules/eventlogging/plugins.py',
+    }
+
     # This check was written for eventlog1001, so only include it there.,
     if $::hostname == 'eventlog1001' {
 

-- 
To view, visit https://gerrit.wikimedia.org/r/389722
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3d5f8650c416c71f7b8cc904a58f979c852a723f
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <ao...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to