jenkins-bot has submitted this change and it was merged.

Change subject: Add support for "%{fieldName}i"-style format specifiers
......................................................................


Add support for "%{fieldName}i"-style format specifiers

We need to add a user-agent field matcher to the cilent-side event parser's
format string, but we don't have a specifier for it. We could pick an
unassigned letter and turn it into a user-agent format specifier (e.g. '%a').
But that's ad-hoc. It's nicer to support varnishncsa's notion of naming
capturing groups via squiggly brackets.

Thus, introduce the following format specifiers:

* "%{..}i": Tab-delimited string
* "%{..}s": Space-delimited string
* "%{..}d": Integer

..where '..' represents the desired name for the capturing group.
With this change, we can deprecate %l and %n, so do that too.

As an example, we can replace this format string:

    %q %l %n %t %h

With this one:

    %q %{recvFrom}s %{seqId}d %t %h %{userAgent}i

Change-Id: I07c6c785648397c2fe51247edb27653885c0cbed
---
M server/eventlogging/parse.py
M server/eventlogging/schema.py
M server/tests/test_parser.py
3 files changed, 40 insertions(+), 15 deletions(-)

Approvals:
  Ori.livneh: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/server/eventlogging/parse.py b/server/eventlogging/parse.py
index 3e867b9..41cc4f6 100644
--- a/server/eventlogging/parse.py
+++ b/server/eventlogging/parse.py
@@ -19,14 +19,24 @@
   +--------+-----------------------------+
   |   %j   | JSON object                 |
   +--------+-----------------------------+
-  |   %l   | Hostname of origin          |
+  |   %l   | Hostname of origin*         |
   +--------+-----------------------------+
-  |   %n   | Sequence ID                 |
+  |   %n   | Sequence ID*                |
   +--------+-----------------------------+
   |   %q   | Query-string-encoded JSON   |
   +--------+-----------------------------+
-  |   %t   | Timestamp in NCSA format.   |
+  |   %t   | Timestamp in NCSA format    |
   +--------+-----------------------------+
+  | %{..}i | Tab-delimited string        |
+  +--------+-----------------------------+
+  | %{..}s | Space-delimited string      |
+  +--------+-----------------------------+
+  | %{..}d | Integer                     |
+  +--------+-----------------------------+
+
+   '..' is the desired property name for the capturing group.
+
+   *: Deprecated; use "%{..}x"-style format specifiers instead.
 
 """
 from __future__ import division, unicode_literals
@@ -108,13 +118,18 @@
 
 #: A mapping of format specifiers to a tuple of (regexp, caster).
 format_specifiers = {
-    '%h': (r'(?P<clientIp>\S+)', hash_ip),
-    '%j': (r'(?P<capsule>\S+)', json.loads),
-    '%l': (r'(?P<recvFrom>\S+)', str),
-    '%n': (r'(?P<seqId>\d+)', int),
-    '%q': (r'(?P<capsule>\?\S+)', decode_qson),
-    '%t': (r'(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})',
-           ncsa_to_epoch),
+    'h': (r'(?P<clientIp>\S+)', hash_ip),
+    'j': (r'(?P<capsule>\S+)', json.loads),
+    's': (r'(?P<%s>\S+)', str),
+    'i': (r'(?P<%s>[^\t]+)', str),
+    'd': (r'(?P<%s>\d+)', int),
+    'q': (r'(?P<capsule>\?\S+)', decode_qson),
+    't': (r'(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})',
+          ncsa_to_epoch),
+
+    #: Deprecated format specifiers:
+    'l': (r'(?P<recvFrom>\S+)', str),
+    'n': (r'(?P<seqId>\d+)', int),
 }
 
 
@@ -134,13 +149,17 @@
 
         #: Compiled regexp.
         format = re.sub(' ', r'\s+', format)
-        self.re = re.compile(re.sub(r'(?<!%)%[hjlnqt]', self._repl, format))
+        raw = re.sub(r'(?<!%)%({(\w+)})?([dhijlnqst])', self._repl, format)
+        self.re = re.compile(raw)
 
     def _repl(self, spec):
         """Replace a format specifier with its expanded regexp matcher
         and append its caster to the list. Called by :func:`re.sub`.
         """
-        matcher, caster = format_specifiers[spec.group()]
+        _, name, specifier = spec.groups()
+        matcher, caster = format_specifiers[specifier]
+        if name:
+            matcher = matcher % name
         self.casters.append(caster)
         return matcher
 
diff --git a/server/eventlogging/schema.py b/server/eventlogging/schema.py
index d7c98ae..67dd318 100644
--- a/server/eventlogging/schema.py
+++ b/server/eventlogging/schema.py
@@ -72,6 +72,9 @@
         # KeyError exception will be raised. We re-raise it as a
         # :exc:`ValidationError` to provide a simpler API for callers.
         raise jsonschema.ValidationError('Missing key: %s' % ex)
+    if capsule['revision'] < 1:
+        raise jsonschema.ValidationError(
+            'Invalid revision ID: %(revision)s' % capsule)
     schema = get_schema(scid, encapsulate=True)
     jsonschema.Draft3Validator.check_schema(schema)
     jsonschema.Draft3Validator(schema).validate(capsule)
diff --git a/server/tests/test_parser.py b/server/tests/test_parser.py
index df7ad8e..54b8ce9 100644
--- a/server/tests/test_parser.py
+++ b/server/tests/test_parser.py
@@ -31,14 +31,16 @@
     maxDiff = None
 
     def test_parse_client_side_events(self):
-        """Parser test: client-side events (%q %l %n %t %h)."""
-        parser = eventlogging.LogParser('%q %l %n %t %h')
+        """Parser test: client-side events
+        (%q %{recvFrom}s %{seqId}d %t %h %{userAgent}i)."""
+        parser = eventlogging.LogParser('%q %{recvFrom}s %{seqId}d %t %h'
+                                        '%{userAgent}i')
         raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic'
                '%22%2C%22revision%22%3A13%2C%22clientValidated%22%3Atrue%2C'
                '%22event%22%3A%7B%22articleId%22%3A1%2C%22articleTitle%22%3'
                'A%22H%C3%A9ctor%20Elizondo%22%7D%2C%22webHost%22%3A%22test.'
                'wikipedia.org%22%7D; cp3022.esams.wikimedia.org 132073 2013'
-               '-01-19T23:16:38 86.149.229.149')
+               '-01-19T23:16:38 86.149.229.149 Mozilla/5.0')
         parsed = {
             'uuid': '799341a01ba957c79b15dc4d2d950864',
             'recvFrom': 'cp3022.esams.wikimedia.org',
@@ -50,6 +52,7 @@
             'clientIp': eventlogging.parse.hash_ip('86.149.229.149'),
             'schema': 'Generic',
             'revision': 13,
+            'userAgent': 'Mozilla/5.0',
             'event': {
                 'articleTitle': 'Héctor Elizondo',
                 'articleId': 1

-- 
To view, visit https://gerrit.wikimedia.org/r/113457
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I07c6c785648397c2fe51247edb27653885c0cbed
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/EventLogging
Gerrit-Branch: master
Gerrit-Owner: Ori.livneh <o...@wikimedia.org>
Gerrit-Reviewer: Mattflaschen <mflasc...@wikimedia.org>
Gerrit-Reviewer: Nuria <nu...@wikimedia.org>
Gerrit-Reviewer: Ori.livneh <o...@wikimedia.org>
Gerrit-Reviewer: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to