jenkins-bot has submitted this change and it was merged. Change subject: Add support for "%{fieldName}i"-style format specifiers ......................................................................
Add support for "%{fieldName}i"-style format specifiers We need to add a user-agent field matcher to the cilent-side event parser's format string, but we don't have a specifier for it. We could pick an unassigned letter and turn it into a user-agent format specifier (e.g. '%a'). But that's ad-hoc. It's nicer to support varnishncsa's notion of naming capturing groups via squiggly brackets. Thus, introduce the following format specifiers: * "%{..}i": Tab-delimited string * "%{..}s": Space-delimited string * "%{..}d": Integer ..where '..' represents the desired name for the capturing group. With this change, we can deprecate %l and %n, so do that too. As an example, we can replace this format string: %q %l %n %t %h With this one: %q %{recvFrom}s %{seqId}d %t %h %{userAgent}i Change-Id: I07c6c785648397c2fe51247edb27653885c0cbed --- M server/eventlogging/parse.py M server/eventlogging/schema.py M server/tests/test_parser.py 3 files changed, 40 insertions(+), 15 deletions(-) Approvals: Ori.livneh: Looks good to me, approved jenkins-bot: Verified diff --git a/server/eventlogging/parse.py b/server/eventlogging/parse.py index 3e867b9..41cc4f6 100644 --- a/server/eventlogging/parse.py +++ b/server/eventlogging/parse.py @@ -19,14 +19,24 @@ +--------+-----------------------------+ | %j | JSON object | +--------+-----------------------------+ - | %l | Hostname of origin | + | %l | Hostname of origin* | +--------+-----------------------------+ - | %n | Sequence ID | + | %n | Sequence ID* | +--------+-----------------------------+ | %q | Query-string-encoded JSON | +--------+-----------------------------+ - | %t | Timestamp in NCSA format. | + | %t | Timestamp in NCSA format | +--------+-----------------------------+ + | %{..}i | Tab-delimited string | + +--------+-----------------------------+ + | %{..}s | Space-delimited string | + +--------+-----------------------------+ + | %{..}d | Integer | + +--------+-----------------------------+ + + '..' is the desired property name for the capturing group. + + *: Deprecated; use "%{..}x"-style format specifiers instead. """ from __future__ import division, unicode_literals @@ -108,13 +118,18 @@ #: A mapping of format specifiers to a tuple of (regexp, caster). format_specifiers = { - '%h': (r'(?P<clientIp>\S+)', hash_ip), - '%j': (r'(?P<capsule>\S+)', json.loads), - '%l': (r'(?P<recvFrom>\S+)', str), - '%n': (r'(?P<seqId>\d+)', int), - '%q': (r'(?P<capsule>\?\S+)', decode_qson), - '%t': (r'(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})', - ncsa_to_epoch), + 'h': (r'(?P<clientIp>\S+)', hash_ip), + 'j': (r'(?P<capsule>\S+)', json.loads), + 's': (r'(?P<%s>\S+)', str), + 'i': (r'(?P<%s>[^\t]+)', str), + 'd': (r'(?P<%s>\d+)', int), + 'q': (r'(?P<capsule>\?\S+)', decode_qson), + 't': (r'(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})', + ncsa_to_epoch), + + #: Deprecated format specifiers: + 'l': (r'(?P<recvFrom>\S+)', str), + 'n': (r'(?P<seqId>\d+)', int), } @@ -134,13 +149,17 @@ #: Compiled regexp. format = re.sub(' ', r'\s+', format) - self.re = re.compile(re.sub(r'(?<!%)%[hjlnqt]', self._repl, format)) + raw = re.sub(r'(?<!%)%({(\w+)})?([dhijlnqst])', self._repl, format) + self.re = re.compile(raw) def _repl(self, spec): """Replace a format specifier with its expanded regexp matcher and append its caster to the list. Called by :func:`re.sub`. """ - matcher, caster = format_specifiers[spec.group()] + _, name, specifier = spec.groups() + matcher, caster = format_specifiers[specifier] + if name: + matcher = matcher % name self.casters.append(caster) return matcher diff --git a/server/eventlogging/schema.py b/server/eventlogging/schema.py index d7c98ae..67dd318 100644 --- a/server/eventlogging/schema.py +++ b/server/eventlogging/schema.py @@ -72,6 +72,9 @@ # KeyError exception will be raised. We re-raise it as a # :exc:`ValidationError` to provide a simpler API for callers. raise jsonschema.ValidationError('Missing key: %s' % ex) + if capsule['revision'] < 1: + raise jsonschema.ValidationError( + 'Invalid revision ID: %(revision)s' % capsule) schema = get_schema(scid, encapsulate=True) jsonschema.Draft3Validator.check_schema(schema) jsonschema.Draft3Validator(schema).validate(capsule) diff --git a/server/tests/test_parser.py b/server/tests/test_parser.py index df7ad8e..54b8ce9 100644 --- a/server/tests/test_parser.py +++ b/server/tests/test_parser.py @@ -31,14 +31,16 @@ maxDiff = None def test_parse_client_side_events(self): - """Parser test: client-side events (%q %l %n %t %h).""" - parser = eventlogging.LogParser('%q %l %n %t %h') + """Parser test: client-side events + (%q %{recvFrom}s %{seqId}d %t %h %{userAgent}i).""" + parser = eventlogging.LogParser('%q %{recvFrom}s %{seqId}d %t %h' + '%{userAgent}i') raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic' '%22%2C%22revision%22%3A13%2C%22clientValidated%22%3Atrue%2C' '%22event%22%3A%7B%22articleId%22%3A1%2C%22articleTitle%22%3' 'A%22H%C3%A9ctor%20Elizondo%22%7D%2C%22webHost%22%3A%22test.' 'wikipedia.org%22%7D; cp3022.esams.wikimedia.org 132073 2013' - '-01-19T23:16:38 86.149.229.149') + '-01-19T23:16:38 86.149.229.149 Mozilla/5.0') parsed = { 'uuid': '799341a01ba957c79b15dc4d2d950864', 'recvFrom': 'cp3022.esams.wikimedia.org', @@ -50,6 +52,7 @@ 'clientIp': eventlogging.parse.hash_ip('86.149.229.149'), 'schema': 'Generic', 'revision': 13, + 'userAgent': 'Mozilla/5.0', 'event': { 'articleTitle': 'Héctor Elizondo', 'articleId': 1 -- To view, visit https://gerrit.wikimedia.org/r/113457 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I07c6c785648397c2fe51247edb27653885c0cbed Gerrit-PatchSet: 3 Gerrit-Project: mediawiki/extensions/EventLogging Gerrit-Branch: master Gerrit-Owner: Ori.livneh <o...@wikimedia.org> Gerrit-Reviewer: Mattflaschen <mflasc...@wikimedia.org> Gerrit-Reviewer: Nuria <nu...@wikimedia.org> Gerrit-Reviewer: Ori.livneh <o...@wikimedia.org> Gerrit-Reviewer: QChris <christ...@quelltextlich.at> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits