[MediaWiki-commits] [Gerrit] Tool that quickly breaks cache logs into parts - change (wikimedia...tools)

Mwalker (Code Review) Wed, 23 Oct 2013 15:43:42 -0700

Mwalker has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/91524



Change subject: Tool that quickly breaks cache logs into parts
......................................................................

Tool that quickly breaks cache logs into parts

Change-Id: Ib305830a6aba286ec95665607b9a5fcc5521a616
---
A SquidRipper/logparser.py
1 file changed, 173 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools 
refs/changes/24/91524/1

diff --git a/SquidRipper/logparser.py b/SquidRipper/logparser.py
new file mode 100644
index 0000000..7976610
--- /dev/null
+++ b/SquidRipper/logparser.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python2
+
+import re
+import sys
+from optparse import OptionParser
+import socket
+import struct
+
+# Regex based on http://wikitech.wikimedia.org/view/Squid_log_format
+cacheRegex = re.compile(
+    r"""
+        ^(?P<server>[\S]+) # Name of the squid server
+        \s[-]*
+        (?P<sequence>[0-9]+) # Sequence ID from the squid server
+        \s
+        (?P<timestamp>[0-9-]+T[0-9:.]+) # Timestamp
+        \s
+        (?P<servicetime>[0-9.]+) # Request service time
+        \s
+        (?P<client>[\S]+) # Client IP address
+        \s
+        (?P<httpstatus>[\S]+) # Squid request status and HTTP status code
+        \s
+        (?P<replysize>[0-9]+) # Reply size including HTTP headers
+        \s
+        (?P<httpmethod>[\S]+) # Request type
+        \s
+        (?P<url>[\S]+) # Request URL
+        \s
+        (?P<squidhierarchy>[\S]+) # Squid hierarchy status, peer IP
+        \s
+        (?P<mime>[\S]+) # MIME content type
+        \s
+        (?P<referrer>[\S]+) # Referer header
+        \s
+        (?P<xff>[\S]+) # X-Forwarded-For header
+        \s
+        (?P<useragent>[\S\s]+) # User-Agent header
+        \s
+        (?P<acceptlanguage>[\S\s]+) # Accept-Language header
+        \s
+        .*$
+    """, re.VERBOSE
+)
+
+# Based on urlparse.urlsplit which is really slow but only does:
+# <scheme>://<netloc>/<path>?<query>#<fragment>
+# This regex does not replicate all functionality, just optimizes
+# even further for our purposes
+urlRegex = re.compile(
+    r"""
+        (?P<urlscheme>http|https)
+        ://
+        (?P<urlhost>(?:(?!/|\?|\#)\S)*)
+        /?
+        (?P<urlpath>(?:(?!\?|\#)\S)*)
+        \??
+        (?P<urlquery>(?:(?!\#)\S)*)
+        \#?
+        (?P<urlfragment>[\S]*)
+    """, re.VERBOSE
+)
+
+# Ignore these subnets on request
+localSubnets = [
+    '10.0.0.0/8',
+    '208.80.152.0/22',
+    '208.80.155.0/27',
+    '91.198.174.0/24',
+]
+
+
+def initIpFilter():
+    global localSubnets
+    subnets = []
+    for subnet in localSubnets:
+        (network, mask) = subnet.split('/')
+        network = struct.unpack('L', socket.inet_aton(network))[0]
+        mask = (2L << mask - 1) - 1
+        subnets.append((mask, network))
+    localSubnets = subnets
+
+
+def ipFilter(squidParts):
+    global localSubnets
+
+    ip = struct.unpack('L', socket.inet_aton(squidParts['client']))[0]
+    for subnet in localSubnets:
+        if ip & subnet[0] == subnet[1]:
+            return True
+    return False
+
+
+def sslFilter(squidParts):
+    if squidParts['server'][:3] == 'ssl':
+        return True
+    else:
+        return False
+
+
+def getSquidParts(line):
+    match = cacheRegex.match(line)
+    if match:
+        return match.groupdict()
+    else:
+        raise Exception("Cache regex did not match: %s" % line)
+
+
+def getHttpRequest(squidParts):
+    match = urlRegex.match(squidParts['url'])
+    if match:
+        return match.groupdict()
+    else:
+        raise Exception("URL regex did not match: %s" % squidParts['url'])
+
+
+def getHttpParams(httpParts):
+    elements = httpParts['urlquery'].split('&')
+    elementDict = {}
+    for element in elements:
+        try:
+            (k,v) = element.split('=')
+        except ValueError:
+            k = element
+            v = None
+        elementDict[k] = v
+    return elementDict
+
+
+
+if __name__ == "__main__":
+    parser = OptionParser(usage="usage: %prog [options] <columns...>")
+    parser.add_option(
+        "-i", "--filterIP", dest='filterIP', action='store_true', 
default=False, help='Filter out local IPs'
+    )
+    parser.add_option(
+        "-s", "--filterSSL", dest='filterSSL', action='store_true', 
default=False,
+        help='Filter out initial SSL connections'
+    )
+    (options, args) = parser.parse_args()
+
+    if options.filterIP:
+        initIpFilter()
+
+    # Main application loop
+    for line in sys.stdin:
+        try:
+            squidParts = getSquidParts(line)
+            if options.filterIP and ipFilter(squidParts):
+                continue
+            if options.filterSSL and sslFilter(squidParts):
+                continue
+
+            # OK we're supposed to be here, now do the more expensive matching
+            httpParts = getHttpRequest(squidParts)
+            urlParts = getHttpParams(httpParts)
+
+            # And now compose the output line
+            out = []
+            for k in args:
+                if k in squidParts:
+                    out.append(squidParts[k])
+                elif k in httpParts:
+                    out.append(httpParts[k])
+                elif k in urlParts:
+                    out.append(urlParts[k])
+                else:
+                    out.append('-')  # When we cannot otherwise find anything
+            sys.stdout.write(' '.join(out))
+            sys.stdout.write("\n")
+
+        except Exception as e:
+            sys.stderr.write("Could not process %s" % e.message)

-- 
To view, visit https://gerrit.wikimedia.org/r/91524
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib305830a6aba286ec95665607b9a5fcc5521a616
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Mwalker <mwal...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Tool that quickly breaks cache logs into parts - change (wikimedia...tools)

Reply via email to