Inductiveload has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/99642


Change subject: Add an interface module to the WikidataQuery API, along with 
ways to generate the queries programmatically. Tests included.
......................................................................

Add an interface module to the WikidataQuery API, along with ways to generate 
the queries programmatically. Tests included.

Change-Id: Id1dd2c48c65b9bfb877ec10ad1b8ea69aa00a39c
---
A pywikibot/data/query.py
A tests/wikidataquery_tests.py
2 files changed, 670 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/42/99642/1

diff --git a/pywikibot/data/query.py b/pywikibot/data/query.py
new file mode 100644
index 0000000..8cb4bbc
--- /dev/null
+++ b/pywikibot/data/query.py
@@ -0,0 +1,467 @@
+# -*- coding: utf-8  -*-
+"""
+Objects representing WikidataQuery query syntax and API
+"""
+#
+# (C) Pywikipedia bot team, 2013
+#
+# Distributed under the terms of the MIT license.
+
+import json
+import urllib2
+import pickle
+import os
+import sha
+import time
+import tempfile
+
+
+def listify(x):
+    """
+    If given a non-list , encapsulate in a single-element list
+    """
+    return x if isinstance(x, list) else [x]
+
+
+class QuerySet():
+    """
+    A QuerySet represents a set of queries or other query sets, joined
+    by operators (AND and OR).
+
+    A QuerySet stores this information as a list of Query(Sets) and
+    a joiner operator to join them all together
+    """
+
+    def __init__(self, q):
+        """
+        Initialise a query set from a Query or another QuerySet
+        """
+        self.qs = [q]
+
+    def addJoiner(self, args, joiner):
+        """
+        Add to this QuerySet using the given joiner.
+
+        @return a new query set representing the joining of this one and
+            the arguments
+        """
+
+        '''If the given joiner is not the same as we used before in
+        this QuerySet, nest the current one in parens before joining
+        - this makes the implicit grouping of the
+        API explicit.'''
+        if len(self.qs) >1 and joiner != self.joiner:
+            left = QuerySet(self)
+        else:
+            left = self
+
+        left.joiner = joiner
+
+        for a in listify(args):
+            left.qs.append(a)
+
+        return left
+
+    def AND(self, args):
+        """
+        Add the given args (Queries or QuerySets) to the Query set as a
+        logical conjuction (AND)
+        """
+        return self.addJoiner(args, "AND")
+
+    def OR(self, args):
+        """
+        Add the given args (Queries or QuerySets) to the Query set as a
+        logical disjunction (AND)
+        """
+        return self.addJoiner(args, "OR")
+
+    def __str__(self):
+        """
+        Output as an API-ready string
+        """
+
+        def bracketIfQuerySet(q):
+            if isinstance(q, QuerySet) and q.joiner != self.joiner:
+                return "(%s)" % q
+            else:
+                return str(q)
+
+        s = bracketIfQuerySet(self.qs[0])
+
+        for q in self.qs[1:]:
+            s += " %s %s" % (self.joiner, bracketIfQuerySet(q))
+
+        return s
+
+
+class Query():
+    """
+    A query is a single query for the WikidataQuery API, for example
+    claim[100:60] or link[enwiki]
+
+    Construction of a Query can throw a TypeError if you feed it bad
+    parameters. Exactly what these need to be depends on the Query
+    """
+
+    def AND(self, ands):
+        """
+        Produce a query set ANDing this query and all the given query/sets
+        """
+        return QuerySet(self).addJoiner(ands, "AND");
+
+    def OR(self, ors):
+        """
+        Produce a query set ORing this query and all the given query/sets
+        """
+        return QuerySet(self).addJoiner(ors, "OR");
+
+    def formatItem(self, item):
+        """
+        Default item formatting is string, which will work for queries,
+        querysets, ints and strings
+        """
+        return str(item)
+
+    def formatList(self, l):
+        """
+        Format and comma-join a list
+        """
+        return ",".join([self.formatItem(x) for x in l])
+
+    @staticmethod
+    def isOrContainsOnlyTypes(items, types):
+        """
+        Either this item is one of the given types, or it is a list of
+        only those types
+        """
+        if isinstance(items, list):
+            for x in items:
+                found = False
+                for typ in listify(types):
+                    if isinstance(x, typ):
+                        found = True
+                        break
+
+                if not found:
+                    return False
+        else:
+            for typ in listify(types):
+                found = False
+                if isinstance(items, typ):
+                    found = True
+                    break
+
+
+            if not found:
+                return False
+
+        return True
+
+    def validateOrRaise(self):
+        if not self.validate():
+            raise TypeError
+
+
+class Claim(Query):
+    """
+    This is a Query of the form "claim[prop:val]". It is subclassed by
+    the other similar forms like noclaim and string
+    """
+
+    queryType = "claim"
+
+    def __init__(self, prop, items=[]):
+        self.prop = prop
+
+        if isinstance(items, Tree):
+            self.items = items
+        else:
+            self.items = listify(items)
+
+        self.validateOrRaise()
+
+
+    def formatItems(self):
+        res = ''
+        if len(self.items):
+            res += ":" + ",".join([self.formatItem(x) for x in self.items])
+
+        return res
+
+    def validate(self):
+        return self.isOrContainsOnlyTypes(self.items, [int, Tree])
+
+    def __str__(self):
+        if isinstance(self.items, list):
+            return "%s[%s%s]" % (self.queryType, self.prop, self.formatItems())
+        elif isinstance(self.items, Tree): # maybe Query?
+            return "%s[%s:(%s)]" % (self.queryType, self.prop, self.items)
+
+
+class NoClaim(Claim):
+    queryType = "noclaim"
+
+
+class String(Claim):
+    """
+    Query of the form string[PROPERTY:"STRING",...]
+    """
+    queryType = "string"
+
+    def formatItem(self, x):
+        """
+        Strings need quote-wrapping
+        """
+        return '"%s"' % x
+
+    def validate(self):
+        return self.isOrContainsOnlyTypes(self.items, str)
+
+
+class Tree(Query):
+    """
+    Query of the form tree[ITEM,...][PROPERTY,...]<PROPERTY,...>
+    """
+    queryType = "tree"
+
+    def __init__(self, item, forward=[], reverse=[]):
+        """
+        @param item The root item
+        @param forward List of forward properties, can be empty
+        @param reverse List of reverse properties, can be empty
+        """
+        self.item = listify(item)
+        self.forward = listify(forward)
+        self.reverse = listify(reverse)
+
+        self.validateOrRaise()
+
+    def validate(self):
+        return (self.isOrContainsOnlyTypes(self.item, int) and
+            self.isOrContainsOnlyTypes(self.forward, int) and
+            self.isOrContainsOnlyTypes(self.reverse, int))
+
+    def __str__(self):
+        return "%s[%s][%s][%s]" % (self.queryType, self.formatList(self.item),
+                self.formatList(self.forward), self.formatList(self.reverse))
+
+
+class Around(Query):
+    """
+    A query in the form around[PROPERTY,LATITUDE,LONGITUDE,RADIUS]
+    """
+    queryType = "around"
+
+    def __init__(self, prop, lt, lg, rad):
+        self.prop = prop
+        self.lt = lt
+        self.lg = lg
+        self.rad = rad
+
+    def validate(self):
+        return isinstance(self.prop, int)
+
+    def __str__(self):
+        return "%s[%s,%s,%s,%s]" % (self.queryType, self.prop,
+            self.lt, self.lg, self.rad)
+
+
+class Between(Query):
+    """
+    A query in the form between[PROP, BEGIN, END]
+
+    You have to give one of begin or end
+    """
+    queryType = "between"
+
+    def __init__(self, prop, begin='', end=''):
+        self.prop = prop
+        self.begin = begin
+        self.end = end
+
+    def validate(self):
+        return (len(self.begin) or len(self.end)) and isinstance(self.prop, 
int)
+
+    def __str__(self):
+        return "%s[%s,%s,%s]" % (self.queryType, self.prop, self.begin, 
self.end)
+
+
+class Link(Query):
+    """
+    A query in the form link[LINK,...], which also includes nolink
+
+    All link elements have to be strings, or validation will throw
+    """
+
+    queryType = "link";
+
+    def __init__(self, link):
+        self.link = listify(link)
+        self.validateOrRaise()
+
+    def validate(self):
+        return self.isOrContainsOnlyTypes(self.link, str)
+
+    def __str__(self):
+        return "%s[%s]" % (self.queryType, self.formatList(self.link))
+
+
+class NoLink(Link):
+    queryType = "nolink"
+
+
+class WikidataQuery():
+    """
+    An interface to the WikidatQuery API. Default host is
+    wikidataquery.eu, but you can substitute a different one.
+
+    Caching defaults to a subdir of the system temp directory with a
+    1 hour max cache age.
+
+    Set a zero or negative maxCacheAge to disable caching
+    """
+
+    def __init__(self, host="http://208.80.153.172";, cacheDir=None,
+            cacheMaxAge=60):
+        self.host = host
+        self.cacheMaxAge = cacheMaxAge;
+
+        if cacheDir:
+            self.cacheDir = cacheDir
+        else:
+            self.cacheDir = os.path.join(tempfile.gettempdir(),
+                "wikidataquery_cache")
+
+    def getUrl(self, queryStr):
+        return "%s/api?%s" % (self.host, queryStr)
+
+    def getQueryString(self, q, labels=[], props=[]):
+        """
+        Get the query string for a given query or queryset
+        @return query string including lables and props
+        """
+        qStr = "q=%s" % urllib2.quote(str(q))
+
+        if labels:
+            qStr += "&labels=%s" % ','.join(labels)
+
+        if props:
+            qStr += "&props=%s" % ','.join(props)
+
+        return qStr
+
+    def getCacheFilename(self, queryStr):
+        """
+        Encode a query into a unique and universally safe format
+        """
+        encQuery = sha.new(queryStr).hexdigest() + ".wdq_cache"
+        return os.path.join(self.cacheDir, encQuery)
+
+    def readFromCache(self, queryStr):
+        """
+        Check if we have cached this data recently enough, read it
+        if we have. Returns None if the data is not there or if it is
+        too old
+        """
+
+        if self.cacheMaxAge <= 0:
+            return None
+
+        cacheFile = self.getCacheFilename(queryStr)
+
+        if os.path.isfile(cacheFile):
+            mtime = os.path.getmtime(cacheFile)
+            now = time.time()
+
+            if ((now - mtime) / 60) < self.cacheMaxAge:
+
+                try:
+                    data = pickle.load(open(cacheFile, 'r'))
+                except pickle.UnpicklingError:
+                    print("Couldn't read cached data!")
+                    data = None
+
+                return data
+
+        return None
+
+    def saveToCache(self, q, data):
+        """
+        Save data from a query to a cache file, if enabled
+        @ returns nothing
+        """
+
+        if self.cacheMaxAge <= 0:
+            return
+
+        # we have to use our own query string, as otherwise we may
+        # be able to find the cache file again if there are e.g.
+        # whitespace differences
+        cacheFile = self.getCacheFilename(q)
+
+        if os.path.exists(cacheFile) and not os.path.isfile(cacheFile):
+            return
+
+        if not os.path.exists(self.cacheDir):
+            os.makedirs(self.cacheDir)
+
+        try:
+            pickle.dump(data, open(cacheFile, 'w'))
+        except IOError:
+            print("Failed to write cache file")
+
+    def getDataFromHost(self, queryStr):
+        """
+        Go and fetch a query from the host's API
+        """
+        url = self.getUrl(queryStr)
+
+        resp = urllib2.urlopen(url)
+
+        # http request failed for some reason
+        if resp.getcode() != 200:
+            return None
+
+        try:
+            data = json.loads(resp.read())
+        except ValueError:
+            print("Data received from host but no JSON could be decoded")
+            data = None
+
+        return data
+
+
+    def query(self, q, labels=[], props=[]):
+        """
+        Actually run a query over the API
+        @return Python object of the interpreted JSON or None on failure
+        """
+
+        fullQueryString = self.getQueryString(q, labels, props)
+
+        #try to get cached data first
+        data = self.readFromCache(fullQueryString)
+
+        if data:
+            return data
+
+        # the cached data must not be OK, go and get real data from the
+        # host's API
+        data = self.getDataFromHost(fullQueryString)
+
+        # no JSON found
+        if not data:
+            return None
+
+        #cache data for next time
+        self.saveToCache(fullQueryString, data)
+
+        # just make sure a list is present to simplify checks later
+        # pretty sure the API does this anyway
+        if 'items' not in data:
+            data['items'] = []
+
+        return data
+
+
diff --git a/tests/wikidataquery_tests.py b/tests/wikidataquery_tests.py
new file mode 100644
index 0000000..481cb53
--- /dev/null
+++ b/tests/wikidataquery_tests.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8  -*-
+"""
+Test cases for the WikidataQuery query syntax and API
+"""
+#
+# (C) Pywikipedia bot team, 2013
+#
+# Distributed under the terms of the MIT license.
+
+
+import pywikibot.data.query as query
+from utils import PywikibotTestCase, unittest
+
+import os
+import time
+
+class TestApiFunctions(unittest.TestCase):
+
+    def testQueries(self):
+        """
+        Test that we produce the expected query strings and that
+        invalid inputs are rejected correctly
+        """
+
+        q = query.Claim(99)
+        self.assertEqual(str(q), "claim[99]")
+
+        q = query.Claim(99, 100)
+        self.assertEqual(str(q), "claim[99:100]")
+
+        q = query.Claim(99, [100])
+        self.assertEqual(str(q), "claim[99:100]")
+
+        q = query.Claim(99, [100,101])
+        self.assertEqual(str(q), "claim[99:100,101]")
+
+        q = query.NoClaim(99, [100,101])
+        self.assertEqual(str(q), "noclaim[99:100,101]")
+
+        q = query.String(99, "Hello")
+        self.assertEqual(str(q), 'string[99:"Hello"]')
+
+        q = query.String(99, ["Hello"])
+        self.assertEqual(str(q), 'string[99:"Hello"]')
+
+        q = query.String(99, ["Hello", "world"])
+        self.assertEqual(str(q), 'string[99:"Hello","world"]')
+
+        self.assertRaises(TypeError, lambda:query.String(99, 2))
+
+        q = query.Tree(92, [1], 2)
+        self.assertEqual(str(q), 'tree[92][1][2]')
+
+        #missing third arg
+        q = query.Tree(92, 1)
+        self.assertEqual(str(q), 'tree[92][1][]')
+
+        #missing second arg
+        q = query.Tree(92, reverse=3)
+        self.assertEqual(str(q), 'tree[92][][3]')
+
+        q = query.Tree([92,93], 1, [2,7])
+        self.assertEqual(str(q), 'tree[92,93][1][2,7]')
+
+        #bad tree arg types
+        self.assertRaises(TypeError, lambda:query.Tree(99, "hello"))
+
+        q = query.Around(625, 50, 60, 23.4)
+        self.assertEqual(str(q), 'around[625,50,60,23.4]')
+
+        q = query.Between(60, 1999, 2010)
+        self.assertEqual(str(q), 'between[60,1999,2010]')
+
+        q = query.Between(60, 1999)
+        self.assertEqual(str(q), 'between[60,1999,]')
+
+        q = query.Between(60, end=2010)
+        self.assertEqual(str(q), 'between[60,,2010]')
+
+        q = query.Link("enwiki")
+        self.assertEqual(str(q), 'link[enwiki]')
+
+        q = query.NoLink(["enwiki", "frwiki"])
+        self.assertEqual(str(q), 'nolink[enwiki,frwiki]')
+
+        #bad link arg types
+        self.assertRaises(TypeError, lambda:query.Link(99))
+        self.assertRaises(TypeError, lambda:query.Link([99]))
+
+        #claim with tree as arg
+        q = query.Claim(99, query.Tree(1,2,3))
+        self.assertEqual(str(q), "claim[99:(tree[1][2][3])]")
+
+        q = query.Claim(99, query.Tree(1,[2,5],[3,90]))
+        self.assertEqual(str(q), "claim[99:(tree[1][2,5][3,90])]")
+
+    def testQuerySets(self):
+        """
+        Test that we can join queries together correctly
+        """
+
+        # construct via queries
+        qs = query.Claim(99, 100).AND(query.Claim(99, 101))
+
+        self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101]')
+
+        qs = query.Claim(99, 100).AND(query.Claim(99, 
101)).AND(query.Claim(95))
+
+        self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101] AND 
claim[95]')
+
+        # construct via queries
+        qs = query.Claim(99, 100).AND([query.Claim(99, 101), query.Claim(95)])
+
+        self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101] AND 
claim[95]')
+
+        qs = query.Claim(99, 100).OR([query.Claim(99, 101), query.Claim(95)])
+
+        self.assertEqual(str(qs), 'claim[99:100] OR claim[99:101] OR 
claim[95]')
+
+        q1 = query.Claim(99, 100)
+        q2 = query.Claim(99, 101)
+
+        #different joiners get explicit grouping parens (the api also allows 
implicit, but we don't do that)
+        qs1 = q1.AND(q2)
+        qs2 = q1.OR(qs1).AND(query.Claim(98))
+
+        self.assertEqual(str(qs2), '(claim[99:100] OR (claim[99:100] AND 
claim[99:101])) AND claim[98]')
+
+        #if the joiners are the same, no need to group
+        qs1 = q1.AND(q2)
+        qs2 = q1.AND(qs1).AND(query.Claim(98))
+
+        self.assertEqual(str(qs2), 'claim[99:100] AND claim[99:100] AND 
claim[99:101] AND claim[98]')
+
+        qs1 = query.Claim(100).AND(query.Claim(101))
+        qs2 = qs1.OR(query.Claim(102))
+
+        self.assertEqual(str(qs2), '(claim[100] AND claim[101]) OR claim[102]')
+
+        qs = query.Link("enwiki").AND(query.NoLink("dewiki"))
+
+        self.assertEqual(str(qs), 'link[enwiki] AND nolink[dewiki]')
+
+    def testQueryApiSyntax(self):
+        """
+        Test that we can generate the API query correctly
+        """
+
+        w = query.WikidataQuery("http://example.com";)
+
+        qs = w.getQueryString(query.Link("enwiki"))
+        self.assertEqual(qs, "q=link%5Benwiki%5D")
+
+        self.assertEqual(w.getUrl(qs), 
"http://example.com/api?q=link%5Benwiki%5D";)
+
+        #check labels and props work OK
+        qs = w.getQueryString(query.Link("enwiki"), ['en','fr'], ['prop'])
+        self.assertEqual(qs, "q=link%5Benwiki%5D&labels=en,fr&props=prop")
+
+    def testQueryApiGetter(self):
+        """
+        Test that we can actually retreive data and that caching works
+        """
+        
+        w = query.WikidataQuery(cacheMaxAge = 0)
+
+        #this query odesn't return any items, save a bit of bandwidth!
+        q = query.Claim(105).AND([query.NoClaim(225), query.Claim(100)])
+
+        #check that the cache file is created
+        cacheFile = w.getCacheFilename(w.getQueryString(q, [], []))
+
+        try:
+            os.remove(cacheFile)
+        except OSError:
+            pass
+
+        data = w.query(q)
+
+        self.assertFalse(os.path.exists(cacheFile))
+
+        w = query.WikidataQuery(cacheMaxAge = 0.1)
+
+        data = w.query(q)
+
+        self.assertTrue(os.path.exists(cacheFile))
+
+        self.assertTrue('status' in data)
+        self.assertTrue('items' in data)
+
+        t1 = time.time()
+        data = w.query(q)
+        t2 = time.time()
+
+        # check that the cache access is fast
+        self.assertTrue(t2-t1 < 0.2)
+
+
+if __name__ == '__main__':
+    try:
+        unittest.main()
+    except SystemExit:
+        pass

-- 
To view, visit https://gerrit.wikimedia.org/r/99642
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id1dd2c48c65b9bfb877ec10ad1b8ea69aa00a39c
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Inductiveload <inductivel...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to