Inductiveload has uploaded a new change for review. https://gerrit.wikimedia.org/r/99642
Change subject: Add an interface module to the WikidataQuery API, along with ways to generate the queries programmatically. Tests included. ...................................................................... Add an interface module to the WikidataQuery API, along with ways to generate the queries programmatically. Tests included. Change-Id: Id1dd2c48c65b9bfb877ec10ad1b8ea69aa00a39c --- A pywikibot/data/query.py A tests/wikidataquery_tests.py 2 files changed, 670 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/42/99642/1 diff --git a/pywikibot/data/query.py b/pywikibot/data/query.py new file mode 100644 index 0000000..8cb4bbc --- /dev/null +++ b/pywikibot/data/query.py @@ -0,0 +1,467 @@ +# -*- coding: utf-8 -*- +""" +Objects representing WikidataQuery query syntax and API +""" +# +# (C) Pywikipedia bot team, 2013 +# +# Distributed under the terms of the MIT license. + +import json +import urllib2 +import pickle +import os +import sha +import time +import tempfile + + +def listify(x): + """ + If given a non-list , encapsulate in a single-element list + """ + return x if isinstance(x, list) else [x] + + +class QuerySet(): + """ + A QuerySet represents a set of queries or other query sets, joined + by operators (AND and OR). + + A QuerySet stores this information as a list of Query(Sets) and + a joiner operator to join them all together + """ + + def __init__(self, q): + """ + Initialise a query set from a Query or another QuerySet + """ + self.qs = [q] + + def addJoiner(self, args, joiner): + """ + Add to this QuerySet using the given joiner. + + @return a new query set representing the joining of this one and + the arguments + """ + + '''If the given joiner is not the same as we used before in + this QuerySet, nest the current one in parens before joining + - this makes the implicit grouping of the + API explicit.''' + if len(self.qs) >1 and joiner != self.joiner: + left = QuerySet(self) + else: + left = self + + left.joiner = joiner + + for a in listify(args): + left.qs.append(a) + + return left + + def AND(self, args): + """ + Add the given args (Queries or QuerySets) to the Query set as a + logical conjuction (AND) + """ + return self.addJoiner(args, "AND") + + def OR(self, args): + """ + Add the given args (Queries or QuerySets) to the Query set as a + logical disjunction (AND) + """ + return self.addJoiner(args, "OR") + + def __str__(self): + """ + Output as an API-ready string + """ + + def bracketIfQuerySet(q): + if isinstance(q, QuerySet) and q.joiner != self.joiner: + return "(%s)" % q + else: + return str(q) + + s = bracketIfQuerySet(self.qs[0]) + + for q in self.qs[1:]: + s += " %s %s" % (self.joiner, bracketIfQuerySet(q)) + + return s + + +class Query(): + """ + A query is a single query for the WikidataQuery API, for example + claim[100:60] or link[enwiki] + + Construction of a Query can throw a TypeError if you feed it bad + parameters. Exactly what these need to be depends on the Query + """ + + def AND(self, ands): + """ + Produce a query set ANDing this query and all the given query/sets + """ + return QuerySet(self).addJoiner(ands, "AND"); + + def OR(self, ors): + """ + Produce a query set ORing this query and all the given query/sets + """ + return QuerySet(self).addJoiner(ors, "OR"); + + def formatItem(self, item): + """ + Default item formatting is string, which will work for queries, + querysets, ints and strings + """ + return str(item) + + def formatList(self, l): + """ + Format and comma-join a list + """ + return ",".join([self.formatItem(x) for x in l]) + + @staticmethod + def isOrContainsOnlyTypes(items, types): + """ + Either this item is one of the given types, or it is a list of + only those types + """ + if isinstance(items, list): + for x in items: + found = False + for typ in listify(types): + if isinstance(x, typ): + found = True + break + + if not found: + return False + else: + for typ in listify(types): + found = False + if isinstance(items, typ): + found = True + break + + + if not found: + return False + + return True + + def validateOrRaise(self): + if not self.validate(): + raise TypeError + + +class Claim(Query): + """ + This is a Query of the form "claim[prop:val]". It is subclassed by + the other similar forms like noclaim and string + """ + + queryType = "claim" + + def __init__(self, prop, items=[]): + self.prop = prop + + if isinstance(items, Tree): + self.items = items + else: + self.items = listify(items) + + self.validateOrRaise() + + + def formatItems(self): + res = '' + if len(self.items): + res += ":" + ",".join([self.formatItem(x) for x in self.items]) + + return res + + def validate(self): + return self.isOrContainsOnlyTypes(self.items, [int, Tree]) + + def __str__(self): + if isinstance(self.items, list): + return "%s[%s%s]" % (self.queryType, self.prop, self.formatItems()) + elif isinstance(self.items, Tree): # maybe Query? + return "%s[%s:(%s)]" % (self.queryType, self.prop, self.items) + + +class NoClaim(Claim): + queryType = "noclaim" + + +class String(Claim): + """ + Query of the form string[PROPERTY:"STRING",...] + """ + queryType = "string" + + def formatItem(self, x): + """ + Strings need quote-wrapping + """ + return '"%s"' % x + + def validate(self): + return self.isOrContainsOnlyTypes(self.items, str) + + +class Tree(Query): + """ + Query of the form tree[ITEM,...][PROPERTY,...]<PROPERTY,...> + """ + queryType = "tree" + + def __init__(self, item, forward=[], reverse=[]): + """ + @param item The root item + @param forward List of forward properties, can be empty + @param reverse List of reverse properties, can be empty + """ + self.item = listify(item) + self.forward = listify(forward) + self.reverse = listify(reverse) + + self.validateOrRaise() + + def validate(self): + return (self.isOrContainsOnlyTypes(self.item, int) and + self.isOrContainsOnlyTypes(self.forward, int) and + self.isOrContainsOnlyTypes(self.reverse, int)) + + def __str__(self): + return "%s[%s][%s][%s]" % (self.queryType, self.formatList(self.item), + self.formatList(self.forward), self.formatList(self.reverse)) + + +class Around(Query): + """ + A query in the form around[PROPERTY,LATITUDE,LONGITUDE,RADIUS] + """ + queryType = "around" + + def __init__(self, prop, lt, lg, rad): + self.prop = prop + self.lt = lt + self.lg = lg + self.rad = rad + + def validate(self): + return isinstance(self.prop, int) + + def __str__(self): + return "%s[%s,%s,%s,%s]" % (self.queryType, self.prop, + self.lt, self.lg, self.rad) + + +class Between(Query): + """ + A query in the form between[PROP, BEGIN, END] + + You have to give one of begin or end + """ + queryType = "between" + + def __init__(self, prop, begin='', end=''): + self.prop = prop + self.begin = begin + self.end = end + + def validate(self): + return (len(self.begin) or len(self.end)) and isinstance(self.prop, int) + + def __str__(self): + return "%s[%s,%s,%s]" % (self.queryType, self.prop, self.begin, self.end) + + +class Link(Query): + """ + A query in the form link[LINK,...], which also includes nolink + + All link elements have to be strings, or validation will throw + """ + + queryType = "link"; + + def __init__(self, link): + self.link = listify(link) + self.validateOrRaise() + + def validate(self): + return self.isOrContainsOnlyTypes(self.link, str) + + def __str__(self): + return "%s[%s]" % (self.queryType, self.formatList(self.link)) + + +class NoLink(Link): + queryType = "nolink" + + +class WikidataQuery(): + """ + An interface to the WikidatQuery API. Default host is + wikidataquery.eu, but you can substitute a different one. + + Caching defaults to a subdir of the system temp directory with a + 1 hour max cache age. + + Set a zero or negative maxCacheAge to disable caching + """ + + def __init__(self, host="http://208.80.153.172", cacheDir=None, + cacheMaxAge=60): + self.host = host + self.cacheMaxAge = cacheMaxAge; + + if cacheDir: + self.cacheDir = cacheDir + else: + self.cacheDir = os.path.join(tempfile.gettempdir(), + "wikidataquery_cache") + + def getUrl(self, queryStr): + return "%s/api?%s" % (self.host, queryStr) + + def getQueryString(self, q, labels=[], props=[]): + """ + Get the query string for a given query or queryset + @return query string including lables and props + """ + qStr = "q=%s" % urllib2.quote(str(q)) + + if labels: + qStr += "&labels=%s" % ','.join(labels) + + if props: + qStr += "&props=%s" % ','.join(props) + + return qStr + + def getCacheFilename(self, queryStr): + """ + Encode a query into a unique and universally safe format + """ + encQuery = sha.new(queryStr).hexdigest() + ".wdq_cache" + return os.path.join(self.cacheDir, encQuery) + + def readFromCache(self, queryStr): + """ + Check if we have cached this data recently enough, read it + if we have. Returns None if the data is not there or if it is + too old + """ + + if self.cacheMaxAge <= 0: + return None + + cacheFile = self.getCacheFilename(queryStr) + + if os.path.isfile(cacheFile): + mtime = os.path.getmtime(cacheFile) + now = time.time() + + if ((now - mtime) / 60) < self.cacheMaxAge: + + try: + data = pickle.load(open(cacheFile, 'r')) + except pickle.UnpicklingError: + print("Couldn't read cached data!") + data = None + + return data + + return None + + def saveToCache(self, q, data): + """ + Save data from a query to a cache file, if enabled + @ returns nothing + """ + + if self.cacheMaxAge <= 0: + return + + # we have to use our own query string, as otherwise we may + # be able to find the cache file again if there are e.g. + # whitespace differences + cacheFile = self.getCacheFilename(q) + + if os.path.exists(cacheFile) and not os.path.isfile(cacheFile): + return + + if not os.path.exists(self.cacheDir): + os.makedirs(self.cacheDir) + + try: + pickle.dump(data, open(cacheFile, 'w')) + except IOError: + print("Failed to write cache file") + + def getDataFromHost(self, queryStr): + """ + Go and fetch a query from the host's API + """ + url = self.getUrl(queryStr) + + resp = urllib2.urlopen(url) + + # http request failed for some reason + if resp.getcode() != 200: + return None + + try: + data = json.loads(resp.read()) + except ValueError: + print("Data received from host but no JSON could be decoded") + data = None + + return data + + + def query(self, q, labels=[], props=[]): + """ + Actually run a query over the API + @return Python object of the interpreted JSON or None on failure + """ + + fullQueryString = self.getQueryString(q, labels, props) + + #try to get cached data first + data = self.readFromCache(fullQueryString) + + if data: + return data + + # the cached data must not be OK, go and get real data from the + # host's API + data = self.getDataFromHost(fullQueryString) + + # no JSON found + if not data: + return None + + #cache data for next time + self.saveToCache(fullQueryString, data) + + # just make sure a list is present to simplify checks later + # pretty sure the API does this anyway + if 'items' not in data: + data['items'] = [] + + return data + + diff --git a/tests/wikidataquery_tests.py b/tests/wikidataquery_tests.py new file mode 100644 index 0000000..481cb53 --- /dev/null +++ b/tests/wikidataquery_tests.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +""" +Test cases for the WikidataQuery query syntax and API +""" +# +# (C) Pywikipedia bot team, 2013 +# +# Distributed under the terms of the MIT license. + + +import pywikibot.data.query as query +from utils import PywikibotTestCase, unittest + +import os +import time + +class TestApiFunctions(unittest.TestCase): + + def testQueries(self): + """ + Test that we produce the expected query strings and that + invalid inputs are rejected correctly + """ + + q = query.Claim(99) + self.assertEqual(str(q), "claim[99]") + + q = query.Claim(99, 100) + self.assertEqual(str(q), "claim[99:100]") + + q = query.Claim(99, [100]) + self.assertEqual(str(q), "claim[99:100]") + + q = query.Claim(99, [100,101]) + self.assertEqual(str(q), "claim[99:100,101]") + + q = query.NoClaim(99, [100,101]) + self.assertEqual(str(q), "noclaim[99:100,101]") + + q = query.String(99, "Hello") + self.assertEqual(str(q), 'string[99:"Hello"]') + + q = query.String(99, ["Hello"]) + self.assertEqual(str(q), 'string[99:"Hello"]') + + q = query.String(99, ["Hello", "world"]) + self.assertEqual(str(q), 'string[99:"Hello","world"]') + + self.assertRaises(TypeError, lambda:query.String(99, 2)) + + q = query.Tree(92, [1], 2) + self.assertEqual(str(q), 'tree[92][1][2]') + + #missing third arg + q = query.Tree(92, 1) + self.assertEqual(str(q), 'tree[92][1][]') + + #missing second arg + q = query.Tree(92, reverse=3) + self.assertEqual(str(q), 'tree[92][][3]') + + q = query.Tree([92,93], 1, [2,7]) + self.assertEqual(str(q), 'tree[92,93][1][2,7]') + + #bad tree arg types + self.assertRaises(TypeError, lambda:query.Tree(99, "hello")) + + q = query.Around(625, 50, 60, 23.4) + self.assertEqual(str(q), 'around[625,50,60,23.4]') + + q = query.Between(60, 1999, 2010) + self.assertEqual(str(q), 'between[60,1999,2010]') + + q = query.Between(60, 1999) + self.assertEqual(str(q), 'between[60,1999,]') + + q = query.Between(60, end=2010) + self.assertEqual(str(q), 'between[60,,2010]') + + q = query.Link("enwiki") + self.assertEqual(str(q), 'link[enwiki]') + + q = query.NoLink(["enwiki", "frwiki"]) + self.assertEqual(str(q), 'nolink[enwiki,frwiki]') + + #bad link arg types + self.assertRaises(TypeError, lambda:query.Link(99)) + self.assertRaises(TypeError, lambda:query.Link([99])) + + #claim with tree as arg + q = query.Claim(99, query.Tree(1,2,3)) + self.assertEqual(str(q), "claim[99:(tree[1][2][3])]") + + q = query.Claim(99, query.Tree(1,[2,5],[3,90])) + self.assertEqual(str(q), "claim[99:(tree[1][2,5][3,90])]") + + def testQuerySets(self): + """ + Test that we can join queries together correctly + """ + + # construct via queries + qs = query.Claim(99, 100).AND(query.Claim(99, 101)) + + self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101]') + + qs = query.Claim(99, 100).AND(query.Claim(99, 101)).AND(query.Claim(95)) + + self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101] AND claim[95]') + + # construct via queries + qs = query.Claim(99, 100).AND([query.Claim(99, 101), query.Claim(95)]) + + self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101] AND claim[95]') + + qs = query.Claim(99, 100).OR([query.Claim(99, 101), query.Claim(95)]) + + self.assertEqual(str(qs), 'claim[99:100] OR claim[99:101] OR claim[95]') + + q1 = query.Claim(99, 100) + q2 = query.Claim(99, 101) + + #different joiners get explicit grouping parens (the api also allows implicit, but we don't do that) + qs1 = q1.AND(q2) + qs2 = q1.OR(qs1).AND(query.Claim(98)) + + self.assertEqual(str(qs2), '(claim[99:100] OR (claim[99:100] AND claim[99:101])) AND claim[98]') + + #if the joiners are the same, no need to group + qs1 = q1.AND(q2) + qs2 = q1.AND(qs1).AND(query.Claim(98)) + + self.assertEqual(str(qs2), 'claim[99:100] AND claim[99:100] AND claim[99:101] AND claim[98]') + + qs1 = query.Claim(100).AND(query.Claim(101)) + qs2 = qs1.OR(query.Claim(102)) + + self.assertEqual(str(qs2), '(claim[100] AND claim[101]) OR claim[102]') + + qs = query.Link("enwiki").AND(query.NoLink("dewiki")) + + self.assertEqual(str(qs), 'link[enwiki] AND nolink[dewiki]') + + def testQueryApiSyntax(self): + """ + Test that we can generate the API query correctly + """ + + w = query.WikidataQuery("http://example.com") + + qs = w.getQueryString(query.Link("enwiki")) + self.assertEqual(qs, "q=link%5Benwiki%5D") + + self.assertEqual(w.getUrl(qs), "http://example.com/api?q=link%5Benwiki%5D") + + #check labels and props work OK + qs = w.getQueryString(query.Link("enwiki"), ['en','fr'], ['prop']) + self.assertEqual(qs, "q=link%5Benwiki%5D&labels=en,fr&props=prop") + + def testQueryApiGetter(self): + """ + Test that we can actually retreive data and that caching works + """ + + w = query.WikidataQuery(cacheMaxAge = 0) + + #this query odesn't return any items, save a bit of bandwidth! + q = query.Claim(105).AND([query.NoClaim(225), query.Claim(100)]) + + #check that the cache file is created + cacheFile = w.getCacheFilename(w.getQueryString(q, [], [])) + + try: + os.remove(cacheFile) + except OSError: + pass + + data = w.query(q) + + self.assertFalse(os.path.exists(cacheFile)) + + w = query.WikidataQuery(cacheMaxAge = 0.1) + + data = w.query(q) + + self.assertTrue(os.path.exists(cacheFile)) + + self.assertTrue('status' in data) + self.assertTrue('items' in data) + + t1 = time.time() + data = w.query(q) + t2 = time.time() + + # check that the cache access is fast + self.assertTrue(t2-t1 < 0.2) + + +if __name__ == '__main__': + try: + unittest.main() + except SystemExit: + pass -- To view, visit https://gerrit.wikimedia.org/r/99642 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id1dd2c48c65b9bfb877ec10ad1b8ea69aa00a39c Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Inductiveload <inductivel...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits