Nschaaf has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/337626 )
Change subject: Add option to rank translation recs by sitelinks
......................................................................
Add option to rank translation recs by sitelinks
Bug: T157371
Change-Id: I64fe7b620d4230d3548ab24412ceb7c3e39a7926
---
M recommendation/api/external_data/fetcher.py
M recommendation/api/external_data/wikidata.py
M
recommendation/api/types/test/test_translation_and_related_articles_relationship.py
M recommendation/api/types/translation/candidate_finders.py
M recommendation/api/types/translation/filters.py
M recommendation/api/types/translation/pageviews.py
A recommendation/api/types/translation/recommendation.py
M recommendation/api/types/translation/test/test_api.py
M recommendation/api/types/translation/test/test_candidate_finders.py
M recommendation/api/types/translation/test/test_filters.py
M recommendation/api/types/translation/test/test_pageviews.py
M recommendation/api/types/translation/translation.py
D recommendation/api/types/translation/utils.py
M recommendation/data/recommendation.ini
M setup.py
15 files changed, 267 insertions(+), 269 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/research/recommendation-api
refs/changes/26/337626/1
diff --git a/recommendation/api/external_data/fetcher.py
b/recommendation/api/external_data/fetcher.py
index dda437a..3a5a4fb 100644
--- a/recommendation/api/external_data/fetcher.py
+++ b/recommendation/api/external_data/fetcher.py
@@ -100,6 +100,30 @@
return results
+def get_most_popular_articles(source):
+ days = configuration.get_config_int('popular_pageviews', 'days')
+ date_format = configuration.get_config_value('popular_pageviews',
'date_format')
+ query = configuration.get_config_value('popular_pageviews', 'query')
+ date = (datetime.datetime.utcnow() -
datetime.timedelta(days=days)).strftime(date_format)
+ query = query.format(source=source, date=date)
+ try:
+ data = get(query)
+ except ValueError:
+ log.info('pageview query failed')
+ return []
+
+ if 'items' not in data or len(data['items']) < 1 or 'articles' not in
data['items'][0]:
+ log.info('pageview data is not in a known format')
+ return []
+
+ articles = []
+
+ for article in data['items'][0]['articles']:
+ articles.append({'title': article['article'], 'pageviews':
article['views']})
+
+ return articles
+
+
def build_wiki_search(source, seed, count, morelike):
endpoint = configuration.get_config_value('endpoints',
'wikipedia').format(source=source)
params = configuration.get_config_dict('wiki_search_params')
diff --git a/recommendation/api/external_data/wikidata.py
b/recommendation/api/external_data/wikidata.py
index c3edd72..ed50ac9 100644
--- a/recommendation/api/external_data/wikidata.py
+++ b/recommendation/api/external_data/wikidata.py
@@ -8,43 +8,66 @@
log = logging.getLogger(__name__)
-WikidataItem = collections.namedtuple('WikidataItem', ['id', 'title', 'url'])
+_RawWikidataItem = collections.namedtuple('RawWikidataItem', ['id',
'sitelinks'])
+WikidataItem = collections.namedtuple('WikidataItem', ['id', 'title', 'url',
'sitelink_count'])
-def query(params, expected_sitelinks=1):
- """
- Query the wikidata endpoint and return a list of WikidataItem
+def get_items_in_source_missing_in_target_by_titles(source, target, titles):
+ target_wiki = '{}wiki'.format(target)
+ items = get_items(source, titles=titles, raw_filter=lambda item:
target_wiki not in item.sitelinks)
+ return {item.title: item for item in items}
- This only includes items that have exactly expected_sitelinks sitelink
- """
- endpoint = configuration.get_config_value('endpoints', 'wikidata')
- try:
- data = fetcher.post(endpoint, data=params)
- if 'warnings' in data:
- raise ValueError()
- except ValueError:
- log.info('Bad Wikidata API response')
- return {}
- entities = data.get('entities', {})
+def get_wikidata_items_from_titles(source, titles):
+ return get_items(source, titles=titles)
+
+def get_titles_from_wikidata_items(source, items):
+ return get_items(source, ids=items)
+
+
+def default_filter(_):
+ return True
+
+
+def get_items(source, titles=None, ids=None, raw_filter=default_filter):
+ params = configuration.get_config_dict('wikidata_query_params')
+ params['sites'] = params['sites'].format(source=source)
items = []
-
- for id, entity in entities.items():
- sitelinks = entity.get('sitelinks', {})
- if len(sitelinks.keys()) != expected_sitelinks:
- continue
- sitelink = sitelinks.popitem()[1]
-
- item = WikidataItem(id=id,
- title=sitelink['title'].replace(' ', '_'),
- url=sitelink['url'])
- items.append(item)
-
+ if titles is not None:
+ items = chunk_query_for_parameter(params, 'titles', titles)
+ if ids is not None:
+ items = chunk_query_for_parameter(params, 'ids', ids)
+ items = [extract_from_raw(item, params['sites']) for item in items if
raw_filter(item)]
+ items = [item for item in items if item is not None]
return items
def chunk_query_for_parameter(params, parameter, values):
+ """
+ This takes in general params for a query that needs to be performed
+ for a set of values, and then adds a specified parameter with the
+ chunked values until all the values have been in a query.
+
+ Ex:
+ chunk_query_for_parameter(
+ {'foo': 'bar'},
+ 'additional',
+ ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
+
+ results in the following queries if chunk_size is 3:
+
+ query({'foo': 'bar',
+ 'additional': 'a|b|c'})
+ query({'foo': 'bar',
+ 'additional': 'd|e|f'})
+ query({'foo': 'bar',
+ 'additional': 'g|h|i'})
+ query({'foo': 'bar',
+ 'additional': 'j'})
+
+ the results are appended in the appropriate order and returned
+ """
chunk_size = configuration.get_config_int('external_api_parameters',
'wikidata_chunk_size')
param_groups = []
@@ -61,35 +84,39 @@
return []
-def get_items_in_source_missing_in_target_by_titles(source, target, titles):
- params = configuration.get_config_dict('wikidata_titles_to_items_params')
- params['sites'] = params['sites'].format(source=source)
- # We want the sitefilter to include both the source and target
- # wikis. This sets up the scenario where if there is only 1 sitelink
- # present, that means that the article is missing in the target (since
- # the title will have come from the source wiki)
- params['sitefilter'] = params['sitefilter'].format(target=target)
- params['sitefilter'] += '|{}wiki'.format(source)
-
- items = chunk_query_for_parameter(params, 'titles', titles)
-
- return {item.title: item.id for item in items}
+def query(params):
+ entities = get_entities(params)
+ raw_items = get_raw_items_from_entities(entities)
+ return raw_items
-def get_wikidata_items_from_titles(source, titles):
- params = configuration.get_config_dict('wikidata_titles_to_items_params')
- params['sites'] = params['sites'].format(source=source)
- params['sitefilter'] = params['sitefilter'].format(target=source)
+def get_entities(params):
+ endpoint = configuration.get_config_value('endpoints', 'wikidata')
+ try:
+ data = fetcher.post(endpoint, data=params)
+ if 'warnings' in data:
+ raise ValueError()
+ except ValueError:
+ log.info('Bad Wikidata API response')
+ return {}
- items = chunk_query_for_parameter(params, 'titles', titles)
+ return data.get('entities', {})
+
+def get_raw_items_from_entities(entities):
+ items = []
+ for id, entity in entities.items():
+ sitelinks = entity.get('sitelinks', {})
+ items.append(_RawWikidataItem(id=id, sitelinks=sitelinks))
return items
-def get_titles_from_wikidata_items(source, items):
- params = configuration.get_config_dict('wikidata_items_to_titles_params')
- params['sitefilter'] = params['sitefilter'].format(source=source)
-
- items = chunk_query_for_parameter(params, 'ids', items)
-
- return items
+def extract_from_raw(raw_item, site):
+ try:
+ sitelink = raw_item.sitelinks[site]
+ except KeyError:
+ return None
+ return WikidataItem(id=raw_item.id,
+ title=sitelink['title'].replace(' ', '_'),
+ url=sitelink['url'],
+ sitelink_count=len(raw_item.sitelinks))
diff --git
a/recommendation/api/types/test/test_translation_and_related_articles_relationship.py
b/recommendation/api/types/test/test_translation_and_related_articles_relationship.py
index 543c355..600650f 100644
---
a/recommendation/api/types/test/test_translation_and_related_articles_relationship.py
+++
b/recommendation/api/types/test/test_translation_and_related_articles_relationship.py
@@ -113,7 +113,7 @@
@pytest.fixture
def remove_filters(monkeypatch):
- monkeypatch.setattr(filters, 'apply_filters', lambda source, target,
candidates, count: candidates)
+ monkeypatch.setattr(filters, 'apply_filters', lambda source, target,
candidates: candidates)
@pytest.fixture
@@ -121,13 +121,15 @@
def get_titles_from_wikidata_items(source, items):
return [wikidata.WikidataItem(id=item['wikidata_id'],
url=item['url'],
- title=item['title'])
+ title=item['title'],
+ sitelink_count=1)
for item in RELATED_ARTICLE_RESPONSE if item['wikidata_id'] in
items]
def get_wikidata_items_from_titles(source, titles):
return [wikidata.WikidataItem(id=item['wikidata_id'],
url=item['url'],
- title=item['title'])
+ title=item['title'],
+ sitelink_count=1)
for item in RELATED_ARTICLE_RESPONSE if item['title'] in
titles]
monkeypatch.setattr(wikidata, 'get_titles_from_wikidata_items',
get_titles_from_wikidata_items)
@@ -139,7 +141,8 @@
def expected_recommendations():
return [{'title': item['title'],
'pageviews': None,
- 'wikidata_id': item['wikidata_id']} for item in
RELATED_ARTICLE_RESPONSE]
+ 'wikidata_id': item['wikidata_id'],
+ 'rank': item['score']} for item in RELATED_ARTICLE_RESPONSE]
@pytest.fixture
diff --git a/recommendation/api/types/translation/candidate_finders.py
b/recommendation/api/types/translation/candidate_finders.py
index 34b5c17..00d5ec6 100644
--- a/recommendation/api/types/translation/candidate_finders.py
+++ b/recommendation/api/types/translation/candidate_finders.py
@@ -1,131 +1,67 @@
-import datetime
import logging
import random
from recommendation.api.external_data import fetcher
-from recommendation.api.types.translation.utils import Article
-from recommendation.utils import configuration
+from recommendation.api.types.translation import recommendation
log = logging.getLogger(__name__)
-class CandidateFinder:
- """
- CandidateFinder interface
- """
+def get_top_pageview_candidates(source, _, count):
+ articles = fetcher.get_most_popular_articles(source)
- def get_candidates(self, s, seed, n):
- """
- get list candidate source language articles
- using seed (optional)
- """
+ # shuffle articles
+ articles = sorted(articles, key=lambda x: random.random())
+
+ recommendations = []
+
+ for index, article in enumerate(articles):
+ rec = recommendation.Recommendation(article['title'])
+ rec.rank = index
+ rec.pageviews = article['pageviews']
+ recommendations.append(rec)
+
+ return recommendations[:count]
+
+
+def get_morelike_candidates(source, seed, count):
+ seed_list = fetcher.wiki_search(source, seed, 1)
+
+ if len(seed_list) == 0:
+ log.info('seed does not map to an article')
return []
+ if seed != seed_list[0]:
+ log.debug('seed parameter of %s mapped to article %s', seed,
seed_list[0])
+
+ results = fetcher.wiki_search(source, seed_list[0], count, morelike=True)
+ if results:
+ results.insert(0, seed_list[0])
+ log.info('morelike returned %d results', len(results))
+ else:
+ log.info('morelike search failed; reverting to standard search')
+ results = fetcher.wiki_search(source, seed, count)
+
+ recommendations = []
+
+ for index, title in enumerate(results):
+ rec = recommendation.Recommendation(title)
+ rec.rank = index
+ recommendations.append(rec)
+
+ return recommendations[:count]
-class PageviewCandidateFinder(CandidateFinder):
- """
- Utility Class for getting a list of the most
- popular articles in a source Wikipedia.
- """
+def get_related_articles(source, seed, count):
+ results = fetcher.get_related_articles(source, seed)
+ if len(results) == 0:
+ log.info('Failed related_articles search. Reverting to morelike.
Source: %s Seed: %s', source, seed)
+ return get_morelike_candidates(source, seed, count)
- def query_pageviews(self, s):
- """
- Query pageview API and parse results
- """
- days = configuration.get_config_int('popular_pageviews', 'days')
- date_format = configuration.get_config_value('popular_pageviews',
'date_format')
- query = configuration.get_config_value('popular_pageviews', 'query')
- date = (datetime.datetime.utcnow() -
datetime.timedelta(days=days)).strftime(date_format)
- query = query.format(source=s, date=date)
- try:
- data = fetcher.get(query)
- except ValueError:
- return []
+ recommendations = []
+ for item in results:
+ rec = recommendation.Recommendation(item['title'])
+ rec.wikidata_id = item['wikidata_id']
+ rec.rank = item['score']
+ recommendations.append(rec)
- article_pv_tuples = []
-
- try:
- for d in data['items'][0]['articles']:
- article_pv_tuples.append((d['article'], d['views']))
- except:
- log.info('Could not get most popular articles for %s from pageview
API. Try using a seed article.', s)
-
- return article_pv_tuples
-
- def get_candidates(self, s, seed, n):
- """
- Wrap top articles in a list of Article objects
- """
- articles = []
- article_pv_tuples = sorted(self.query_pageviews(s), key=lambda x:
random.random())
-
- for i, t in enumerate(article_pv_tuples):
- a = Article(t[0])
- a.rank = i
- articles.append(a)
-
- return articles[:n]
-
-
-class MorelikeCandidateFinder(CandidateFinder):
- """
- Utility class for getting articles that are similar to
- a given seed article in a source Wikipedia via "morelike"
- search
- """
-
- def get_morelike_candidates(self, s, query, n):
- """
- Perform a "morelike" search via the Mediawiki search API.
- First map the query to an article via standard search,
- and then get a list of related articles via morelike search
- """
- seed_list = fetcher.wiki_search(s, query, 1)
-
- if len(seed_list) == 0:
- log.info('Seed does not map to an article')
- return []
-
- seed = seed_list[0]
- if seed != query:
- log.info('Query: %s Article: %s', query, seed)
- results = fetcher.wiki_search(s, seed, n, morelike=True)
- if results:
- results.insert(0, seed)
- log.info('Successful Morelike Search')
- return results
- else:
- log.info('Failed Morelike Search. Reverting to standard search')
- return fetcher.wiki_search(s, query, n)
-
- def get_candidates(self, s, seed, n):
- """
- Wrap morelike search results into a list of articles
- """
- results = self.get_morelike_candidates(s, seed, n)
-
- articles = []
-
- for i, title in enumerate(results):
- a = Article(title)
- a.rank = i
- articles.append(a)
-
- return articles[:n]
-
-
-class RelatedArticleFinder(CandidateFinder):
- def get_candidates(self, s, seed, n):
- results = fetcher.get_related_articles(s, seed)
- if len(results) == 0:
- log.info('Failed related_articles search. Reverting to morelike.
Source: %s Seed: %s', s, seed)
- return MorelikeCandidateFinder().get_candidates(s, seed, n)
-
- articles = []
- for item in results:
- a = Article(item['title'])
- a.wikidata_id = item['wikidata_id']
- a.rank = 1.0 - item['score']
- articles.append(a)
-
- return articles[:n]
+ return recommendations[:count]
diff --git a/recommendation/api/types/translation/filters.py
b/recommendation/api/types/translation/filters.py
index 641fe9c..92c6399 100644
--- a/recommendation/api/types/translation/filters.py
+++ b/recommendation/api/types/translation/filters.py
@@ -12,14 +12,14 @@
using Wikidata sitelinks
"""
titles = [article.title for article in candidates]
- title_id_dict =
wikidata.get_items_in_source_missing_in_target_by_titles(source, target, titles)
+ titles_to_items =
wikidata.get_items_in_source_missing_in_target_by_titles(source, target, titles)
filtered_articles = []
for article in candidates:
- if article.title in title_id_dict:
+ if article.title in titles_to_items:
# TODO: change this side-effect to be more explicit / non-stateful
- article.wikidata_id = title_id_dict[article.title]
+ article.incorporate_wikidata_item(titles_to_items[article.title])
filtered_articles.append(article)
return filtered_articles
@@ -41,7 +41,7 @@
return [article for article in candidates if ':' not in article.title and
not article.title.startswith('List')]
-def apply_filters(source, target, candidates, count):
+def apply_filters(source, target, candidates):
log.debug('Number of candidates: %d', len(candidates))
candidates = filter_by_title(candidates)
log.debug('Number of candidates after title: %d', len(candidates))
@@ -50,4 +50,4 @@
candidates = filter_by_disambiguation(source, candidates)
log.debug('Number of candidates after disambiguation: %d', len(candidates))
- return candidates[:count]
+ return candidates
diff --git a/recommendation/api/types/translation/pageviews.py
b/recommendation/api/types/translation/pageviews.py
index c9856cd..f37e80f 100644
--- a/recommendation/api/types/translation/pageviews.py
+++ b/recommendation/api/types/translation/pageviews.py
@@ -6,11 +6,12 @@
def set_pageview_data(source, articles):
with concurrent.futures.ThreadPoolExecutor(max_workers=len(articles)) as
executor:
futures = [executor.submit(_get_and_set_pageview_data, source,
article) for article in articles]
- data = [future.result() for future in
concurrent.futures.as_completed(futures)]
+ data = [future.result() for future in futures]
return data
def _get_and_set_pageview_data(source, article):
- pageviews = fetcher.get_pageviews(source, article.title)
- article.pageviews = pageviews
+ if article.pageviews is None:
+ pageviews = fetcher.get_pageviews(source, article.title)
+ article.pageviews = pageviews
return article
diff --git a/recommendation/api/types/translation/recommendation.py
b/recommendation/api/types/translation/recommendation.py
new file mode 100644
index 0000000..a49ec7a
--- /dev/null
+++ b/recommendation/api/types/translation/recommendation.py
@@ -0,0 +1,21 @@
+class Recommendation:
+ def __init__(self, title):
+ self.title = title
+ self.wikidata_id = None
+ self.rank = None
+ self.pageviews = None
+ self.url = None
+ self.sitelink_count = None
+
+ def __dict__(self):
+ return dict(title=self.title,
+ wikidata_id=self.wikidata_id,
+ rank=self.rank,
+ pageviews=self.pageviews,
+ url=self.url,
+ sitelink_count=self.sitelink_count)
+
+ def incorporate_wikidata_item(self, item):
+ self.wikidata_id = item.id
+ self.url = item.url
+ self.sitelink_count = item.sitelink_count
diff --git a/recommendation/api/types/translation/test/test_api.py
b/recommendation/api/types/translation/test/test_api.py
index 75dff86..d05743f 100644
--- a/recommendation/api/types/translation/test/test_api.py
+++ b/recommendation/api/types/translation/test/test_api.py
@@ -4,18 +4,18 @@
import urllib.parse
from recommendation.api.types.translation import translation
-from recommendation.api.types.translation import utils
from recommendation.api.types.translation import filters
+from recommendation.api.types.translation import recommendation
GOOD_RESPONSE = [
- {'title': 'A', 'pageviews': 10, 'wikidata_id': '123'},
- {'title': 'B', 'pageviews': 11, 'wikidata_id': '122'},
- {'title': 'C', 'pageviews': 12, 'wikidata_id': '121'},
- {'title': 'D', 'pageviews': 13, 'wikidata_id': '120'},
- {'title': 'E', 'pageviews': 14, 'wikidata_id': '119'},
- {'title': 'F', 'pageviews': 15, 'wikidata_id': '118'},
- {'title': 'G', 'pageviews': 16, 'wikidata_id': '117'},
- {'title': 'H', 'pageviews': 17, 'wikidata_id': '116'},
+ {'title': 'A', 'pageviews': 9, 'wikidata_id': '123', 'rank': 9.0},
+ {'title': 'B', 'pageviews': 8, 'wikidata_id': '122', 'rank': 8.0},
+ {'title': 'C', 'pageviews': 7, 'wikidata_id': '121', 'rank': 7.0},
+ {'title': 'D', 'pageviews': 6, 'wikidata_id': '120', 'rank': 6.0},
+ {'title': 'E', 'pageviews': 5, 'wikidata_id': '119', 'rank': 5.0},
+ {'title': 'F', 'pageviews': 4, 'wikidata_id': '118', 'rank': 4.0},
+ {'title': 'G', 'pageviews': 3, 'wikidata_id': '117', 'rank': 3.0},
+ {'title': 'H', 'pageviews': 2, 'wikidata_id': '116', 'rank': 2.0}
]
@@ -120,32 +120,28 @@
def test_recommend_uses_mostpopular_if_no_seed_is_specified(monkeypatch):
- class MockFinder:
- @classmethod
- def get_candidates(cls, s, seed, n):
- return []
+ def mock_finder(*_):
+ return []
- monkeypatch.setattr(translation, 'finder_map', {'mostpopular': MockFinder})
+ monkeypatch.setattr(translation, 'finder_map', {'mostpopular':
mock_finder})
result = translation.recommend(source='xx', target='yy',
search='customsearch', seed=None, count=12,
include_pageviews=True)
assert [] == result
def test_generated_recommend_response_is_marshalled(client, get_url,
monkeypatch):
- class MockFinder:
- @classmethod
- def get_candidates(cls, s, seed, n):
- articles = []
- for item in GOOD_RESPONSE:
- article = utils.Article(item['title'])
- article.pageviews = item['pageviews']
- article.wikidata_id = item['wikidata_id']
- article.rank = article.pageviews
- articles.append(article)
- return articles
+ def mock_finder(*_):
+ articles = []
+ for item in GOOD_RESPONSE:
+ article = recommendation.Recommendation(item['title'])
+ article.pageviews = item['pageviews']
+ article.wikidata_id = item['wikidata_id']
+ article.rank = item['rank']
+ articles.append(article)
+ return articles
- monkeypatch.setattr(translation, 'finder_map', {'mostpopular': MockFinder})
- monkeypatch.setattr(filters, 'apply_filters', lambda source, target, recs,
count: recs)
+ monkeypatch.setattr(translation, 'finder_map', {'mostpopular':
mock_finder})
+ monkeypatch.setattr(filters, 'apply_filters', lambda source, target, recs:
recs)
result = client.get(get_url(dict(s='xx', t='yy', pageviews=False)))
assert GOOD_RESPONSE == json.loads(result.data.decode('utf-8'))
diff --git
a/recommendation/api/types/translation/test/test_candidate_finders.py
b/recommendation/api/types/translation/test/test_candidate_finders.py
index ae5dd62..79632ea 100644
--- a/recommendation/api/types/translation/test/test_candidate_finders.py
+++ b/recommendation/api/types/translation/test/test_candidate_finders.py
@@ -76,17 +76,15 @@
def get_good_response(finder):
- finder_type = type(finder)
- if finder_type is candidate_finders.PageviewCandidateFinder:
+ if finder is candidate_finders.get_top_pageview_candidates:
return PAGEVIEW_RESPONSE
- if finder_type is candidate_finders.MorelikeCandidateFinder:
+ if finder is candidate_finders.get_morelike_candidates:
return MORELIKE_RESPONSE
return {}
def get_bad_response(finder):
- finder_type = type(finder)
- if finder_type is candidate_finders.PageviewCandidateFinder:
+ if finder is candidate_finders.get_top_pageview_candidates:
return BAD_PAGEVIEW_RESPONSE
return {}
@@ -97,11 +95,11 @@
@pytest.fixture(params=[
- candidate_finders.PageviewCandidateFinder,
- candidate_finders.MorelikeCandidateFinder
+ candidate_finders.get_top_pageview_candidates,
+ candidate_finders.get_morelike_candidates
])
def finder(request):
- return request.param()
+ return request.param
@pytest.mark.parametrize('count', [
@@ -110,17 +108,13 @@
])
def test_finder_returns_correct_amount(finder, count):
add_response(json.dumps(get_good_response(finder)), 200)
- result = finder.get_candidates('en', None, count)
+ result = finder('en', None, count)
assert count == len(result)
-
-
-def test_inheritance(finder):
- assert isinstance(finder, candidate_finders.CandidateFinder)
def test_invalid_language_returns_empty_list(finder):
add_response(body=json.dumps(get_bad_response(finder)), status=404)
- result = finder.get_candidates('qqq', None, 1)
+ result = finder('qqq', None, 1)
assert [] == result
@@ -131,16 +125,16 @@
])
def test_finder_returns_empty_list_when_requests_breaks(finder, body, status):
add_response(body=body, status=status)
- assert [] == finder.get_candidates('en', None, 10)
+ assert [] == finder('en', None, 10)
def test_finder_calls_go_through_responses(finder):
- if type(finder) is candidate_finders.MorelikeCandidateFinder:
+ if finder is candidate_finders.get_morelike_candidates:
# the number of calls is determined by other factors
# that are tested more thoroughly elsewhere
return
add_response(body=json.dumps(get_good_response(finder)), status=200)
- finder.get_candidates('en', None, 10)
+ finder('en', None, 10)
assert 1 == len(responses.calls)
url = responses.calls[0].request.url
assert 'http://localhost' == url[:16]
@@ -164,7 +158,7 @@
:param seed_response: the response to the initial seed_list query
:param morelike_response: the response to the morelike query
"""
- finder = candidate_finders.MorelikeCandidateFinder()
+ finder = candidate_finders.get_morelike_candidates
search_pattern = dict(
seed=(seed, 1, False, seed_response),
morelike=(query, 10, True, morelike_response),
@@ -174,5 +168,5 @@
url, params = fetcher.build_wiki_search('en', query, count, morelike)
url += '?' + urllib.parse.urlencode(params)
responses.add(responses.GET, url, json=response, status=200,
match_querystring=True)
- finder.get_candidates('en', seed, 10)
+ finder('en', seed, 10)
assert expected_calls == len(responses.calls)
diff --git a/recommendation/api/types/translation/test/test_filters.py
b/recommendation/api/types/translation/test/test_filters.py
index 5a46499..01f7d61 100644
--- a/recommendation/api/types/translation/test/test_filters.py
+++ b/recommendation/api/types/translation/test/test_filters.py
@@ -3,8 +3,8 @@
import re
from recommendation.api.types.translation import filters
+from recommendation.api.types.translation import recommendation
from recommendation.utils import configuration
-from recommendation.api.types.translation import utils
SOURCE = 'xx'
@@ -19,7 +19,7 @@
def query(the_filter):
if the_filter is filters.filter_by_missing:
- return the_filter(SOURCE, SOURCE, [utils.Article('something')])
+ return the_filter(SOURCE, SOURCE,
[recommendation.Recommendation('something')])
if the_filter is filters.filter_by_disambiguation:
return the_filter(SOURCE, '')
@@ -64,7 +64,7 @@
'Double Word'
])
def test_filter_by_good_title(title):
- candidates = [utils.Article(title)]
+ candidates = [recommendation.Recommendation(title)]
result = filters.filter_by_title(candidates)
assert candidates == result
@@ -75,6 +75,6 @@
'and cannot appear : anywhere'
])
def test_filter_by_bad_title(title):
- candidates = [utils.Article(title)]
+ candidates = [recommendation.Recommendation(title)]
result = filters.filter_by_title(candidates)
assert [] == result
diff --git a/recommendation/api/types/translation/test/test_pageviews.py
b/recommendation/api/types/translation/test/test_pageviews.py
index f4c1a9d..f362b0f 100644
--- a/recommendation/api/types/translation/test/test_pageviews.py
+++ b/recommendation/api/types/translation/test/test_pageviews.py
@@ -6,7 +6,7 @@
from recommendation.api.external_data import fetcher
from recommendation.api.types.translation import pageviews
-from recommendation.api.types.translation import utils
+from recommendation.api.types.translation import recommendation
from recommendation.utils import configuration
TITLE = 'Sample_Title'
@@ -36,7 +36,7 @@
def run_getter():
- articles = [utils.Article(TITLE)]
+ articles = [recommendation.Recommendation(TITLE)]
result = pageviews.set_pageview_data(SOURCE, articles)
assert result == articles
return result
diff --git a/recommendation/api/types/translation/translation.py
b/recommendation/api/types/translation/translation.py
index cedc934..b263414 100644
--- a/recommendation/api/types/translation/translation.py
+++ b/recommendation/api/types/translation/translation.py
@@ -20,7 +20,7 @@
legacy = helper.build_api('legacy', __name__, url_prefix='/api')
-ArticleSpec = collections.namedtuple('Article', ['pageviews', 'title',
'wikidata_id'])
+ArticleSpec = collections.namedtuple('Article', ['pageviews', 'title',
'wikidata_id', 'rank'])
def get_legacy_params():
@@ -66,7 +66,8 @@
legacy_model = legacy.model(ArticleSpec.__name__, ArticleSpec(
pageviews=fields.Integer(description='pageviews', required=False),
title=fields.String(description='title', required=True),
- wikidata_id=fields.String(description='wikidata_id', required=True)
+ wikidata_id=fields.String(description='wikidata_id', required=True),
+ rank=fields.Float(description='rank', required=True)
)._asdict())
legacy_doc = dict(description='Gets recommendations of source articles that
are missing in the target',
@@ -126,6 +127,12 @@
required=False,
default='morelike',
choices=['morelike', 'wiki', 'related_articles'])
+ v1_params.add_argument(
+ 'rank_method',
+ type=str,
+ required=False,
+ default='default',
+ choices=['default', 'sitelinks'])
return v1_params
@@ -155,7 +162,8 @@
article_model = v1.model(ArticleSpec.__name__, ArticleSpec(
pageviews=fields.Integer(description='pageviews', required=False),
title=fields.String(description='title', required=True),
- wikidata_id=fields.String(description='wikidata_id', required=True)
+ wikidata_id=fields.String(description='wikidata_id', required=True),
+ rank=fields.Float(description='rank', required=True)
)._asdict())
@@ -173,35 +181,41 @@
finder_map = {
- 'morelike': candidate_finders.MorelikeCandidateFinder(),
- 'wiki': candidate_finders.MorelikeCandidateFinder(),
- 'mostpopular': candidate_finders.PageviewCandidateFinder(),
- 'related_articles': candidate_finders.RelatedArticleFinder()
+ 'morelike': candidate_finders.get_morelike_candidates,
+ 'wiki': candidate_finders.get_morelike_candidates,
+ 'mostpopular': candidate_finders.get_top_pageview_candidates,
+ 'related_articles': candidate_finders.get_related_articles
}
-def recommend(source, target, search, seed, count, include_pageviews,
max_candidates=500):
+def recommend(source, target, search, seed, count, include_pageviews,
rank_method='default', max_candidates=500):
"""
1. Use finder to select a set of candidate articles
2. Filter out candidates that are not missing, are disambiguation pages,
etc
3. get pageview info for each passing candidate if desired
"""
- recs = []
+ candidates = []
if seed:
finder = finder_map[search]
for seed in seed.split('|'):
- recs.extend(finder.get_candidates(source, seed, max_candidates))
+ candidates.extend(finder(source, seed, max_candidates))
else:
- recs.extend(finder_map['mostpopular'].get_candidates(source, seed,
max_candidates))
+ candidates.extend(finder_map['mostpopular'](source, seed,
max_candidates))
- recs = sorted(recs, key=lambda x: x.rank)
+ recs = filters.apply_filters(source, target, candidates)
- recs = filters.apply_filters(source, target, recs, count)
+ recs = sorted(recs, key=lambda x: x.rank, reverse=True)
+
+ if rank_method == 'sitelinks':
+ for rec in recs:
+ rec.rank = rec.sitelink_count
+ recs = sorted(recs, key=lambda x: x.rank, reverse=True)
+
+ recs = recs[:count]
if recs and include_pageviews:
recs = pageviews.set_pageview_data(source, recs)
- recs = sorted(recs, key=lambda x: x.rank)
- return [{'title': r.title, 'pageviews': r.pageviews, 'wikidata_id':
r.wikidata_id} for r in recs]
+ return [r.__dict__() for r in recs]
diff --git a/recommendation/api/types/translation/utils.py
b/recommendation/api/types/translation/utils.py
deleted file mode 100644
index bae125b..0000000
--- a/recommendation/api/types/translation/utils.py
+++ /dev/null
@@ -1,10 +0,0 @@
-class Article:
- """
- Struct containing meta-data for an article
- """
-
- def __init__(self, title):
- self.title = title
- self.wikidata_id = None
- self.rank = None
- self.pageviews = None
diff --git a/recommendation/data/recommendation.ini
b/recommendation/data/recommendation.ini
index 6b4fb64..5c3166e 100644
--- a/recommendation/data/recommendation.ini
+++ b/recommendation/data/recommendation.ini
@@ -37,20 +37,12 @@
srlimit =
srsearch =
-[wikidata_titles_to_items_params]
+[wikidata_query_params]
action = wbgetentities
props = sitelinks/urls
format = json
+origin = *
sites = {source}wiki
-sitefilter = {target}wiki
-titles =
-
-[wikidata_items_to_titles_params]
-action = wbgetentities
-props = sitelinks/urls
-format = json
-sitefilter = {source}wiki
-ids =
[enabled_services]
gapfinder = True
diff --git a/setup.py b/setup.py
index cae73e8..37e8099 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
setup(
name='recommendation',
- version='0.1.11',
+ version='0.2.0',
url='https://github.com/wikimedia/research-recommendation-api',
license='Apache Software License',
maintainer='Wikimedia Research',
--
To view, visit https://gerrit.wikimedia.org/r/337626
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I64fe7b620d4230d3548ab24412ceb7c3e39a7926
Gerrit-PatchSet: 1
Gerrit-Project: research/recommendation-api
Gerrit-Branch: master
Gerrit-Owner: Nschaaf <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits