[MediaWiki-commits] [Gerrit] research/recommendation-api[master]: Use related-articles in translation recommendation

2016-12-13 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/326903 )

Change subject: Use related-articles in translation recommendation
..


Use related-articles in translation recommendation

Bug: T151793
Change-Id: Iff6b932606cebb7fc239fdb1d64703525069c782
---
M recommendation/api/external_data/fetcher.py
M recommendation/api/external_data/wikidata.py
M recommendation/api/types/translation/candidate_finders.py
M recommendation/api/types/translation/translation.py
M recommendation/data/labs_setup.sh
5 files changed, 31 insertions(+), 4 deletions(-)

Approvals:
  Nschaaf: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/recommendation/api/external_data/fetcher.py 
b/recommendation/api/external_data/fetcher.py
index 76e8b6b..6d6fdf7 100644
--- a/recommendation/api/external_data/fetcher.py
+++ b/recommendation/api/external_data/fetcher.py
@@ -108,3 +108,8 @@
 seed = 'morelike:' + seed
 params['srsearch'] = seed
 return endpoint, params
+
+
+def get_related_articles(source, seed):
+return 
get('http://recommend-related-articles.wmflabs.org/types/related_articles/v1/articles',
+   dict(source=source, seed=seed, count=500))
diff --git a/recommendation/api/external_data/wikidata.py 
b/recommendation/api/external_data/wikidata.py
index 82f7178..7359fbc 100644
--- a/recommendation/api/external_data/wikidata.py
+++ b/recommendation/api/external_data/wikidata.py
@@ -9,11 +9,11 @@
 WikidataItem = collections.namedtuple('WikidataItem', ['id', 'title', 'url'])
 
 
-def query(params):
+def query(params, expected_sitelinks=1):
 """
 Query the wikidata endpoint and return a list of WikidataItem
 
- This only includes items that have exactly 1 sitelink
+ This only includes items that have exactly expected_sitelinks sitelink
 """
 endpoint = configuration.get_config_value('endpoints', 'wikidata')
 try:
@@ -28,7 +28,7 @@
 
 for id, entity in entities.items():
 sitelinks = entity.get('sitelinks', {})
-if len(sitelinks.keys()) != 1:
+if len(sitelinks.keys()) != expected_sitelinks:
 continue
 sitelink = sitelinks.popitem()[1]
 
@@ -43,7 +43,12 @@
 def get_items_in_source_missing_in_target_by_titles(source, target, titles):
 params = configuration.get_config_dict('wikidata_titles_to_items_params')
 params['sites'] = params['sites'].format(source=source)
+# We want the sitefilter to include both the source and target
+# wikis. This sets up the scenario where if there is only 1 sitelink
+# present, that means that the article is missing in the target (since
+# the title will have come from the source wiki)
 params['sitefilter'] = params['sitefilter'].format(target=target)
+params['sitefilter'] += '|{}wiki'.format(source)
 params['titles'] = '|'.join(titles)
 
 items = query(params)
diff --git a/recommendation/api/types/translation/candidate_finders.py 
b/recommendation/api/types/translation/candidate_finders.py
index 70ae772..ba73bf2 100644
--- a/recommendation/api/types/translation/candidate_finders.py
+++ b/recommendation/api/types/translation/candidate_finders.py
@@ -112,3 +112,19 @@
 articles.append(a)
 
 return articles[:n]
+
+
+class RelatedArticleFinder(CandidateFinder):
+def get_candidates(self, s, seed, n):
+results = fetcher.get_related_articles(s, seed)
+if len(results) == 0:
+return MorelikeCandidateFinder().get_candidates(s, seed, n)
+
+articles = []
+for item in results:
+a = Article(item['title'])
+a.wikidata_id = item['wikidata_id']
+a.rank = item['score']
+articles.append(a)
+
+return articles[:n]
diff --git a/recommendation/api/types/translation/translation.py 
b/recommendation/api/types/translation/translation.py
index d271c50..f22a880 100644
--- a/recommendation/api/types/translation/translation.py
+++ b/recommendation/api/types/translation/translation.py
@@ -166,6 +166,7 @@
 'morelike': candidate_finders.MorelikeCandidateFinder(),
 'wiki': candidate_finders.MorelikeCandidateFinder(),
 'mostpopular': candidate_finders.PageviewCandidateFinder(),
+'related_articles': candidate_finders.RelatedArticleFinder()
 }
 
 
diff --git a/recommendation/data/labs_setup.sh 
b/recommendation/data/labs_setup.sh
index 8290c97..377b2d1 100755
--- a/recommendation/data/labs_setup.sh
+++ b/recommendation/data/labs_setup.sh
@@ -31,7 +31,7 @@
 cp ${TMP_PATH}/recommendation-api/recommendation/data/* ${ETC_PATH}
 cp ${ETC_PATH}/recommendation.nginx /etc/nginx/sites-available/recommendation
 ln -s /etc/nginx/sites-available/recommendation /etc/nginx/sites-enabled/
-cp ${ETC_PATH}/recommendation.service 
/etc/systemd/system/multi-user.target/wants/
+cp ${ETC_PATH}/recommendation.service 
/etc/systemd/system/multi-user.target.wants/
 

[MediaWiki-commits] [Gerrit] research/recommendation-api[master]: Use related-articles in translation recommendation

2016-12-13 Thread Nschaaf (Code Review)
Nschaaf has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/326903 )

Change subject: Use related-articles in translation recommendation
..

Use related-articles in translation recommendation

Bug: T151793
Change-Id: Iff6b932606cebb7fc239fdb1d64703525069c782
---
M recommendation/api/external_data/fetcher.py
M recommendation/api/external_data/wikidata.py
M recommendation/api/types/translation/candidate_finders.py
M recommendation/api/types/translation/translation.py
M recommendation/data/labs_setup.sh
5 files changed, 35 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/research/recommendation-api 
refs/changes/03/326903/1

diff --git a/recommendation/api/external_data/fetcher.py 
b/recommendation/api/external_data/fetcher.py
index 76e8b6b..6d6fdf7 100644
--- a/recommendation/api/external_data/fetcher.py
+++ b/recommendation/api/external_data/fetcher.py
@@ -108,3 +108,8 @@
 seed = 'morelike:' + seed
 params['srsearch'] = seed
 return endpoint, params
+
+
+def get_related_articles(source, seed):
+return 
get('http://recommend-related-articles.wmflabs.org/types/related_articles/v1/articles',
+   dict(source=source, seed=seed, count=500))
diff --git a/recommendation/api/external_data/wikidata.py 
b/recommendation/api/external_data/wikidata.py
index 82f7178..7359fbc 100644
--- a/recommendation/api/external_data/wikidata.py
+++ b/recommendation/api/external_data/wikidata.py
@@ -9,11 +9,11 @@
 WikidataItem = collections.namedtuple('WikidataItem', ['id', 'title', 'url'])
 
 
-def query(params):
+def query(params, expected_sitelinks=1):
 """
 Query the wikidata endpoint and return a list of WikidataItem
 
- This only includes items that have exactly 1 sitelink
+ This only includes items that have exactly expected_sitelinks sitelink
 """
 endpoint = configuration.get_config_value('endpoints', 'wikidata')
 try:
@@ -28,7 +28,7 @@
 
 for id, entity in entities.items():
 sitelinks = entity.get('sitelinks', {})
-if len(sitelinks.keys()) != 1:
+if len(sitelinks.keys()) != expected_sitelinks:
 continue
 sitelink = sitelinks.popitem()[1]
 
@@ -43,7 +43,12 @@
 def get_items_in_source_missing_in_target_by_titles(source, target, titles):
 params = configuration.get_config_dict('wikidata_titles_to_items_params')
 params['sites'] = params['sites'].format(source=source)
+# We want the sitefilter to include both the source and target
+# wikis. This sets up the scenario where if there is only 1 sitelink
+# present, that means that the article is missing in the target (since
+# the title will have come from the source wiki)
 params['sitefilter'] = params['sitefilter'].format(target=target)
+params['sitefilter'] += '|{}wiki'.format(source)
 params['titles'] = '|'.join(titles)
 
 items = query(params)
diff --git a/recommendation/api/types/translation/candidate_finders.py 
b/recommendation/api/types/translation/candidate_finders.py
index 70ae772..ba73bf2 100644
--- a/recommendation/api/types/translation/candidate_finders.py
+++ b/recommendation/api/types/translation/candidate_finders.py
@@ -112,3 +112,19 @@
 articles.append(a)
 
 return articles[:n]
+
+
+class RelatedArticleFinder(CandidateFinder):
+def get_candidates(self, s, seed, n):
+results = fetcher.get_related_articles(s, seed)
+if len(results) == 0:
+return MorelikeCandidateFinder().get_candidates(s, seed, n)
+
+articles = []
+for item in results:
+a = Article(item['title'])
+a.wikidata_id = item['wikidata_id']
+a.rank = item['score']
+articles.append(a)
+
+return articles[:n]
diff --git a/recommendation/api/types/translation/translation.py 
b/recommendation/api/types/translation/translation.py
index d271c50..f22a880 100644
--- a/recommendation/api/types/translation/translation.py
+++ b/recommendation/api/types/translation/translation.py
@@ -166,6 +166,7 @@
 'morelike': candidate_finders.MorelikeCandidateFinder(),
 'wiki': candidate_finders.MorelikeCandidateFinder(),
 'mostpopular': candidate_finders.PageviewCandidateFinder(),
+'related_articles': candidate_finders.RelatedArticleFinder()
 }
 
 
diff --git a/recommendation/data/labs_setup.sh 
b/recommendation/data/labs_setup.sh
index 8290c97..228241d 100755
--- a/recommendation/data/labs_setup.sh
+++ b/recommendation/data/labs_setup.sh
@@ -8,6 +8,10 @@
 apt-get install -y git nginx npm python3 python3-pip
 pip3 install --upgrade pip
 
+# Need to add uwsgi to the wheels
+apt-get install -y build-essential python3-dev
+pip3 install uwsgi
+
 rm -rf ${TMP_PATH}
 mkdir -p ${TMP_PATH}
 mkdir -p ${SRV_PATH}/resources
@@ -31,7 +35,7 @@
 cp ${TMP_PATH}/recommendation-api/recommendation/data/* ${ETC_PATH}
 cp ${ETC_PATH}/recommendation.nginx