jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/370775 )
Change subject: Harvest monument_article via sparql ...................................................................... Harvest monument_article via sparql Bug: T172842 Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5 --- M erfgoedbot/common.py M erfgoedbot/template/wikidata_query.sparql M erfgoedbot/update_database.py M tests/test_common.py 4 files changed, 45 insertions(+), 4 deletions(-) Approvals: Jean-Frédéric: Looks good to me, approved jenkins-bot: Verified diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py index a6f0369..333409c 100644 --- a/erfgoedbot/common.py +++ b/erfgoedbot/common.py @@ -29,6 +29,19 @@ return '[[{0}]]'.format(page_title) +def get_page_from_url(url): + """ + Retrieve the wikipage and site from a page or entity url. + """ + supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia'] + pattern = '\/\/(.+?)\.({0})\.org\/(wiki|entity)\/(.+?)$'.format( + '|'.join(supported_sites)) + m = re.search(pattern, url) + site = (m.group(2), m.group(1)) + page_name = m.group(4) + return (page_name, site) + + def get_source_page(source, harvest_type=None): """ Retrieve the wikipage and site from the source field. @@ -43,8 +56,11 @@ site = None page_name = None if harvest_type == 'sparql': - site = ('wikidata', 'www') - page_name = source.split('/')[-1] + try: + return get_page_from_url(source) + except AttributeError: + raise ValueError( + u'Could not find source list ({0})'.format(source)) else: supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia'] pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format( diff --git a/erfgoedbot/template/wikidata_query.sparql b/erfgoedbot/template/wikidata_query.sparql index 6a3878d..266e6ea 100644 --- a/erfgoedbot/template/wikidata_query.sparql +++ b/erfgoedbot/template/wikidata_query.sparql @@ -1,9 +1,11 @@ # MonumentsDB harvest -SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat ?address ?coordinate WHERE { +SELECT DISTINCT ?item ?itemLabel ?id ?monument_article ?admin ?adminLabel ?image ?commonscat ?address ?coordinate WHERE { # Make it properties and filter out end time %(select_statement)s . + OPTIONAL { ?monument_article schema:about ?item; + schema:isPartOf <https://%(lang)s.%(project)s.org/>; } . OPTIONAL { ?item wdt:P131 ?admin } . OPTIONAL { ?item wdt:P18 ?image } . OPTIONAL { ?item wdt:P373 ?commonscat } . diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py index 23e4670..7d86e93 100755 --- a/erfgoedbot/update_database.py +++ b/erfgoedbot/update_database.py @@ -21,6 +21,7 @@ from pywikibot import pagegenerators import monuments_config as mconfig +import common as common from converters import ( extractWikilink, extract_elements_from_template_param, @@ -257,6 +258,9 @@ if params['adminLabel']: params['admin'] = params['adminLabel'].value + if params['monument_article']: + params['monument_article'], _site = common.get_page_from_url(params['monument_article'].value) + params['source'] = params['item'].value params['wd_item'] = params['item'].getID() @@ -473,7 +477,8 @@ sparql_query = sparql_template % dict( select_statement=sparql_select, - lang=countryconfig.get('lang') + lang=countryconfig.get('lang'), + project=countryconfig.get('project') ) # print sparql_query sq = pywikibot.data.sparql.SparqlQuery() diff --git a/tests/test_common.py b/tests/test_common.py index 126eeb7..2f016c0 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -29,6 +29,24 @@ self.assertEquals(result, ('Q123', ('wikidata', 'www'))) +class TestGetPageFromUrl(unittest.TestCase): + + def test_get_page_from_url_entity(self): + source = 'http://www.wikidata.org/entity/Q123' + result = common.get_page_from_url(source) + self.assertEquals(result, ('Q123', ('wikidata', 'www'))) + + def test_get_page_from_url_page(self): + source = 'http://www.wikidata.org/wiki/Q123' + result = common.get_page_from_url(source) + self.assertEquals(result, ('Q123', ('wikidata', 'www'))) + + def test_get_page_from_url_wikipedia(self): + source = 'http://en.wikipedia.org/entity/foo' + result = common.get_page_from_url(source) + self.assertEquals(result, ('foo', ('wikipedia', 'en'))) + + class TestGetSourceLink(unittest.TestCase): def setUp(self): -- To view, visit https://gerrit.wikimedia.org/r/370775 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5 Gerrit-PatchSet: 1 Gerrit-Project: labs/tools/heritage Gerrit-Branch: wikidata Gerrit-Owner: Lokal Profil <lokal.pro...@gmail.com> Gerrit-Reviewer: Jean-Frédéric <jeanfrederic.w...@gmail.com> Gerrit-Reviewer: Multichill <maar...@mdammers.nl> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits