jenkins-bot has submitted this change and it was merged.

Change subject: Make categorisation project aware
......................................................................


Make categorisation project aware

Also:
* Fixed an issue where non-category sitelinks on wikidata
  could be added as a category.
* Added a test for the wikidata method
* Removed unused method and variable.

Not done:
* Could not successfully mock NoPage triggers in the wikidata
  method. Attempt left commented out.

Bug:T111618
Change-Id: Ie1804ec682b284fcbe6cf154b3455992d9112fd5
---
M erfgoedbot/categorize_images.py
M tests/test_categorization.py
2 files changed, 141 insertions(+), 72 deletions(-)

Approvals:
  Jean-Frédéric: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/erfgoedbot/categorize_images.py b/erfgoedbot/categorize_images.py
index 47ca122..95f75e1 100644
--- a/erfgoedbot/categorize_images.py
+++ b/erfgoedbot/categorize_images.py
@@ -41,7 +41,7 @@
     pass
 
 # Contains the commonscat templates for most Wikipedia's (taken from 
ex-commonscat.py)
-commonscatTemplates = {
+wikipedia_commonscat_templates = {
     '_default': (u'Commonscat', []),
     'af': (u'CommonsKategorie', [u'commonscat']),
     'an': (u'Commonscat', [u'Commons cat']),
@@ -269,7 +269,7 @@
 
 def get_new_categories(monumentId, monData, lang, commonsCatTemplates):
     (monumentName, monumentCommonscat,
-     monumentArticleTitle, monumentSource) = monData
+     monumentArticleTitle, monumentSource, project) = monData
     commons_site = pywikibot.Site(u'commons', u'commons')
     newcats = []
     # First try to add a category based on the commonscat field in the list
@@ -285,12 +285,13 @@
         except pywikibot.exceptions.InvalidTitle:
             pywikibot.warning(
                 u'Incorrect category title %s' % (monumentCommonscat,))
+
     # Option two is to use the article about the monument and see if it has
     # Commonscat links
     if not newcats:
         monumentArticle = None
         if monumentArticleTitle:
-            project_site = pywikibot.Site(lang, u'wikipedia')
+            project_site = pywikibot.Site(lang, project)
             monumentArticle = pywikibot.Page(project_site, 
monumentArticleTitle)
         if monumentArticle:
             try:
@@ -312,9 +313,10 @@
             except pywikibot.exceptions.Error as e:
                 pywikibot.error(u'Error occured with monument %s: %s' % (
                     monumentId, str(e)))
+
     # Option three is to see if the list contains Commonscat links (whole list)
     if not newcats:
-        monumentList = getList(lang, monumentSource)
+        monumentList = getList(lang, project, monumentSource)
         # print monumentList
         if not monumentList:
             return False
@@ -378,10 +380,11 @@
 
 
 def getMonData(countrycode, lang, monumentId, conn, cursor):
-    '''
-    Get monument name and source from db
-    '''
-    query = u"""SELECT `name`, `commonscat`, `monument_article`, `source` FROM 
monuments_all WHERE (country=%s AND lang=%s AND id=%s) LIMIT 1"""
+    """Get monument name and source from db."""
+    query = u"SELECT `name`, `commonscat`, `monument_article`, `source`, 
`project` " \
+            u"FROM monuments_all " \
+            u"WHERE (country=%s AND lang=%s AND id=%s) " \
+            u"LIMIT 1"
 
     cursor.execute(query, (countrycode, lang, monumentId))
 
@@ -393,38 +396,18 @@
         return False
 
 
-def getArticle(lang, monumentName):
-    '''
-    Get monument article page from wikilink at monumentName
-    '''
-
-    if monumentName:
-        regex = u'^\[\[(.+?)(\||\])'
-
-        match = re.search(regex, monumentName)
-        if not match:
-            return False
-
-        page_title = match.group(1)
-        site = pywikibot.Site(lang, u'wikipedia')
-
-        return pywikibot.Page(site, page_title)
-    else:
-        return False
-
-
-def getList(lang, monumentSource):
+def getList(lang, project, monumentSource):
     '''
     Get listpage
     '''
     if monumentSource:
-        regex = u'^(https:)?//%s.wikipedia.org/w/index.php\?title=(.+?)&' % (
-            lang,)
+        regex = u'^(https:)?//%s.%s.org/w/index.php\?title=(.+?)&' % (
+            lang, project)
         match = re.search(regex, monumentSource)
         if not match:
             return False
         page_title = match.group(2)
-        site = pywikibot.Site(lang, u'wikipedia')
+        site = pywikibot.Site(lang, project)
         return pywikibot.Page(site, page_title)
     else:
         return False
@@ -504,12 +487,13 @@
 
 
 def get_Commons_category_via_Wikidata(page):
-    '''
+    """
     Get Commons Category from the linked Wikidata item and P373.
 
-    Raises: NoCommonsCatFromWikidataItemException if either there is no linked 
item
-            or it does not bear P373 or a sitelink to Commons
-    '''
+    Raises: NoCommonsCatFromWikidataItemException if either there is no linked
+            item or it does not bear P373 or a sitelink to a category page on
+            Commons.
+    """
     try:
         data_item = page.data_item()
         claims = data_item.get()['claims']
@@ -517,8 +501,12 @@
             return 'Category:' + claims['P373'][0].getTarget()
         else:
             commons_site = pywikibot.Site(u'commons', u'commons')
-            return data_item.getSitelink(commons_site)
-    except (pywikibot.NoPage, KeyError):
+            commons_page = data_item.getSitelink(commons_site)
+            if commons_page.startswith('Category:'):
+                return commons_page
+            else:
+                raise NoCommonsCatFromWikidataItemException(page)
+    except (pywikibot.NoPage, KeyError, NoCommonsCatFromWikidataItemException):
         raise NoCommonsCatFromWikidataItemException(page)
 
 
@@ -544,7 +532,6 @@
 
     site = pywikibot.Site(u'commons', u'commons')
     generator = None
-    genFactory = pagegenerators.GeneratorFactory()
     commonsTemplate = countryconfig.get('commonsTemplate')
 
     if overridecat:
@@ -614,16 +601,20 @@
     page.put(newtext=output, comment=comment)
 
 
-def getCommonscatTemplates(lang=None):
-    '''Get the template name in a language. Expects the language code.
-    Return as tuple containing the primary template and it's alternatives
+def getCommonscatTemplates(lang=None, project=None):
+    """
+    Get the template name in a language on a project.
 
-    '''
+    Expects the language code and project.
+    Return as tuple containing the primary template and it's alternatives
+    """
+    project = project or u'wikipedia'  # default to wikipedia
+
     result = []
-    if lang in commonscatTemplates:
-        (prim, backups) = commonscatTemplates[lang]
+    if project == u'wikipedia' and lang in wikipedia_commonscat_templates:
+        (prim, backups) = wikipedia_commonscat_templates[lang]
     else:
-        (prim, backups) = commonscatTemplates[u'_default']
+        (prim, backups) = wikipedia_commonscat_templates[u'_default']
     result.append(prim)
     result = result + backups
     return result
@@ -654,16 +645,19 @@
             return False
         pywikibot.log(
             u'Working on countrycode "%s" in language "%s"' % (countrycode, 
lang))
-        commonsCatTemplates = getCommonscatTemplates(lang)
+        countryconfig = mconfig.countries.get((countrycode, lang))
+        commonsCatTemplates = getCommonscatTemplates(
+            lang, countryconfig.get('project'))
         # print commonsCatTemplates
-        processCountry(countrycode, lang, mconfig.countries.get(
-            (countrycode, lang)), commonsCatTemplates, conn, cursor, 
overridecat=overridecat)
+        processCountry(countrycode, lang, countryconfig, commonsCatTemplates,
+                       conn, cursor, overridecat=overridecat)
     else:
         statistics = []
         for (countrycode, lang), countryconfig in 
mconfig.countries.iteritems():
             pywikibot.log(
                 u'Working on countrycode "%s" in language "%s"' % 
(countrycode, lang))
-            commonsCatTemplates = getCommonscatTemplates(lang)
+            commonsCatTemplates = getCommonscatTemplates(
+                lang, countryconfig.get('project'))
             result = processCountry(
                 countrycode, lang, countryconfig, commonsCatTemplates, conn, 
cursor)
             if result:
diff --git a/tests/test_categorization.py b/tests/test_categorization.py
index e9eb67e..93e4d1e 100644
--- a/tests/test_categorization.py
+++ b/tests/test_categorization.py
@@ -1,12 +1,10 @@
+"""Unit tests for categorize_images."""
 
+# from pywikibot.site import APISite
+# from pywikibot.exceptions import NoPage
 import unittest
-from mock import Mock
-from erfgoedbot.categorize_images import \
-    replace_default_cat_with_new_categories_in_image_text, \
-    filter_out_categories_to_add, \
-    remove_base_category_from_categories_to_add_if_present, \
-    deduplicate_categories, \
-    NoCategoryToAddException
+import mock
+from erfgoedbot import categorize_images
 
 
 class TestReplaceCategories(unittest.TestCase):
@@ -14,15 +12,16 @@
     """Test the replace_default_cat_with_new_categories_in_image_text 
method."""
 
     def setUp(self):
-        self.mock_old_category = Mock()
+        self.mock_old_category = mock.Mock()
         self.mock_old_category.title.return_value = "A"
-        self.mock_old_category.__str__ = Mock()
+        self.mock_old_category.__str__ = mock.Mock()
         self.mock_old_category.__str__.return_value = 'A'
 
     def 
test_replace_categories_with_no_category_present_should_add_new_category(self):
         old_text = ""
         new_categories = ["B"]
-        new_text = 
replace_default_cat_with_new_categories_in_image_text(old_text, 
self.mock_old_category, new_categories)
+        new_text = 
categorize_images.replace_default_cat_with_new_categories_in_image_text(
+            old_text, self.mock_old_category, new_categories)
         expected_text = u'[[Category:B]]'
         self.assertEquals(new_text, expected_text)
 
@@ -31,34 +30,36 @@
         [[Category:A]]
         """
         new_categories = ["B"]
-        new_text = 
replace_default_cat_with_new_categories_in_image_text(old_text, 
self.mock_old_category, new_categories)
+        new_text = 
categorize_images.replace_default_cat_with_new_categories_in_image_text(
+            old_text, self.mock_old_category, new_categories)
         expected_text = u'[[Category:B]]'
         self.assertEquals(new_text, expected_text)
 
     def 
test_replace_categories_with_no_categories_to_add_raise_Exception(self):
         old_text = ""
         new_categories = []
-        with self.assertRaises(NoCategoryToAddException):
-            new_text = 
replace_default_cat_with_new_categories_in_image_text(old_text, 
self.mock_old_category, new_categories)
+        with self.assertRaises(categorize_images.NoCategoryToAddException):
+            
categorize_images.replace_default_cat_with_new_categories_in_image_text(
+                old_text, self.mock_old_category, new_categories)
 
 
 class TestDeduplicateCategories(unittest.TestCase):
 
     def test_deduplicate_categories(self):
         new_categories = ['A', 'B', 'A']
-        result = deduplicate_categories(new_categories)
+        result = categorize_images.deduplicate_categories(new_categories)
         expected = ['A', 'B']
         self.assertItemsEqual(result, expected)
 
     def test_deduplicate_categories_nothing_to_deduplicate(self):
         new_categories = ['A', 'B']
-        result = deduplicate_categories(new_categories)
+        result = categorize_images.deduplicate_categories(new_categories)
         expected = ['A', 'B']
         self.assertItemsEqual(result, expected)
 
     def test_deduplicate_categories_no_categories(self):
         new_categories = []
-        result = deduplicate_categories(new_categories)
+        result = categorize_images.deduplicate_categories(new_categories)
         expected = []
         self.assertItemsEqual(result, expected)
 
@@ -66,11 +67,13 @@
 class TestRemoveBaseCategory(unittest.TestCase):
 
     def test_remove_category_with_category_present(self):
-        result = remove_base_category_from_categories_to_add_if_present(['A', 
'B'], 'A')
+        result = 
categorize_images.remove_base_category_from_categories_to_add_if_present(
+            ['A', 'B'], 'A')
         self.assertItemsEqual(result, ['B'])
 
     def test_remove_category_without_category_present(self):
-        result = remove_base_category_from_categories_to_add_if_present(['B', 
'C'], 'A')
+        result = 
categorize_images.remove_base_category_from_categories_to_add_if_present(
+            ['B', 'C'], 'A')
         self.assertItemsEqual(result, ['B', 'C'])
 
 
@@ -79,23 +82,95 @@
     def test_filter_categories_all_present(self):
         new_categories = ['A', 'B']
         current_categories = ['A', 'B', 'C']
-        result = filter_out_categories_to_add(new_categories, 
current_categories)
+        result = categorize_images.filter_out_categories_to_add(
+            new_categories, current_categories)
         self.assertEquals(result, [])
 
     def test_filter_out_some_categories_present(self):
         new_categories = ['A', 'B', 'C']
         current_categories = ['A', 'B']
-        result = filter_out_categories_to_add(new_categories, 
current_categories)
+        result = categorize_images.filter_out_categories_to_add(
+            new_categories, current_categories)
         self.assertEquals(result, ['C'])
 
     def test_filter_out_with_no_categories_among_present(self):
         new_categories = ['A', 'B', 'C']
         current_categories = ['D', 'E']
-        result = filter_out_categories_to_add(new_categories, 
current_categories)
+        result = categorize_images.filter_out_categories_to_add(
+            new_categories, current_categories)
         self.assertItemsEqual(result, ['A', 'B', 'C'])
 
     def test_filter_out_with_no_categories_present(self):
         new_categories = ['A', 'B', 'C']
         current_categories = []
-        result = filter_out_categories_to_add(new_categories, 
current_categories)
+        result = categorize_images.filter_out_categories_to_add(
+            new_categories, current_categories)
         self.assertItemsEqual(result, ['A', 'B', 'C'])
+
+
+class TestGetCommonsCategoryViaWikidata(unittest.TestCase):
+
+    """Tests the get_Commons_category_via_Wikidata method."""
+
+    def setUp(self):
+        self.mock_page = mock.create_autospec(
+            categorize_images.pywikibot.Page)
+        self.mock_data_item = mock.create_autospec(
+            categorize_images.pywikibot.ItemPage)
+        self.mock_claim = mock.create_autospec(
+            categorize_images.pywikibot.Claim)
+
+        self.mock_claim.getTarget.return_value = u'A category'
+        self.mock_data_item.get.return_value = {
+            u'claims': {},
+            u'sitelinks': {},
+        }
+        self.claims = {u'P373': [self.mock_claim, ]}
+        self.page_sitelink = u'Some page'
+        self.category_sitelink = u'Category:Some category'
+
+    # def test_get_Commons_category_via_Wikidata_no_data_item(self):
+    #    self.mock_page.data_item.side_effect = NoPage
+    #    with 
self.assertRaises(categorize_images.NoCommonsCatFromWikidataItemException):
+    #        
categorize_images.get_Commons_category_via_Wikidata(self.mock_page)
+
+    # def test_get_Commons_category_via_Wikidata_no_claim_or_sitelink(self):
+    #    expected_site = APISite("commons", "commons")
+    #    self.mock_page.data_item.return_value = self.mock_data_item
+    #    self.mock_data_item.getSitelink.side_effect = NoPage
+    #    with 
self.assertRaises(categorize_images.NoCommonsCatFromWikidataItemException):
+    #        
categorize_images.get_Commons_category_via_Wikidata(self.mock_page)
+    #        self.mock_data_item.getSitelink.assert_called_once_with(
+    #            expected_site)
+
+    def test_get_Commons_category_via_Wikidata_with_claim(self):
+        self.mock_page.data_item.return_value = self.mock_data_item
+        self.mock_data_item.get.return_value[u'claims'] = self.claims
+        expected = u'Category:A category'
+        self.assertEquals(
+            
categorize_images.get_Commons_category_via_Wikidata(self.mock_page),
+            expected)
+
+    def test_get_Commons_category_via_Wikidata_with_page_sitelink(self):
+        self.mock_page.data_item.return_value = self.mock_data_item
+        self.mock_data_item.getSitelink.return_value = self.page_sitelink
+        with 
self.assertRaises(categorize_images.NoCommonsCatFromWikidataItemException):
+            categorize_images.get_Commons_category_via_Wikidata(self.mock_page)
+
+    def test_get_Commons_category_via_Wikidata_with_category_sitelink(self):
+        self.mock_page.data_item.return_value = self.mock_data_item
+        self.mock_data_item.getSitelink.return_value = self.category_sitelink
+        expected = u'Category:Some category'
+        self.assertEquals(
+            
categorize_images.get_Commons_category_via_Wikidata(self.mock_page),
+            expected)
+
+    def 
test_get_Commons_category_via_Wikidata_with_claim_and_category_sitelink(self):
+        """Ensure claim value is chosen over sitelink value."""
+        self.mock_page.data_item.return_value = self.mock_data_item
+        self.mock_data_item.get.return_value[u'claims'] = self.claims
+        self.mock_data_item.getSitelink.return_value = self.category_sitelink
+        expected = u'Category:A category'
+        self.assertEquals(
+            
categorize_images.get_Commons_category_via_Wikidata(self.mock_page),
+            expected)

-- 
To view, visit https://gerrit.wikimedia.org/r/299273
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie1804ec682b284fcbe6cf154b3455992d9112fd5
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: master
Gerrit-Owner: Lokal Profil <[email protected]>
Gerrit-Reviewer: Jean-Frédéric <[email protected]>
Gerrit-Reviewer: Lokal Profil <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to