Vadiraja.k has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/261877

Change subject: pagegenerators.py: Allow filtering by category
......................................................................

pagegenerators.py: Allow filtering by category

Allow generated pages to be filtered by category.
A category may have large number of pages, while using -intersect filtering
pages from other generators will is more efficient than getting all pages in
the category.
Add related tests in page_generators_tests.py.

Bug: T122392
Change-Id: Ib760712be4b5acc84b09b80e7eaf6b9c11e4c870
---
M pywikibot/pagegenerators.py
M scripts/i18n
M tests/pagegenerators_tests.py
3 files changed, 60 insertions(+), 3 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/77/261877/1

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d3a15a9..f45c1c2 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -62,6 +62,9 @@
 
 parameterHelp = u"""\
 
+-catfilter        Filter the page generator to only yield pages in the
+                  specified category. See -cat for argument format.
+
 -cat              Work on all pages which are in a specific category.
                   Argument can also be given as "-cat:categoryname" or
                   as "-cat:categoryname|fromtitle" (using # instead of |
@@ -350,6 +353,7 @@
         self.articlefilter_list = []
         self.titlefilter_list = []
         self.claimfilter_list = []
+        self.catfilter_list = []
         self.intersect = False
         self.subpage_max_depth = None
         self._site = site
@@ -422,6 +426,7 @@
             if (self.titlefilter_list or
                 self.articlefilter_list or
                 self.claimfilter_list or
+                self.catfilter_list or
                 self.subpage_max_depth is not None or
                     self.qualityfilter_list):
                 pywikibot.warning(
@@ -466,11 +471,14 @@
             dupfiltergen = RegexBodyFilterPageGenerator(
                 PreloadingGenerator(dupfiltergen), self.articlefilter_list)
 
+        if self.catfilter_list:
+            dupfiltergen = CategoryFilterPageGenerator(
+                dupfiltergen, self.catfilter_list, self.site)
+
         return dupfiltergen
 
-    def getCategoryGen(self, arg, recurse=False, content=False,
-                       gen_func=None):
-        """Return generator based on Category defined by arg and gen_func."""
+    def getCategory(self, arg):
+        """Return Category and start as defined by arg."""
         categoryname = arg.partition(':')[2]
         if not categoryname:
             categoryname = i18n.input(
@@ -491,6 +499,12 @@
                                              categoryname)
         cat = pywikibot.Category(pywikibot.Link(categoryname,
                                                 defaultNamespace=14))
+        return cat, startfrom
+
+    def getCategoryGen(self, arg, recurse=False, content=False,
+                       gen_func=None):
+        """Return generator based on Category defined by arg and gen_func."""
+        cat, startfrom = self.getCategory(arg)
 
         return gen_func(cat,
                         start=startfrom,
@@ -651,6 +665,10 @@
         elif arg.startswith('-catr'):
             gen = self.getCategoryGen(arg, recurse=True,
                                       gen_func=CategorizedPageGenerator)
+        elif arg.startswith('-catfilter'):
+            cat, _ = self.getCategory(arg)
+            self.catfilter_list.append(cat)
+            return True
         elif arg.startswith('-category'):
             gen = self.getCategoryGen(arg, gen_func=CategorizedPageGenerator)
         elif arg.startswith('-cat'):
@@ -1550,6 +1568,22 @@
         else:
             yield page
 
+def CategoryFilterPageGenerator(generator, category_list, site=None):
+    """
+    Wrap a generator to filter pages by categories specified.
+
+    @param generator: A generator object
+    @param category_list: categories used to filter generated pages
+    @type category_list: list of category objects
+
+    """
+    if site is None:
+        site = pywikibot.Site()
+    for page in generator:
+        if all(x in site.pagecategories(page) for x in category_list):
+            yield page
+
+
 # name the generator methods
 RegexFilterPageGenerator = RegexFilter.titlefilter
 RegexBodyFilterPageGenerator = RegexFilter.contentfilter
diff --git a/scripts/i18n b/scripts/i18n
index 8e949fc..eb6ada8 160000
--- a/scripts/i18n
+++ b/scripts/i18n
-Subproject commit 8e949fce7f77cc97f682780010c0bcc2e2de8f14
+Subproject commit eb6ada896419336ed036ba2ab799faa5d1b334fe
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index ed38066..76d1c28 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -183,6 +183,29 @@
         self.assertEqual(len(tuple(gen)), 9)
 
 
+class TestCategoryFilterPageGenerator(TestCase):
+
+    """Test CategoryFilterPageGenerator method."""
+
+    family = 'wikisource'
+    code = 'en'
+
+    base_title = 'Page:06-24-1920 -The Story of the Jones County Calf 
Case.pdf/%s'
+    category_list = ['Category:Validated']
+
+    def setUp(self):
+        super(TestCategoryFilterPageGenerator, self).setUp()
+        self.site = self.get_site()
+        self.titles = [self.base_title % i for i in range(1, 11)]
+        self.catfilter_list = [pywikibot.Category(self.site, cat) for cat in 
self.category_list]
+
+    def test_CategoryFilterPageGenerator(self):
+        site = self.site
+        gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site)
+        gen = pagegenerators.CategoryFilterPageGenerator(gen, 
self.catfilter_list, site)
+        self.assertEqual(len(tuple(gen)), 7)
+
+
 class TestQualityFilterPageGenerator(TestCase):
 
     """Test QualityFilterPageGenerator methods."""

-- 
To view, visit https://gerrit.wikimedia.org/r/261877
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib760712be4b5acc84b09b80e7eaf6b9c11e4c870
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Vadiraja.k <vadi.f...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to