Vadiraja.k has uploaded a new change for review. https://gerrit.wikimedia.org/r/261877
Change subject: pagegenerators.py: Allow filtering by category ...................................................................... pagegenerators.py: Allow filtering by category Allow generated pages to be filtered by category. A category may have large number of pages, while using -intersect filtering pages from other generators will is more efficient than getting all pages in the category. Add related tests in page_generators_tests.py. Bug: T122392 Change-Id: Ib760712be4b5acc84b09b80e7eaf6b9c11e4c870 --- M pywikibot/pagegenerators.py M scripts/i18n M tests/pagegenerators_tests.py 3 files changed, 60 insertions(+), 3 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/77/261877/1 diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index d3a15a9..f45c1c2 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -62,6 +62,9 @@ parameterHelp = u"""\ +-catfilter Filter the page generator to only yield pages in the + specified category. See -cat for argument format. + -cat Work on all pages which are in a specific category. Argument can also be given as "-cat:categoryname" or as "-cat:categoryname|fromtitle" (using # instead of | @@ -350,6 +353,7 @@ self.articlefilter_list = [] self.titlefilter_list = [] self.claimfilter_list = [] + self.catfilter_list = [] self.intersect = False self.subpage_max_depth = None self._site = site @@ -422,6 +426,7 @@ if (self.titlefilter_list or self.articlefilter_list or self.claimfilter_list or + self.catfilter_list or self.subpage_max_depth is not None or self.qualityfilter_list): pywikibot.warning( @@ -466,11 +471,14 @@ dupfiltergen = RegexBodyFilterPageGenerator( PreloadingGenerator(dupfiltergen), self.articlefilter_list) + if self.catfilter_list: + dupfiltergen = CategoryFilterPageGenerator( + dupfiltergen, self.catfilter_list, self.site) + return dupfiltergen - def getCategoryGen(self, arg, recurse=False, content=False, - gen_func=None): - """Return generator based on Category defined by arg and gen_func.""" + def getCategory(self, arg): + """Return Category and start as defined by arg.""" categoryname = arg.partition(':')[2] if not categoryname: categoryname = i18n.input( @@ -491,6 +499,12 @@ categoryname) cat = pywikibot.Category(pywikibot.Link(categoryname, defaultNamespace=14)) + return cat, startfrom + + def getCategoryGen(self, arg, recurse=False, content=False, + gen_func=None): + """Return generator based on Category defined by arg and gen_func.""" + cat, startfrom = self.getCategory(arg) return gen_func(cat, start=startfrom, @@ -651,6 +665,10 @@ elif arg.startswith('-catr'): gen = self.getCategoryGen(arg, recurse=True, gen_func=CategorizedPageGenerator) + elif arg.startswith('-catfilter'): + cat, _ = self.getCategory(arg) + self.catfilter_list.append(cat) + return True elif arg.startswith('-category'): gen = self.getCategoryGen(arg, gen_func=CategorizedPageGenerator) elif arg.startswith('-cat'): @@ -1550,6 +1568,22 @@ else: yield page +def CategoryFilterPageGenerator(generator, category_list, site=None): + """ + Wrap a generator to filter pages by categories specified. + + @param generator: A generator object + @param category_list: categories used to filter generated pages + @type category_list: list of category objects + + """ + if site is None: + site = pywikibot.Site() + for page in generator: + if all(x in site.pagecategories(page) for x in category_list): + yield page + + # name the generator methods RegexFilterPageGenerator = RegexFilter.titlefilter RegexBodyFilterPageGenerator = RegexFilter.contentfilter diff --git a/scripts/i18n b/scripts/i18n index 8e949fc..eb6ada8 160000 --- a/scripts/i18n +++ b/scripts/i18n -Subproject commit 8e949fce7f77cc97f682780010c0bcc2e2de8f14 +Subproject commit eb6ada896419336ed036ba2ab799faa5d1b334fe diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index ed38066..76d1c28 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -183,6 +183,29 @@ self.assertEqual(len(tuple(gen)), 9) +class TestCategoryFilterPageGenerator(TestCase): + + """Test CategoryFilterPageGenerator method.""" + + family = 'wikisource' + code = 'en' + + base_title = 'Page:06-24-1920 -The Story of the Jones County Calf Case.pdf/%s' + category_list = ['Category:Validated'] + + def setUp(self): + super(TestCategoryFilterPageGenerator, self).setUp() + self.site = self.get_site() + self.titles = [self.base_title % i for i in range(1, 11)] + self.catfilter_list = [pywikibot.Category(self.site, cat) for cat in self.category_list] + + def test_CategoryFilterPageGenerator(self): + site = self.site + gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site) + gen = pagegenerators.CategoryFilterPageGenerator(gen, self.catfilter_list, site) + self.assertEqual(len(tuple(gen)), 7) + + class TestQualityFilterPageGenerator(TestCase): """Test QualityFilterPageGenerator methods.""" -- To view, visit https://gerrit.wikimedia.org/r/261877 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ib760712be4b5acc84b09b80e7eaf6b9c11e4c870 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Vadiraja.k <vadi.f...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits