Merlijn van Deen has uploaded a new change for review. https://gerrit.wikimedia.org/r/86813
Change subject: Pagegenerators: Add filter for article-bodies ...................................................................... Pagegenerators: Add filter for article-bodies Add a filter that matches a regex against the bodies of all pages returned by the following generators. Change-Id: I90d29e4762bec1f8259dbdd5ef2f2adc9bd578f9 --- M pywikibot/pagegenerators.py 1 file changed, 34 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/13/86813/1 diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index 5be6e16..1e1e866 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -159,6 +159,21 @@ -page Work on a single page. Argument can also be given as "-page:pagetitle". + +-contentregex A regular expression that needs to match the article + otherwise the page won't be returned. The filter works + for all subsequent generators. + + Example: + pagegenerators.py \\ + -family:wikipedia -lang:en \\ + -recentchanges:5 \\ + -contentregex:'.*Thor.*' + -cat:Thunder_gods \\ + -cat:Sky_and_weather_gods + This will find the five most recently edited pages + and all pages in the categories 'Thunder gods' and + 'Sky and weather gods' that refer to 'Thor'. """ docuReplacements = {'¶ms;': parameterHelp} @@ -178,6 +193,7 @@ self.namespaces = [] self.step = None self.limit = None + self.articlefilter = None def getCombinedGenerator(self): """Return the combination of all accumulated generators. @@ -453,12 +469,21 @@ else: regex = arg[7:] gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex) + elif arg.startswith('-contentregex'): + if len(arg) == 13: + self.articlefilter = pywikibot.input(u'Please enter your filter-expression:') + else: + self.articlefilter = arg[14:] + return True # No generator is returned, so just stop here. elif arg.startswith('-yahoo'): gen = YahooSearchPageGenerator(arg[7:]) else: pass if gen: - self.gens.append(gen) + if self.articlefilter: + self.gens.append(RegexBodyFilterPageGenerator(gen, self.articlefilter)) + else: + self.gens.append(gen) return True else: return False @@ -776,6 +801,14 @@ yield page +def RegexBodyFilterPageGenerator(generator, regex): + """Yield pages from another generator whose body matches regex with options re.IGNORECASE|re.DOTALL.""" + reg = re.compile(regex, re.IGNORECASE | re.DOTALL) + for page in generator: + if reg.match(page.text): + yield page + + def CombinedPageGenerator(generators): return itertools.chain(*generators) -- To view, visit https://gerrit.wikimedia.org/r/86813 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I90d29e4762bec1f8259dbdd5ef2f2adc9bd578f9 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Merlijn van Deen <valhall...@arctus.nl> Gerrit-Reviewer: NikiWiki <mediawikidev100...@spam.game-host.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits