Merlijn van Deen has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/86813


Change subject: Pagegenerators: Add filter for article-bodies
......................................................................

Pagegenerators: Add filter for article-bodies

Add a filter that matches a regex against the bodies of all pages
returned by the following generators.

Change-Id: I90d29e4762bec1f8259dbdd5ef2f2adc9bd578f9
---
M pywikibot/pagegenerators.py
1 file changed, 34 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/13/86813/1

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 5be6e16..1e1e866 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -159,6 +159,21 @@
 
 -page             Work on a single page. Argument can also be given as
                   "-page:pagetitle".
+
+-contentregex     A regular expression that needs to match the article
+                  otherwise the page won't be returned. The filter works
+                  for all subsequent generators.
+
+                  Example:
+                  pagegenerators.py \\
+                         -family:wikipedia -lang:en \\
+                         -recentchanges:5 \\
+                         -contentregex:'.*Thor.*'
+                         -cat:Thunder_gods \\
+                         -cat:Sky_and_weather_gods
+                  This will find the five most recently edited pages
+                  and all pages in the categories 'Thunder gods' and
+                  'Sky and weather gods' that refer to 'Thor'.
 """
 
 docuReplacements = {'&params;': parameterHelp}
@@ -178,6 +193,7 @@
         self.namespaces = []
         self.step = None
         self.limit = None
+        self.articlefilter = None
 
     def getCombinedGenerator(self):
         """Return the combination of all accumulated generators.
@@ -453,12 +469,21 @@
             else:
                 regex = arg[7:]
             gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex)
+        elif arg.startswith('-contentregex'):
+            if len(arg) == 13:
+                self.articlefilter = pywikibot.input(u'Please enter your 
filter-expression:')
+            else:
+                self.articlefilter = arg[14:]
+            return True  # No generator is returned, so just stop here.
         elif arg.startswith('-yahoo'):
             gen = YahooSearchPageGenerator(arg[7:])
         else:
             pass
         if gen:
-            self.gens.append(gen)
+            if self.articlefilter:
+                self.gens.append(RegexBodyFilterPageGenerator(gen, 
self.articlefilter))
+            else:
+                self.gens.append(gen)
             return True
         else:
             return False
@@ -776,6 +801,14 @@
             yield page
 
 
+def RegexBodyFilterPageGenerator(generator, regex):
+    """Yield pages from another generator whose body matches regex with 
options re.IGNORECASE|re.DOTALL."""
+    reg = re.compile(regex, re.IGNORECASE | re.DOTALL)
+    for page in generator:
+        if reg.match(page.text):
+            yield page
+
+
 def CombinedPageGenerator(generators):
     return itertools.chain(*generators)
 

-- 
To view, visit https://gerrit.wikimedia.org/r/86813
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I90d29e4762bec1f8259dbdd5ef2f2adc9bd578f9
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Merlijn van Deen <valhall...@arctus.nl>
Gerrit-Reviewer: NikiWiki <mediawikidev100...@spam.game-host.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to