jenkins-bot has submitted this change and it was merged. Change subject: Migrate XmlDumpPageGenerator to pagegenerators ......................................................................
Migrate XmlDumpPageGenerator to pagegenerators Four scripts now use a generic page generator class: - noreferences - reflinks - template - weblinkchecker Two scripts still have their own XML page generator: - redirect - replace Bug: T85334 Change-Id: I5b6268673f5db5cc9506bc0e24ab70f72d9af573 --- M pywikibot/pagegenerators.py M pywikibot/tools/__init__.py M scripts/noreferences.py M scripts/reflinks.py M scripts/template.py M scripts/weblinkchecker.py M tests/reflinks_tests.py 7 files changed, 126 insertions(+), 148 deletions(-) Approvals: XZise: Looks good to me, approved jenkins-bot: Verified diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index d9a4ead..9f91111 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -42,10 +42,11 @@ issue_deprecation_warning, DequeGenerator, intersect_generators, + IteratorNextMixin, filter_unique, ) -from pywikibot import date, config, i18n +from pywikibot import date, config, i18n, xmlreader from pywikibot.comms import http from pywikibot.exceptions import ArgumentDeprecationWarning @@ -2331,6 +2332,62 @@ yield page +class XMLDumpOldPageGenerator(IteratorNextMixin): + + """Xml generator that yields Page objects with old text loaded.""" + + @deprecated_args(xmlFilename='filename', xmlStart='start') + def __init__(self, filename, start=None, namespaces=[], site=None, + text_predicate=None): + """Constructor.""" + # xmlFilename and xmlStart mapped to not break git blame + # use filename and start on new/changed lines + xmlFilename = filename + xmlStart = start + + if text_predicate is None: + text_predicate = lambda text: True + self.text_predicate = text_predicate + + self.xmlStart = xmlStart + self.namespaces = namespaces + self.skipping = bool(xmlStart) + self.site = site or pywikibot.Site() + + dump = xmlreader.XmlDump(xmlFilename) + self.parser = dump.parse() + + def __next__(self): + """Get next Page.""" + while True: + try: + entry = next(self.parser) + except StopIteration: + raise + if self.skipping: + if entry.title != self.xmlStart: + continue + self.skipping = False + page = pywikibot.Page(self.site, entry.title) + if not self.namespaces == []: + if page.namespace() not in self.namespaces: + continue + if self.text_predicate(entry.text): + page.text = entry.text + return page + + +class XMLDumpPageGenerator(XMLDumpOldPageGenerator): + + """Xml generator that yields Page objects without text loaded.""" + + def __next__(self): + """Get next Page from dump and remove the text.""" + page = super(XMLDumpPageGenerator, self).__next__() + del page.text + return page + + def YearPageGenerator(start=1, end=2050, site=None): """ Year page generator. diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py index 4655d91..4a93ca4 100644 --- a/pywikibot/tools/__init__.py +++ b/pywikibot/tools/__init__.py @@ -854,24 +854,27 @@ """Unicode string with SelfCallMixin.""" -class DequeGenerator(collections.deque): +class IteratorNextMixin(collections.Iterator): + + """Backwards compatibility for Iterators.""" + + if PY2: + + def next(self): + """Python 2 next.""" + return self.__next__() + + +class DequeGenerator(IteratorNextMixin, collections.deque): """A generator that allows items to be added during generating.""" - def __iter__(self): - """Return the object which will be iterated.""" - return self - - def next(self): + def __next__(self): """Python 3 iterator method.""" if len(self): return self.popleft() else: raise StopIteration - - def __next__(self): - """Python 3 iterator method.""" - return self.next() class ContextManagerWrapper(object): diff --git a/scripts/noreferences.py b/scripts/noreferences.py index 42e6dd0..e2d793d 100755 --- a/scripts/noreferences.py +++ b/scripts/noreferences.py @@ -45,9 +45,14 @@ import re +from functools import partial + import pywikibot from pywikibot import i18n, pagegenerators, textlib, Bot +from pywikibot.pagegenerators import ( + XMLDumpPageGenerator, +) # This is required for the text that is shown when you run this script # with the parameter -help. @@ -440,37 +445,18 @@ maintenance_category = 'cite_error_refs_without_references_category' +_ref_regex = re.compile('</ref>', re.IGNORECASE) +_references_regex = re.compile('<references.*?/>', re.IGNORECASE) -class XmlDumpNoReferencesPageGenerator(object): - """ - Generator which will yield Pages that might lack a references tag. +def _match_xml_page_text(text): + """Match page text.""" + text = textlib.removeDisabledParts(text) + return _ref_regex.search(text) and not _references_regex.search(text) - These pages will be retrieved from a local XML dump file - (pages-articles or pages-meta-current). - """ - def __init__(self, xmlFilename): - """ - Constructor. - - Arguments: - * xmlFilename - The dump's path, either absolute or relative - """ - self.xmlFilename = xmlFilename - self.refR = re.compile('</ref>', re.IGNORECASE) - # The references tab can contain additional spaces and a group - # attribute. - self.referencesR = re.compile('<references.*?/>', re.IGNORECASE) - - def __iter__(self): - """XML iterator.""" - from pywikibot import xmlreader - dump = xmlreader.XmlDump(self.xmlFilename) - for entry in dump.parse(): - text = textlib.removeDisabledParts(entry.text) - if self.refR.search(text) and not self.referencesR.search(text): - yield pywikibot.Page(pywikibot.Site(), entry.title) +XmlDumpNoReferencesPageGenerator = partial( + XMLDumpPageGenerator, text_predicate=_match_xml_page_text) class NoReferencesBot(Bot): @@ -488,8 +474,8 @@ self.site = pywikibot.Site() self.comment = i18n.twtranslate(self.site, 'noreferences-add-tag') - self.refR = re.compile('</ref>', re.IGNORECASE) - self.referencesR = re.compile('<references.*?/>', re.IGNORECASE) + self.refR = _ref_regex + self.referencesR = _references_regex self.referencesTagR = re.compile('<references>.*?</references>', re.IGNORECASE | re.DOTALL) try: diff --git a/scripts/reflinks.py b/scripts/reflinks.py index 3035fc1..328b904 100755 --- a/scripts/reflinks.py +++ b/scripts/reflinks.py @@ -56,14 +56,18 @@ import sys import io +from functools import partial + import pywikibot -from pywikibot import i18n, pagegenerators, textlib, xmlreader, Bot +from pywikibot import i18n, pagegenerators, textlib, Bot +from pywikibot.pagegenerators import ( + XMLDumpPageGenerator as _XMLDumpPageGenerator, +) from pywikibot.tools.formatter import color_format from scripts import noreferences -# TODO: Convert to httlib2 if sys.version_info[0] > 2: from urllib.parse import quote from urllib.request import urlopen @@ -185,41 +189,8 @@ # ( maintained by User:Dispenser ) listof404pages = '404-links.txt' - -class XmlDumpPageGenerator(object): - - """Xml generator that yields pages containing bare references.""" - - def __init__(self, xmlFilename, xmlStart, namespaces, site=None): - self.xmlStart = xmlStart - self.namespaces = namespaces - self.skipping = bool(xmlStart) - self.site = site or pywikibot.Site() - - dump = xmlreader.XmlDump(xmlFilename) - self.parser = dump.parse() - - def __iter__(self): - return self - - def next(self): - while True: - try: - entry = next(self.parser) - except StopIteration: - raise - if self.skipping: - if entry.title != self.xmlStart: - continue - self.skipping = False - page = pywikibot.Page(self.site, entry.title) - if not self.namespaces == []: - if page.namespace() not in self.namespaces: - continue - if linksInRef.search(entry.text): - return page - - __next__ = next +XmlDumpPageGenerator = partial( + _XMLDumpPageGenerator, text_predicate=linksInRef.search) class RefLink(object): diff --git a/scripts/template.py b/scripts/template.py index facda7c..e789176 100755 --- a/scripts/template.py +++ b/scripts/template.py @@ -118,12 +118,14 @@ import pywikibot -from pywikibot import i18n, pagegenerators, xmlreader, Bot +from pywikibot import i18n, pagegenerators, Bot from pywikibot.exceptions import ArgumentDeprecationWarning +from pywikibot.pagegenerators import XMLDumpPageGenerator + from scripts.replace import ReplaceRobot as ReplaceBot -class XmlDumpTemplatePageGenerator(object): +class XmlDumpTemplatePageGenerator(XMLDumpPageGenerator): """ Generator which yields Pages that transclude a template. @@ -144,11 +146,7 @@ """ self.templates = templates self.xmlfilename = xmlfilename - - def __iter__(self): - """Yield page objects until the entire XML dump has been read.""" mysite = pywikibot.Site() - dump = xmlreader.XmlDump(self.xmlfilename) # regular expression to find the original template. # {{vfd}} does the same thing as {{Vfd}}, so both will be found. # The old syntax, {{msg:vfd}}, will also be found. @@ -164,10 +162,9 @@ templateRegex = re.compile( r'\{\{ *([mM][sS][gG]:)?(?:%s) *(?P<parameters>\|[^}]+|) *}}' % '|'.join(templatePatterns)) - for entry in dump.parse(): - if templateRegex.search(entry.text): - page = pywikibot.Page(mysite, entry.title) - yield page + + super(XmlDumpTemplatePageGenerator, self).__init__( + xmlfilename, site=mysite, text_predicate=templateRegex.search) class TemplateRobot(ReplaceBot): diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py index 23afde0..015f108 100755 --- a/scripts/weblinkchecker.py +++ b/scripts/weblinkchecker.py @@ -108,6 +108,7 @@ import time import sys +from functools import partial from warnings import warn try: @@ -117,8 +118,11 @@ import pywikibot -from pywikibot import i18n, config, pagegenerators, textlib, xmlreader, weblib +from pywikibot import i18n, config, pagegenerators, textlib, weblib from pywikibot.bot import ExistingPageBot, SingleSiteBot +from pywikibot.pagegenerators import ( + XMLDumpPageGenerator as _XMLDumpPageGenerator, +) from pywikibot.tools.formatter import color_format # TODO: Convert to httlib2 @@ -247,48 +251,8 @@ yield m.group('urlb') -class XmlDumpPageGenerator(object): - - """Xml generator that yiels pages containing a web link.""" - - def __init__(self, xmlFilename, xmlStart, namespaces): - self.xmlStart = xmlStart - self.namespaces = namespaces - self.skipping = bool(xmlStart) - self.site = pywikibot.Site() - - dump = xmlreader.XmlDump(xmlFilename) - self.parser = dump.parse() - - def __iter__(self): - return self - - def next(self): - try: - for entry in self.parser: - if self.skipping: - if entry.title != self.xmlStart: - continue - self.skipping = False - page = pywikibot.Page(self.site, entry.title) - if self.namespaces: - if page.namespace() not in self.namespaces: - continue - found = False - for url in weblinksIn(entry.text): - found = True - if found: - return page - except KeyboardInterrupt: - try: - if not self.skipping: - pywikibot.output( - u'To resume, use "-xmlstart:%s" on the command line.' - % entry.title) - except NameError: - pass - - __next__ = next +XmlDumpPageGenerator = partial( + _XMLDumpPageGenerator, text_predicate=weblinksIn) class NotAnURLError(BaseException): diff --git a/tests/reflinks_tests.py b/tests/reflinks_tests.py index 603371f..f154738 100644 --- a/tests/reflinks_tests.py +++ b/tests/reflinks_tests.py @@ -27,8 +27,8 @@ def test_non_bare_ref_urls(self): """Test pages without bare references are not processed.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('article-pear-0.10.xml'), - xmlStart=u'Pear', + filename=join_xml_data_path('article-pear-0.10.xml'), + start='Pear', namespaces=[0, 1], site=self.get_site()) pages = list(gen) @@ -37,8 +37,8 @@ def test_simple_bare_refs(self): """Test simple bare references in multiple namespaces.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart=u'Fake page', + filename=join_xml_data_path('dummy-reflinks.xml'), + start='Fake page', namespaces=[0, 1], site=self.get_site()) pages = list(gen) @@ -48,8 +48,8 @@ def test_namespace_empty_list(self): """Test namespaces=[] processes all namespaces.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart=u'Fake page', + filename=join_xml_data_path('dummy-reflinks.xml'), + start=u'Fake page', namespaces=[], site=self.get_site()) pages = list(gen) @@ -60,8 +60,8 @@ def test_namespace_None(self): """Test namespaces=None processes all namespaces.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart=u'Fake page', + filename=join_xml_data_path('dummy-reflinks.xml'), + start='Fake page', namespaces=None, site=self.get_site()) pages = list(gen) @@ -72,8 +72,8 @@ def test_namespace_string_ids(self): """Test namespaces with ids as string.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart=u'Fake page', + filename=join_xml_data_path('dummy-reflinks.xml'), + start='Fake page', namespaces=["0", "1"], site=self.get_site()) pages = list(gen) @@ -83,8 +83,8 @@ def test_namespace_names(self): """Test namespaces with namespace names.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart=u'Fake page', + filename=join_xml_data_path('dummy-reflinks.xml'), + start='Fake page', namespaces=["Talk"], site=self.get_site()) pages = list(gen) @@ -95,8 +95,8 @@ def test_start_with_underscore(self): """Test with underscore in start page title.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart=u'Fake_page', + filename=join_xml_data_path('dummy-reflinks.xml'), + start='Fake_page', namespaces=[0, 1], site=self.get_site()) pages = list(gen) @@ -106,8 +106,8 @@ def test_without_start(self): """Test without a start page title.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart=None, + filename=join_xml_data_path('dummy-reflinks.xml'), + start=None, namespaces=[0, 1], site=self.get_site()) pages = list(gen) @@ -118,8 +118,8 @@ def test_start_prefix(self): """Test with a prefix as a start page title.""" gen = XmlDumpPageGenerator( - xmlFilename=join_xml_data_path('dummy-reflinks.xml'), - xmlStart='Fake', + filename=join_xml_data_path('dummy-reflinks.xml'), + start='Fake', namespaces=[0, 1], site=self.get_site()) pages = list(gen) -- To view, visit https://gerrit.wikimedia.org/r/239658 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I5b6268673f5db5cc9506bc0e24ab70f72d9af573 Gerrit-PatchSet: 6 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com> Gerrit-Reviewer: XZise <commodorefabia...@gmx.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ Pywikibot-commits mailing list Pywikibot-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits