jenkins-bot has submitted this change and it was merged.
Change subject: Add textlib._MultiTemplateMatchBuilder
......................................................................
Add textlib._MultiTemplateMatchBuilder
Various template matching regex's exist throughout pywikibot
and scripts, with varying issues in the template matching regexes.
Create a _MultiTemplateMatchBuilder capable of providing
regex objects for templates, using the regex from
template.TemplateRobot, and use it to fix bugs in cosmetic_changes
replaceDeprecatedTemplates.
template.XmlDumpTemplatePageGenerator duplicates template matching
logic found in TemplateRobot. Replace and deprecate
template.XmlDumpTemplatePageGenerator.
Start test module for template script.
Add TODO for other template matching to be converted to using
_MultiTemplateMatchBuilder.
Change-Id: I0deb795b6634b030c9e655e8e1dbbb925480de5b
---
M pywikibot/cosmetic_changes.py
M pywikibot/textlib.py
M scripts/add_text.py
M scripts/category_redirect.py
M scripts/checkimages.py
M scripts/template.py
M tests/__init__.py
M tests/cosmetic_changes_tests.py
A tests/data/xml/dummy-template.xml
A tests/template_bot_tests.py
10 files changed, 325 insertions(+), 20 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index f466b89..0d77cee 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -71,6 +71,7 @@
import pywikibot
from pywikibot import config, textlib
+from pywikibot.textlib import _MultiTemplateMatchBuilder
from pywikibot.tools import deprecate_arg, first_lower, first_upper
from pywikibot.tools import MediaWikiVersion
@@ -306,6 +307,7 @@
5. interwiki links
"""
+ # TODO: T123150
starsList = [
u'bueno',
u'bom interwiki',
@@ -694,6 +696,8 @@
def replaceDeprecatedTemplates(self, text):
exceptions = ['comment', 'math', 'nowiki', 'pre']
+ builder = _MultiTemplateMatchBuilder(self.site)
+
if self.site.family.name in deprecatedTemplates and \
self.site.code in deprecatedTemplates[self.site.family.name]:
for template in deprecatedTemplates[
@@ -704,12 +708,12 @@
new = ''
else:
new = '{{%s}}' % new
- if self.site.namespaces[10].case == 'first-letter':
- old = '[' + old[0].upper() + old[0].lower() + ']' + old[1:]
+
text = textlib.replaceExcept(
text,
- r'\{\{([mM][sS][gG]:)?%s(?P<parameters>\|[^}]+|)}}' % old,
+ builder.pattern(old),
new, exceptions)
+
return text
# from fixes.py
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 8e73b56..79d30ee 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -42,6 +42,7 @@
deprecated,
DeprecatedRegex,
OrderedDict,
+ StringTypes,
UnicodeType,
issue_deprecation_warning
)
@@ -144,6 +145,52 @@
return s
+class _MultiTemplateMatchBuilder(object):
+
+ """Build template matcher."""
+
+ def __init__(self, site):
+ """Constructor."""
+ self.site = site
+
+ def pattern(self, template, flags=re.DOTALL):
+ """Return a compiled regex to match template."""
+ # TODO: add ability to also match contents within the template
+ # TODO: add option for template to be None to match any template
+ # TODO: use NESTED_TEMPLATE_REGEX with <parameters> instead of <params>
+ namespace = self.site.namespaces[10]
+ if isinstance(template, pywikibot.Page):
+ if template.namespace() == 10:
+ old = template.title(withNamespace=False)
+ else:
+ raise ValueError(
+ '{0} is not a template Page object'.format(template))
+ elif isinstance(template, StringTypes):
+ old = template
+ else:
+ raise ValueError(
+ '{0!r} is not a valid template'.format(template))
+
+ if namespace.case == 'first-letter':
+ pattern = '[' + \
+ re.escape(old[0].upper()) + \
+ re.escape(old[0].lower()) + \
+ ']' + re.escape(old[1:])
+ else:
+ pattern = re.escape(old)
+ pattern = re.sub(r'_|\\ ', r'[_ ]', pattern)
+ templateRegex = re.compile(r'\{\{ *(' + ':|'.join(namespace) +
+ r':|[mM][sS][gG]:)?' + pattern +
+ r'(?P<parameters>\s*\|.+?|) *}}',
+ flags)
+ return templateRegex
+
+ def search_any_predicate(self, templates):
+ """Return a predicate that matches any template."""
+ predicates = [self.pattern(template).search for template in templates]
+ return lambda text: any(predicate(text) for predicate in predicates)
+
+
def _create_default_regexes():
"""Fill (and possibly overwrite) _regex_cache with default regexes."""
_regex_cache.update({
diff --git a/scripts/add_text.py b/scripts/add_text.py
index bfe9365..f6049f5 100755
--- a/scripts/add_text.py
+++ b/scripts/add_text.py
@@ -196,6 +196,7 @@
categoriesInside, site,
True)
# Dealing the stars' issue
+ # TODO: T123150
allstars = []
starstext = textlib.removeDisabledParts(text)
for star in starsList:
diff --git a/scripts/category_redirect.py b/scripts/category_redirect.py
index 7305b76..51e3159 100755
--- a/scripts/category_redirect.py
+++ b/scripts/category_redirect.py
@@ -293,6 +293,7 @@
with open(datafile + ".bak", "wb") as f:
cPickle.dump(record, f, protocol=config.pickle_protocol)
# regex to match soft category redirects
+ # TODO: enhance and use textlib._MultiTemplateMatchBuilder
# note that any templates containing optional "category:" are
# incorrect and will be fixed by the bot
template_regex = re.compile(
diff --git a/scripts/checkimages.py b/scripts/checkimages.py
index f34a18f..516efda 100755
--- a/scripts/checkimages.py
+++ b/scripts/checkimages.py
@@ -1405,6 +1405,7 @@
def isTagged(self):
"""Understand if a file is already tagged or not."""
+ # TODO: enhance and use textlib._MultiTemplateMatchBuilder
# Is the image already tagged? If yes, no need to double-check, skip
for i in i18n.translate(self.site, txt_find):
# If there are {{ use regex, otherwise no (if there's not the
diff --git a/scripts/template.py b/scripts/template.py
index 6aa9563..fba031c 100755
--- a/scripts/template.py
+++ b/scripts/template.py
@@ -118,14 +118,16 @@
import pywikibot
-from pywikibot import i18n, pagegenerators, Bot
+from pywikibot import i18n, pagegenerators, textlib, Bot
from pywikibot.exceptions import ArgumentDeprecationWarning
from pywikibot.pagegenerators import XMLDumpPageGenerator
+from pywikibot.tools import deprecated
from scripts.replace import ReplaceRobot as ReplaceBot
+@deprecated('XMLDumpPageGenerator')
class XmlDumpTemplatePageGenerator(XMLDumpPageGenerator):
"""
@@ -220,20 +222,9 @@
replacements = []
exceptions = {}
- namespace = self.site.namespaces[10]
+ builder = textlib._MultiTemplateMatchBuilder(site)
for old, new in self.templates.items():
- if namespace.case == 'first-letter':
- pattern = '[' + \
- re.escape(old[0].upper()) + \
- re.escape(old[0].lower()) + \
- ']' + re.escape(old[1:])
- else:
- pattern = re.escape(old)
- pattern = re.sub(r'_|\\ ', r'[_ ]', pattern)
- templateRegex = re.compile(r'\{\{ *(' + ':|'.join(namespace) +
- r':|[mM][sS][gG]:)?' + pattern +
- r'(?P<parameters>\s*\|.+?|) *}}',
- re.DOTALL)
+ templateRegex = builder.pattern(old)
if self.getOption('subst') and self.getOption('remove'):
replacements.append((templateRegex,
@@ -344,9 +335,14 @@
oldTemplates.append(oldTemplate)
if xmlfilename:
- gen = XmlDumpTemplatePageGenerator(oldTemplates, xmlfilename)
+ builder = textlib._MultiTemplateMatchBuilder(site)
+ predicate = builder.search_any_predicate(oldTemplates)
+
+ gen = XmlDumpTemplatePageGenerator(
+ xmlfilename, site=site, text_predicate=predicate)
else:
gen = genFactory.getCombinedGenerator()
+
if not gen:
gens = [
pagegenerators.ReferringPageGenerator(t,
onlyTemplateInclusion=True)
diff --git a/tests/__init__.py b/tests/__init__.py
index 1596d05..c7a66bf 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -138,6 +138,7 @@
'isbn',
'protectbot',
'reflinks',
+ 'template_bot',
'replacebot',
'uploadbot',
'weblinkchecker',
diff --git a/tests/cosmetic_changes_tests.py b/tests/cosmetic_changes_tests.py
index 4144a33..cf8b7ae 100644
--- a/tests/cosmetic_changes_tests.py
+++ b/tests/cosmetic_changes_tests.py
@@ -102,10 +102,10 @@
def test_replaceDeprecatedTemplates(self):
"""Test replaceDeprecatedTemplates method."""
self.assertEqual(
- '{{Quellen fehlen }}'
+ '{{Belege fehlen}}'
'{{Belege fehlen| }}'
'{{Belege fehlen|foo}}'
- '{{Quellen_fehlen|foo}}',
+ '{{Belege fehlen|foo}}',
self.cct.replaceDeprecatedTemplates(
'{{Quellen fehlen }}'
'{{Quellen fehlen| }}'
diff --git a/tests/data/xml/dummy-template.xml
b/tests/data/xml/dummy-template.xml
new file mode 100644
index 0000000..1673c75
--- /dev/null
+++ b/tests/data/xml/dummy-template.xml
@@ -0,0 +1,108 @@
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/
http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
+ <siteinfo>
+ <sitename>Wikipedia</sitename>
+ <dbname>enwiki</dbname>
+ <base>http://en.wikipedia.org/wiki/Main_Page</base>
+ <generator>MediaWiki 1.25wmf12</generator>
+ <case>first-letter</case>
+ <namespaces>
+ <namespace key="-2" case="first-letter">Media</namespace>
+ <namespace key="-1" case="first-letter">Special</namespace>
+ <namespace key="0" case="first-letter" />
+ <namespace key="1" case="first-letter">Talk</namespace>
+ <namespace key="2" case="first-letter">User</namespace>
+ <namespace key="3" case="first-letter">User talk</namespace>
+ <namespace key="4" case="first-letter">Wikipedia</namespace>
+ <namespace key="5" case="first-letter">Wikipedia talk</namespace>
+ <namespace key="6" case="first-letter">File</namespace>
+ <namespace key="7" case="first-letter">File talk</namespace>
+ <namespace key="8" case="first-letter">MediaWiki</namespace>
+ <namespace key="9" case="first-letter">MediaWiki talk</namespace>
+ <namespace key="10" case="first-letter">Template</namespace>
+ <namespace key="11" case="first-letter">Template talk</namespace>
+ <namespace key="12" case="first-letter">Help</namespace>
+ <namespace key="13" case="first-letter">Help talk</namespace>
+ <namespace key="14" case="first-letter">Category</namespace>
+ <namespace key="15" case="first-letter">Category talk</namespace>
+ <namespace key="100" case="first-letter">Portal</namespace>
+ <namespace key="101" case="first-letter">Portal talk</namespace>
+ <namespace key="108" case="first-letter">Book</namespace>
+ <namespace key="109" case="first-letter">Book talk</namespace>
+ <namespace key="118" case="first-letter">Draft</namespace>
+ <namespace key="119" case="first-letter">Draft talk</namespace>
+ <namespace key="446" case="first-letter">Education Program</namespace>
+ <namespace key="447" case="first-letter">Education Program
talk</namespace>
+ <namespace key="710" case="first-letter">TimedText</namespace>
+ <namespace key="711" case="first-letter">TimedText talk</namespace>
+ <namespace key="828" case="first-letter">Module</namespace>
+ <namespace key="829" case="first-letter">Module talk</namespace>
+ <namespace key="2600" case="first-letter">Topic</namespace>
+ </namespaces>
+ </siteinfo>
+ <page>
+ <title>Fake page with msg</title>
+ <ns>0</ns>
+ <id>12345</id>
+ <revision>
+ <id>123456789</id>
+ <parentid>123456788</parentid>
+ <timestamp>2014-12-24T01:01:01Z</timestamp>
+ <contributor>
+ <username>John Vandenberg</username>
+ <id>31009137</id>
+ </contributor>
+ <minor/>
+ <comment>Foo</comment>
+ <model>wikitext</model>
+ <format>text/x-wiki</format>
+ <text xml:space="preserve" bytes="1">
+ {{ MsG:foo }}
+ </text>
+ <sha1>1ywwm7o751gkr3fj9l7rqpl0s8o87b1</sha1>
+ </revision>
+ </page>
+ <page>
+ <title>Fake page with unnecessary template prefix</title>
+ <ns>1</ns>
+ <id>54321</id>
+ <revision>
+ <id>987654321</id>
+ <parentid>987654320</parentid>
+ <timestamp>2014-12-24T01:01:02Z</timestamp>
+ <contributor>
+ <username>John Vandenberg</username>
+ <id>31009137</id>
+ </contributor>
+ <minor/>
+ <comment>Lets discuss foo</comment>
+ <model>wikitext</model>
+ <format>text/x-wiki</format>
+ <text xml:space="preserve" bytes="1">
+ {{TEMPLATE:bar}}
+ </text>
+ <sha1>1ywwm7o751gkr3fj9l7rqpl0s8o87b2</sha1>
+ </revision>
+ </page>
+ <page>
+ <title>Fake page with nested template</title>
+ <ns>1</ns>
+ <id>54322</id>
+ <revision>
+ <id>987654322</id>
+ <parentid>987654320</parentid>
+ <timestamp>2014-12-24T01:01:02Z</timestamp>
+ <contributor>
+ <username>John Vandenberg</username>
+ <id>31009137</id>
+ </contributor>
+ <minor/>
+ <comment>Lets discuss foo</comment>
+ <model>wikitext</model>
+ <format>text/x-wiki</format>
+ <text xml:space="preserve" bytes="1">
+ {{baz|{{boo|}} }}
+ </text>
+ <sha1>1ywwm7o751gkr3fj9l7rqpl0s8o87b2</sha1>
+ </revision>
+ </page>
+</mediawiki>
diff --git a/tests/template_bot_tests.py b/tests/template_bot_tests.py
new file mode 100644
index 0000000..c27dd67
--- /dev/null
+++ b/tests/template_bot_tests.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+"""Test template bot module."""
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import absolute_import, unicode_literals
+
+__version__ = '$Id$'
+
+import pywikibot
+
+from pywikibot.pagegenerators import XMLDumpPageGenerator
+from pywikibot.textlib import _MultiTemplateMatchBuilder
+
+from tests import join_xml_data_path
+from tests.aspects import unittest, TestCase
+
+
+class TestXMLPageGenerator(TestCase):
+
+ """Test XML Page generator."""
+
+ family = 'wikipedia'
+ code = 'en'
+
+ dry = True
+
+ def test_no_match(self):
+ """Test pages without any desired templates."""
+ template = pywikibot.Page(self.site, 'Template:foobar')
+ builder = _MultiTemplateMatchBuilder(self.site)
+ predicate = builder.search_any_predicate([template])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('article-pear-0.10.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 0)
+
+ def test_match(self):
+ """Test pages with one match without parameters."""
+ template = pywikibot.Page(self.site, 'Template:stack begin')
+ builder = _MultiTemplateMatchBuilder(self.site)
+ predicate = builder.search_any_predicate([template])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('article-pear-0.10.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 1)
+ self.assertPagelistTitles(pages, ['Pear'],
+ site=self.site)
+
+ def test_match_with_params(self):
+ """Test pages with one match with parameters."""
+ template = pywikibot.Page(self.site, 'Template:Taxobox')
+ builder = _MultiTemplateMatchBuilder(self.site)
+ predicate = builder.search_any_predicate([template])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('article-pear-0.10.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 1)
+ self.assertPagelistTitles(pages, ['Pear'],
+ site=self.site)
+
+ def test_match_any(self):
+ """Test pages with one of many matches."""
+ template1 = pywikibot.Page(self.site, 'Template:stack begin')
+ template2 = pywikibot.Page(self.site, 'Template:foobar')
+ builder = _MultiTemplateMatchBuilder(self.site)
+
+ predicate = builder.search_any_predicate([template1, template2])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('article-pear-0.10.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 1)
+ self.assertPagelistTitles(pages, ['Pear'],
+ site=self.site)
+
+ # reorder templates
+ predicate = builder.search_any_predicate([template2, template1])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('article-pear-0.10.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 1)
+ self.assertPagelistTitles(pages, ['Pear'],
+ site=self.site)
+
+ def test_match_msg(self):
+ """Test pages with {{msg:..}}."""
+ template = pywikibot.Page(self.site, 'Template:Foo')
+ builder = _MultiTemplateMatchBuilder(self.site)
+
+ predicate = builder.search_any_predicate([template])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('dummy-template.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 1)
+ self.assertPagelistTitles(pages, ['Fake page with msg'],
+ site=self.site)
+
+ @unittest.expectedFailure
+ def test_match_unnecessary_template_prefix(self):
+ """Test pages with {{template:..}}."""
+ template = pywikibot.Page(self.site, 'Template:Bar')
+ builder = _MultiTemplateMatchBuilder(self.site)
+
+ predicate = builder.search_any_predicate([template])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('dummy-template.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 1)
+ self.assertPagelistTitles(
+ pages, ['Fake page with unnecessary template prefix'],
+ site=self.site)
+
+ def test_nested_match(self):
+ """Test pages with one match inside another template."""
+ template = pywikibot.Page(self.site, 'Template:boo')
+ builder = _MultiTemplateMatchBuilder(self.site)
+ predicate = builder.search_any_predicate([template])
+ gen = XMLDumpPageGenerator(
+ filename=join_xml_data_path('dummy-template.xml'),
+ site=self.site,
+ text_predicate=predicate)
+ pages = list(gen)
+ self.assertEqual(len(pages), 1)
+ self.assertPagelistTitles(
+ pages, ['Fake page with nested template'],
+ site=self.site)
+
+
+if __name__ == "__main__":
+ unittest.main()
--
To view, visit https://gerrit.wikimedia.org/r/251906
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I0deb795b6634b030c9e655e8e1dbbb925480de5b
Gerrit-PatchSet: 7
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits