jenkins-bot has submitted this change and it was merged.

Change subject: [IMPROV] page: re.sub instead of manual iteration
......................................................................


[IMPROV] page: re.sub instead of manual iteration

The re.sub method allows to use a callable as the replacement which
allows it to automatically replace without manually stitching the text
together.

Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a
---
M pywikibot/page.py
M tests/page_tests.py
2 files changed, 33 insertions(+), 34 deletions(-)

Approvals:
  Mpaa: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/page.py b/pywikibot/page.py
index f38dc5c..8f9781b 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -4814,42 +4814,33 @@
     # ensuring that illegal   and &#157, which have no known 
values,
     # don't get converted to chr(129), chr(141) or chr(157)
     ignore = set(ignore) | set([129, 141, 157])
-    result = u''
-    i = 0
-    found = True
-    while found:
-        text = text[i:]
-        match = entityR.search(text)
-        if match:
-            unicodeCodepoint = None
-            if match.group('decimal'):
-                unicodeCodepoint = int(match.group('decimal'))
-            elif match.group('hex'):
-                unicodeCodepoint = int(match.group('hex'), 16)
-            elif match.group('name'):
-                name = match.group('name')
-                if name in htmlentitydefs.name2codepoint:
-                    # We found a known HTML entity.
-                    unicodeCodepoint = htmlentitydefs.name2codepoint[name]
-            result += text[:match.start()]
-            try:
-                unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
-            except KeyError:
-                pass
-            if unicodeCodepoint and unicodeCodepoint not in ignore:
-                if unicodeCodepoint > sys.maxunicode:
-                    # solve narrow Python 2 build exception (UTF-16)
-                    result += eval(r"u'\U{:08x}'".format(unicodeCodepoint))
-                else:
-                    result += chr(unicodeCodepoint)
+
+    def handle_entity(match):
+        if match.group('decimal'):
+            unicodeCodepoint = int(match.group('decimal'))
+        elif match.group('hex'):
+            unicodeCodepoint = int(match.group('hex'), 16)
+        elif match.group('name'):
+            name = match.group('name')
+            if name in htmlentitydefs.name2codepoint:
+                # We found a known HTML entity.
+                unicodeCodepoint = htmlentitydefs.name2codepoint[name]
             else:
-                # Leave the entity unchanged
-                result += text[match.start():match.end()]
-            i = match.end()
+                unicodeCodepoint = False
+        try:
+            unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
+        except KeyError:
+            pass
+        if unicodeCodepoint and unicodeCodepoint not in ignore:
+            if unicodeCodepoint > sys.maxunicode:
+                # solve narrow Python 2 build exception (UTF-16)
+                return eval(r"u'\U{:08x}'".format(unicodeCodepoint))
+            else:
+                return chr(unicodeCodepoint)
         else:
-            result += text
-            found = False
-    return result
+            # Leave the entity unchanged
+            return match.group(0)
+    return entityR.sub(handle_entity, text)
 
 
 def UnicodeToAsciiHtml(s):
diff --git a/tests/page_tests.py b/tests/page_tests.py
index bc77765..f2ab434 100644
--- a/tests/page_tests.py
+++ b/tests/page_tests.py
@@ -796,12 +796,20 @@
         self.assertEqual(pywikibot.page.html2unicode('A&O'), 'A&O')
         self.assertEqual(pywikibot.page.html2unicode('py'), 'py')
         self.assertEqual(pywikibot.page.html2unicode('𐀀'), 
u'\U00010000')
+        self.assertEqual(pywikibot.page.html2unicode('p&y'), 
'p&y')
 
     @unittest.expectedFailure
     def test_recursive_entities(self):
         """Test recursive entities."""
         self.assertEqual(pywikibot.page.html2unicode('A&O'), 'A&O')
 
+    def test_invalid_entities(self):
+        """Test texts with invalid entities."""
+        self.assertEqual(pywikibot.page.html2unicode('A&notaname;O'), 
'A&notaname;O')
+        self.assertEqual(pywikibot.page.html2unicode('A&#7f;O'), 'A&#7f;O')
+        self.assertEqual(pywikibot.page.html2unicode('&#7f'), '&#7f')
+        self.assertEqual(pywikibot.page.html2unicode('&#x70y'), '&#x70y')
+
 
 if __name__ == '__main__':
     try:

-- 
To view, visit https://gerrit.wikimedia.org/r/196424
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <commodorefabia...@gmx.de>
Gerrit-Reviewer: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.w...@gmail.com>
Gerrit-Reviewer: Ricordisamoa <ricordisa...@openmailbox.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to