jenkins-bot has submitted this change and it was merged. Change subject: [IMPROV] page: re.sub instead of manual iteration ......................................................................
[IMPROV] page: re.sub instead of manual iteration The re.sub method allows to use a callable as the replacement which allows it to automatically replace without manually stitching the text together. Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a --- M pywikibot/page.py M tests/page_tests.py 2 files changed, 33 insertions(+), 34 deletions(-) Approvals: Mpaa: Looks good to me, approved jenkins-bot: Verified diff --git a/pywikibot/page.py b/pywikibot/page.py index f38dc5c..8f9781b 100644 --- a/pywikibot/page.py +++ b/pywikibot/page.py @@ -4814,42 +4814,33 @@ # ensuring that illegal   and , which have no known values, # don't get converted to chr(129), chr(141) or chr(157) ignore = set(ignore) | set([129, 141, 157]) - result = u'' - i = 0 - found = True - while found: - text = text[i:] - match = entityR.search(text) - if match: - unicodeCodepoint = None - if match.group('decimal'): - unicodeCodepoint = int(match.group('decimal')) - elif match.group('hex'): - unicodeCodepoint = int(match.group('hex'), 16) - elif match.group('name'): - name = match.group('name') - if name in htmlentitydefs.name2codepoint: - # We found a known HTML entity. - unicodeCodepoint = htmlentitydefs.name2codepoint[name] - result += text[:match.start()] - try: - unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint] - except KeyError: - pass - if unicodeCodepoint and unicodeCodepoint not in ignore: - if unicodeCodepoint > sys.maxunicode: - # solve narrow Python 2 build exception (UTF-16) - result += eval(r"u'\U{:08x}'".format(unicodeCodepoint)) - else: - result += chr(unicodeCodepoint) + + def handle_entity(match): + if match.group('decimal'): + unicodeCodepoint = int(match.group('decimal')) + elif match.group('hex'): + unicodeCodepoint = int(match.group('hex'), 16) + elif match.group('name'): + name = match.group('name') + if name in htmlentitydefs.name2codepoint: + # We found a known HTML entity. + unicodeCodepoint = htmlentitydefs.name2codepoint[name] else: - # Leave the entity unchanged - result += text[match.start():match.end()] - i = match.end() + unicodeCodepoint = False + try: + unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint] + except KeyError: + pass + if unicodeCodepoint and unicodeCodepoint not in ignore: + if unicodeCodepoint > sys.maxunicode: + # solve narrow Python 2 build exception (UTF-16) + return eval(r"u'\U{:08x}'".format(unicodeCodepoint)) + else: + return chr(unicodeCodepoint) else: - result += text - found = False - return result + # Leave the entity unchanged + return match.group(0) + return entityR.sub(handle_entity, text) def UnicodeToAsciiHtml(s): diff --git a/tests/page_tests.py b/tests/page_tests.py index bc77765..f2ab434 100644 --- a/tests/page_tests.py +++ b/tests/page_tests.py @@ -796,12 +796,20 @@ self.assertEqual(pywikibot.page.html2unicode('A&O'), 'A&O') self.assertEqual(pywikibot.page.html2unicode('py'), 'py') self.assertEqual(pywikibot.page.html2unicode('𐀀'), u'\U00010000') + self.assertEqual(pywikibot.page.html2unicode('p&y'), 'p&y') @unittest.expectedFailure def test_recursive_entities(self): """Test recursive entities.""" self.assertEqual(pywikibot.page.html2unicode('A&amp;O'), 'A&O') + def test_invalid_entities(self): + """Test texts with invalid entities.""" + self.assertEqual(pywikibot.page.html2unicode('A¬aname;O'), 'A¬aname;O') + self.assertEqual(pywikibot.page.html2unicode('Af;O'), 'Af;O') + self.assertEqual(pywikibot.page.html2unicode('f'), 'f') + self.assertEqual(pywikibot.page.html2unicode('py'), 'py') + if __name__ == '__main__': try: -- To view, visit https://gerrit.wikimedia.org/r/196424 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a Gerrit-PatchSet: 3 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: XZise <commodorefabia...@gmx.de> Gerrit-Reviewer: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com> Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl> Gerrit-Reviewer: Mpaa <mpaa.w...@gmail.com> Gerrit-Reviewer: Ricordisamoa <ricordisa...@openmailbox.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits