Author: Matti Picus <matti.pi...@gmail.com> Branch: py3.6 Changeset: r96067:813c99f810ac Date: 2019-02-18 15:07 +0200 http://bitbucket.org/pypy/pypy/changeset/813c99f810ac/
Log: collect surrogate pairs for error diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -239,8 +239,19 @@ upos += rutf8.codepoints_in_utf8(s, start=pos, end=e.pos) pos = e.pos assert pos >= 0 + # Try to get collect surrogates in one pass + # XXX do we care about performance in this case? + # XXX should this loop for more than one pair? + delta = 1 + uchr = rutf8.codepoint_at_pos(s, pos) + if 0xD800 <= uchr <= 0xDBFF: + pos = rutf8.next_codepoint_pos(s, pos) + if pos < size: + uchr = rutf8.codepoint_at_pos(s, pos) + if 0xDC00 <= uchr <= 0xDFFF: + delta += 1 res, newindex, rettype = errorhandler(errors, 'utf8', - 'surrogates not allowed', s, upos, upos + 1) + 'surrogates not allowed', s, upos, upos + delta) if rettype == 'u': for cp in rutf8.Utf8StringIterator(res): result.append(chr(cp)) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit